diff options
Diffstat (limited to 'usr/src/uts/common')
167 files changed, 30998 insertions, 2882 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index baeb7b0015..fa9a3a4bf4 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -21,6 +21,7 @@ # # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2012 Joyent, Inc. All rights reserved. # Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. # Copyright (c) 2012 by Delphix. All rights reserved. # @@ -1145,8 +1146,13 @@ PIPE_OBJS += pipe.o HSFS_OBJS += hsfs_node.o hsfs_subr.o hsfs_vfsops.o hsfs_vnops.o \ hsfs_susp.o hsfs_rrip.o hsfs_susp_subr.o +HYPRLOFS_OBJS += hyprlofs_dir.o hyprlofs_subr.o \ + hyprlofs_vnops.o hyprlofs_vfsops.o + LOFS_OBJS += lofs_subr.o lofs_vfsops.o lofs_vnops.o +LXPROC_OBJS += lxpr_subr.o lxpr_vfsops.o lxpr_vnops.o + NAMEFS_OBJS += namevfs.o namevno.o NFS_OBJS += nfs_client.o nfs_common.o nfs_dump.o \ @@ -1385,6 +1391,7 @@ ZFS_COMMON_OBJS += \ zfs_fuid.o \ zfs_sa.o \ zfs_znode.o \ + zfs_zone.o \ zil.o \ zio.o \ zio_checksum.o \ @@ -1739,6 +1746,8 @@ IPF_OBJS += ip_fil_solaris.o fil.o solaris.o ip_state.o ip_frag.o ip_nat.o \ ip_proxy.o ip_auth.o ip_pool.o ip_htable.o ip_lookup.o \ ip_log.o misc.o ip_compat.o ip_nat6.o drand48.o +IPD_OBJS += ipd.o + IBD_OBJS += ibd.o ibd_cm.o EIBNX_OBJS += enx_main.o enx_hdlrs.o enx_ibt.o enx_log.o enx_fip.o \ @@ -1995,7 +2004,12 @@ MEGA_SAS_OBJS = megaraid_sas.o # # MR_SAS module # -MR_SAS_OBJS = mr_sas.o +MR_SAS_OBJS = ld_pd_map.o mr_sas.o mr_sas_tbolt.o mr_sas_list.o + +# +# DR_SAS module +# +DR_SAS_OBJS = dr_sas.o # # ISCSI_INITIATOR module diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index 28f7ddefae..27478a210d 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -21,6 +21,10 @@ # # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. +# + +# +# Copyright (c) 2012 Joyent, Inc. All rights reserved. # Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. # @@ -242,10 +246,18 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/hsfs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/hyprlofs/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/lofs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/lxproc/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/mntfs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -505,6 +517,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/ipf/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/ipd/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(COMMONBASE)/net/patricia/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -717,6 +733,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/drm/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/dr_sas/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/efe/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1744,9 +1764,15 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/fifofs/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/hsfs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/hyprlofs/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/lofs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/lxproc/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/mntfs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -1891,6 +1917,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ipnet/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/iptun/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ipd/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/ipf/%.c @($(LHEAD) $(LINT.c) $(IPFFLAGS) $< $(LTAIL)) @@ -2062,6 +2091,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/dmfe/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/drm/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/dr_sas/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/efe/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c index d72cfb0b8f..06e7810f07 100644 --- a/usr/src/uts/common/conf/param.c +++ b/usr/src/uts/common/conf/param.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. */ @@ -565,8 +566,8 @@ char *isa_list = architecture; static pgcnt_t original_physmem = 0; #define MIN_DEFAULT_MAXUSERS 8u -#define MAX_DEFAULT_MAXUSERS 2048u -#define MAX_MAXUSERS 4096u +#define MAX_DEFAULT_MAXUSERS 10000u +#define MAX_MAXUSERS 20000u void param_preset(void) @@ -578,7 +579,7 @@ void param_calc(int platform_max_nprocs) { /* - * Default to about one "user" per megabyte, taking into + * Default to about one "user" per 8MB, taking into * account both physical and virtual constraints. * Note: 2^20 is a meg; shifting right by (20 - PAGESHIFT) * converts pages to megs without integer overflow. @@ -592,8 +593,9 @@ param_calc(int platform_max_nprocs) if (maxusers == 0) { pgcnt_t physmegs = physmem >> (20 - PAGESHIFT); pgcnt_t virtmegs = vmem_size(heap_arena, VMEM_FREE) >> 20; - maxusers = MIN(MAX(MIN(physmegs, virtmegs), - MIN_DEFAULT_MAXUSERS), MAX_DEFAULT_MAXUSERS); + maxusers = MIN(physmegs, virtmegs) >> 3; /* divide by 8 */ + maxusers = MAX(maxusers, MIN_DEFAULT_MAXUSERS); + maxusers = MIN(maxusers, MAX_DEFAULT_MAXUSERS); } if (maxusers > MAX_MAXUSERS) { maxusers = MAX_MAXUSERS; diff --git a/usr/src/uts/common/crypto/api/kcf_random.c b/usr/src/uts/common/crypto/api/kcf_random.c index efaf5c37d1..a11098326b 100644 --- a/usr/src/uts/common/crypto/api/kcf_random.c +++ b/usr/src/uts/common/crypto/api/kcf_random.c @@ -71,6 +71,7 @@ #include <sys/cpuvar.h> #include <sys/taskq.h> #include <rng/fips_random.h> +#include <sys/strlog.h> #define RNDPOOLSIZE 1024 /* Pool size in bytes */ #define MINEXTRACTBYTES 20 @@ -900,7 +901,8 @@ rnd_handler(void *arg) int len = 0; if (!rng_prov_found && rng_ok_to_log) { - cmn_err(CE_WARN, "No randomness provider enabled for " + (void) strlog(0, 0, 0, SL_NOTE, + "No randomness provider enabled for " "/dev/random. Use cryptoadm(1M) to enable a provider."); rng_ok_to_log = B_FALSE; } diff --git a/usr/src/uts/common/crypto/core/kcf_sched.c b/usr/src/uts/common/crypto/core/kcf_sched.c index f461fe048c..8b2760b237 100644 --- a/usr/src/uts/common/crypto/core/kcf_sched.c +++ b/usr/src/uts/common/crypto/core/kcf_sched.c @@ -1027,9 +1027,9 @@ kcfpool_svc(void *arg) case 0: case -1: /* - * Woke up with no work to do. Check - * if this thread should exit. We keep - * at least kcf_minthreads. + * Woke up with no work to do. Check if we + * should lwp_exit() (which won't return). We + * keep at least kcf_minthreads. */ if (kcfpool->kp_threads > kcf_minthreads) { KCF_ATOMIC_DECR(kcfpool->kp_threads); diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c index 46f53faab6..68be78a84f 100644 --- a/usr/src/uts/common/disp/cpucaps.c +++ b/usr/src/uts/common/disp/cpucaps.c @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011, 2012 Joyent, Inc. All rights reserved. */ #include <sys/disp.h> @@ -74,6 +75,32 @@ * Putting threads on wait queues in random places while running in the * kernel might lead to all kinds of locking problems. * + * Bursting + * ======== + * + * CPU bursting occurs when the CPU usage is over the baseline but under the + * cap. The baseline CPU (zone.cpu-baseline) is set in a multi-tenant + * environment so that we know how much CPU is allocated for a tenant under + * normal utilization. We can then track how much time a zone is spending + * over the "normal" CPU utilization expected for that zone using the + * "above_base_sec" kstat. This kstat is cumulative. + * + * If the zone has a burst limit (zone.cpu-burst-time) then the zone can + * burst for that period of time (in seconds) before the effective cap is + * lowered to the baseline. Once the effective cap is lowered, the zone + * will run at the baseline for the burst limit before the effective cap is + * raised again to the full value. This will allow the zone to burst again. + * We can watch this behavior using the kstats. The "effective" kstat shows + * which cap is being used, the baseline value or the burst value. The + * "burst_limit_sec" shows the value of the zone.cpu-burst-time rctl and the + * "bursting_sec" kstat shows how many seconds the zone has currently been + * bursting. When the CPU load is continuously greater than the baseline, + * bursting_sec will increase, up to the burst_limit_sec value, then the + * effective kstat will drop to the baseline and the bursting_sec value will + * decrease until it hits 0, at which time the effective kstat will return to + * the full burst value and the bursting_sec value will begin to increase + * again. + * * Accounting * ========== * @@ -203,18 +230,28 @@ static void caps_update(); */ struct cap_kstat { kstat_named_t cap_value; + kstat_named_t cap_baseline; + kstat_named_t cap_effective; + kstat_named_t cap_burst_limit; + kstat_named_t cap_bursting; kstat_named_t cap_usage; kstat_named_t cap_nwait; kstat_named_t cap_below; kstat_named_t cap_above; + kstat_named_t cap_above_base; kstat_named_t cap_maxusage; kstat_named_t cap_zonename; } cap_kstat = { { "value", KSTAT_DATA_UINT64 }, + { "baseline", KSTAT_DATA_UINT64 }, + { "effective", KSTAT_DATA_UINT64 }, + { "burst_limit_sec", KSTAT_DATA_UINT64 }, + { "bursting_sec", KSTAT_DATA_UINT64 }, { "usage", KSTAT_DATA_UINT64 }, { "nwait", KSTAT_DATA_UINT64 }, { "below_sec", KSTAT_DATA_UINT64 }, { "above_sec", KSTAT_DATA_UINT64 }, + { "above_base_sec", KSTAT_DATA_UINT64 }, { "maxusage", KSTAT_DATA_UINT64 }, { "zonename", KSTAT_DATA_STRING }, }; @@ -311,7 +348,7 @@ cap_enable(list_t *l, cpucap_t *cap, hrtime_t value) cap->cap_below = cap->cap_above = 0; cap->cap_maxusage = 0; cap->cap_usage = 0; - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; waitq_unblock(&cap->cap_waitq); if (CPUCAPS_OFF()) { cpucaps_enabled = B_TRUE; @@ -345,7 +382,7 @@ cap_disable(list_t *l, cpucap_t *cap) cpucaps_enabled = B_FALSE; cpucaps_clock_callout = NULL; } - cap->cap_value = 0; + cap->cap_value = cap->cap_chk_value = 0; cap->cap_project = NULL; cap->cap_zone = NULL; if (cap->cap_kstat != NULL) { @@ -487,6 +524,8 @@ cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t)) * The waitq_isempty check is performed without the waitq lock. If a new thread * is placed on the waitq right after the check, it will be picked up during the * next invocation of cap_poke_waitq(). + * + * Called once per tick for zones. */ /* ARGSUSED */ static void @@ -494,15 +533,92 @@ cap_poke_waitq(cpucap_t *cap, int64_t gen) { ASSERT(MUTEX_HELD(&caps_lock)); - if (cap->cap_usage >= cap->cap_value) { + if (cap->cap_base != 0) { + /* + * Because of the way usage is calculated and decayed, its + * possible for the zone to be slightly over its cap, but we + * don't want to count that after we have reduced the effective + * cap to the baseline. That way the zone will be able to + * burst again after the burst_limit has expired. + */ + if (cap->cap_usage > cap->cap_base && + cap->cap_chk_value == cap->cap_value) { + cap->cap_above_base++; + + /* + * If bursting is limited and we've been bursting + * longer than we're supposed to, then set the + * effective cap to the baseline. + */ + if (cap->cap_burst_limit != 0) { + cap->cap_bursting++; + if (cap->cap_bursting >= cap->cap_burst_limit) + cap->cap_chk_value = cap->cap_base; + } + } else if (cap->cap_bursting > 0) { + /* + * We're not bursting now, but we were, decay the + * bursting timer. + */ + cap->cap_bursting--; + /* + * Reset the effective cap once we decay to 0 so we + * can burst again. + */ + if (cap->cap_bursting == 0 && + cap->cap_chk_value != cap->cap_value) + cap->cap_chk_value = cap->cap_value; + } + } + + if (cap->cap_usage >= cap->cap_chk_value) { cap->cap_above++; } else { waitq_t *wq = &cap->cap_waitq; cap->cap_below++; - if (!waitq_isempty(wq)) - waitq_runone(wq); + if (!waitq_isempty(wq)) { + int i, ndequeue, p; + + /* + * Since this function is only called once per tick, + * we can hit a situation where we have artificially + * limited the project/zone below its cap. This would + * happen if we have multiple threads queued up but + * only dequeued one thread/tick. To avoid this we + * dequeue multiple threads, calculated based on the + * usage percentage of the cap. It is possible that we + * could dequeue too many threads and some of them + * might be put back on the wait queue quickly, but + * since we know that threads are on the wait queue + * because we're capping, we know that there is unused + * CPU cycles anyway, so this extra work would not + * hurt. Also, the ndequeue number is only an upper + * bound and we might dequeue less, depending on how + * many threads are actually in the wait queue. The + * ndequeue values are empirically derived and could be + * adjusted or calculated in another way if necessary. + */ + p = (int)((100 * cap->cap_usage) / cap->cap_chk_value); + if (p >= 98) + ndequeue = 10; + else if (p >= 95) + ndequeue = 20; + else if (p >= 90) + ndequeue = 40; + else if (p >= 85) + ndequeue = 80; + else + ndequeue = 160; + + for (i = 0; i < ndequeue; i++) { + waitq_runone(wq); + if (waitq_isempty(wq)) + break; + } + DTRACE_PROBE2(cpucaps__pokeq, int, p, int, i); + } } } @@ -629,14 +745,14 @@ cap_project_zone_modify_walker(kproject_t *kpj, void *arg) * Remove all projects in this zone without caps * from the capped_projects list. */ - if (project_cap->cap_value == MAX_USAGE) { + if (project_cap->cap_chk_value == MAX_USAGE) { cap_project_disable(kpj); } } else if (CAP_DISABLED(project_cap)) { /* * Add the project to capped_projects list. */ - ASSERT(project_cap->cap_value == 0); + ASSERT(project_cap->cap_chk_value == 0); cap_project_enable(kpj, MAX_USAGE); } mutex_exit(&caps_lock); @@ -746,7 +862,7 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) /* * No state transitions, just change the value */ - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; } ASSERT(MUTEX_HELD(&caps_lock)); @@ -757,6 +873,108 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) } /* + * Set zone's base cpu value to base_val + */ +int +cpucaps_zone_set_base(zone_t *zone, rctl_qty_t base_val) +{ + cpucap_t *cap = NULL; + hrtime_t value; + + ASSERT(base_val <= MAXCAP); + if (base_val > MAXCAP) + base_val = MAXCAP; + + if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) + return (0); + + if (zone->zone_cpucap == NULL) + cap = cap_alloc(); + + mutex_enter(&caps_lock); + + if (cpucaps_busy) { + mutex_exit(&caps_lock); + return (EBUSY); + } + + /* + * Double-check whether zone->zone_cpucap is NULL, now with caps_lock + * held. If it is still NULL, assign a newly allocated cpucap to it. + */ + if (zone->zone_cpucap == NULL) { + zone->zone_cpucap = cap; + } else if (cap != NULL) { + cap_free(cap); + } + + cap = zone->zone_cpucap; + + value = base_val * cap_tick_cost; + if (value < 0 || value > cap->cap_value) + value = 0; + + cap->cap_base = value; + + mutex_exit(&caps_lock); + + return (0); +} + +/* + * Set zone's maximum burst time in seconds. A burst time of 0 means that + * the zone can run over its baseline indefinitely. + */ +int +cpucaps_zone_set_burst_time(zone_t *zone, rctl_qty_t base_val) +{ + cpucap_t *cap = NULL; + hrtime_t value; + + ASSERT(base_val <= INT_MAX); + /* Treat the default as 0 - no limit */ + if (base_val == INT_MAX) + base_val = 0; + if (base_val > INT_MAX) + base_val = INT_MAX; + + if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) + return (0); + + if (zone->zone_cpucap == NULL) + cap = cap_alloc(); + + mutex_enter(&caps_lock); + + if (cpucaps_busy) { + mutex_exit(&caps_lock); + return (EBUSY); + } + + /* + * Double-check whether zone->zone_cpucap is NULL, now with caps_lock + * held. If it is still NULL, assign a newly allocated cpucap to it. + */ + if (zone->zone_cpucap == NULL) { + zone->zone_cpucap = cap; + } else if (cap != NULL) { + cap_free(cap); + } + + cap = zone->zone_cpucap; + + value = SEC_TO_TICK(base_val); + if (value < 0) + value = 0; + + cap->cap_burst_limit = value; + + mutex_exit(&caps_lock); + + return (0); +} + +/* * The project is going away so disable its cap. */ void @@ -902,7 +1120,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) if (CAP_DISABLED(cap)) cap_project_enable(kpj, value); else - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; } else if (CAP_ENABLED(cap)) { /* * User requested to drop a cap on the project. If it is part of @@ -910,7 +1128,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) * otherwise disable the cap. */ if (ZONE_IS_CAPPED(kpj->kpj_zone)) { - cap->cap_value = MAX_USAGE; + cap->cap_value = cap->cap_chk_value = MAX_USAGE; } else { cap_project_disable(kpj); } @@ -948,6 +1166,26 @@ cpucaps_zone_get(zone_t *zone) } /* + * Get current zone baseline. + */ +rctl_qty_t +cpucaps_zone_get_base(zone_t *zone) +{ + return (zone->zone_cpucap != NULL ? + (rctl_qty_t)(zone->zone_cpucap->cap_base / cap_tick_cost) : 0); +} + +/* + * Get current zone maximum burst time. + */ +rctl_qty_t +cpucaps_zone_get_burst_time(zone_t *zone) +{ + return (zone->zone_cpucap != NULL ? + (rctl_qty_t)(TICK_TO_SEC(zone->zone_cpucap->cap_burst_limit)) : 0); +} + +/* * Charge project of thread t the time thread t spent on CPU since previously * adjusted. * @@ -1045,7 +1283,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) project_cap = kpj->kpj_cpucap; - if (project_cap->cap_usage >= project_cap->cap_value) { + if (project_cap->cap_usage >= project_cap->cap_chk_value) { t->t_schedflag |= TS_PROJWAITQ; rc = B_TRUE; } else if (t->t_schedflag & TS_PROJWAITQ) { @@ -1059,7 +1297,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) } else { cpucap_t *zone_cap = zone->zone_cpucap; - if (zone_cap->cap_usage >= zone_cap->cap_value) { + if (zone_cap->cap_usage >= zone_cap->cap_chk_value) { t->t_schedflag |= TS_ZONEWAITQ; rc = B_TRUE; } else if (t->t_schedflag & TS_ZONEWAITQ) { @@ -1133,6 +1371,12 @@ cap_kstat_update(kstat_t *ksp, int rw) capsp->cap_value.value.ui64 = ROUND_SCALE(cap->cap_value, cap_tick_cost); + capsp->cap_baseline.value.ui64 = + ROUND_SCALE(cap->cap_base, cap_tick_cost); + capsp->cap_effective.value.ui64 = + ROUND_SCALE(cap->cap_chk_value, cap_tick_cost); + capsp->cap_burst_limit.value.ui64 = + ROUND_SCALE(cap->cap_burst_limit, tick_sec); capsp->cap_usage.value.ui64 = ROUND_SCALE(cap->cap_usage, cap_tick_cost); capsp->cap_maxusage.value.ui64 = @@ -1140,6 +1384,10 @@ cap_kstat_update(kstat_t *ksp, int rw) capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count; capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec); capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec); + capsp->cap_above_base.value.ui64 = + ROUND_SCALE(cap->cap_above_base, tick_sec); + capsp->cap_bursting.value.ui64 = + ROUND_SCALE(cap->cap_bursting, tick_sec); kstat_named_setstr(&capsp->cap_zonename, zonename); return (0); diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c index be92ba108b..9afcd81239 100644 --- a/usr/src/uts/common/disp/disp.c +++ b/usr/src/uts/common/disp/disp.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ @@ -105,7 +109,7 @@ static void cpu_resched(cpu_t *cp, pri_t tpri); /* * If this is set, only interrupt threads will cause kernel preemptions. * This is done by changing the value of kpreemptpri. kpreemptpri - * will either be the max sysclass pri + 1 or the min interrupt pri. + * will either be the max sysclass pri or the min interrupt pri. */ int only_intr_kpreempt; @@ -252,7 +256,23 @@ dispinit(void) maxglobpri = cl_maxglobpri; } } - kpreemptpri = (pri_t)v.v_maxsyspri + 1; + + /* + * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is + * to say, maxclsyspri + 1. However, over time, the system has used + * more and more asynchronous kernel threads, with an increasing number + * of these doing work on direct behalf of higher-level software (e.g., + * network processing). This has led to potential priority inversions: + * threads doing low-priority lengthy kernel work can effectively + * delay kernel-level processing of higher-priority data. To minimize + * such inversions, we set kpreemptpri to be v_maxsyspri; anything in + * the kernel that runs at maxclsyspri will therefore induce kernel + * preemption, and this priority should be used if/when an asynchronous + * thread (or, as is often the case, task queue) is performing a task + * on behalf of higher-level software (or any task that is otherwise + * latency-sensitve). + */ + kpreemptpri = (pri_t)v.v_maxsyspri; if (kpqpri == KPQPRI) kpqpri = kpreemptpri; diff --git a/usr/src/uts/common/disp/fss.c b/usr/src/uts/common/disp/fss.c index 62301d65d8..c1c7da06ec 100644 --- a/usr/src/uts/common/disp/fss.c +++ b/usr/src/uts/common/disp/fss.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -54,6 +55,179 @@ #include <sys/cpucaps.h> /* + * The fair share scheduling class ensures that collections of processes + * (zones and projects) each get their configured share of CPU. This is in + * contrast to the TS class which considers individual processes. + * + * The FSS cpu-share is set on zones using the zone.cpu-shares rctl and on + * projects using the project.cpu-shares rctl. By default the value is 1 + * and it can range from 0 - 64k. A value of 0 means that processes in the + * collection will only get CPU resources when there are no other processes + * that need CPU. The cpu-share is used as one of the inputs to calculate a + * thread's "user-mode" priority (umdpri) for the scheduler. The umdpri falls + * in the range 0-59. FSS calculates other, internal, priorities which are not + * visible outside of the FSS class. + * + * The FSS class should approximate TS behavior when there are excess CPU + * resources. When there is a backlog of runnable processes, then the share + * is used as input into the runnable process's priority calculation, where + * the final umdpri is used by the scheduler to determine when the process runs. + * + * Projects in a zone compete with each other for CPU time, receiving CPU + * allocation within a zone proportional to the project's share; at a higher + * level zones compete with each other, receiving allocation in a pset + * proportional to the zone's share. + * + * The FSS priority calculation consists of several parts. + * + * 1) Once per second the fss_update function runs. The first thing it does is + * call fss_decay_usage. This function does three things. + * + * a) fss_decay_usage first decays the maxfsspri value for the pset. This + * value is used in the per-process priority calculation described in step + * (2b). The maxfsspri is decayed using the following formula: + * + * maxfsspri * fss_nice_decay[NZERO]) + * maxfsspri = ------------------------------------ + * FSS_DECAY_BASE + * + * + * - NZERO is the default process priority (i.e. 20) + * + * The fss_nice_decay array is a fixed set of values used to adjust the + * decay rate of processes based on their nice value. Entries in this + * array are initialized in fss_init using the following formula: + * + * (FSS_DECAY_MAX - FSS_DECAY_MIN) * i + * FSS_DECAY_MIN + ------------------------------------- + * FSS_NICE_RANGE - 1 + * + * - FSS_DECAY_MIN is 82 = approximates 65% (82/128) + * - FSS_DECAY_MAX is 108 = approximates 85% (108/128) + * - FSS_NICE_RANGE is 40 (range is 0 - 39) + * + * b) The second thing fss_decay_usage does is update each project's "usage" + * for the last second and then recalculates the project's "share usage". + * + * The usage value is the recent CPU usage for all of the threads in the + * project. It is decayed and updated this way: + * + * (usage * FSS_DECAY_USG) + * usage = ------------------------- + ticks; + * FSS_DECAY_BASE + * + * - FSS_DECAY_BASE is 128 - used instead of 100 so we can shift vs divide + * - FSS_DECAY_USG is 96 - approximates 75% (96/128) + * - ticks is updated whenever a process in this project is running + * when the scheduler's tick processing fires. This is not a simple + * counter, the values are based on the entries in the fss_nice_tick + * array (see section 3 below). ticks is then reset to 0 so it can track + * the next seconds worth of nice-adjusted time for the project. + * + * c) The third thing fss_decay_usage does is update each project's "share + * usage" (shusage). This is the normalized usage value for the project and + * is calculated this way: + * + * pset_shares^2 zone_int_shares^2 + * usage * ------------- * ------------------ + * kpj_shares^2 zone_ext_shares^2 + * + * - usage - see (1b) for more details + * - pset_shares is the total of all *active* zone shares in the pset (by + * default there is only one pset) + * - kpj_shares is the individual project's share (project.cpu-shares rctl) + * - zone_int_shares is the sum of shares of all active projects within the + * zone (the zone-internal total) + * - zone_ext_shares is the share value for the zone (zone.cpu-shares rctl) + * + * The shusage is used in step (2b) to calculate the thread's new internal + * priority. A larger shusage value leads to a lower priority. + * + * 2) The fss_update function then calls fss_update_list to update the priority + * of all threads. This does two things. + * + * a) First the thread's internal priority is decayed using the following + * formula: + * + * fsspri * fss_nice_decay[nice_value]) + * fsspri = ------------------------------------ + * FSS_DECAY_BASE + * + * - FSS_DECAY_BASE is 128 as described above + * + * b) Second, if the thread is runnable (TS_RUN or TS_WAIT) calls fss_newpri + * to update the user-mode priority (umdpri) of the runnable thread. + * Threads that are running (TS_ONPROC) or waiting for an event (TS_SLEEP) + * are not updated at this time. The updated user-mode priority can cause + * threads to change their position in the run queue. + * + * The process's new internal fsspri is calculated using the following + * formula. All runnable threads in the project will use the same shusage + * and nrunnable values in their calculation. + * + * fsspri += shusage * nrunnable * ticks + * + * - shusage is the project's share usage, calculated in (1c) + * - nrunnable is the number of runnable threads in the project + * - ticks is the number of ticks this thread ran since the last fss_newpri + * invocation. + * + * Finally the process's new user-mode priority is calculated using the + * following formula: + * + * (fsspri * umdprirange) + * umdpri = maxumdpri - ------------------------ + * maxfsspri + * + * - maxumdpri is MINCLSYSPRI - 1 (i.e. 59) + * - umdprirange is maxumdpri - 1 (i.e. 58) + * - maxfsspri is the largest fsspri seen so far, as we're iterating all + * runnable processes + * + * Thus, a higher internal priority (fsspri) leads to a lower user-mode + * priority which means the thread runs less. The fsspri is higher when + * the project's normalized share usage is higher, when the project has + * more runnable threads, or when the thread has accumulated more run-time. + * + * This code has various checks to ensure the resulting umdpri is in the + * range 1-59. See fss_newpri for more details. + * + * To reiterate, the above processing is performed once per second to recompute + * the runnable thread user-mode priorities. + * + * 3) The final major component in the priority calculation is the tick + * processing which occurs on a thread that is running when the clock + * calls fss_tick. + * + * A thread can run continuously in user-land (compute-bound) for the + * fss_quantum (see "dispadmin -c FSS -g" for the configurable properties). + * The fss_quantum defaults to 11 (i.e. 11 ticks). + * + * Once the quantum has been consumed, the thread will call fss_newpri to + * recompute its umdpri priority, as described above in (2b). Threads that + * were T_ONPROC at the one second interval when runnable thread priorities + * were recalculated will have their umdpri priority recalculated when their + * quanta expires. + * + * To ensure that runnable threads within a project see the expected + * round-robin behavior, there is a special case in fss_newpri for a thread + * that has run for its quanta within the one second update interval. See + * the handling for the quanta_up parameter within fss_newpri. + * + * Also of interest, the fss_tick code increments the project's tick value + * using the fss_nice_tick array entry for the thread's nice value. The idea + * behind the fss_nice_tick array is that the cost of a tick is lower at + * positive nice values (so that it doesn't increase the project's usage + * as much as normal) with a 50% drop at the maximum level and a 50% + * increase at the minimum level. See (1b). The fss_nice_tick array is + * initialized in fss_init using the following formula: + * + * FSS_TICK_COST * (((3 * FSS_NICE_RANGE) / 2) - i) + * -------------------------------------------------- + * FSS_NICE_RANGE + * + * - FSS_TICK_COST is 1000, the tick cost for threads with nice level 0 + * * FSS Data Structures: * * fsszone @@ -72,7 +246,6 @@ * ----- ----- ----- * fssproj * - * * That is, fsspsets contain a list of fsszone's that are currently active in * the pset, and a list of fssproj's, corresponding to projects with runnable * threads on the pset. fssproj's in turn point to the fsszone which they @@ -81,12 +254,6 @@ * An fssproj_t is removed when there are no threads in it. * * An fsszone_t is removed when there are no projects with threads in it. - * - * Projects in a zone compete with each other for cpu time, receiving cpu - * allocation within a zone proportional to fssproj->fssp_shares - * (project.cpu-shares); at a higher level zones compete with each other, - * receiving allocation in a pset proportional to fsszone->fssz_shares - * (zone.cpu-shares). See fss_decay_usage() for the precise formula. */ static pri_t fss_init(id_t, int, classfuncs_t **); @@ -186,7 +353,7 @@ static time_t fss_minrun = 2; /* t_pri becomes 59 within 2 secs */ static time_t fss_minslp = 2; /* min time on sleep queue for hardswap */ static int fss_quantum = 11; -static void fss_newpri(fssproc_t *); +static void fss_newpri(fssproc_t *, boolean_t); static void fss_update(void *); static int fss_update_list(int); static void fss_change_priority(kthread_t *, fssproc_t *); @@ -718,17 +885,55 @@ fss_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp) } /* - * Calculate the new cpupri based on the usage, the number of shares and - * the number of active threads. Reset the tick counter for this thread. + * Calculate the new fss_umdpri based on the usage, the normalized share usage + * and the number of active threads. Reset the tick counter for this thread. + * + * When calculating the new priority using the standard formula we can hit + * a scenario where we don't have good round-robin behavior. This would be + * most commonly seen when there is a zone with lots of runnable threads. + * In the bad scenario we will see the following behavior when using the + * standard formula and these conditions: + * + * - there are multiple runnable threads in the zone (project) + * - the fssps_maxfsspri is a very large value + * - (we also know all of these threads will use the project's + * fssp_shusage) + * + * Under these conditions, a thread with a low fss_fsspri value is chosen + * to run and the thread gets a high fss_umdpri. This thread can run for + * its full quanta (fss_timeleft) at which time fss_newpri is called to + * calculate the thread's new priority. + * + * In this case, because the newly calculated fsspri value is much smaller + * (orders of magnitude) than the fssps_maxfsspri value, if we used the + * standard formula the thread will still get a high fss_umdpri value and + * will run again for another quanta, even though there are other runnable + * threads in the project. + * + * For a thread that is runnable for a long time, the thread can continue + * to run for many quanta (totaling many seconds) before the thread's fsspri + * exceeds the fssps_maxfsspri and the thread's fss_umdpri is reset back + * down to 1. This behavior also keeps the fssps_maxfsspr at a high value, + * so that the next runnable thread might repeat this cycle. + * + * This leads to the case where we don't have round-robin behavior at quanta + * granularity, but instead, runnable threads within the project only run + * at several second intervals. + * + * To prevent this scenario from occuring, when a thread has consumed its + * quanta and there are multiple runnable threads in the project, we + * immediately cause the thread to hit fssps_maxfsspri so that it gets + * reset back to 1 and another runnable thread in the project can run. */ static void -fss_newpri(fssproc_t *fssproc) +fss_newpri(fssproc_t *fssproc, boolean_t quanta_up) { kthread_t *tp; fssproj_t *fssproj; fsspset_t *fsspset; fsszone_t *fsszone; fsspri_t fsspri, maxfsspri; + uint32_t n_runnable; pri_t invpri; uint32_t ticks; @@ -751,25 +956,43 @@ fss_newpri(fssproc_t *fssproc) fsspset = FSSPROJ2FSSPSET(fssproj); disp_lock_enter_high(&fsspset->fssps_displock); + ticks = fssproc->fss_ticks; + fssproc->fss_ticks = 0; + if (fssproj->fssp_shares == 0 || fsszone->fssz_rshares == 0) { /* * Special case: threads with no shares. */ fssproc->fss_umdpri = fss_minglobpri; - fssproc->fss_ticks = 0; disp_lock_exit_high(&fsspset->fssps_displock); return; } - /* - * fsspri += shusage * nrunnable * ticks - */ - ticks = fssproc->fss_ticks; - fssproc->fss_ticks = 0; - fsspri = fssproc->fss_fsspri; - fsspri += fssproj->fssp_shusage * fssproj->fssp_runnable * ticks; + maxfsspri = fsspset->fssps_maxfsspri; + n_runnable = fssproj->fssp_runnable; + + if (quanta_up && n_runnable > 1) { + fsspri = maxfsspri; + } else { + /* + * fsspri += fssp_shusage * nrunnable * ticks + * If all three values are non-0, this typically calculates to + * a large number (sometimes > 1M, sometimes > 100B) due to + * fssp_shusage which can be > 1T. + */ + fsspri = fssproc->fss_fsspri; + fsspri += fssproj->fssp_shusage * n_runnable * ticks; + } + fssproc->fss_fsspri = fsspri; + /* + * fss_maxumdpri is normally 59, since FSS priorities are 0-59. + * If the previous calculation resulted in 0 (e.g. was 0 and added 0 + * because ticks == 0), then instead of 0, we use the largest priority, + * which is still small in comparison to the large numbers we typically + * see. + */ if (fsspri < fss_maxumdpri) fsspri = fss_maxumdpri; /* so that maxfsspri is != 0 */ @@ -783,12 +1006,16 @@ fss_newpri(fssproc_t *fssproc) * If this thread's fsspri is greater than the previous largest * fsspri, then record it as the new high and priority for this * thread will be one (the lowest priority assigned to a thread - * that has non-zero shares). + * that has non-zero shares). Because of this check, maxfsspri can + * change as this function is called via the + * fss_update -> fss_update_list -> fss_newpri code path to update + * all runnable threads. See the code in fss_update for how we + * mitigate this issue. + * * Note that this formula cannot produce out of bounds priority - * values; if it is changed, additional checks may need to be + * values (0-59); if it is changed, additional checks may need to be * added. */ - maxfsspri = fsspset->fssps_maxfsspri; if (fsspri >= maxfsspri) { fsspset->fssps_maxfsspri = fsspri; disp_lock_exit_high(&fsspset->fssps_displock); @@ -801,8 +1028,9 @@ fss_newpri(fssproc_t *fssproc) } /* - * Decays usages of all running projects and resets their tick counters. - * Called once per second from fss_update() after updating priorities. + * Decays usages of all running projects, resets their tick counters and + * calcluates the projects normalized share usage. Called once per second from + * fss_update(). */ static void fss_decay_usage() @@ -814,6 +1042,7 @@ fss_decay_usage() fsszone_t *fsszone; fsspri_t maxfsspri; int psetid; + struct zone *zp; mutex_enter(&fsspsets_lock); /* @@ -824,6 +1053,8 @@ fss_decay_usage() fsspset = &fsspsets[psetid]; mutex_enter(&fsspset->fssps_lock); + fsspset->fssps_gen++; + if (fsspset->fssps_cpupart == NULL || (fssproj = fsspset->fssps_list) == NULL) { mutex_exit(&fsspset->fssps_lock); @@ -836,6 +1067,8 @@ fss_decay_usage() */ disp_lock_enter(&fsspset->fssps_displock); + pset_shares = fsspset->fssps_shares; + maxfsspri = (fsspset->fssps_maxfsspri * fss_nice_decay[NZERO]) / FSS_DECAY_BASE; if (maxfsspri < fss_maxumdpri) @@ -843,16 +1076,31 @@ fss_decay_usage() fsspset->fssps_maxfsspri = maxfsspri; do { + fsszone = fssproj->fssp_fsszone; + zp = fsszone->fssz_zone; + /* - * Decay usage for each project running on - * this cpu partition. + * Reset zone's FSS stats if they are from a + * previous cycle. + */ + if (fsspset->fssps_gen != zp->zone_fss_gen) { + zp->zone_fss_gen = fsspset->fssps_gen; + zp->zone_run_ticks = 0; + } + + /* + * Decay project usage, then add in this cycle's + * nice tick value. */ fssproj->fssp_usage = (fssproj->fssp_usage * FSS_DECAY_USG) / - FSS_DECAY_BASE + fssproj->fssp_ticks; + FSS_DECAY_BASE + + fssproj->fssp_ticks; + fssproj->fssp_ticks = 0; + zp->zone_run_ticks += fssproj->fssp_tick_cnt; + fssproj->fssp_tick_cnt = 0; - fsszone = fssproj->fssp_fsszone; /* * Readjust the project's number of shares if it has * changed since we checked it last time. @@ -871,18 +1119,55 @@ fss_decay_usage() * Readjust the zone's number of shares if it * has changed since we checked it last time. */ - zone_ext_shares = fsszone->fssz_zone->zone_shares; + zone_ext_shares = zp->zone_shares; if (fsszone->fssz_rshares != zone_ext_shares) { if (fsszone->fssz_runnable != 0) { fsspset->fssps_shares -= fsszone->fssz_rshares; fsspset->fssps_shares += zone_ext_shares; + pset_shares = fsspset->fssps_shares; } fsszone->fssz_rshares = zone_ext_shares; } zone_int_shares = fsszone->fssz_shares; - pset_shares = fsspset->fssps_shares; + + /* + * If anything is runnable in the project, track the + * overall project share percent for monitoring useage. + */ + if (fssproj->fssp_runnable > 0) { + uint32_t zone_shr_pct; + uint32_t int_shr_pct; + + /* + * Times 1000 to get tenths of a percent + * + * zone_ext_shares + * zone_shr_pct = --------------- + * pset_shares + * + * kpj_shares + * int_shr_pct = --------------- + * zone_int_shares + */ + if (pset_shares == 0 || zone_int_shares == 0) { + fssproj->fssp_shr_pct = 0; + } else { + zone_shr_pct = + (zone_ext_shares * 1000) / + pset_shares; + int_shr_pct = (kpj_shares * 1000) / + zone_int_shares; + fssproj->fssp_shr_pct = + (zone_shr_pct * int_shr_pct) / + 1000; + } + } else { + DTRACE_PROBE1(fss__prj__norun, fssproj_t *, + fssproj); + } + /* * Calculate fssp_shusage value to be used * for fsspri increments for the next second. @@ -890,10 +1175,22 @@ fss_decay_usage() if (kpj_shares == 0 || zone_ext_shares == 0) { fssproj->fssp_shusage = 0; } else if (FSSPROJ2KPROJ(fssproj) == proj0p) { + uint32_t zone_shr_pct; + /* * Project 0 in the global zone has 50% - * of its zone. + * of its zone. See calculation above for + * the zone's share percent. */ + if (pset_shares == 0) + zone_shr_pct = 1000; + else + zone_shr_pct = + (zone_ext_shares * 1000) / + pset_shares; + + fssproj->fssp_shr_pct = zone_shr_pct / 2; + fssproj->fssp_shusage = (fssproj->fssp_usage * zone_int_shares * zone_int_shares) / (zone_ext_shares * zone_ext_shares); @@ -925,6 +1222,10 @@ fss_decay_usage() * pset_shares^2 * shusage = usage * ---------------------- * zone_ext_shares^2 + * + * shusage is one input to calculating fss_pri + * in fss_newpri(). Larger values tend toward + * lower priorities for processes in the proj. */ fssproj->fssp_shusage = fssproj->fssp_usage * pset_shares * zone_int_shares; @@ -996,6 +1297,10 @@ fss_change_priority(kthread_t *t, fssproc_t *fssproc) * thread pointer. Each list has its own lock. This avoids blocking all * fss_enterclass, fss_fork, and fss_exitclass operations while fss_update runs. * fss_update traverses each list in turn. + * + * Each time we're run (once/second) we may start at the next list and iterate + * through all of the lists. By starting with a different list, we mitigate any + * effects we would see updating the fssps_maxfsspri value in fss_newpri. */ static void fss_update(void *arg) @@ -1021,7 +1326,7 @@ fss_update(void *arg) do { /* * If this is the first list after the current marker to have - * threads with priorities updates, advance the marker to this + * threads with priority updates, advance the marker to this * list for the next time fss_update runs. */ if (fss_update_list(i) && @@ -1050,6 +1355,7 @@ fss_update_list(int i) fssproc_t *fssproc; fssproj_t *fssproj; fsspri_t fsspri; + pri_t fss_umdpri; kthread_t *t; int updated = 0; @@ -1073,6 +1379,7 @@ fss_update_list(int i) fssproj = FSSPROC2FSSPROJ(fssproc); if (fssproj == NULL) goto next; + if (fssproj->fssp_shares != 0) { /* * Decay fsspri value. @@ -1091,16 +1398,21 @@ fss_update_list(int i) */ t->t_trapret = 1; aston(t); + if (t->t_state == TS_ONPROC) + DTRACE_PROBE1(fss__onproc, fssproc_t *, + fssproc); goto next; } - fss_newpri(fssproc); + fss_newpri(fssproc, B_FALSE); updated = 1; + fss_umdpri = fssproc->fss_umdpri; + /* * Only dequeue the thread if it needs to be moved; otherwise * it should just round-robin here. */ - if (t->t_pri != fssproc->fss_umdpri) + if (t->t_pri != fss_umdpri) fss_change_priority(t, fssproc); next: thread_unlock(t); @@ -1624,7 +1936,7 @@ fss_forkret(kthread_t *t, kthread_t *ct) thread_lock(t); fssproc = FSSPROC(t); - fss_newpri(fssproc); + fss_newpri(fssproc, B_FALSE); fssproc->fss_timeleft = fss_quantum; t->t_pri = fssproc->fss_umdpri; ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri); @@ -1725,7 +2037,7 @@ fss_parmsset(kthread_t *t, void *parmsp, id_t reqpcid, cred_t *reqpcredp) fssproc->fss_uprilim = reqfssuprilim; fssproc->fss_upri = reqfssupri; fssproc->fss_nice = nice; - fss_newpri(fssproc); + fss_newpri(fssproc, B_FALSE); if ((fssproc->fss_flags & FSSKPRI) != 0) { thread_unlock(t); @@ -2180,6 +2492,7 @@ fss_tick(kthread_t *t) fsspset_t *fsspset = FSSPROJ2FSSPSET(fssproj); disp_lock_enter_high(&fsspset->fssps_displock); fssproj->fssp_ticks += fss_nice_tick[fssproc->fss_nice]; + fssproj->fssp_tick_cnt++; fssproc->fss_ticks++; disp_lock_exit_high(&fsspset->fssps_displock); } @@ -2223,7 +2536,7 @@ fss_tick(kthread_t *t) } fssproc->fss_flags &= ~FSSRESTORE; - fss_newpri(fssproc); + fss_newpri(fssproc, B_TRUE); new_pri = fssproc->fss_umdpri; ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri); @@ -2262,7 +2575,7 @@ fss_tick(kthread_t *t) * queue so that it gets charged for the CPU time from its * quantum even before that quantum expires. */ - fss_newpri(fssproc); + fss_newpri(fssproc, B_FALSE); if (t->t_pri != fssproc->fss_umdpri) fss_change_priority(t, fssproc); diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c index 5ed9110251..63a08483f8 100644 --- a/usr/src/uts/common/disp/thread.c +++ b/usr/src/uts/common/disp/thread.c @@ -1049,6 +1049,8 @@ installctx( ctx->free_op = free; ctx->arg = arg; ctx->next = t->t_ctx; + ctx->save_ts = 0; + ctx->restore_ts = 0; t->t_ctx = ctx; } @@ -1120,9 +1122,12 @@ savectx(kthread_t *t) struct ctxop *ctx; ASSERT(t == curthread); - for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) - if (ctx->save_op != NULL) + for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) { + if (ctx->save_op != NULL) { + ctx->save_ts = gethrtime_unscaled(); (ctx->save_op)(ctx->arg); + } + } } void @@ -1131,9 +1136,12 @@ restorectx(kthread_t *t) struct ctxop *ctx; ASSERT(t == curthread); - for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) - if (ctx->restore_op != NULL) + for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) { + if (ctx->restore_op != NULL) { + ctx->restore_ts = gethrtime_unscaled(); (ctx->restore_op)(ctx->arg); + } + } } void diff --git a/usr/src/uts/common/dtrace/dtrace.c b/usr/src/uts/common/dtrace/dtrace.c index 5013661588..8ef84d1322 100644 --- a/usr/src/uts/common/dtrace/dtrace.c +++ b/usr/src/uts/common/dtrace/dtrace.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ @@ -116,7 +116,7 @@ int dtrace_destructive_disallow = 0; dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024); size_t dtrace_difo_maxsize = (256 * 1024); -dtrace_optval_t dtrace_dof_maxsize = (256 * 1024); +dtrace_optval_t dtrace_dof_maxsize = (8 * 1024 * 1024); size_t dtrace_global_maxsize = (16 * 1024); size_t dtrace_actions_max = (16 * 1024); size_t dtrace_retain_max = 1024; @@ -171,6 +171,7 @@ static dtrace_provider_t *dtrace_provider; /* provider list */ static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */ static int dtrace_opens; /* number of opens */ static int dtrace_helpers; /* number of helpers */ +static int dtrace_getf; /* number of unpriv getf()s */ static void *dtrace_softstate; /* softstate pointer */ static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */ static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */ @@ -267,17 +268,22 @@ dtrace_id_t dtrace_probeid_error; /* special ERROR probe */ /* * DTrace Helper Tracing Variables + * + * These variables should be set dynamically to enable helper tracing. The + * only variables that should be set are dtrace_helptrace_enable (which should + * be set to a non-zero value to allocate helper tracing buffers on the next + * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a + * non-zero value to deallocate helper tracing buffers on the next close of + * /dev/dtrace). When (and only when) helper tracing is disabled, the + * buffer size may also be set via dtrace_helptrace_bufsize. */ -uint32_t dtrace_helptrace_next = 0; -uint32_t dtrace_helptrace_nlocals; -char *dtrace_helptrace_buffer; -int dtrace_helptrace_bufsize = 512 * 1024; - -#ifdef DEBUG -int dtrace_helptrace_enabled = 1; -#else -int dtrace_helptrace_enabled = 0; -#endif +int dtrace_helptrace_enable = 0; +int dtrace_helptrace_disable = 0; +int dtrace_helptrace_bufsize = 16 * 1024 * 1024; +uint32_t dtrace_helptrace_nlocals; +static dtrace_helptrace_t *dtrace_helptrace_buffer; +static uint32_t dtrace_helptrace_next = 0; +static int dtrace_helptrace_wrapped = 0; /* * DTrace Error Hashing @@ -373,8 +379,8 @@ static kmutex_t dtrace_errlock; * disallow all negative sizes. Ranges of size 0 are allowed. */ #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \ - ((testaddr) - (baseaddr) < (basesz) && \ - (testaddr) + (testsz) - (baseaddr) <= (basesz) && \ + ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \ + (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \ (testaddr) + (testsz) >= (testaddr)) /* @@ -475,6 +481,8 @@ static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t, dtrace_optval_t); static int dtrace_ecb_create_enable(dtrace_probe_t *, void *); static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *); +static int dtrace_priv_proc(dtrace_state_t *, dtrace_mstate_t *); +static void dtrace_getf_barrier(void); /* * DTrace Probe Context Functions @@ -619,7 +627,7 @@ dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, * up both thread-local variables and any global dynamically-allocated * variables. */ - if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base, + if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base, vstate->dtvs_dynvars.dtds_size)) { dtrace_dstate_t *dstate = &vstate->dtvs_dynvars; uintptr_t base = (uintptr_t)dstate->dtds_base + @@ -686,6 +694,7 @@ dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) { volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval; + file_t *fp; /* * If we hold the privilege to read from kernel memory, then @@ -703,10 +712,99 @@ dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, /* * We're allowed to read from our own string table. */ - if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab, + if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab, mstate->dtms_difo->dtdo_strlen)) return (1); + if (vstate->dtvs_state != NULL && + dtrace_priv_proc(vstate->dtvs_state, mstate)) { + proc_t *p; + + /* + * When we have privileges to the current process, there are + * several context-related kernel structures that are safe to + * read, even absent the privilege to read from kernel memory. + * These reads are safe because these structures contain only + * state that (1) we're permitted to read, (2) is harmless or + * (3) contains pointers to additional kernel state that we're + * not permitted to read (and as such, do not present an + * opportunity for privilege escalation). Finally (and + * critically), because of the nature of their relation with + * the current thread context, the memory associated with these + * structures cannot change over the duration of probe context, + * and it is therefore impossible for this memory to be + * deallocated and reallocated as something else while it's + * being operated upon. + */ + if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t))) + return (1); + + if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr, + sz, curthread->t_procp, sizeof (proc_t))) { + return (1); + } + + if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz, + curthread->t_cred, sizeof (cred_t))) { + return (1); + } + + if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz, + &(p->p_pidp->pid_id), sizeof (pid_t))) { + return (1); + } + + if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz, + curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) { + return (1); + } + } + + if ((fp = mstate->dtms_getf) != NULL) { + uintptr_t psz = sizeof (void *); + vnode_t *vp; + vnodeops_t *op; + + /* + * When getf() returns a file_t, the enabling is implicitly + * granted the (transient) right to read the returned file_t + * as well as the v_path and v_op->vnop_name of the underlying + * vnode. These accesses are allowed after a successful + * getf() because the members that they refer to cannot change + * once set -- and the barrier logic in the kernel's closef() + * path assures that the file_t and its referenced vode_t + * cannot themselves be stale (that is, it impossible for + * either dtms_getf itself or its f_vnode member to reference + * freed memory). + */ + if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t))) + return (1); + + if ((vp = fp->f_vnode) != NULL) { + if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) + return (1); + + if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz, + vp->v_path, strlen(vp->v_path) + 1)) { + return (1); + } + + if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz)) + return (1); + + if ((op = vp->v_op) != NULL && + DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) { + return (1); + } + + if (op != NULL && op->vnop_name != NULL && + DTRACE_INRANGE(addr, sz, op->vnop_name, + strlen(op->vnop_name) + 1)) { + return (1); + } + } + } + DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV); *illval = addr; return (0); @@ -746,7 +844,7 @@ static int dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) { - size_t sz; + size_t sz, strsize; ASSERT(type->dtdt_flags & DIF_TF_BYREF); /* @@ -756,11 +854,24 @@ dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate, if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) return (1); - if (type->dtdt_kind == DIF_TYPE_STRING) - sz = dtrace_strlen(src, - vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1; - else + if (type->dtdt_kind == DIF_TYPE_STRING) { + dtrace_state_t *state = vstate->dtvs_state; + + if (state != NULL) { + strsize = state->dts_options[DTRACEOPT_STRSIZE]; + } else { + /* + * In helper context, we have a NULL state; fall back + * to using the system-wide default for the string size + * in this case. + */ + strsize = dtrace_strsize_default; + } + + sz = dtrace_strlen(src, strsize) + 1; + } else { sz = type->dtdt_size; + } return (dtrace_canload((uintptr_t)src, sz, mstate, vstate)); } @@ -1085,8 +1196,7 @@ dtrace_priv_proc_common_zone(dtrace_state_t *state) */ ASSERT(s_cr != NULL); - if ((cr = CRED()) != NULL && - s_cr->cr_zone == cr->cr_zone) + if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone) return (1); return (0); @@ -1209,19 +1319,17 @@ dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate, mode = pops->dtps_mode(prov->dtpv_arg, probe->dtpr_id, probe->dtpr_arg); - ASSERT((mode & DTRACE_MODE_USER) || - (mode & DTRACE_MODE_KERNEL)); - ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) || - (mode & DTRACE_MODE_NOPRIV_DROP)); + ASSERT(mode & (DTRACE_MODE_USER | DTRACE_MODE_KERNEL)); + ASSERT(mode & (DTRACE_MODE_NOPRIV_RESTRICT | + DTRACE_MODE_NOPRIV_DROP)); } /* * If the dte_cond bits indicate that this consumer is only allowed to - * see user-mode firings of this probe, call the provider's dtps_mode() - * entry point to check that the probe was fired while in a user - * context. If that's not the case, use the policy specified by the - * provider to determine if we drop the probe or merely restrict - * operation. + * see user-mode firings of this probe, check that the probe was fired + * while in a user context. If that's not the case, use the policy + * specified by the provider to determine if we drop the probe or + * merely restrict operation. */ if (ecb->dte_cond & DTRACE_COND_USERMODE) { ASSERT(mode != DTRACE_MODE_NOPRIV_DROP); @@ -1288,6 +1396,15 @@ dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate, } } + /* + * By merits of being in this code path at all, we have limited + * privileges. If the provider has indicated that limited privileges + * are to denote restricted operation, strip off the ability to access + * arguments. + */ + if (mode & DTRACE_MODE_LIMITEDPRIV_RESTRICT) + mstate->dtms_access &= ~DTRACE_ACCESS_ARGS; + return (1); } @@ -2924,7 +3041,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, } case DIF_VAR_CURTHREAD: - if (!dtrace_priv_kernel(state)) + if (!dtrace_priv_proc(state, mstate)) return (0); return ((uint64_t)(uintptr_t)curthread); @@ -4452,11 +4569,35 @@ case DIF_SUBR_GETMAJOR: break; } + case DIF_SUBR_GETF: { + uintptr_t fd = tupregs[0].dttk_value; + uf_info_t *finfo = &curthread->t_procp->p_user.u_finfo; + file_t *fp; + + if (!dtrace_priv_proc(state, mstate)) { + regs[rd] = NULL; + break; + } + + /* + * This is safe because fi_nfiles only increases, and the + * fi_list array is not freed when the array size doubles. + * (See the comment in flist_grow() for details on the + * management of the u_finfo structure.) + */ + fp = fd < finfo->fi_nfiles ? finfo->fi_list[fd].uf_file : NULL; + + mstate->dtms_getf = fp; + regs[rd] = (uintptr_t)fp; + break; + } + case DIF_SUBR_CLEANPATH: { char *dest = (char *)mstate->dtms_scratch_ptr, c; uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; uintptr_t src = tupregs[0].dttk_value; int i = 0, j = 0; + zone_t *z; if (!dtrace_strcanload(src, size, mstate, vstate)) { regs[rd] = NULL; @@ -4555,6 +4696,23 @@ next: } while (c != '\0'); dest[j] = '\0'; + + if (mstate->dtms_getf != NULL && + !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) && + (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) { + /* + * If we've done a getf() as a part of this ECB and we + * don't have kernel access (and we're not in the global + * zone), check if the path we cleaned up begins with + * the zone's root path, and trim it off if so. Note + * that this is an output cleanliness issue, not a + * security issue: knowing one's zone root path does + * not enable privilege escalation. + */ + if (strstr(dest, z->zone_rootpath) == dest) + dest += strlen(z->zone_rootpath) - 1; + } + regs[rd] = (uintptr_t)dest; mstate->dtms_scratch_ptr += size; break; @@ -4939,71 +5097,50 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, pc = DIF_INSTR_LABEL(instr); break; case DIF_OP_RLDSB: - if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 1, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDSB: regs[rd] = (int8_t)dtrace_load8(regs[r1]); break; case DIF_OP_RLDSH: - if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 2, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDSH: regs[rd] = (int16_t)dtrace_load16(regs[r1]); break; case DIF_OP_RLDSW: - if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 4, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDSW: regs[rd] = (int32_t)dtrace_load32(regs[r1]); break; case DIF_OP_RLDUB: - if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 1, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDUB: regs[rd] = dtrace_load8(regs[r1]); break; case DIF_OP_RLDUH: - if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 2, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDUH: regs[rd] = dtrace_load16(regs[r1]); break; case DIF_OP_RLDUW: - if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 4, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDUW: regs[rd] = dtrace_load32(regs[r1]); break; case DIF_OP_RLDX: - if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 8, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDX: regs[rd] = dtrace_load64(regs[r1]); @@ -5940,6 +6077,8 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE; mstate.dtms_access = DTRACE_ACCESS_ARGS | DTRACE_ACCESS_PROC; + mstate.dtms_getf = NULL; + *flags &= ~CPU_DTRACE_ERROR; if (prov == dtrace_provider) { @@ -6736,7 +6875,7 @@ dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp) priv = DTRACE_PRIV_ALL; } else { *uidp = crgetuid(cr); - *zoneidp = crgetzoneid(cr); + *zoneidp = crgetzonedid(cr); priv = 0; if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) @@ -7232,7 +7371,7 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv, provider->dtpv_priv.dtpp_flags = priv; if (cr != NULL) { provider->dtpv_priv.dtpp_uid = crgetuid(cr); - provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr); + provider->dtpv_priv.dtpp_zoneid = crgetzonedid(cr); } provider->dtpv_pops = *pops; @@ -7843,6 +7982,7 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab) uint32_t priv; uid_t uid; zoneid_t zoneid; + dtrace_state_t *state = enab->dten_vstate->dtvs_state; ASSERT(MUTEX_HELD(&dtrace_lock)); dtrace_ecb_create_cache = NULL; @@ -7857,8 +7997,22 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab) } dtrace_probekey(desc, &pkey); - dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred, - &priv, &uid, &zoneid); + dtrace_cred2priv(state->dts_cred.dcr_cred, &priv, &uid, &zoneid); + + if ((priv & DTRACE_PRIV_ZONEOWNER) && + state->dts_options[DTRACEOPT_ZONE] != DTRACEOPT_UNSET) { + /* + * If we have the privilege of instrumenting all zones but we + * have been told to instrument but one, we will spoof this up + * depriving ourselves of DTRACE_PRIV_ZONEOWNER for purposes + * of dtrace_match(). (Note that DTRACEOPT_ZONE is not for + * security but rather for performance: it allows the global + * zone to instrument USDT probes in a local zone without + * requiring all zones to be instrumented.) + */ + priv &= ~DTRACE_PRIV_ZONEOWNER; + zoneid = state->dts_options[DTRACEOPT_ZONE]; + } return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, enab)); @@ -8443,6 +8597,20 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, subr == DIF_SUBR_COPYOUTSTR) { dp->dtdo_destructive = 1; } + + if (subr == DIF_SUBR_GETF) { + /* + * If we have a getf() we need to record that + * in our state. Note that our state can be + * NULL if this is a helper -- but in that + * case, the call to getf() is itself illegal, + * and will be caught (slightly later) when + * the helper is validated. + */ + if (vstate->dtvs_state != NULL) + vstate->dtvs_state->dts_getf++; + } + break; case DIF_OP_PUSHTR: if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF) @@ -13085,6 +13253,22 @@ dtrace_state_go(dtrace_state_t *state, processorid_t *cpu) state->dts_activity = DTRACE_ACTIVITY_WARMUP; + if (state->dts_getf != 0 && + !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) { + /* + * We don't have kernel privs but we have at least one call + * to getf(); we need to bump our zone's count, and (if + * this is the first enabling to have an unprivileged call + * to getf()) we need to hook into closef(). + */ + state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++; + + if (dtrace_getf++ == 0) { + ASSERT(dtrace_closef == NULL); + dtrace_closef = dtrace_getf_barrier; + } + } + /* * Now it's time to actually fire the BEGIN probe. We need to disable * interrupts here both to record the CPU on which we fired the BEGIN @@ -13201,6 +13385,24 @@ dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu) state->dts_activity = DTRACE_ACTIVITY_STOPPED; dtrace_sync(); + if (state->dts_getf != 0 && + !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) { + /* + * We don't have kernel privs but we have at least one call + * to getf(); we need to lower our zone's count, and (if + * this is the last enabling to have an unprivileged call + * to getf()) we need to clear the closef() hook. + */ + ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0); + ASSERT(dtrace_closef == dtrace_getf_barrier); + ASSERT(dtrace_getf > 0); + + state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--; + + if (--dtrace_getf == 0) + dtrace_closef = NULL; + } + return (0); } @@ -13507,10 +13709,10 @@ dtrace_helper_trace(dtrace_helper_action_t *helper, dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where) { uint32_t size, next, nnext, i; - dtrace_helptrace_t *ent; + dtrace_helptrace_t *ent, *buffer; uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags; - if (!dtrace_helptrace_enabled) + if ((buffer = dtrace_helptrace_buffer) == NULL) return; ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals); @@ -13538,10 +13740,12 @@ dtrace_helper_trace(dtrace_helper_action_t *helper, /* * We have our slot; fill it in. */ - if (nnext == size) + if (nnext == size) { + dtrace_helptrace_wrapped++; next = 0; + } - ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next]; + ent = (dtrace_helptrace_t *)((uintptr_t)buffer + next); ent->dtht_helper = helper; ent->dtht_where = where; ent->dtht_nlocals = vstate->dtvs_nlocals; @@ -13575,7 +13779,7 @@ dtrace_helper(int which, dtrace_mstate_t *mstate, dtrace_helper_action_t *helper; dtrace_vstate_t *vstate; dtrace_difo_t *pred; - int i, trace = dtrace_helptrace_enabled; + int i, trace = dtrace_helptrace_buffer != NULL; ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS); @@ -14761,6 +14965,23 @@ dtrace_toxrange_add(uintptr_t base, uintptr_t limit) dtrace_toxranges++; } +static void +dtrace_getf_barrier() +{ + /* + * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings + * that contain calls to getf(), this routine will be called on every + * closef() before either the underlying vnode is released or the + * file_t itself is freed. By the time we are here, it is essential + * that the file_t can no longer be accessed from a call to getf() + * in probe context -- that assures that a dtrace_sync() can be used + * to clear out any enablings referring to the old structures. + */ + if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 || + kcred->cr_zone->zone_dtrace_getf != 0) + dtrace_sync(); +} + /* * DTrace Driver Cookbook Functions */ @@ -14875,17 +15096,6 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) mutex_exit(&cpu_lock); /* - * If DTrace helper tracing is enabled, we need to allocate the - * trace buffer and initialize the values. - */ - if (dtrace_helptrace_enabled) { - ASSERT(dtrace_helptrace_buffer == NULL); - dtrace_helptrace_buffer = - kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP); - dtrace_helptrace_next = 0; - } - - /* * If there are already providers, we must ask them to provide their * probes, and then match any anonymous enabling against them. Note * that there should be no other retained enablings at this time: @@ -14981,6 +15191,18 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) return (EBUSY); } + if (dtrace_helptrace_enable && dtrace_helptrace_buffer == NULL) { + /* + * If DTrace helper tracing is enabled, we need to allocate the + * trace buffer and initialize the values. + */ + dtrace_helptrace_buffer = + kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP); + dtrace_helptrace_next = 0; + dtrace_helptrace_wrapped = 0; + dtrace_helptrace_enable = 0; + } + state = dtrace_state_create(devp, cred_p); mutex_exit(&cpu_lock); @@ -15002,6 +15224,7 @@ dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p) { minor_t minor = getminor(dev); dtrace_state_t *state; + dtrace_helptrace_t *buf = NULL; if (minor == DTRACEMNRN_HELPER) return (0); @@ -15019,6 +15242,18 @@ dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p) dtrace_state_destroy(state->dts_anon); } + if (dtrace_helptrace_disable) { + /* + * If we have been told to disable helper tracing, set the + * buffer to NULL before calling into dtrace_state_destroy(); + * we take advantage of its dtrace_sync() to know that no + * CPU is in probe context with enabled helper tracing + * after it returns. + */ + buf = dtrace_helptrace_buffer; + dtrace_helptrace_buffer = NULL; + } + dtrace_state_destroy(state); ASSERT(dtrace_opens > 0); @@ -15029,6 +15264,11 @@ dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p) if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); + if (buf != NULL) { + kmem_free(buf, dtrace_helptrace_bufsize); + dtrace_helptrace_disable = 0; + } + mutex_exit(&dtrace_lock); mutex_exit(&cpu_lock); @@ -15917,12 +16157,10 @@ dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) dtrace_modload = NULL; dtrace_modunload = NULL; - mutex_exit(&cpu_lock); + ASSERT(dtrace_getf == 0); + ASSERT(dtrace_closef == NULL); - if (dtrace_helptrace_enabled) { - kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize); - dtrace_helptrace_buffer = NULL; - } + mutex_exit(&cpu_lock); kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *)); dtrace_probes = NULL; diff --git a/usr/src/uts/common/dtrace/sdt_subr.c b/usr/src/uts/common/dtrace/sdt_subr.c index 242185071b..157acc25fc 100644 --- a/usr/src/uts/common/dtrace/sdt_subr.c +++ b/usr/src/uts/common/dtrace/sdt_subr.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include <sys/sdt_impl.h> @@ -97,26 +98,26 @@ static dtrace_pattr_t iscsi_attr = { }; sdt_provider_t sdt_providers[] = { - { "vtrace", "__vtrace_", &vtrace_attr, 0 }, - { "sysinfo", "__cpu_sysinfo_", &info_attr, 0 }, - { "vminfo", "__cpu_vminfo_", &info_attr, 0 }, - { "fpuinfo", "__fpuinfo_", &fpu_attr, 0 }, - { "sched", "__sched_", &stab_attr, 0 }, - { "proc", "__proc_", &stab_attr, 0 }, - { "io", "__io_", &stab_attr, 0 }, - { "ip", "__ip_", &stab_attr, 0 }, - { "tcp", "__tcp_", &stab_attr, 0 }, - { "udp", "__udp_", &stab_attr, 0 }, - { "mib", "__mib_", &stab_attr, 0 }, - { "fsinfo", "__fsinfo_", &fsinfo_attr, 0 }, - { "iscsi", "__iscsi_", &iscsi_attr, 0 }, - { "nfsv3", "__nfsv3_", &stab_attr, 0 }, - { "nfsv4", "__nfsv4_", &stab_attr, 0 }, - { "xpv", "__xpv_", &xpv_attr, 0 }, - { "fc", "__fc_", &fc_attr, 0 }, - { "srp", "__srp_", &fc_attr, 0 }, - { "sysevent", "__sysevent_", &stab_attr, 0 }, - { "sdt", NULL, &sdt_attr, 0 }, + { "vtrace", "__vtrace_", &vtrace_attr }, + { "sysinfo", "__cpu_sysinfo_", &info_attr, DTRACE_PRIV_USER }, + { "vminfo", "__cpu_vminfo_", &info_attr, DTRACE_PRIV_USER }, + { "fpuinfo", "__fpuinfo_", &fpu_attr }, + { "sched", "__sched_", &stab_attr, DTRACE_PRIV_USER }, + { "proc", "__proc_", &stab_attr, DTRACE_PRIV_USER }, + { "io", "__io_", &stab_attr }, + { "ip", "__ip_", &stab_attr }, + { "tcp", "__tcp_", &stab_attr }, + { "udp", "__udp_", &stab_attr }, + { "mib", "__mib_", &stab_attr }, + { "fsinfo", "__fsinfo_", &fsinfo_attr }, + { "iscsi", "__iscsi_", &iscsi_attr }, + { "nfsv3", "__nfsv3_", &stab_attr }, + { "nfsv4", "__nfsv4_", &stab_attr }, + { "xpv", "__xpv_", &xpv_attr }, + { "fc", "__fc_", &fc_attr }, + { "srp", "__srp_", &fc_attr }, + { "sysevent", "__sysevent_", &stab_attr }, + { "sdt", NULL, &sdt_attr }, { NULL } }; @@ -1155,6 +1156,20 @@ sdt_argdesc_t sdt_args[] = { }; /*ARGSUSED*/ +int +sdt_mode(void *arg, dtrace_id_t id, void *parg) +{ + /* + * We tell DTrace that we're in kernel mode, that the firing needs to + * be dropped for anything that doesn't have necessary privileges, and + * that it needs to be restricted for anything that has restricted + * (i.e., not all-zone) privileges. + */ + return (DTRACE_MODE_KERNEL | DTRACE_MODE_NOPRIV_DROP | + DTRACE_MODE_LIMITEDPRIV_RESTRICT); +} + +/*ARGSUSED*/ void sdt_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc) { diff --git a/usr/src/uts/common/fs/dev/sdev_subr.c b/usr/src/uts/common/fs/dev/sdev_subr.c index b814175e8a..f29918e351 100644 --- a/usr/src/uts/common/fs/dev/sdev_subr.c +++ b/usr/src/uts/common/fs/dev/sdev_subr.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* @@ -537,7 +538,7 @@ static struct sdev_vop_table vtab[] = SDEV_DYNAMIC | SDEV_VTOR }, { "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops, - devzvol_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR }, + devzvol_validate, SDEV_ZONED | SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR }, { "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE }, diff --git a/usr/src/uts/common/fs/dev/sdev_vnops.c b/usr/src/uts/common/fs/dev/sdev_vnops.c index fb1d93d06b..89c5decbf0 100644 --- a/usr/src/uts/common/fs/dev/sdev_vnops.c +++ b/usr/src/uts/common/fs/dev/sdev_vnops.c @@ -1142,9 +1142,21 @@ sdev_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp, struct sdev_node *parent = VTOSDEV(dvp); int error; - /* execute access is required to search the directory */ - if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0) - return (error); + /* + * We must check that we have execute access to search the directory -- + * but because our sdev_contents lock is already held as a reader (the + * caller must have done a VOP_RWLOCK()), we call directly into the + * underlying access routine if sdev_attr is non-NULL. + */ + if (parent->sdev_attr != NULL) { + VERIFY(RW_READ_HELD(&parent->sdev_contents)); + + if (sdev_unlocked_access(parent, VEXEC, cred) != 0) + return (EACCES); + } else { + if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0) + return (error); + } ASSERT(parent); if (!SDEV_IS_GLOBAL(parent)) diff --git a/usr/src/uts/common/fs/dev/sdev_zvolops.c b/usr/src/uts/common/fs/dev/sdev_zvolops.c index 89ce67dd68..09e5559701 100644 --- a/usr/src/uts/common/fs/dev/sdev_zvolops.c +++ b/usr/src/uts/common/fs/dev/sdev_zvolops.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ /* vnode ops for the /dev/zvol directory */ @@ -47,6 +48,7 @@ static ldi_ident_t devzvol_li; static ldi_handle_t devzvol_lh; static kmutex_t devzvol_mtx; static boolean_t devzvol_isopen; +static major_t devzvol_major; /* * we need to use ddi_mod* since fs/dev gets loaded early on in @@ -61,12 +63,16 @@ int (*szn2m)(char *, minor_t *); int sdev_zvol_create_minor(char *dsname) { + if (szcm == NULL) + return (-1); return ((*szcm)(dsname)); } int sdev_zvol_name2minor(char *dsname, minor_t *minor) { + if (szn2m == NULL) + return (-1); return ((*szn2m)(dsname, minor)); } @@ -74,6 +80,7 @@ int devzvol_open_zfs() { int rc; + dev_t dv; devzvol_li = ldi_ident_from_anon(); if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred, @@ -94,6 +101,9 @@ devzvol_open_zfs() cmn_err(CE_WARN, "couldn't resolve zvol_name2minor"); return (rc); } + if (ldi_get_dev(devzvol_lh, &dv)) + return (-1); + devzvol_major = getmajor(dv); return (0); } @@ -270,6 +280,8 @@ devzvol_validate(struct sdev_node *dv) sdcmn_err13((" v_type %d do_type %d", SDEVTOV(dv)->v_type, do_type)); if ((SDEVTOV(dv)->v_type == VLNK && do_type != DMU_OST_ZVOL) || + ((SDEVTOV(dv)->v_type == VBLK || SDEVTOV(dv)->v_type == VCHR) && + do_type != DMU_OST_ZVOL) || (SDEVTOV(dv)->v_type == VDIR && do_type == DMU_OST_ZVOL)) { kmem_free(dsname, strlen(dsname) + 1); return (SDEV_VTOR_STALE); @@ -486,6 +498,82 @@ devzvol_prunedir(struct sdev_node *ddv) rw_downgrade(&ddv->sdev_contents); } +/* + * This function is used to create a dir or dev inside a zone's /dev when the + * zone has a zvol that is dynamically created within the zone (i.e. inside + * of a delegated dataset. Since there is no /devices tree within a zone, + * we create the chr/blk devices directly inside the zone's /dev instead of + * making symlinks. + */ +static int +devzvol_mk_ngz_node(struct sdev_node *parent, char *nm) +{ + struct vattr vattr; + timestruc_t now; + enum vtype expected_type = VDIR; + dmu_objset_type_t do_type; + struct sdev_node *dv = NULL; + int res; + char *dsname; + + bzero(&vattr, sizeof (vattr)); + gethrestime(&now); + vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID; + vattr.va_uid = SDEV_UID_DEFAULT; + vattr.va_gid = SDEV_GID_DEFAULT; + vattr.va_type = VNON; + vattr.va_atime = now; + vattr.va_mtime = now; + vattr.va_ctime = now; + + if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL) + return (ENOENT); + + if (devzvol_objset_check(dsname, &do_type) != 0) { + kmem_free(dsname, strlen(dsname) + 1); + return (ENOENT); + } + if (do_type == DMU_OST_ZVOL) + expected_type = VBLK; + + if (expected_type == VDIR) { + vattr.va_type = VDIR; + vattr.va_mode = SDEV_DIRMODE_DEFAULT; + } else { + minor_t minor; + dev_t devnum; + int rc; + + rc = sdev_zvol_create_minor(dsname); + if ((rc != 0 && rc != EEXIST && rc != EBUSY) || + sdev_zvol_name2minor(dsname, &minor)) { + kmem_free(dsname, strlen(dsname) + 1); + return (ENOENT); + } + + devnum = makedevice(devzvol_major, minor); + vattr.va_rdev = devnum; + + if (strstr(parent->sdev_path, "/rdsk/") != NULL) + vattr.va_type = VCHR; + else + vattr.va_type = VBLK; + vattr.va_mode = SDEV_DEVMODE_DEFAULT; + } + kmem_free(dsname, strlen(dsname) + 1); + + rw_enter(&parent->sdev_contents, RW_WRITER); + + res = sdev_mknode(parent, nm, &dv, &vattr, + NULL, NULL, kcred, SDEV_READY); + rw_exit(&parent->sdev_contents); + if (res != 0) + return (ENOENT); + + SDEV_RELE(dv); + return (0); +} + /*ARGSUSED*/ static int devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, @@ -505,9 +593,39 @@ devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, return (error); rw_enter(&parent->sdev_contents, RW_READER); - if (!SDEV_IS_GLOBAL(parent)) { + if (SDEV_IS_GLOBAL(parent)) { + /* + * During iter_datasets, don't create GZ dev when running in + * NGZ. We can't return ENOENT here since that could + * incorrectly trigger the creation of the dev from the + * recursive call through prof_filldir during iter_datasets. + */ + if (getzoneid() != GLOBAL_ZONEID) { + rw_exit(&parent->sdev_contents); + return (EPERM); + } + } else { + int res; + rw_exit(&parent->sdev_contents); - return (prof_lookup(dvp, nm, vpp, cred)); + res = prof_lookup(dvp, nm, vpp, cred); + + /* + * We won't find a zvol that was dynamically created inside + * a NGZ, within a delegated dataset, in the zone's dev profile + * but prof_lookup will also find it via sdev_cache_lookup. + */ + if (res == ENOENT) { + /* + * We have to create the sdev node for the dymamically + * created zvol. + */ + if (devzvol_mk_ngz_node(parent, nm) != 0) + return (ENOENT); + res = prof_lookup(dvp, nm, vpp, cred); + } + + return (res); } dsname = devzvol_make_dsname(parent->sdev_path, nm); @@ -613,8 +731,10 @@ sdev_iter_datasets(struct vnode *dvp, int arg, char *name) } else if (rc == ENOENT) { goto skip; } else { - /* EBUSY == problem with zvols's dmu holds? */ - ASSERT(0); + /* + * EBUSY == problem with zvols's dmu holds? + * EPERM when in a NGZ and traversing up and out. + */ goto skip; } if (arg == ZFS_IOC_DATASET_LIST_NEXT && diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c new file mode 100644 index 0000000000..16068e35ee --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c @@ -0,0 +1,640 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/stat.h> +#include <sys/policy.h> +#include <sys/fs/hyprlofs_info.h> + +static int hldir_make_hlnode(hlnode_t *, hlfsmount_t *, vattr_t *, enum de_op, + vnode_t *, hlnode_t **, cred_t *); +static int hldiraddentry(hlnode_t *, hlnode_t *, char *); + + +#define HL_HASH_SIZE 8192 /* must be power of 2 */ +#define HL_MUTEX_SIZE 64 + +static hldirent_t *hl_hashtable[HL_HASH_SIZE]; +static kmutex_t hl_hashmutex[HL_MUTEX_SIZE]; + +#define HL_HASH_INDEX(a) ((a) & (HL_HASH_SIZE-1)) +#define HL_MUTEX_INDEX(a) ((a) & (HL_MUTEX_SIZE-1)) + +#define HYPRLOFS_HASH(tp, name, hash) \ + { \ + char Xc, *Xcp; \ + hash = (uint_t)(uintptr_t)(tp) >> 8; \ + for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \ + hash = (hash << 4) + hash + (uint_t)Xc; \ + } + +void +hyprlofs_hash_init(void) +{ + int ix; + + for (ix = 0; ix < HL_MUTEX_SIZE; ix++) + mutex_init(&hl_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL); +} + +static void +hyprlofs_hash_in(hldirent_t *h) +{ + uint_t hash; + hldirent_t **prevpp; + kmutex_t *hmtx; + + HYPRLOFS_HASH(h->hld_parent, h->hld_name, hash); + h->hld_hash = hash; + prevpp = &hl_hashtable[HL_HASH_INDEX(hash)]; + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + h->hld_link = *prevpp; + *prevpp = h; + mutex_exit(hmtx); +} + +/* Remove hldirent *h from the hash list. */ +static void +hyprlofs_hash_out(hldirent_t *h) +{ + uint_t hash; + hldirent_t **prevpp; + kmutex_t *hmtx; + + hash = h->hld_hash; + prevpp = &hl_hashtable[HL_HASH_INDEX(hash)]; + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + while (*prevpp != h) + prevpp = &(*prevpp)->hld_link; + *prevpp = h->hld_link; + mutex_exit(hmtx); +} + +static hldirent_t * +hyprlofs_hash_lookup(char *name, hlnode_t *parent, uint_t hold, + hlnode_t **found) +{ + hldirent_t *l; + uint_t hash; + kmutex_t *hmtx; + hlnode_t *hnp; + + HYPRLOFS_HASH(parent, name, hash); + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + l = hl_hashtable[HL_HASH_INDEX(hash)]; + while (l) { + if (l->hld_hash == hash && l->hld_parent == parent && + strcmp(l->hld_name, name) == 0) { + /* + * Ensure that the hlnode that we put a hold on is the + * same one that we pass back. Thus the temp. var + * hnp is necessary. + */ + hnp = l->hld_hlnode; + if (hold) { + ASSERT(hnp); + hlnode_hold(hnp); + } + if (found) + *found = hnp; + mutex_exit(hmtx); + return (l); + } else { + l = l->hld_link; + } + } + mutex_exit(hmtx); + return (NULL); +} + +/* + * Search directory 'parent' for entry 'name'. + * + * The calling thread can't hold the write version of the rwlock for the + * directory being searched + * + * On success *foundtp points to the found hlnode with its vnode held. + */ +int +hyprlofs_dirlookup(hlnode_t *parent, char *name, hlnode_t **foundtp, cred_t *cr) +{ + int error; + + *foundtp = NULL; + if (parent->hln_type != VDIR) + return (ENOTDIR); + + if ((error = hyprlofs_taccess(parent, VEXEC, cr))) + return (error); + + if (*name == '\0') { + hlnode_hold(parent); + *foundtp = parent; + return (0); + } + + /* + * Search the directory for the matching name. We need the lock + * protecting the hln_dir list so that it doesn't change out from + * underneath us. hyprlofs_hash_lookup() will pass back the hlnode + * with a hold on it. + */ + if (hyprlofs_hash_lookup(name, parent, 1, foundtp) != NULL) { + ASSERT(*foundtp); + return (0); + } + + return (ENOENT); +} + +/* + * Enter a directory entry (either a file or subdir, depending on op) for + * 'name' and 'hp' into directory 'dir' + */ +int +hyprlofs_direnter( + hlfsmount_t *hm, + hlnode_t *dir, /* target directory to make entry in */ + char *name, /* name of entry */ + enum de_op op, /* entry operation */ + vnode_t *realvp, /* real vnode */ + vattr_t *va, + hlnode_t **hpp, /* return hlnode */ + cred_t *cr) +{ + hldirent_t *hdp; + hlnode_t *found = NULL; + hlnode_t *hp; + int error = 0; + char *s; + + /* hln_rwlock is held to serialize direnter and dirdeletes */ + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + /* Don't allow '/' characters in pathname component */ + for (s = name; *s; s++) + if (*s == '/') + return (EACCES); + + if (name[0] == '\0') + panic("hyprlofs_direnter: NULL name"); + + /* + * This might be a "dangling detached directory". It could have been + * removed, but a reference to it kept in u_cwd. Don't bother searching + * it, and with any luck the user will get tired of dealing with us and + * cd to some absolute pathway. This is in ufs, too. + */ + if (dir->hln_nlink == 0) { + return (ENOENT); + } + + /* Search for the entry. Return "found" if it exists. */ + hdp = hyprlofs_hash_lookup(name, dir, 1, &found); + + if (hdp) { + ASSERT(found); + switch (op) { + case DE_CREATE: + case DE_MKDIR: + if (hpp) { + *hpp = found; + error = EEXIST; + } else { + hlnode_rele(found); + } + break; + } + } else { + + /* + * The entry does not exist. Check write perms in dir to see if + * entry can be created. + */ + if ((error = hyprlofs_taccess(dir, VWRITE, cr))) + return (error); + + /* Make new hlnode and directory entry as required. */ + if ((error = hldir_make_hlnode(dir, hm, va, op, realvp, &hp, + cr))) + return (error); + + if ((error = hldiraddentry(dir, hp, name))) { + /* Unmake the inode we just made. */ + rw_enter(&hp->hln_rwlock, RW_WRITER); + if ((hp->hln_type) == VDIR) { + ASSERT(hdp == NULL); + /* cleanup allocs made by hyprlofs_dirinit() */ + hyprlofs_dirtrunc(hp); + } + mutex_enter(&hp->hln_tlock); + hp->hln_nlink = 0; + mutex_exit(&hp->hln_tlock); + gethrestime(&hp->hln_ctime); + rw_exit(&hp->hln_rwlock); + hlnode_rele(hp); + hp = NULL; + } else if (hpp) { + *hpp = hp; + } else { + hlnode_rele(hp); + } + } + + return (error); +} + +/* + * Delete entry hp of name "nm" from dir. Free dir entry space and decrement + * link count on hlnode(s). + */ +int +hyprlofs_dirdelete(hlnode_t *dir, hlnode_t *hp, char *nm, enum dr_op op, + cred_t *cr) +{ + hldirent_t *hpdp; + int error; + size_t namelen; + hlnode_t *hnp; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(RW_WRITE_HELD(&hp->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + if (nm[0] == '\0') + panic("hyprlofs_dirdelete: NULL name for %p", (void *)hp); + + /* return error if removing . or .. */ + if (nm[0] == '.') { + if (nm[1] == '\0') + return (EINVAL); + if (nm[1] == '.' && nm[2] == '\0') + return (EEXIST); /* thus in ufs */ + } + + if ((error = hyprlofs_taccess(dir, VEXEC|VWRITE, cr)) != 0) + return (error); + + if (dir->hln_dir == NULL) + return (ENOENT); + + hpdp = hyprlofs_hash_lookup(nm, dir, 0, &hnp); + if (hpdp == NULL) { + /* + * If it is gone, some other thread got here first! + * Return error ENOENT. + */ + return (ENOENT); + } + + /* + * If the hlnode in the hldirent changed (shouldn't happen since we + * don't support rename) then original is gone, so return that status + * (same as UFS). + */ + if (hp != hnp) + return (ENOENT); + + hyprlofs_hash_out(hpdp); + + /* Take hpdp out of the directory list. */ + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + if (hpdp->hld_prev) { + hpdp->hld_prev->hld_next = hpdp->hld_next; + } + if (hpdp->hld_next) { + hpdp->hld_next->hld_prev = hpdp->hld_prev; + } + + /* + * If the roving slot pointer happens to match hpdp, point it at the + * previous dirent. + */ + if (dir->hln_dir->hld_prev == hpdp) { + dir->hln_dir->hld_prev = hpdp->hld_prev; + } + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + /* hpdp points to the correct directory entry */ + namelen = strlen(hpdp->hld_name) + 1; + + hyprlofs_memfree(hpdp, sizeof (hldirent_t) + namelen); + dir->hln_size -= (sizeof (hldirent_t) + namelen); + dir->hln_dirents--; + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + hp->hln_ctime = now; + + ASSERT(hp->hln_nlink > 0); + DECR_COUNT(&hp->hln_nlink, &hp->hln_tlock); + if (op == DR_RMDIR && hp->hln_type == VDIR) { + hyprlofs_dirtrunc(hp); + ASSERT(hp->hln_nlink == 0); + } + return (0); +} + +/* + * hyprlofs_dirinit initializes a dir with '.' and '..' entries without + * checking perms and locking + */ +void +hyprlofs_dirinit( + hlnode_t *parent, /* parent of directory to initialize */ + hlnode_t *dir) /* the new directory */ +{ + hldirent_t *dot, *dotdot; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&parent->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + dot = hyprlofs_memalloc(sizeof (hldirent_t) + 2, HL_MUSTHAVE); + dotdot = hyprlofs_memalloc(sizeof (hldirent_t) + 3, HL_MUSTHAVE); + + /* Initialize the entries */ + dot->hld_hlnode = dir; + dot->hld_offset = 0; + dot->hld_name = (char *)dot + sizeof (hldirent_t); + dot->hld_name[0] = '.'; + dot->hld_parent = dir; + hyprlofs_hash_in(dot); + + dotdot->hld_hlnode = parent; + dotdot->hld_offset = 1; + dotdot->hld_name = (char *)dotdot + sizeof (hldirent_t); + dotdot->hld_name[0] = '.'; + dotdot->hld_name[1] = '.'; + dotdot->hld_parent = dir; + hyprlofs_hash_in(dotdot); + + /* Initialize directory entry list. */ + dot->hld_next = dotdot; + dot->hld_prev = dotdot; + dotdot->hld_next = NULL; + dotdot->hld_prev = dot; + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + /* + * Since hyprlofs_dirinit is called with both dir and parent being the + * same for the root vnode, we need to increment this before we set + * hln_nlink = 2 below. + */ + INCR_COUNT(&parent->hln_nlink, &parent->hln_tlock); + parent->hln_ctime = now; + + dir->hln_dir = dot; + dir->hln_size = 2 * sizeof (hldirent_t) + 5; /* dot and dotdot */ + dir->hln_dirents = 2; + dir->hln_nlink = 2; +} + + +/* + * hyprlofs_dirtrunc removes all dir entries under this dir. + */ +void +hyprlofs_dirtrunc(hlnode_t *dir) +{ + hldirent_t *hdp; + hlnode_t *tp; + size_t namelen; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + if (dir->hln_looped) + return; + + for (hdp = dir->hln_dir; hdp; hdp = dir->hln_dir) { + ASSERT(hdp->hld_next != hdp); + ASSERT(hdp->hld_prev != hdp); + ASSERT(hdp->hld_hlnode); + + dir->hln_dir = hdp->hld_next; + namelen = strlen(hdp->hld_name) + 1; + + /* + * Adjust the link counts to account for this dir entry removal. + */ + tp = hdp->hld_hlnode; + + ASSERT(tp->hln_nlink > 0); + DECR_COUNT(&tp->hln_nlink, &tp->hln_tlock); + + hyprlofs_hash_out(hdp); + + hyprlofs_memfree(hdp, sizeof (hldirent_t) + namelen); + dir->hln_size -= (sizeof (hldirent_t) + namelen); + dir->hln_dirents--; + } + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + ASSERT(dir->hln_dir == NULL); + ASSERT(dir->hln_size == 0); + ASSERT(dir->hln_dirents == 0); +} + +static int +hldiraddentry( + hlnode_t *dir, /* target directory to make entry in */ + hlnode_t *hp, /* new hlnode */ + char *name) +{ + hldirent_t *hdp, *hpdp; + size_t namelen, alloc_size; + timestruc_t now; + + /* + * Make sure the parent dir wasn't removed from underneath the caller. + */ + if (dir->hln_dir == NULL) + return (ENOENT); + + /* Check that everything is on the same FS. */ + if (hp->hln_vnode->v_vfsp != dir->hln_vnode->v_vfsp) + return (EXDEV); + + /* Alloc and init dir entry */ + namelen = strlen(name) + 1; + alloc_size = namelen + sizeof (hldirent_t); + hdp = hyprlofs_memalloc(alloc_size, 0); + if (hdp == NULL) + return (ENOSPC); + + dir->hln_size += alloc_size; + dir->hln_dirents++; + hdp->hld_hlnode = hp; + hdp->hld_parent = dir; + + /* The dir entry and its name were allocated sequentially. */ + hdp->hld_name = (char *)hdp + sizeof (hldirent_t); + (void) strcpy(hdp->hld_name, name); + + hyprlofs_hash_in(hdp); + + /* + * Some utilities expect the size of a directory to remain fairly + * static. For example, a routine which unlinks files between calls to + * readdir(); the size of the dir changes from underneath it and so the + * real dir offset in bytes is invalid. To circumvent this problem, we + * initialize a dir entry with a phony offset, and use this offset to + * determine end of file in hyprlofs_readdir. + */ + hpdp = dir->hln_dir->hld_prev; + /* + * Install at first empty "slot" in directory list. + */ + while (hpdp->hld_next != NULL && (hpdp->hld_next->hld_offset - + hpdp->hld_offset) <= 1) { + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + ASSERT(hpdp->hld_next->hld_offset > hpdp->hld_offset); + hpdp = hpdp->hld_next; + } + hdp->hld_offset = hpdp->hld_offset + 1; + + /* + * If we're at the end of the dirent list and the offset (which is + * necessarily the largest offset in this dir) is more than twice the + * number of dirents, that means the dir is 50% holes. At this point + * we reset the slot pointer back to the beginning of the dir so we + * start using the holes. The idea is that if there are N dirents, + * there must also be N holes, so we can satisfy the next N creates by + * walking at most 2N entries; thus the average cost of a create is + * constant. Note that we use the first dirent's hld_prev as the roving + * slot pointer. This saves a word in every dirent. + */ + if (hpdp->hld_next == NULL && hpdp->hld_offset > 2 * dir->hln_dirents) + dir->hln_dir->hld_prev = dir->hln_dir->hld_next; + else + dir->hln_dir->hld_prev = hdp; + + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + hdp->hld_next = hpdp->hld_next; + if (hdp->hld_next) { + hdp->hld_next->hld_prev = hdp; + } + hdp->hld_prev = hpdp; + hpdp->hld_next = hdp; + + ASSERT(hdp->hld_next != hdp); + ASSERT(hdp->hld_prev != hdp); + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + return (0); +} + +static int +hldir_make_hlnode(hlnode_t *dir, hlfsmount_t *hm, vattr_t *va, enum de_op op, + vnode_t *realvp, hlnode_t **newnode, cred_t *cr) +{ + hlnode_t *hp; + enum vtype type; + + ASSERT(va != NULL); + ASSERT(op == DE_CREATE || op == DE_MKDIR); + if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) || + ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) + return (EOVERFLOW); + type = va->va_type; + hp = hyprlofs_memalloc(sizeof (hlnode_t), HL_MUSTHAVE); + hyprlofs_node_init(hm, hp, va, cr); + + hp->hln_vnode->v_rdev = hp->hln_rdev = NODEV; + hp->hln_vnode->v_type = type; + hp->hln_uid = crgetuid(cr); + + /* + * To determine the gid of the created file: + * If the directory's set-gid bit is set, set the gid to the gid + * of the parent dir, otherwise, use the process's gid. + */ + if (dir->hln_mode & VSGID) + hp->hln_gid = dir->hln_gid; + else + hp->hln_gid = crgetgid(cr); + + /* + * If we're creating a dir and the parent dir has the set-GID bit set, + * set it on the new dir. Otherwise, if the user is neither privileged + * nor a member of the file's new group, clear the file's set-GID bit. + */ + if (dir->hln_mode & VSGID && type == VDIR) + hp->hln_mode |= VSGID; + else { + if ((hp->hln_mode & VSGID) && + secpolicy_vnode_setids_setgids(cr, hp->hln_gid) != 0) + hp->hln_mode &= ~VSGID; + } + + if (va->va_mask & AT_ATIME) + hp->hln_atime = va->va_atime; + if (va->va_mask & AT_MTIME) + hp->hln_mtime = va->va_mtime; + + if (op == DE_MKDIR) { + hyprlofs_dirinit(dir, hp); + hp->hln_looped = 0; + } else { + hp->hln_realvp = realvp; + hp->hln_size = va->va_size; + hp->hln_looped = 1; + } + + *newnode = hp; + return (0); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c new file mode 100644 index 0000000000..bf71b2bfcb --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c @@ -0,0 +1,154 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> +#include <sys/t_lock.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/debug.h> +#include <sys/time.h> +#include <sys/cmn_err.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/vfs.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/atomic.h> +#include <sys/policy.h> +#include <sys/fs/hyprlofs_info.h> + +#define MODESHIFT 3 + +/* Initialize a hlnode and add it to file list under mount point. */ +void +hyprlofs_node_init(hlfsmount_t *hm, hlnode_t *h, vattr_t *vap, cred_t *cr) +{ + vnode_t *vp; + timestruc_t now; + + ASSERT(vap != NULL); + + rw_init(&h->hln_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&h->hln_tlock, NULL, MUTEX_DEFAULT, NULL); + h->hln_mode = MAKEIMODE(vap->va_type, vap->va_mode); + h->hln_mask = 0; + h->hln_type = vap->va_type; + h->hln_nodeid = (ino64_t)(uint32_t)((uintptr_t)h >> 3); + h->hln_nlink = 1; + h->hln_size = 0; + + if (cr == NULL) { + h->hln_uid = vap->va_uid; + h->hln_gid = vap->va_gid; + } else { + h->hln_uid = crgetuid(cr); + h->hln_gid = crgetgid(cr); + } + + h->hln_fsid = hm->hlm_dev; + h->hln_rdev = vap->va_rdev; + h->hln_blksize = PAGESIZE; + h->hln_nblocks = 0; + gethrestime(&now); + h->hln_atime = now; + h->hln_mtime = now; + h->hln_ctime = now; + h->hln_seq = 0; + h->hln_dir = NULL; + + h->hln_vnode = vn_alloc(KM_SLEEP); + vp = HLNTOV(h); + vn_setops(vp, hyprlofs_vnodeops); + vp->v_vfsp = hm->hlm_vfsp; + vp->v_type = vap->va_type; + vp->v_rdev = vap->va_rdev; + vp->v_data = (caddr_t)h; + mutex_enter(&hm->hlm_contents); + /* + * Increment the pseudo generation number for this hlnode. Since + * hlnodes are allocated and freed, there really is no particular + * generation number for a new hlnode. Just fake it by using a + * counter in each file system. + */ + h->hln_gen = hm->hlm_gen++; + + /* + * Add new hlnode to end of linked list of hlnodes for this hyprlofs + * Root dir is handled specially in hyprlofs_mount. + */ + if (hm->hlm_rootnode != (hlnode_t *)NULL) { + h->hln_forw = NULL; + h->hln_back = hm->hlm_rootnode->hln_back; + h->hln_back->hln_forw = hm->hlm_rootnode->hln_back = h; + } + mutex_exit(&hm->hlm_contents); + vn_exists(vp); +} + +int +hyprlofs_taccess(void *vtp, int mode, cred_t *cr) +{ + hlnode_t *hp = vtp; + int shift = 0; + + /* Check access based on owner, group and public perms in hlnode. */ + if (crgetuid(cr) != hp->hln_uid) { + shift += MODESHIFT; + if (groupmember(hp->hln_gid, cr) == 0) + shift += MODESHIFT; + } + + return (secpolicy_vnode_access2(cr, HLNTOV(hp), hp->hln_uid, + hp->hln_mode << shift, mode)); +} + +/* + * Allocate zeroed memory if hyprlofs_maxkmem has not been exceeded or the + * 'musthave' flag is set. 'musthave' allocations should always be subordinate + * to normal allocations so that hyprlofs_maxkmem can't be exceeded by more + * than a few KB. E.g. when creating a new dir, the hlnode is a normal + * allocation; if that succeeds, the dirents for "." and ".." are 'musthave' + * allocations. + */ +void * +hyprlofs_memalloc(size_t size, int musthave) +{ + if (atomic_add_long_nv(&hyprlofs_kmemspace, size) < hyprlofs_maxkmem || + musthave) + return (kmem_zalloc(size, KM_SLEEP)); + + atomic_add_long(&hyprlofs_kmemspace, -size); + cmn_err(CE_WARN, "hyprlofs over memory limit"); + return (NULL); +} + +void +hyprlofs_memfree(void *cp, size_t size) +{ + kmem_free(cp, size); + atomic_add_long(&hyprlofs_kmemspace, -size); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c new file mode 100644 index 0000000000..e8af803529 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c @@ -0,0 +1,626 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +/* + * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and + * lofs(7FS) file systems. It is modeled on code from both of these file + * systems. + * + * The purpose is to create a high performance name space for files on which + * applications will compute. Given a large number of data files with various + * owners, we want to construct a view onto those files such that only a subset + * is visible to the applications and such that the view can be changed very + * quickly as compute progresses. Entries in the name space are not mounts and + * thus do not appear in the mnttab. Entries in the name space are allowed to + * refer to files on different backing file systems. Intermediate directories + * in the name space exist only in-memory, ala tmpfs. There are no leaf nodes + * in the name space except for entries that refer to backing files ala lofs. + * + * The name space is managed via ioctls issued on the mounted file system and + * is mostly read-only for the compute applications. That is, applications + * cannot create new files in the name space. If a file is unlinked by an + * application, that only removes the file from the name space, the backing + * file remains in place. It is possible for applications to write-through to + * the backing files if the file system is mounted read-write. + * + * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES, + * and HYPRLOFS_RM_ALL ioctls on the top-level mount. + * + * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and + * the name(s) for the file(s) in the name space. The name(s) may be path(s) + * which will be relative to the root of the mount and thus cannot begin with + * a /. If the name is a path, it does not have to correspond to any backing + * path. The intermediate directories will only exist in the name space. The + * entry(ies) will be added to the name space. + * + * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the + * name space which should be removed. The name(s) may be path(s) which will + * be relative to the root of the mount and thus cannot begin with a /. The + * named entry(ies) will be removed. + * + * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/time.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/stat.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/statvfs.h> +#include <sys/mount.h> +#include <sys/debug.h> +#include <sys/systm.h> +#include <sys/mntent.h> +#include <fs/fs_subr.h> +#include <vm/page.h> +#include <vm/anon.h> +#include <sys/model.h> +#include <sys/policy.h> + +#include <sys/fs/swapnode.h> +#include <sys/fs/hyprlofs_info.h> + +static int hyprlofsfstype; + +/* + * hyprlofs vfs operations. + */ +static int hyprlofsinit(int, char *); +static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *); +static int hyprlofs_unmount(vfs_t *, int, cred_t *); +static int hyprlofs_root(vfs_t *, vnode_t **); +static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *); +static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *); + +/* + * Loadable module wrapper + */ +#include <sys/modctl.h> + +static mntopts_t hyprlofs_mntopts; + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "hyprlofs", + hyprlofsinit, + VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT, + &hyprlofs_mntopts +}; + +static mntopts_t hyprlofs_mntopts = { + 0, NULL +}; + +/* + * Module linkage information + */ +static struct modlfs modlfs = { + &mod_fsops, "filesystem for hyprlofs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlfs, NULL +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + int error; + + error = mod_remove(&modlinkage); + if (error) + return (error); + /* + * Tear down the operations vectors + */ + (void) vfs_freevfsops_by_type(hyprlofsfstype); + vn_freevnodeops(hyprlofs_vnodeops); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * The following are patchable variables limiting the amount of system + * resources hyprlofs can use. + * + * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can + * use for it's data structures (e.g. hlnodes, directory entries). It is set + * as a percentage of physical memory which is determined when hyprlofs is + * first used in the system. + * + * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for + * the rest of the system. If the amount of free swap space in the system + * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon + * allocations will fail. + */ +size_t hyprlofs_maxkmem = 0; +size_t hyprlofs_minfree = 0; +size_t hyprlofs_kmemspace; /* bytes of kernel heap used by all hyprlofs */ + +static major_t hyprlofs_major; +static minor_t hyprlofs_minor; +static kmutex_t hyprlofs_minor_lock; + +/* + * initialize global hyprlofs locks and hashes when loading hyprlofs module + */ +static int +hyprlofsinit(int fstype, char *name) +{ + static const fs_operation_def_t hl_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = hyprlofs_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = hyprlofs_unmount }, + VFSNAME_ROOT, { .vfs_root = hyprlofs_root }, + VFSNAME_STATVFS, { .vfs_statvfs = hyprlofs_statvfs }, + VFSNAME_VGET, { .vfs_vget = hyprlofs_vget }, + NULL, NULL + }; + int error; + extern void hyprlofs_hash_init(); + + hyprlofs_hash_init(); + hyprlofsfstype = fstype; + ASSERT(hyprlofsfstype != 0); + + error = vfs_setfsops(fstype, hl_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template"); + return (error); + } + + error = vn_make_ops(name, hyprlofs_vnodeops_template, + &hyprlofs_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template"); + return (error); + } + + /* + * hyprlofs_minfree is an absolute limit of swap space which still + * allows other processes to execute. Set it if its not patched. + */ + if (hyprlofs_minfree == 0) + hyprlofs_minfree = btopr(HYPRLOFSMINFREE); + + /* + * The maximum amount of space hyprlofs can allocate is + * HYPRLOFSMAXPROCKMEM percent of kernel memory + */ + if (hyprlofs_maxkmem == 0) + hyprlofs_maxkmem = + MAX(PAGESIZE, kmem_maxavail() / HYPRLOFSMAXFRACKMEM); + + if ((hyprlofs_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, + "hyprlofsinit: Can't get unique device number."); + hyprlofs_major = 0; + } + mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL); + return (0); +} + +static int +hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + hlfsmount_t *hm = NULL; + hlnode_t *hp; + struct pathname dpn; + int error; + vattr_t rattr; + int got_attrs; + + if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + return (error); + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (uap->flags & MS_REMOUNT) + return (EBUSY); + + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* Having the resource be anything but "swap" doesn't make sense. */ + vfs_setresource(vfsp, "swap", 0); + + if ((error = pn_get(uap->dir, + (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, + &dpn)) != 0) + goto out; + + if ((hm = hyprlofs_memalloc(sizeof (hlfsmount_t), 0)) == NULL) { + pn_free(&dpn); + error = ENOMEM; + goto out; + } + + /* Get an available minor device number for this mount */ + mutex_enter(&hyprlofs_minor_lock); + do { + hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32; + hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor); + } while (vfs_devismounted(hm->hlm_dev)); + mutex_exit(&hyprlofs_minor_lock); + + /* + * Set but don't bother entering the mutex since hlfsmount is not on + * the mount list yet. + */ + mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL); + + hm->hlm_vfsp = vfsp; + + vfsp->vfs_data = (caddr_t)hm; + vfsp->vfs_fstype = hyprlofsfstype; + vfsp->vfs_dev = hm->hlm_dev; + vfsp->vfs_bsize = PAGESIZE; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype); + hm->hlm_mntpath = hyprlofs_memalloc(dpn.pn_pathlen + 1, HL_MUSTHAVE); + (void) strcpy(hm->hlm_mntpath, dpn.pn_path); + + /* allocate and initialize root hlnode structure */ + bzero(&rattr, sizeof (vattr_t)); + rattr.va_mode = (mode_t)(S_IFDIR | 0777); + rattr.va_type = VDIR; + rattr.va_rdev = 0; + hp = hyprlofs_memalloc(sizeof (hlnode_t), HL_MUSTHAVE); + hyprlofs_node_init(hm, hp, &rattr, cr); + + /* Get the mode, uid, and gid from the underlying mount point. */ + rattr.va_mask = AT_MODE|AT_UID|AT_GID; + got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL); + + rw_enter(&hp->hln_rwlock, RW_WRITER); + HLNTOV(hp)->v_flag |= VROOT; + + /* + * If the getattr succeeded, use its results, otherwise allow the + * previously set defaults to prevail. + */ + if (got_attrs == 0) { + hp->hln_mode = rattr.va_mode; + hp->hln_uid = rattr.va_uid; + hp->hln_gid = rattr.va_gid; + } + + /* + * Initialize linked list of hlnodes so that the back pointer of the + * root hlnode always points to the last one on the list and the + * forward pointer of the last node is null + */ + hp->hln_back = hp; + hp->hln_forw = NULL; + hp->hln_nlink = 0; + hm->hlm_rootnode = hp; + + hyprlofs_dirinit(hp, hp); + + rw_exit(&hp->hln_rwlock); + + pn_free(&dpn); + error = 0; + +out: + return (error); +} + +static int +hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hnp, *cancel; + vnode_t *vp; + int error; + + if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (error); + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + /* + * forced unmount is not supported by this file system + * and thus, ENOTSUP, is being returned. + */ + if (flag & MS_FORCE) + return (ENOTSUP); + + mutex_enter(&hm->hlm_contents); + + /* + * If there are no open files, only the root node should have a ref cnt. + * With hlm_contents held, nothing can be added or removed. There may + * be some dirty pages. To prevent fsflush from disrupting the unmount, + * put a hold on each node while scanning. If we find a previously + * referenced node, undo the holds we have placed and fail EBUSY. + */ + hnp = hm->hlm_rootnode; + if (HLNTOV(hnp)->v_count > 1) { + mutex_exit(&hm->hlm_contents); + return (EBUSY); + } + + for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) { + if ((vp = HLNTOV(hnp))->v_count > 0) { + cancel = hm->hlm_rootnode->hln_forw; + while (cancel != hnp) { + vp = HLNTOV(cancel); + ASSERT(vp->v_count > 0); + VN_RELE(vp); + cancel = cancel->hln_forw; + } + mutex_exit(&hm->hlm_contents); + return (EBUSY); + } + VN_HOLD(vp); + } + + /* We can drop the mutex now because no one can find this mount */ + mutex_exit(&hm->hlm_contents); + + /* + * Free all alloc'd memory associated with this FS. To do this, we go + * through the file list twice, once to remove all the dir entries, and + * then to remove all the files. + */ + + /* Remove all directory entries */ + for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) { + rw_enter(&hnp->hln_rwlock, RW_WRITER); + if (hnp->hln_type == VDIR) + hyprlofs_dirtrunc(hnp); + rw_exit(&hnp->hln_rwlock); + } + + ASSERT(hm->hlm_rootnode); + + /* + * All links are gone, v_count is keeping nodes in place. VN_RELE + * should make the node disappear, unless somebody is holding pages + * against it. Wait and retry until it disappears. + * + * We re-acquire the lock to prevent others who have a HOLD on a hlnode + * from blowing it away (in hyprlofs_inactive) while we're trying to + * get to it here. Once we have a HOLD on it we know it'll stick around. + */ + mutex_enter(&hm->hlm_contents); + + /* Remove all the files (except the rootnode) backwards. */ + while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) { + mutex_exit(&hm->hlm_contents); + /* Note we handled the link count in pass 2 above. */ + vp = HLNTOV(hnp); + VN_RELE(vp); + mutex_enter(&hm->hlm_contents); + /* + * It's still there after the RELE. Someone else like pageout + * has a hold on it so wait a bit and then try again. + */ + if (hnp == hm->hlm_rootnode->hln_back) { + VN_HOLD(vp); + mutex_exit(&hm->hlm_contents); + delay(hz / 4); + mutex_enter(&hm->hlm_contents); + } + } + mutex_exit(&hm->hlm_contents); + + VN_RELE(HLNTOV(hm->hlm_rootnode)); + + ASSERT(hm->hlm_mntpath); + + hyprlofs_memfree(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1); + + mutex_destroy(&hm->hlm_contents); + hyprlofs_memfree(hm, sizeof (hlfsmount_t)); + + return (0); +} + +/* Return root hlnode for given vnode */ +static int +hyprlofs_root(vfs_t *vfsp, vnode_t **vpp) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hp = hm->hlm_rootnode; + vnode_t *vp; + + ASSERT(hp); + + vp = HLNTOV(hp); + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + ulong_t blocks; + dev32_t d32; + zoneid_t eff_zid; + struct zone *zp; + + /* + * The FS may have been mounted by the GZ on behalf of the NGZ. In + * that case, the hlfsmount zone_id will be the global zone. We want + * to show the swap cap inside the zone in this case, even though the + * FS was mounted by the GZ. + */ + if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID) + zp = curproc->p_zone; + else + zp = hm->hlm_vfsp->vfs_zone; + + if (zp == NULL) + eff_zid = GLOBAL_ZONEUNIQID; + else + eff_zid = zp->zone_id; + + sbp->f_bsize = PAGESIZE; + sbp->f_frsize = PAGESIZE; + + /* + * Find the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + + if (blocks > hyprlofs_minfree) + sbp->f_bfree = blocks - hyprlofs_minfree; + else + sbp->f_bfree = 0; + + sbp->f_bavail = sbp->f_bfree; + + /* + * Total number of blocks is what's available plus what's been used + */ + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree); + + if (eff_zid != GLOBAL_ZONEUNIQID && + zp->zone_max_swap_ctl != UINT64_MAX) { + /* + * If the fs is used by a NGZ with a swap cap, then report the + * capped size. + */ + rctl_qty_t cap, used; + pgcnt_t pgcap, pgused; + + mutex_enter(&zp->zone_mem_lock); + cap = zp->zone_max_swap_ctl; + used = zp->zone_max_swap; + mutex_exit(&zp->zone_mem_lock); + + pgcap = btop(cap); + pgused = btop(used); + + sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree); + sbp->f_bavail = sbp->f_bfree; + sbp->f_blocks = MIN(pgcap, sbp->f_blocks); + } + + /* + * This is fairly inaccurate since it doesn't take into account the + * names stored in the directory entries. + */ + if (hyprlofs_maxkmem > hyprlofs_kmemspace) + sbp->f_ffree = (hyprlofs_maxkmem - hyprlofs_kmemspace) / + (sizeof (hlnode_t) + sizeof (hldirent_t)); + else + sbp->f_ffree = 0; + + sbp->f_files = hyprlofs_maxkmem / + (sizeof (hlnode_t) + sizeof (hldirent_t)); + sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); + (void) cmpldev(&d32, vfsp->vfs_dev); + sbp->f_fsid = d32; + (void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name); + (void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr)); + /* + * ensure null termination + */ + sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; + sbp->f_flag = vf_to_stf(vfsp->vfs_flag); + sbp->f_namemax = MAXNAMELEN - 1; + return (0); +} + +static int +hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp) +{ + hlfid_t *hfid; + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hp = NULL; + + hfid = (hlfid_t *)fidp; + *vpp = NULL; + + mutex_enter(&hm->hlm_contents); + for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) { + mutex_enter(&hp->hln_tlock); + if (hp->hln_nodeid == hfid->hlfid_ino) { + /* + * If the gen numbers don't match we know the file + * won't be found since only one hlnode can have this + * number at a time. + */ + if (hp->hln_gen != hfid->hlfid_gen || + hp->hln_nlink == 0) { + mutex_exit(&hp->hln_tlock); + mutex_exit(&hm->hlm_contents); + return (0); + } + *vpp = (vnode_t *)HLNTOV(hp); + + VN_HOLD(*vpp); + + if ((hp->hln_mode & S_ISVTX) && + !(hp->hln_mode & (S_IXUSR | S_IFDIR))) { + mutex_enter(&(*vpp)->v_lock); + (*vpp)->v_flag |= VISSWAP; + mutex_exit(&(*vpp)->v_lock); + } + mutex_exit(&hp->hln_tlock); + mutex_exit(&hm->hlm_contents); + return (0); + } + mutex_exit(&hp->hln_tlock); + } + mutex_exit(&hm->hlm_contents); + return (0); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c new file mode 100644 index 0000000000..b382210334 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c @@ -0,0 +1,1412 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2012 Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/t_lock.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/user.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/flock.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/cred.h> +#include <sys/dirent.h> +#include <sys/pathname.h> +#include <sys/fs/hyprlofs.h> +#include <sys/fs/hyprlofs_info.h> +#include <sys/mman.h> +#include <vm/pvn.h> +#include <sys/cmn_err.h> +#include <sys/buf.h> +#include <sys/policy.h> +#include <fs/fs_subr.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +static int hyprlofs_add_entry(vnode_t *, char *, char *, cred_t *, + caller_context_t *); +static int hyprlofs_rm_entry(vnode_t *, char *, cred_t *, caller_context_t *, + int); +static int hyprlofs_rm_all(vnode_t *, cred_t *, caller_context_t *, int); +static int hyprlofs_remove(vnode_t *, char *, cred_t *, caller_context_t *, + int); +static int hyprlofs_get_all(vnode_t *, intptr_t, cred_t *, caller_context_t *, + int); + +/* + * This is a somewhat arbitrary upper limit on the number of entries we can + * pass in on a single add/rm ioctl call. This is only used to validate that + * the input list looks sane. + */ +#define MAX_IOCTL_PARAMS 100000 + +static int +hyprlofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *rvp; + int error; + + rvp = REALVP(*vpp); + + if (VTOHLN(*vpp)->hln_looped == 0) + return (0); + + /* + * looped back, pass through to real vnode. Need to hold new reference + * to vp since VOP_OPEN() may decide to release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + ASSERT(rvp->v_count > 1); + VN_RELE(rvp); + + return (error); +} + +static int +hyprlofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 0) { + cleanlocks(vp, ttoproc(curthread)->p_pid, 0); + cleanshares(vp, ttoproc(curthread)->p_pid); + return (0); + } + + return (VOP_CLOSE(REALVP(vp), flag, count, offset, cr, ct)); +} + +static int +hyprlofs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + return (VOP_READ(REALVP(vp), uiop, ioflag, cr, ct)); +} + +static int +hyprlofs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + /* We don't support writing to non-regular files */ + if (vp->v_type != VREG) + return (EINVAL); + + if (vn_is_readonly(vp)) + return (EROFS); + + return (VOP_WRITE(REALVP(vp), uiop, ioflag, cr, ct)); +} + +/* ARGSUSED */ +static int +hyprlofs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag, + cred_t *cr, int *rvalp, caller_context_t *ct) +{ + int len, cnt, error; + int i; + model_t model; + char path[MAXPATHLEN]; + char nm[MAXPATHLEN]; + + /* We only support the hyprlofs ioctls on the root vnode */ + if (!(vp->v_flag & VROOT)) + return (ENOTTY); + + /* + * Check if managing hyprlofs is allowed. + */ + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + if (cmd == HYPRLOFS_ADD_ENTRIES || cmd == HYPRLOFS_RM_ENTRIES) { + model = get_udatamodel(); + + if (model == DATAMODEL_NATIVE) { + hyprlofs_entries_t ebuf; + hyprlofs_entry_t *e; + + if (copyin((void *)data, &ebuf, sizeof (ebuf))) + return (EFAULT); + cnt = ebuf.hle_len; + if (cnt > MAX_IOCTL_PARAMS) + return (EINVAL); + len = sizeof (hyprlofs_entry_t) * cnt; + + e = kmem_alloc(len, KM_SLEEP); + if (copyin((void *)(ebuf.hle_entries), e, len)) { + kmem_free(e, len); + return (EFAULT); + } + + for (i = 0; i < cnt; i++) { + if (e[i].hle_nlen == 0 || + e[i].hle_nlen > MAXPATHLEN) + return (EINVAL); + + if (copyin(e[i].hle_name, nm, e[i].hle_nlen) + != 0) { + kmem_free(e, len); + return (EFAULT); + } + nm[e[i].hle_nlen] = '\0'; + + if (cmd == HYPRLOFS_ADD_ENTRIES) { + if (e[i].hle_plen == 0 || + e[i].hle_plen > MAXPATHLEN) + return (EINVAL); + + if (copyin(e[i].hle_path, path, + e[i].hle_plen) != 0) { + kmem_free(e, len); + return (EFAULT); + } + path[e[i].hle_plen] = '\0'; + + if ((error = hyprlofs_add_entry(vp, + path, nm, cr, ct)) != 0) { + kmem_free(e, len); + return (error); + } + } else { + if ((error = hyprlofs_rm_entry(vp, nm, + cr, ct, flag)) != 0) { + kmem_free(e, len); + return (error); + } + } + } + + kmem_free(e, len); + return (0); + + } else { + hyprlofs_entries32_t ebuf32; + hyprlofs_entry32_t *e32; + + if (copyin((void *)data, &ebuf32, sizeof (ebuf32))) + return (EFAULT); + + cnt = ebuf32.hle_len; + if (cnt > MAX_IOCTL_PARAMS) + return (EINVAL); + len = sizeof (hyprlofs_entry32_t) * cnt; + + e32 = kmem_alloc(len, KM_SLEEP); + if (copyin((void *)(unsigned long)(ebuf32.hle_entries), + e32, len)) { + kmem_free(e32, len); + return (EFAULT); + } + + for (i = 0; i < cnt; i++) { + if (e32[i].hle_nlen == 0 || + e32[i].hle_nlen > MAXPATHLEN) + return (EINVAL); + + if (copyin((void *)(unsigned long) + e32[i].hle_name, nm, + e32[i].hle_nlen) != 0) { + kmem_free(e32, len); + return (EFAULT); + } + nm[e32[i].hle_nlen] = '\0'; + + if (cmd == HYPRLOFS_ADD_ENTRIES) { + if (e32[i].hle_plen == 0 || + e32[i].hle_plen > MAXPATHLEN) + return (EINVAL); + + if (copyin((void *)(unsigned long) + e32[i].hle_path, path, + e32[i].hle_plen) != 0) { + kmem_free(e32, len); + return (EFAULT); + } + path[e32[i].hle_plen] = '\0'; + + if ((error = hyprlofs_add_entry(vp, + path, nm, cr, ct)) != 0) { + kmem_free(e32, len); + return (error); + } + } else { + if ((error = hyprlofs_rm_entry(vp, nm, + cr, ct, flag)) != 0) { + kmem_free(e32, len); + return (error); + } + } + } + + kmem_free(e32, len); + return (0); + } + } + + if (cmd == HYPRLOFS_RM_ALL) { + return (hyprlofs_rm_all(vp, cr, ct, flag)); + } + + if (cmd == HYPRLOFS_GET_ENTRIES) { + return (hyprlofs_get_all(vp, data, cr, ct, flag)); + } + + return (ENOTTY); +} + +/*ARGSUSED2*/ +static int +hyprlofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + + mutex_enter(&tp->hln_tlock); + vap->va_type = vp->v_type; + vap->va_mode = tp->hln_mode & MODEMASK; + vap->va_uid = tp->hln_uid; + vap->va_gid = tp->hln_gid; + vap->va_fsid = tp->hln_fsid; + vap->va_nodeid = (ino64_t)tp->hln_nodeid; + vap->va_nlink = tp->hln_nlink; + vap->va_size = (u_offset_t)tp->hln_size; + vap->va_atime = tp->hln_atime; + vap->va_mtime = tp->hln_mtime; + vap->va_ctime = tp->hln_ctime; + vap->va_blksize = PAGESIZE; + vap->va_rdev = tp->hln_rdev; + vap->va_seq = tp->hln_seq; + + vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size))); + mutex_exit(&tp->hln_tlock); + return (0); +} + +/*ARGSUSED4*/ +static int +hyprlofs_setattr(vnode_t *vp, vattr_t *vap, int flags, + cred_t *cr, caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + int error = 0; + vattr_t *get; + long mask; + + /* + * Cannot set these attributes + */ + if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR)) + return (EINVAL); + + mutex_enter(&tp->hln_tlock); + + get = &tp->hln_attr; + /* + * Change file access modes. Must be owner or have sufficient + * privileges. + */ + error = secpolicy_vnode_setattr(cr, vp, vap, get, flags, + hyprlofs_taccess, tp); + + if (error) + goto out; + + mask = vap->va_mask; + + if (mask & AT_MODE) { + get->va_mode &= S_IFMT; + get->va_mode |= vap->va_mode & ~S_IFMT; + } + + if (mask & AT_UID) + get->va_uid = vap->va_uid; + if (mask & AT_GID) + get->va_gid = vap->va_gid; + if (mask & AT_ATIME) + get->va_atime = vap->va_atime; + if (mask & AT_MTIME) + get->va_mtime = vap->va_mtime; + + if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME)) + gethrestime(&tp->hln_ctime); + +out: + mutex_exit(&tp->hln_tlock); + return (error); +} + +static int +hyprlofs_access(vnode_t *vp, int mode, int flags, cred_t *cr, + caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + int error; + + if (mode & VWRITE) { + if (vp->v_type == VREG && vn_is_readonly(vp)) + return (EROFS); + } + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_ACCESS(REALVP(vp), mode, flags, cr, ct)); + + mutex_enter(&tp->hln_tlock); + error = hyprlofs_taccess(tp, mode, cr); + mutex_exit(&tp->hln_tlock); + return (error); +} + +/* ARGSUSED3 */ +static int +hyprlofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(dvp); + hlnode_t *ntp = NULL; + int error; + + if (VTOHLN(dvp)->hln_looped == 1) + return (VOP_LOOKUP(REALVP(dvp), nm, vpp, pnp, flags, rdir, + cr, ct, direntflags, realpnp)); + + if (flags & LOOKUP_XATTR) + return (EINVAL); + + /* Null component name is a synonym for directory being searched. */ + if (*nm == '\0') { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } + ASSERT(tp); + + if ((error = hyprlofs_dirlookup(tp, nm, &ntp, cr)) == 0) { + ASSERT(ntp); + *vpp = HLNTOV(ntp); + } + return (error); +} + +/* + * Create the loopback from the hyprlofs vnode to the real vnode. + */ +static int +hyprlofs_loopback(vnode_t *dvp, vnode_t *rvp, char *nm, vattr_t *vap, + int mode, cred_t *cr, caller_context_t *ct) +{ + hlnode_t *parent; + hlfsmount_t *tm; + int error; + hlnode_t *oldtp; + vnode_t *vp; + + parent = (hlnode_t *)VTOHLN(dvp); + tm = (hlfsmount_t *)VTOHLM(dvp); + error = 0; + oldtp = NULL; + + if (vap->va_type == VREG && (vap->va_mode & VSVTX)) { + /* we don't support the sticky bit */ + vap->va_mode &= ~VSVTX; + } else if (vap->va_type == VNON) { + return (EINVAL); + } + + /* Null component name is a synonym for directory being searched. */ + if (*nm == '\0') { + VN_HOLD(dvp); + oldtp = parent; + } else { + error = hyprlofs_dirlookup(parent, nm, &oldtp, cr); + } + + if (error == 0) { /* name found */ + ASSERT(oldtp); + + rw_enter(&oldtp->hln_rwlock, RW_WRITER); + + /* + * if create/read-only an existing directory, allow it + */ + if ((oldtp->hln_type == VDIR) && (mode & VWRITE)) + error = EISDIR; + else { + error = hyprlofs_taccess(oldtp, mode, cr); + } + + if (error) { + rw_exit(&oldtp->hln_rwlock); + hlnode_rele(oldtp); + return (error); + } + + vp = HLNTOV(oldtp); + rw_exit(&oldtp->hln_rwlock); + + if (vp->v_type == VREG) { + hlnode_rele(oldtp); + return (EEXIST); + } + + vnevent_create(vp, ct); + return (0); + } + + if (error != ENOENT) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + error = hyprlofs_direnter(tm, parent, nm, DE_CREATE, rvp, vap, NULL, + cr); + rw_exit(&parent->hln_rwlock); + + return (error); +} + +/* + * Create an in-memory directory based on the add-entry ioctl name. + * If the dir exists, return EEXIST but still also return node in vpp. + */ +static int +hyprlofs_mkdir(vnode_t *dvp, char *nm, vattr_t *va, vnode_t **vpp, cred_t *cr) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + hlnode_t *self = NULL; + hlfsmount_t *tm = (hlfsmount_t *)VTOHLM(dvp); + int error; + + /* + * Might be dangling directory. Catch it here, because a ENOENT return + * from hyprlofs_dirlookup() is a valid return. + */ + if (parent->hln_nlink == 0) + return (ENOENT); + + error = hyprlofs_dirlookup(parent, nm, &self, cr); + if (error == 0) { + ASSERT(self); + hlnode_rele(self); + /* We can't loop in under a looped in directory */ + if (self->hln_looped) + return (EACCES); + *vpp = HLNTOV(self); + return (EEXIST); + } + if (error != ENOENT) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + error = hyprlofs_direnter(tm, parent, nm, DE_MKDIR, (vnode_t *)NULL, + va, &self, cr); + rw_exit(&parent->hln_rwlock); + + if (error == 0 || error == EEXIST) { + hlnode_rele(self); + *vpp = HLNTOV(self); + } + + return (error); +} + +/* + * Loop in a file or directory into the namespace. + */ +static int +hyprlofs_add_entry(vnode_t *vp, char *fspath, char *fsname, + cred_t *cr, caller_context_t *ct) +{ + int error; + char *p, *pnm; + vnode_t *realvp, *dvp; + vattr_t va; + + /* + * Get vnode for the real file/dir. We'll have a hold on realvp which + * we won't vn_rele until hyprlofs_inactive. + */ + if ((error = lookupname(fspath, UIO_SYSSPACE, FOLLOW, NULLVPP, + &realvp)) != 0) + return (error); + + /* no devices allowed */ + if (IS_DEVVP(realvp)) { + VN_RELE(realvp); + return (ENODEV); + } + + /* + * realvp may be an AUTOFS node, in which case we perform a VOP_ACCESS + * to trigger the mount of the intended filesystem. This causes a + * loopback mount of the intended filesystem instead of the AUTOFS + * filesystem. + */ + if ((error = VOP_ACCESS(realvp, 0, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + /* + * We're interested in the top most filesystem. This is specially + * important when fspath is a trigger AUTOFS node, since we're really + * interested in mounting the filesystem AUTOFS mounted as result of + * the VOP_ACCESS() call not the AUTOFS node itself. + */ + if (vn_mountedvfs(realvp) != NULL) { + if ((error = traverse(&realvp)) != 0) { + VN_RELE(realvp); + return (error); + } + } + + va.va_type = VNON; + /* + * If the target name is a path, make sure we have all of the + * intermediate directories, creating them if necessary. + */ + dvp = vp; + pnm = p = fsname; + + /* path cannot be absolute */ + if (*p == '/') { + VN_RELE(realvp); + return (EINVAL); + } + + for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) { + if (va.va_type == VNON) + /* use the top-level dir as the template va for mkdir */ + if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + *p = '\0'; + + /* Path component cannot be empty or relative */ + if (pnm[0] == '\0' || (pnm[0] == '.' && pnm[1] == '.')) { + VN_RELE(realvp); + return (EINVAL); + } + + if ((error = hyprlofs_mkdir(dvp, pnm, &va, &dvp, cr)) != 0 && + error != EEXIST) { + VN_RELE(realvp); + return (error); + } + + *p = '/'; + pnm = p + 1; + } + + /* The file name is required */ + if (pnm[0] == '\0') { + VN_RELE(realvp); + return (EINVAL); + } + + /* Now use the real file's va as the template va */ + if ((error = VOP_GETATTR(realvp, &va, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + /* Make the vnode */ + error = hyprlofs_loopback(dvp, realvp, pnm, &va, va.va_mode, cr, ct); + if (error != 0) + VN_RELE(realvp); + return (error); +} + +/* + * Remove a looped in file from the namespace. + */ +static int +hyprlofs_rm_entry(vnode_t *dvp, char *fsname, cred_t *cr, caller_context_t *ct, + int flags) +{ + int error; + char *p, *pnm; + hlnode_t *parent; + hlnode_t *fndtp; + + pnm = p = fsname; + + /* path cannot be absolute */ + if (*p == '/') + return (EINVAL); + + /* + * If the target name is a path, get the containing dir and simple + * file name. + */ + parent = (hlnode_t *)VTOHLN(dvp); + for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) { + *p = '\0'; + + /* Path component cannot be empty or relative */ + if (pnm[0] == '\0' || (pnm[0] == '.' && pnm[1] == '.')) + return (EINVAL); + + if ((error = hyprlofs_dirlookup(parent, pnm, &fndtp, cr)) != 0) + return (error); + + dvp = HLNTOV(fndtp); + parent = fndtp; + pnm = p + 1; + } + + /* The file name is required */ + if (pnm[0] == '\0') + return (EINVAL); + + /* Remove the entry from the parent dir */ + return (hyprlofs_remove(dvp, pnm, cr, ct, flags)); +} + +/* + * Remove all looped in files from the namespace. + */ +static int +hyprlofs_rm_all(vnode_t *dvp, cred_t *cr, caller_context_t *ct, + int flags) +{ + int error = 0; + hlnode_t *hp = (hlnode_t *)VTOHLN(dvp); + hldirent_t *hdp; + + hlnode_hold(hp); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + goto done; + } + + hdp = hp->hln_dir; + while (hdp) { + hlnode_t *fndhp; + + if (strcmp(hdp->hld_name, ".") == 0 || + strcmp(hdp->hld_name, "..") == 0) { + hdp = hdp->hld_next; + continue; + } + + /* This holds the fndhp vnode */ + error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr); + if (error != 0) + goto done; + hlnode_rele(fndhp); + + if (fndhp->hln_looped == 0) { + /* recursively remove contents of this subdir */ + if (fndhp->hln_type == VDIR) { + vnode_t *tvp = HLNTOV(fndhp); + + error = hyprlofs_rm_all(tvp, cr, ct, flags); + if (error != 0) + goto done; + } + } + + /* remove the entry */ + error = hyprlofs_remove(dvp, hdp->hld_name, cr, ct, flags); + if (error != 0) + goto done; + + hdp = hp->hln_dir; + } + +done: + hlnode_rele(hp); + return (error); +} + +/* + * Get a list of all looped in files in the namespace. + */ +static int +hyprlofs_get_all_entries(vnode_t *dvp, hyprlofs_curr_entry_t *hcp, + char *prefix, int *pcnt, int n_max, + cred_t *cr, caller_context_t *ct, int flags) +{ + int error = 0; + int too_big = 0; + int cnt; + int len; + hlnode_t *hp = (hlnode_t *)VTOHLN(dvp); + hldirent_t *hdp; + char *path; + + cnt = *pcnt; + path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + hlnode_hold(hp); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + goto done; + } + + hdp = hp->hln_dir; + while (hdp) { + hlnode_t *fndhp; + vnode_t *tvp; + + if (strcmp(hdp->hld_name, ".") == 0 || + strcmp(hdp->hld_name, "..") == 0) { + hdp = hdp->hld_next; + continue; + } + + /* This holds the fndhp vnode */ + error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr); + if (error != 0) + goto done; + hlnode_rele(fndhp); + + if (fndhp->hln_looped == 0) { + /* recursively get contents of this subdir */ + VERIFY(fndhp->hln_type == VDIR); + tvp = HLNTOV(fndhp); + + if (*prefix == '\0') + (void) strlcpy(path, hdp->hld_name, MAXPATHLEN); + else + (void) snprintf(path, MAXPATHLEN, "%s/%s", + prefix, hdp->hld_name); + + error = hyprlofs_get_all_entries(tvp, hcp, path, + &cnt, n_max, cr, ct, flags); + + if (error == E2BIG) { + too_big = 1; + error = 0; + } + if (error != 0) + goto done; + } else { + if (cnt < n_max) { + char *p; + + if (*prefix == '\0') + (void) strlcpy(path, hdp->hld_name, + MAXPATHLEN); + else + (void) snprintf(path, MAXPATHLEN, + "%s/%s", prefix, hdp->hld_name); + + len = strlen(path); + ASSERT(len <= MAXPATHLEN); + if (copyout(path, (void *)(hcp[cnt].hce_name), + len)) { + error = EFAULT; + goto done; + } + + tvp = REALVP(HLNTOV(fndhp)); + if (tvp->v_path == NULL) { + p = "<unknown>"; + } else { + p = tvp->v_path; + } + len = strlen(p); + ASSERT(len <= MAXPATHLEN); + if (copyout(p, (void *)(hcp[cnt].hce_path), + len)) { + error = EFAULT; + goto done; + } + } + + cnt++; + if (cnt > n_max) + too_big = 1; + } + + hdp = hdp->hld_next; + } + +done: + hlnode_rele(hp); + kmem_free(path, MAXPATHLEN); + + *pcnt = cnt; + if (error == 0 && too_big == 1) + error = E2BIG; + + return (error); +} + +/* + * Return a list of all looped in files in the namespace. + */ +static int +hyprlofs_get_all(vnode_t *dvp, intptr_t data, cred_t *cr, caller_context_t *ct, + int flags) +{ + int limit, cnt, error; + model_t model; + hyprlofs_curr_entry_t *e; + + model = get_udatamodel(); + + if (model == DATAMODEL_NATIVE) { + hyprlofs_curr_entries_t ebuf; + + if (copyin((void *)data, &ebuf, sizeof (ebuf))) + return (EFAULT); + limit = ebuf.hce_cnt; + e = ebuf.hce_entries; + if (limit > MAX_IOCTL_PARAMS) + return (EINVAL); + + } else { + hyprlofs_curr_entries32_t ebuf32; + + if (copyin((void *)data, &ebuf32, sizeof (ebuf32))) + return (EFAULT); + + limit = ebuf32.hce_cnt; + e = (hyprlofs_curr_entry_t *)(unsigned long) + (ebuf32.hce_entries); + if (limit > MAX_IOCTL_PARAMS) + return (EINVAL); + } + + cnt = 0; + error = hyprlofs_get_all_entries(dvp, e, "", &cnt, limit, cr, ct, + flags); + + if (error == 0 || error == E2BIG) { + if (model == DATAMODEL_NATIVE) { + hyprlofs_curr_entries_t ebuf; + + ebuf.hce_cnt = cnt; + if (copyout(&ebuf, (void *)data, sizeof (ebuf))) + return (EFAULT); + + } else { + hyprlofs_curr_entries32_t ebuf32; + + ebuf32.hce_cnt = cnt; + if (copyout(&ebuf32, (void *)data, sizeof (ebuf32))) + return (EFAULT); + } + } + + return (error); +} + +/* ARGSUSED3 */ +static int +hyprlofs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, + int flags) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + int error; + hlnode_t *hp = NULL; + + /* This holds the hp vnode */ + error = hyprlofs_dirlookup(parent, nm, &hp, cr); + if (error) + return (error); + + ASSERT(hp); + rw_enter(&parent->hln_rwlock, RW_WRITER); + rw_enter(&hp->hln_rwlock, RW_WRITER); + + error = hyprlofs_dirdelete(parent, hp, nm, DR_REMOVE, cr); + + rw_exit(&hp->hln_rwlock); + rw_exit(&parent->hln_rwlock); + vnevent_remove(HLNTOV(hp), dvp, nm, ct); + + /* + * We've now dropped the dir link so by rele-ing our vnode we should + * clean up in hyprlofs_inactive. + */ + hlnode_rele(hp); + + return (error); +} + +/* ARGSUSED4 */ +static int +hyprlofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, + caller_context_t *ct, int flags) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + hlnode_t *self = NULL; + vnode_t *vp; + int error = 0; + + /* Return error if removing . or .. */ + if (strcmp(nm, ".") == 0) + return (EINVAL); + if (strcmp(nm, "..") == 0) + return (EEXIST); /* Should be ENOTEMPTY */ + error = hyprlofs_dirlookup(parent, nm, &self, cr); + if (error) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + rw_enter(&self->hln_rwlock, RW_WRITER); + + vp = HLNTOV(self); + if (vp == dvp || vp == cdir) { + error = EINVAL; + goto done1; + } + if (self->hln_type != VDIR) { + error = ENOTDIR; + goto done1; + } + + /* + * When a dir is looped in, we only remove the in-memory dir, not the + * backing dir. + */ + if (self->hln_looped == 0) { + mutex_enter(&self->hln_tlock); + if (self->hln_nlink > 2) { + mutex_exit(&self->hln_tlock); + error = EEXIST; + goto done1; + } + mutex_exit(&self->hln_tlock); + + if (vn_vfswlock(vp)) { + error = EBUSY; + goto done1; + } + if (vn_mountedvfs(vp) != NULL) { + error = EBUSY; + goto done; + } + + /* + * Check for an empty directory, i.e. only includes entries for + * "." and ".." + */ + if (self->hln_dirents > 2) { + error = EEXIST; /* SIGH should be ENOTEMPTY */ + /* + * Update atime because checking hln_dirents is + * equivalent to reading the directory + */ + gethrestime(&self->hln_atime); + goto done; + } + + error = hyprlofs_dirdelete(parent, self, nm, DR_RMDIR, cr); + } else { + error = hyprlofs_dirdelete(parent, self, nm, DR_REMOVE, cr); + } + +done: + if (self->hln_looped == 0) + vn_vfsunlock(vp); +done1: + rw_exit(&self->hln_rwlock); + rw_exit(&parent->hln_rwlock); + vnevent_rmdir(HLNTOV(self), dvp, nm, ct); + + /* + * We've now dropped the dir link so by rele-ing our vnode we should + * clean up in hyprlofs_inactive. + */ + hlnode_rele(self); + + return (error); +} + +static int +hyprlofs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hldirent_t *hdp; + int error = 0; + size_t namelen; + struct dirent64 *dp; + ulong_t offset; + ulong_t total_bytes_wanted; + long outcount = 0; + long bufsize; + int reclen; + caddr_t outbuf; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_READDIR(REALVP(vp), uiop, cr, eofp, ct, flags)); + + if (uiop->uio_loffset >= MAXOFF_T) { + if (eofp) + *eofp = 1; + return (0); + } + /* assuming syscall has already called hln_rwlock */ + ASSERT(RW_READ_HELD(&hp->hln_rwlock)); + + if (uiop->uio_iovcnt != 1) + return (EINVAL); + + if (vp->v_type != VDIR) + return (ENOTDIR); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + return (0); + } + + /* Get space for multiple dir entries */ + total_bytes_wanted = uiop->uio_iov->iov_len; + bufsize = total_bytes_wanted + sizeof (struct dirent64); + outbuf = kmem_alloc(bufsize, KM_SLEEP); + + dp = (struct dirent64 *)((uintptr_t)outbuf); + + offset = 0; + hdp = hp->hln_dir; + while (hdp) { + namelen = strlen(hdp->hld_name); /* no +1 needed */ + offset = hdp->hld_offset; + if (offset >= uiop->uio_offset) { + reclen = (int)DIRENT64_RECLEN(namelen); + if (outcount + reclen > total_bytes_wanted) { + if (!outcount) + /* Buffer too small for any entries. */ + error = EINVAL; + break; + } + ASSERT(hdp->hld_hlnode != NULL); + + /* zero out uninitialized bytes */ + (void) strncpy(dp->d_name, hdp->hld_name, + DIRENT64_NAMELEN(reclen)); + dp->d_reclen = (ushort_t)reclen; + dp->d_ino = (ino64_t)hdp->hld_hlnode->hln_nodeid; + dp->d_off = (offset_t)hdp->hld_offset + 1; + dp = (struct dirent64 *) + ((uintptr_t)dp + dp->d_reclen); + outcount += reclen; + ASSERT(outcount <= bufsize); + } + hdp = hdp->hld_next; + } + + if (!error) + error = uiomove(outbuf, outcount, UIO_READ, uiop); + + if (!error) { + /* + * If we reached the end of the list our offset should now be + * just past the end. + */ + if (!hdp) { + offset += 1; + if (eofp) + *eofp = 1; + } else if (eofp) + *eofp = 0; + uiop->uio_offset = offset; + } + gethrestime(&hp->hln_atime); + kmem_free(outbuf, bufsize); + return (error); +} + +static int +hyprlofs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_FSYNC(REALVP(vp), syncflag, cr, ct)); + return (0); +} + +/* ARGSUSED */ +static void +hyprlofs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vp->v_vfsp); + + rw_enter(&hp->hln_rwlock, RW_WRITER); + + mutex_enter(&hp->hln_tlock); + mutex_enter(&vp->v_lock); + ASSERT(vp->v_count >= 1); + + /* + * If we don't have the last hold or the link count is non-zero, + * there's nothing to do except drop our hold. + */ + if (vp->v_count > 1 || hp->hln_nlink != 0) { + vp->v_count--; + mutex_exit(&vp->v_lock); + mutex_exit(&hp->hln_tlock); + rw_exit(&hp->hln_rwlock); + return; + } + + mutex_exit(&vp->v_lock); + mutex_exit(&hp->hln_tlock); + + /* release hold on the real vnode now */ + if (hp->hln_looped == 1 && hp->hln_realvp != NULL) + VN_RELE(hp->hln_realvp); + + /* Here's our chance to send invalid event while we're between locks */ + vn_invalid(HLNTOV(hp)); + + mutex_enter(&hm->hlm_contents); + if (hp->hln_forw == NULL) + hm->hlm_rootnode->hln_back = hp->hln_back; + else + hp->hln_forw->hln_back = hp->hln_back; + hp->hln_back->hln_forw = hp->hln_forw; + mutex_exit(&hm->hlm_contents); + rw_exit(&hp->hln_rwlock); + rw_destroy(&hp->hln_rwlock); + mutex_destroy(&hp->hln_tlock); + vn_free(HLNTOV(hp)); + hyprlofs_memfree(hp, sizeof (hlnode_t)); +} + +static int +hyprlofs_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hlfid_t *hfid; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_FID(REALVP(vp), fidp, ct)); + + if (fidp->fid_len < (sizeof (hlfid_t) - sizeof (ushort_t))) { + fidp->fid_len = sizeof (hlfid_t) - sizeof (ushort_t); + return (ENOSPC); + } + + hfid = (hlfid_t *)fidp; + bzero(hfid, sizeof (hlfid_t)); + hfid->hlfid_len = (int)sizeof (hlfid_t) - sizeof (ushort_t); + + hfid->hlfid_ino = hp->hln_nodeid; + hfid->hlfid_gen = hp->hln_gen; + + return (0); +} + +static int +hyprlofs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, + cred_t *cr, caller_context_t *ct) +{ + ASSERT(VTOHLN(vp)->hln_looped == 1); + return (VOP_GETPAGE(REALVP(vp), off, len, protp, pl, plsz, seg, addr, + rw, cr, ct)); +} + +int +hyprlofs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, + cred_t *cr, caller_context_t *ct) +{ + ASSERT(VTOHLN(vp)->hln_looped == 1); + return (VOP_PUTPAGE(REALVP(vp), off, len, flags, cr, ct)); +} + +static int +hyprlofs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + ASSERT(VTOHLN(vp)->hln_looped == 1); + return (VOP_MAP(REALVP(vp), off, as, addrp, len, prot, maxprot, flags, + cr, ct)); +} + +static int +hyprlofs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + ASSERT(VTOHLN(vp)->hln_looped == 1); + return (VOP_ADDMAP(REALVP(vp), off, as, addr, len, prot, maxprot, + flags, cr, ct)); +} + +static int +hyprlofs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + ASSERT(VTOHLN(vp)->hln_looped == 1); + return (VOP_DELMAP(REALVP(vp), off, as, addr, len, prot, maxprot, + flags, cr, ct)); +} + +static int +hyprlofs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, + offset_t offset, cred_t *cr, caller_context_t *ct) +{ + ASSERT(VTOHLN(vp)->hln_looped == 1); + return (VOP_SPACE(REALVP(vp), cmd, bfp, flag, offset, cr, ct)); +} + +static int +hyprlofs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, + caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 0) + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); + + return (VOP_SEEK(REALVP(vp), ooff, noffp, ct)); +} + +static int +hyprlofs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + hlnode_t *hp = VTOHLN(vp); + + if (hp->hln_looped == 1) + return (VOP_RWLOCK(REALVP(vp), write_lock, ct)); + + if (write_lock) { + rw_enter(&hp->hln_rwlock, RW_WRITER); + } else { + rw_enter(&hp->hln_rwlock, RW_READER); + } + return (write_lock); +} + +static void +hyprlofs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + hlnode_t *hp = VTOHLN(vp); + + if (hp->hln_looped == 1) { + VOP_RWUNLOCK(REALVP(vp), write_lock, ct); + return; + } + + rw_exit(&hp->hln_rwlock); +} + +static int +hyprlofs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + int error; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_PATHCONF(REALVP(vp), cmd, valp, cr, ct)); + + switch (cmd) { + case _PC_XATTR_ENABLED: + case _PC_XATTR_EXISTS: + case _PC_SATTR_ENABLED: + case _PC_SATTR_EXISTS: + error = EINVAL; + break; + case _PC_TIMESTAMP_RESOLUTION: + /* nanosecond timestamp resolution */ + *valp = 1L; + error = 0; + break; + default: + error = fs_pathconf(vp, cmd, valp, cr, ct); + } + return (error); +} + + +struct vnodeops *hyprlofs_vnodeops; + +const fs_operation_def_t hyprlofs_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = hyprlofs_open }, + VOPNAME_CLOSE, { .vop_close = hyprlofs_close }, + VOPNAME_READ, { .vop_read = hyprlofs_read }, + VOPNAME_WRITE, { .vop_write = hyprlofs_write }, + VOPNAME_IOCTL, { .vop_ioctl = hyprlofs_ioctl }, + VOPNAME_GETATTR, { .vop_getattr = hyprlofs_getattr }, + VOPNAME_SETATTR, { .vop_setattr = hyprlofs_setattr }, + VOPNAME_ACCESS, { .vop_access = hyprlofs_access }, + VOPNAME_LOOKUP, { .vop_lookup = hyprlofs_lookup }, + VOPNAME_CREATE, { .error = fs_error }, + VOPNAME_REMOVE, { .vop_remove = hyprlofs_remove }, + VOPNAME_LINK, { .error = fs_error }, + VOPNAME_RENAME, { .error = fs_error }, + VOPNAME_MKDIR, { .error = fs_error }, + VOPNAME_RMDIR, { .vop_rmdir = hyprlofs_rmdir }, + VOPNAME_READDIR, { .vop_readdir = hyprlofs_readdir }, + VOPNAME_SYMLINK, { .error = fs_error }, + VOPNAME_READLINK, { .error = fs_error }, + VOPNAME_FSYNC, { .vop_fsync = hyprlofs_fsync }, + VOPNAME_INACTIVE, { .vop_inactive = hyprlofs_inactive }, + VOPNAME_FID, { .vop_fid = hyprlofs_fid }, + VOPNAME_RWLOCK, { .vop_rwlock = hyprlofs_rwlock }, + VOPNAME_RWUNLOCK, { .vop_rwunlock = hyprlofs_rwunlock }, + VOPNAME_SEEK, { .vop_seek = hyprlofs_seek }, + VOPNAME_SPACE, { .vop_space = hyprlofs_space }, + VOPNAME_GETPAGE, { .vop_getpage = hyprlofs_getpage }, + VOPNAME_PUTPAGE, { .vop_putpage = hyprlofs_putpage }, + VOPNAME_MAP, { .vop_map = hyprlofs_map }, + VOPNAME_ADDMAP, { .vop_addmap = hyprlofs_addmap }, + VOPNAME_DELMAP, { .vop_delmap = hyprlofs_delmap }, + VOPNAME_PATHCONF, { .vop_pathconf = hyprlofs_pathconf }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + NULL, NULL +}; diff --git a/usr/src/uts/common/fs/lxproc/lxpr_subr.c b/usr/src/uts/common/fs/lxproc/lxpr_subr.c new file mode 100644 index 0000000000..2cd4813e43 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_subr.c @@ -0,0 +1,516 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/varargs.h> +#include <sys/cpuvar.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <sys/prsystm.h> + +#include "lxproc.h" + +#define LXPRCACHE_NAME "lxpr_cache" + +static int lxpr_node_constructor(void *, void *, int); +static void lxpr_node_destructor(void *, void *); + +static kmem_cache_t *lxpr_node_cache; + +struct lxpr_uiobuf { + uio_t *uiop; + char *buffer; + uint32_t buffsize; + char *pos; + size_t beg; + int error; +}; + +int lxpr_bufsize = 4000; + +struct lxpr_uiobuf * +lxpr_uiobuf_new(uio_t *uiop) +{ + /* Allocate memory for both lxpr_uiobuf and output buffer */ + int bufsize = lxpr_bufsize; + struct lxpr_uiobuf *uiobuf = + kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP); + + uiobuf->uiop = uiop; + uiobuf->buffer = (char *)&uiobuf[1]; + uiobuf->buffsize = bufsize; + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + uiobuf->error = 0; + + return (uiobuf); +} + +void +lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf) +{ + ASSERT(uiobuf != NULL); + ASSERT(uiobuf->pos == uiobuf->buffer); + + kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize); +} + +void +lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset) +{ + uiobuf->uiop->uio_offset = (off_t)offset; +} + +void +lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err) +{ + ASSERT(uiobuf->error == 0); + + uiobuf->error = err; +} + +int +lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf) +{ + off_t off = uiobuf->uiop->uio_offset; + caddr_t uaddr = uiobuf->buffer; + size_t beg = uiobuf->beg; + size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr; + + if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + ASSERT(off >= beg); + + if (beg + size > off && off >= 0) + uiobuf->error = + uiomove(uaddr + (off - beg), size - (off - beg), + UIO_READ, uiobuf->uiop); + + uiobuf->beg += size; + } + + uiobuf->pos = uaddr; + + return (uiobuf->error); +} + +void +lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size) +{ + /* While we can still carry on */ + while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + uintptr_t remain = (uintptr_t)uiobuf->buffsize - + ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer); + + /* Enough space in buffer? */ + if (remain >= size) { + bcopy(buf, uiobuf->pos, size); + uiobuf->pos += size; + return; + } + + /* Not enough space, so copy all we can and try again */ + bcopy(buf, uiobuf->pos, remain); + uiobuf->pos += remain; + (void) lxpr_uiobuf_flush(uiobuf); + buf += remain; + size -= remain; + } +} + +#define TYPBUFFSIZE 256 + +void +lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...) +{ + va_list args; + char buff[TYPBUFFSIZE]; + int len; + char *buffer; + + /* Can we still do any output */ + if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0) + return; + + va_start(args, fmt); + + /* Try using stack allocated buffer */ + len = vsnprintf(buff, TYPBUFFSIZE, fmt, args); + if (len < TYPBUFFSIZE) { + va_end(args); + lxpr_uiobuf_write(uiobuf, buff, len); + return; + } + + /* Not enough space in pre-allocated buffer */ + buffer = kmem_alloc(len + 1, KM_SLEEP); + + /* + * We know we allocated the correct amount of space + * so no check on the return value + */ + (void) vsnprintf(buffer, len+1, fmt, args); + lxpr_uiobuf_write(uiobuf, buffer, len); + va_end(args); + kmem_free(buffer, len+1); +} + +/* + * lxpr_lock(): + * + * Lookup process from pid and return with p_plock and P_PR_LOCK held. + */ +proc_t * +lxpr_lock(pid_t pid) +{ + proc_t *p; + kmutex_t *mp; + + ASSERT(!MUTEX_HELD(&pidlock)); + + for (;;) { + mutex_enter(&pidlock); + + /* + * If the pid is 1, we really want the zone's init process + */ + p = prfind((pid == 1) ? + curproc->p_zone->zone_proc_initpid : pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (NULL); + } + + /* + * p_lock is persistent, but p itself is not -- it could + * vanish during cv_wait(). Load p->p_lock now so we can + * drop it after cv_wait() without referencing p. + */ + mp = &p->p_lock; + mutex_enter(mp); + + mutex_exit(&pidlock); + + if (!(p->p_proc_flag & P_PR_LOCK)) + break; + + cv_wait(&pr_pid_cv[p->p_slot], mp); + mutex_exit(mp); + } + + p->p_proc_flag |= P_PR_LOCK; + THREAD_KPRI_REQUEST(); + return (p); +} + +/* + * lxpr_unlock() + * + * Unlock locked process + */ +void +lxpr_unlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(!MUTEX_HELD(&pidlock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; + mutex_exit(&p->p_lock); + THREAD_KPRI_RELEASE(); +} + +void +lxpr_initnodecache() +{ + lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME, + sizeof (lxpr_node_t), 0, + lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0); +} + +void +lxpr_fininodecache() +{ + kmem_cache_destroy(lxpr_node_cache); +} + +/* ARGSUSED */ +static int +lxpr_node_constructor(void *buf, void *un, int kmflags) +{ + lxpr_node_t *lxpnp = buf; + vnode_t *vp; + + vp = lxpnp->lxpr_vnode = vn_alloc(kmflags); + if (vp == NULL) + return (-1); + + (void) vn_setops(vp, lxpr_vnodeops); + vp->v_data = lxpnp; + + return (0); +} + +/* ARGSUSED */ +static void +lxpr_node_destructor(void *buf, void *un) +{ + lxpr_node_t *lxpnp = buf; + + vn_free(LXPTOV(lxpnp)); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them + * to give the inode number for an lxproc node + */ +ino_t +lxpr_inode(lxpr_nodetype_t type, pid_t pid, int fd) +{ + if (pid == 1) + pid = curproc->p_zone->zone_proc_initpid; + + switch (type) { + case LXPR_PIDDIR: + return (pid + 1); + case LXPR_PROCDIR: + return (maxpid + 2); + case LXPR_PID_FD_FD: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + LXPR_NFILES + fd); + default: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + type); + } +} + +/* + * Return inode number of parent (directory) + */ +ino_t +lxpr_parentinode(lxpr_node_t *lxpnp) +{ + /* + * If the input node is the root then the parent inode + * is the mounted on inode so just return our inode number + */ + if (lxpnp->lxpr_type != LXPR_PROCDIR) + return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino); + else + return (lxpnp->lxpr_ino); +} + +/* + * Allocate a new lxproc node + * + * This also allocates the vnode associated with it + */ +lxpr_node_t * +lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd) +{ + lxpr_node_t *lxpnp; + vnode_t *vp; + user_t *up; + timestruc_t now; + + /* + * Allocate a new node. It is deallocated in vop_innactive + */ + lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP); + + /* + * Set defaults (may be overridden below) + */ + gethrestime(&now); + lxpnp->lxpr_type = type; + lxpnp->lxpr_realvp = NULL; + lxpnp->lxpr_parent = dp; + VN_HOLD(dp); + if (p != NULL) { + lxpnp->lxpr_pid = ((p->p_pid == + curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid); + + lxpnp->lxpr_time = PTOU(p)->u_start; + lxpnp->lxpr_uid = crgetruid(p->p_cred); + lxpnp->lxpr_gid = crgetrgid(p->p_cred); + lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd); + } else { + /* Pretend files without a proc belong to sched */ + lxpnp->lxpr_pid = 0; + lxpnp->lxpr_time = now; + lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0; + lxpnp->lxpr_ino = lxpr_inode(type, 0, 0); + } + + /* initialize the vnode data */ + vp = lxpnp->lxpr_vnode; + vn_reinit(vp); + vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; + vp->v_vfsp = dp->v_vfsp; + + /* + * Do node specific stuff + */ + switch (type) { + case LXPR_PROCDIR: + vp->v_flag |= VROOT; + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_CURDIR: + ASSERT(p != NULL); + + /* + * Zombie check. p_stat is officially protected by pidlock, + * but we can't grab pidlock here because we already hold + * p_lock. Luckily if we look at the process exit code + * we see that p_stat only transisions from SRUN to SZOMB + * while p_lock is held. Aside from this, the only other + * p_stat transition that we need to be aware about is + * SIDL to SRUN, but that's not a problem since lxpr_lock() + * ignores nodes in the SIDL state so we'll never get a node + * that isn't already in the SRUN state. + */ + if (p->p_stat == SZOMB) { + lxpnp->lxpr_realvp = NULL; + } else { + up = PTOU(p); + lxpnp->lxpr_realvp = up->u_cdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_ROOTDIR: + ASSERT(p != NULL); + /* Zombie check. see locking comment above */ + if (p->p_stat == SZOMB) { + lxpnp->lxpr_realvp = NULL; + } else { + up = PTOU(p); + lxpnp->lxpr_realvp = + up->u_rdir != NULL ? up->u_rdir : rootdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_EXE: + ASSERT(p != NULL); + lxpnp->lxpr_realvp = p->p_exec; + if (lxpnp->lxpr_realvp != NULL) { + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; + break; + + case LXPR_SELF: + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_FD_FD: + ASSERT(p != NULL); + /* lxpr_realvp is set after we return */ + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0700; /* read-write-exe owner only */ + break; + + case LXPR_PID_FDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0500; /* read-search by owner only */ + break; + + case LXPR_PIDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0511; + break; + + case LXPR_NETDIR: + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by all */ + break; + + case LXPR_PID_ENV: + case LXPR_PID_MEM: + ASSERT(p != NULL); + /*FALLTHRU*/ + case LXPR_KCORE: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0400; /* read-only by owner only */ + break; + + default: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0444; /* read-only by all */ + break; + } + + return (lxpnp); +} + + +/* + * Free the storage obtained from lxpr_getnode(). + */ +void +lxpr_freenode(lxpr_node_t *lxpnp) +{ + ASSERT(lxpnp != NULL); + ASSERT(LXPTOV(lxpnp) != NULL); + + /* + * delete any association with realvp + */ + if (lxpnp->lxpr_realvp != NULL) + VN_RELE(lxpnp->lxpr_realvp); + + /* + * delete any association with parent vp + */ + if (lxpnp->lxpr_parent != NULL) + VN_RELE(lxpnp->lxpr_parent); + + /* + * Release the lxprnode. + */ + kmem_cache_free(lxpr_node_cache, lxpnp); +} diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c new file mode 100644 index 0000000000..1bb7bd3823 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c @@ -0,0 +1,367 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/signal.h> +#include <sys/user.h> +#include <sys/mount.h> +#include <sys/bitmap.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/modctl.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> + +#include "lxproc.h" + +/* Module level parameters */ +static int lxprocfstype; +static dev_t lxprocdev; +static kmutex_t lxpr_mount_lock; + +int nproc_highbit; /* highbit(v.v_nproc) */ + +static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *); +static int lxpr_unmount(vfs_t *, int, cred_t *); +static int lxpr_root(vfs_t *, vnode_t **); +static int lxpr_statvfs(vfs_t *, statvfs64_t *); +static int lxpr_init(int, char *); + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lxproc", + lxpr_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information for the kernel. + */ +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "generic linux procfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int retval; + + /* + * attempt to unload the module + */ + if ((retval = mod_remove(&modlinkage)) != 0) + goto done; + + /* + * destroy lxpr_node cache + */ + lxpr_fininodecache(); + + /* + * clean out the vfsops and vnodeops + */ + (void) vfs_freevfsops_by_type(lxprocfstype); + vn_freevnodeops(lxpr_vnodeops); + + mutex_destroy(&lxpr_mount_lock); +done: + return (retval); +} + +static int +lxpr_init(int fstype, char *name) +{ + static const fs_operation_def_t lxpr_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxpr_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxpr_unmount }, + VFSNAME_ROOT, { .vfs_root = lxpr_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxpr_statvfs }, + NULL, NULL + }; + extern const fs_operation_def_t lxpr_vnodeops_template[]; + int error; + major_t dev; + + nproc_highbit = highbit(v.v_proc); + lxprocfstype = fstype; + ASSERT(lxprocfstype != 0); + + mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Associate VFS ops vector with this fstype. + */ + error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxpr_init: bad vfs ops template"); + return (error); + } + + /* + * Set up vnode ops vector too. + */ + error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxpr_init: bad vnode ops template"); + return (error); + } + + /* + * Assign a unique "device" number (used by stat(2)). + */ + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxpr_init: can't get unique device number"); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxprocdev = makedevice(dev, 0); + + /* + * Initialize cache for lxpr_nodes + */ + lxpr_initnodecache(); + + return (0); +} + +static int +lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt; + zone_t *zone = curproc->p_zone; + ldi_ident_t li; + int err; + + /* + * must be root to mount + */ + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + /* + * mount point must be a directory + */ + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (zone == global_zone) { + zone_t *mntzone; + + mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); + zone_rele(mntzone); + if (zone != mntzone) + return (EBUSY); + } + + /* + * Having the resource be anything but "lxproc" doesn't make sense + */ + vfs_setresource(vfsp, "lxproc", 0); + + lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP); + + if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) { + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + return (err); + } + + lxpr_mnt->lxprm_li = li; + + mutex_enter(&lxpr_mount_lock); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + mutex_exit(&lxpr_mount_lock); + kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt))); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * allocate the first vnode + */ + zone_hold(lxpr_mnt->lxprm_zone = zone); + + /* Arbitrarily set the parent vnode to the mounted over directory */ + lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0); + + /* Correctly set the fs for the root node */ + lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp; + + vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype); + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lxprocfstype; + vfsp->vfs_data = (caddr_t)lxpr_mnt; + vfsp->vfs_dev = lxprocdev; + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data; + vnode_t *vp; + int count; + + ASSERT(lxpr_mnt != NULL); + vp = LXPTOV(lxpr_mnt->lxprm_node); + + mutex_enter(&lxpr_mount_lock); + + /* + * must be root to unmount + */ + if (secpolicy_fs_unmount(cr, vfsp) != 0) { + mutex_exit(&lxpr_mount_lock); + return (EPERM); + } + + /* + * forced unmount is not supported by this file system + */ + if (flag & MS_FORCE) { + mutex_exit(&lxpr_mount_lock); + return (ENOTSUP); + } + + /* + * Ensure that no vnodes are in use on this mount point. + */ + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + if (count > 1) { + mutex_exit(&lxpr_mount_lock); + return (EBUSY); + } + + /* + * purge the dnlc cache for vnode entries + * associated with this file system + */ + count = dnlc_purge_vfsp(vfsp, 0); + + /* + * free up the lxprnode + */ + lxpr_freenode(lxpr_mnt->lxprm_node); + zone_rele(lxpr_mnt->lxprm_zone); + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_root(vfs_t *vfsp, vnode_t **vpp) +{ + lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node; + vnode_t *vp = LXPTOV(lxpnp); + + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + int n; + dev32_t d32; + extern uint_t nproc; + + n = v.v_proc - nproc; + + bzero((caddr_t)sp, sizeof (*sp)); + sp->f_bsize = DEV_BSIZE; + sp->f_frsize = DEV_BSIZE; + sp->f_blocks = (fsblkcnt64_t)0; + sp->f_bfree = (fsblkcnt64_t)0; + sp->f_bavail = (fsblkcnt64_t)0; + sp->f_files = (fsfilcnt64_t)v.v_proc + 2; + sp->f_ffree = (fsfilcnt64_t)n; + sp->f_favail = (fsfilcnt64_t)n; + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + /* It is guaranteed that vsw_name will fit in f_basetype */ + (void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + sp->f_namemax = 64; /* quite arbitrary */ + + (void) strcpy(sp->f_fstr, "lxproc"); + + return (0); +} diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c new file mode 100644 index 0000000000..c1d6a85d99 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c @@ -0,0 +1,3079 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +/* + * lxproc -- a loosely Linux-compatible /proc + * + * The aspiration here is to provide something that sufficiently approximates + * the Linux /proc implementation for purposes of offering some compatibility + * for simple Linux /proc readers (e.g., ps/top/htop). However, it is not + * intended to exactly mimic Linux semantics; when choosing between offering + * compatibility and telling the truth, we emphatically pick the truth. A + * particular glaring example of this is the Linux notion of "tasks" (that is, + * threads), which -- due to historical misadventures on Linux -- allocate their + * identifiers from the process identifier space. (That is, each thread has in + * effect a pid.) Some Linux /proc readers have come to depend on this + * attribute, and become confused when threads appear with proper identifiers, + * so we simply opt for the pre-2.6 behavior, and do not present the tasks + * directory at all. Similarly, when choosing between offering compatibility + * and remaining consistent with our broader security model, we (obviously) + * choose security over compatibility. In short, this is meant to be a best + * effort -- no more. + */ + +#include <sys/cpupart.h> +#include <sys/cpuvar.h> +#include <sys/session.h> +#include <sys/vmparam.h> +#include <sys/mman.h> +#include <vm/rm.h> +#include <vm/seg_vn.h> +#include <sys/sdt.h> +#include <sys/strlog.h> +#include <sys/stropts.h> +#include <sys/cmn_err.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/fp.h> +#include <sys/pool_pset.h> +#include <sys/pset.h> +#include <sys/zone.h> +#include <sys/pghw.h> +#include <sys/vfs_opreg.h> + +/* Dependent on procfs */ +extern kthread_t *prchoose(proc_t *); + +#include "lxproc.h" + +extern pgcnt_t swapfs_minfree; +extern time_t boot_time; + +/* + * Pointer to the vnode ops vector for this fs. + * This is instantiated in lxprinit() in lxpr_vfsops.c + */ +vnodeops_t *lxpr_vnodeops; + +static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *); +static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *, + caller_context_t *); +static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *); +static int lxpr_lookup(vnode_t *, char *, vnode_t **, + pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *, + pathname_t *); +static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *, + caller_context_t *, int); +static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *); +static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *); +static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *); +static int lxpr_sync(void); +static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *); + +static vnode_t *lxpr_lookup_procdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_piddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *); +static vnode_t *lxpr_lookup_fddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_netdir(vnode_t *, char *); + +static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *); + +static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *); + +/* + * Simple conversion + */ +#define btok(x) ((x) >> 10) /* bytes to kbytes */ +#define ptok(x) ((x) << (PAGESHIFT - 10)) /* pages to kbytes */ + +/* + * The lxproc vnode operations vector + */ +const fs_operation_def_t lxpr_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxpr_open }, + VOPNAME_CLOSE, { .vop_close = lxpr_close }, + VOPNAME_READ, { .vop_read = lxpr_read }, + VOPNAME_GETATTR, { .vop_getattr = lxpr_getattr }, + VOPNAME_ACCESS, { .vop_access = lxpr_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxpr_lookup }, + VOPNAME_READDIR, { .vop_readdir = lxpr_readdir }, + VOPNAME_READLINK, { .vop_readlink = lxpr_readlink }, + VOPNAME_FSYNC, { .error = lxpr_sync }, + VOPNAME_SEEK, { .error = lxpr_sync }, + VOPNAME_INACTIVE, { .vop_inactive = lxpr_inactive }, + VOPNAME_CMP, { .vop_cmp = lxpr_cmp }, + VOPNAME_REALVP, { .vop_realvp = lxpr_realvp }, + NULL, NULL +}; + +/* + * file contents of an lxproc directory. + */ +static lxpr_dirent_t lxpr_dir[] = { + { LXPR_CMDLINE, "cmdline" }, + { LXPR_CPUINFO, "cpuinfo" }, + { LXPR_DEVICES, "devices" }, + { LXPR_DMA, "dma" }, + { LXPR_FILESYSTEMS, "filesystems" }, + { LXPR_INTERRUPTS, "interrupts" }, + { LXPR_IOPORTS, "ioports" }, + { LXPR_KCORE, "kcore" }, + { LXPR_KMSG, "kmsg" }, + { LXPR_LOADAVG, "loadavg" }, + { LXPR_MEMINFO, "meminfo" }, + { LXPR_MOUNTS, "mounts" }, + { LXPR_NETDIR, "net" }, + { LXPR_PARTITIONS, "partitions" }, + { LXPR_SELF, "self" }, + { LXPR_STAT, "stat" }, + { LXPR_UPTIME, "uptime" }, + { LXPR_VERSION, "version" } +}; + +#define PROCDIRFILES (sizeof (lxpr_dir) / sizeof (lxpr_dir[0])) + +/* + * Contents of an /lxproc/<pid> directory. + */ +static lxpr_dirent_t piddir[] = { + { LXPR_PID_CMDLINE, "cmdline" }, + { LXPR_PID_CPU, "cpu" }, + { LXPR_PID_CURDIR, "cwd" }, + { LXPR_PID_ENV, "environ" }, + { LXPR_PID_EXE, "exe" }, + { LXPR_PID_MAPS, "maps" }, + { LXPR_PID_MEM, "mem" }, + { LXPR_PID_ROOTDIR, "root" }, + { LXPR_PID_STAT, "stat" }, + { LXPR_PID_STATM, "statm" }, + { LXPR_PID_STATUS, "status" }, + { LXPR_PID_FDDIR, "fd" } +}; + +#define PIDDIRFILES (sizeof (piddir) / sizeof (piddir[0])) + +/* + * contents of /lxproc/net directory + */ +static lxpr_dirent_t netdir[] = { + { LXPR_NET_ARP, "arp" }, + { LXPR_NET_DEV, "dev" }, + { LXPR_NET_DEV_MCAST, "dev_mcast" }, + { LXPR_NET_IGMP, "igmp" }, + { LXPR_NET_IP_MR_CACHE, "ip_mr_cache" }, + { LXPR_NET_IP_MR_VIF, "ip_mr_vif" }, + { LXPR_NET_MCFILTER, "mcfilter" }, + { LXPR_NET_NETSTAT, "netstat" }, + { LXPR_NET_RAW, "raw" }, + { LXPR_NET_ROUTE, "route" }, + { LXPR_NET_RPC, "rpc" }, + { LXPR_NET_RT_CACHE, "rt_cache" }, + { LXPR_NET_SOCKSTAT, "sockstat" }, + { LXPR_NET_SNMP, "snmp" }, + { LXPR_NET_STAT, "stat" }, + { LXPR_NET_TCP, "tcp" }, + { LXPR_NET_UDP, "udp" }, + { LXPR_NET_UNIX, "unix" } +}; + +#define NETDIRFILES (sizeof (netdir) / sizeof (netdir[0])) + +/* + * These are the major signal number differences between Linux and native: + * + * ==================================== + * | Number | Linux | Native | + * | ====== | ========= | ========== | + * | 7 | SIGBUS | SIGEMT | + * | 10 | SIGUSR1 | SIGBUS | + * | 12 | SIGUSR2 | SIGSYS | + * | 16 | SIGSTKFLT | SIGUSR1 | + * | 17 | SIGCHLD | SIGUSR2 | + * | 18 | SIGCONT | SIGCHLD | + * | 19 | SIGSTOP | SIGPWR | + * | 20 | SIGTSTP | SIGWINCH | + * | 21 | SIGTTIN | SIGURG | + * | 22 | SIGTTOU | SIGPOLL | + * | 23 | SIGURG | SIGSTOP | + * | 24 | SIGXCPU | SIGTSTP | + * | 25 | SIGXFSZ | SIGCONT | + * | 26 | SIGVTALARM | SIGTTIN | + * | 27 | SIGPROF | SIGTTOU | + * | 28 | SIGWINCH | SIGVTALARM | + * | 29 | SIGPOLL | SIGPROF | + * | 30 | SIGPWR | SIGXCPU | + * | 31 | SIGSYS | SIGXFSZ | + * ==================================== + * + * Not every Linux signal maps to a native signal, nor does every native + * signal map to a Linux counterpart. However, when signals do map, the + * mapping is unique. + */ +static int +lxpr_sigmap[NSIG] = { + 0, + LX_SIGHUP, + LX_SIGINT, + LX_SIGQUIT, + LX_SIGILL, + LX_SIGTRAP, + LX_SIGABRT, + LX_SIGSTKFLT, + LX_SIGFPE, + LX_SIGKILL, + LX_SIGBUS, + LX_SIGSEGV, + LX_SIGSYS, + LX_SIGPIPE, + LX_SIGALRM, + LX_SIGTERM, + LX_SIGUSR1, + LX_SIGUSR2, + LX_SIGCHLD, + LX_SIGPWR, + LX_SIGWINCH, + LX_SIGURG, + LX_SIGPOLL, + LX_SIGSTOP, + LX_SIGTSTP, + LX_SIGCONT, + LX_SIGTTIN, + LX_SIGTTOU, + LX_SIGVTALRM, + LX_SIGPROF, + LX_SIGXCPU, + LX_SIGXFSZ, + -1, /* 32: illumos SIGWAITING */ + -1, /* 33: illumos SIGLWP */ + -1, /* 34: illumos SIGFREEZE */ + -1, /* 35: illumos SIGTHAW */ + -1, /* 36: illumos SIGCANCEL */ + -1, /* 37: illumos SIGLOST */ + -1, /* 38: illumos SIGXRES */ + -1, /* 39: illumos SIGJVM1 */ + -1, /* 40: illumos SIGJVM2 */ + LX_SIGRTMIN, /* 41: illumos _SIGRTMIN */ + LX_SIGRTMIN + 1, + LX_SIGRTMIN + 2, + LX_SIGRTMIN + 3, + LX_SIGRTMIN + 4, + LX_SIGRTMIN + 5, + LX_SIGRTMIN + 6, + LX_SIGRTMIN + 7, + LX_SIGRTMIN + 8, + LX_SIGRTMIN + 9, + LX_SIGRTMIN + 10, + LX_SIGRTMIN + 11, + LX_SIGRTMIN + 12, + LX_SIGRTMIN + 13, + LX_SIGRTMIN + 14, + LX_SIGRTMIN + 15, + LX_SIGRTMIN + 16, + LX_SIGRTMIN + 17, + LX_SIGRTMIN + 18, + LX_SIGRTMIN + 19, + LX_SIGRTMIN + 20, + LX_SIGRTMIN + 21, + LX_SIGRTMIN + 22, + LX_SIGRTMIN + 23, + LX_SIGRTMIN + 24, + LX_SIGRTMIN + 25, + LX_SIGRTMIN + 26, + LX_SIGRTMIN + 27, + LX_SIGRTMIN + 28, + LX_SIGRTMIN + 29, + LX_SIGRTMIN + 30, + LX_SIGRTMAX, +}; + +/* + * lxpr_open(): Vnode operation for VOP_OPEN() + */ +static int +lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *vp = *vpp; + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + vnode_t *rvp; + int error = 0; + + /* + * We only allow reading in this file systrem + */ + if (flag & FWRITE) + return (EROFS); + + /* + * If we are opening an underlying file only allow regular files + * reject the open for anything but a regular file. + * Just do it if we are opening the current or root directory. + */ + if (lxpnp->lxpr_realvp != NULL) { + rvp = lxpnp->lxpr_realvp; + + if (type == LXPR_PID_FD_FD && rvp->v_type != VREG) + error = EACCES; + else { + /* + * Need to hold rvp since VOP_OPEN() may release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + if (error) { + VN_RELE(rvp); + } else { + *vpp = rvp; + VN_RELE(vp); + } + } + } + + if (type == LXPR_KMSG) { + ldi_ident_t li = VTOLXPM(vp)->lxprm_li; + struct strioctl str; + int rv; + + /* + * Open the zone's console device using the layered driver + * interface. + */ + if ((error = ldi_open_by_name("/dev/log", FREAD, cr, + &lxpnp->lxpr_cons_ldih, li)) != 0) + return (error); + + /* + * Send an ioctl to the underlying console device, letting it + * know we're interested in getting console messages. + */ + str.ic_cmd = I_CONSLOG; + str.ic_timout = 0; + str.ic_len = 0; + str.ic_dp = NULL; + if ((error = ldi_ioctl(lxpnp->lxpr_cons_ldih, I_STR, + (intptr_t)&str, FKIOCTL, cr, &rv)) != 0) + return (error); + } + + return (error); +} + + +/* + * lxpr_close(): Vnode operation for VOP_CLOSE() + */ +/* ARGSUSED */ +static int +lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpr = VTOLXP(vp); + lxpr_nodetype_t type = lxpr->lxpr_type; + int err; + + /* + * we should never get here because the close is done on the realvp + * for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR && + type != LXPR_PID_EXE); + + if (type == LXPR_KMSG) { + if ((err = ldi_close(lxpr->lxpr_cons_ldih, 0, cr)) != 0) + return (err); + } + + return (0); +} + +static void (*lxpr_read_function[LXPR_NFILES])() = { + lxpr_read_isdir, /* /proc */ + lxpr_read_isdir, /* /proc/<pid> */ + lxpr_read_pid_cmdline, /* /proc/<pid>/cmdline */ + lxpr_read_empty, /* /proc/<pid>/cpu */ + lxpr_read_invalid, /* /proc/<pid>/cwd */ + lxpr_read_empty, /* /proc/<pid>/environ */ + lxpr_read_invalid, /* /proc/<pid>/exe */ + lxpr_read_pid_maps, /* /proc/<pid>/maps */ + lxpr_read_empty, /* /proc/<pid>/mem */ + lxpr_read_invalid, /* /proc/<pid>/root */ + lxpr_read_pid_stat, /* /proc/<pid>/stat */ + lxpr_read_pid_statm, /* /proc/<pid>/statm */ + lxpr_read_pid_status, /* /proc/<pid>/status */ + lxpr_read_isdir, /* /proc/<pid>/fd */ + lxpr_read_fd, /* /proc/<pid>/fd/nn */ + lxpr_read_empty, /* /proc/cmdline */ + lxpr_read_cpuinfo, /* /proc/cpuinfo */ + lxpr_read_empty, /* /proc/devices */ + lxpr_read_empty, /* /proc/dma */ + lxpr_read_empty, /* /proc/filesystems */ + lxpr_read_empty, /* /proc/interrupts */ + lxpr_read_empty, /* /proc/ioports */ + lxpr_read_empty, /* /proc/kcore */ + lxpr_read_kmsg, /* /proc/kmsg */ + lxpr_read_loadavg, /* /proc/loadavg */ + lxpr_read_meminfo, /* /proc/meminfo */ + lxpr_read_mounts, /* /proc/mounts */ + lxpr_read_isdir, /* /proc/net */ + lxpr_read_net_arp, /* /proc/net/arp */ + lxpr_read_net_dev, /* /proc/net/dev */ + lxpr_read_net_dev_mcast, /* /proc/net/dev_mcast */ + lxpr_read_net_igmp, /* /proc/net/igmp */ + lxpr_read_net_ip_mr_cache, /* /proc/net/ip_mr_cache */ + lxpr_read_net_ip_mr_vif, /* /proc/net/ip_mr_vif */ + lxpr_read_net_mcfilter, /* /proc/net/mcfilter */ + lxpr_read_net_netstat, /* /proc/net/netstat */ + lxpr_read_net_raw, /* /proc/net/raw */ + lxpr_read_net_route, /* /proc/net/route */ + lxpr_read_net_rpc, /* /proc/net/rpc */ + lxpr_read_net_rt_cache, /* /proc/net/rt_cache */ + lxpr_read_net_sockstat, /* /proc/net/sockstat */ + lxpr_read_net_snmp, /* /proc/net/snmp */ + lxpr_read_net_stat, /* /proc/net/stat */ + lxpr_read_net_tcp, /* /proc/net/tcp */ + lxpr_read_net_udp, /* /proc/net/udp */ + lxpr_read_net_unix, /* /proc/net/unix */ + lxpr_read_partitions, /* /proc/partitions */ + lxpr_read_invalid, /* /proc/self */ + lxpr_read_stat, /* /proc/stat */ + lxpr_read_uptime, /* /proc/uptime */ + lxpr_read_version, /* /proc/version */ +}; + +/* + * Array of lookup functions, indexed by /lxproc file type. + */ +static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = { + lxpr_lookup_procdir, /* /proc */ + lxpr_lookup_piddir, /* /proc/<pid> */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/environ */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/exe */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/maps */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mem */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/root */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/stat */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/statm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/status */ + lxpr_lookup_fddir, /* /proc/<pid>/fd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_lookup_not_a_dir, /* /proc/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/cpuinfo */ + lxpr_lookup_not_a_dir, /* /proc/devices */ + lxpr_lookup_not_a_dir, /* /proc/dma */ + lxpr_lookup_not_a_dir, /* /proc/filesystems */ + lxpr_lookup_not_a_dir, /* /proc/interrupts */ + lxpr_lookup_not_a_dir, /* /proc/ioports */ + lxpr_lookup_not_a_dir, /* /proc/kcore */ + lxpr_lookup_not_a_dir, /* /proc/kmsg */ + lxpr_lookup_not_a_dir, /* /proc/loadavg */ + lxpr_lookup_not_a_dir, /* /proc/meminfo */ + lxpr_lookup_not_a_dir, /* /proc/mounts */ + lxpr_lookup_netdir, /* /proc/net */ + lxpr_lookup_not_a_dir, /* /proc/net/arp */ + lxpr_lookup_not_a_dir, /* /proc/net/dev */ + lxpr_lookup_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_lookup_not_a_dir, /* /proc/net/igmp */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_lookup_not_a_dir, /* /proc/net/mcfilter */ + lxpr_lookup_not_a_dir, /* /proc/net/netstat */ + lxpr_lookup_not_a_dir, /* /proc/net/raw */ + lxpr_lookup_not_a_dir, /* /proc/net/route */ + lxpr_lookup_not_a_dir, /* /proc/net/rpc */ + lxpr_lookup_not_a_dir, /* /proc/net/rt_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/sockstat */ + lxpr_lookup_not_a_dir, /* /proc/net/snmp */ + lxpr_lookup_not_a_dir, /* /proc/net/stat */ + lxpr_lookup_not_a_dir, /* /proc/net/tcp */ + lxpr_lookup_not_a_dir, /* /proc/net/udp */ + lxpr_lookup_not_a_dir, /* /proc/net/unix */ + lxpr_lookup_not_a_dir, /* /proc/partitions */ + lxpr_lookup_not_a_dir, /* /proc/self */ + lxpr_lookup_not_a_dir, /* /proc/stat */ + lxpr_lookup_not_a_dir, /* /proc/uptime */ + lxpr_lookup_not_a_dir, /* /proc/version */ +}; + +/* + * Array of readdir functions, indexed by /proc file type. + */ +static int (*lxpr_readdir_function[LXPR_NFILES])() = { + lxpr_readdir_procdir, /* /proc */ + lxpr_readdir_piddir, /* /proc/<pid> */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/environ */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/exe */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/maps */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mem */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/root */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/stat */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/statm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/status */ + lxpr_readdir_fddir, /* /proc/<pid>/fd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_readdir_not_a_dir, /* /proc/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/cpuinfo */ + lxpr_readdir_not_a_dir, /* /proc/devices */ + lxpr_readdir_not_a_dir, /* /proc/dma */ + lxpr_readdir_not_a_dir, /* /proc/filesystems */ + lxpr_readdir_not_a_dir, /* /proc/interrupts */ + lxpr_readdir_not_a_dir, /* /proc/ioports */ + lxpr_readdir_not_a_dir, /* /proc/kcore */ + lxpr_readdir_not_a_dir, /* /proc/kmsg */ + lxpr_readdir_not_a_dir, /* /proc/loadavg */ + lxpr_readdir_not_a_dir, /* /proc/meminfo */ + lxpr_readdir_not_a_dir, /* /proc/mounts */ + lxpr_readdir_netdir, /* /proc/net */ + lxpr_readdir_not_a_dir, /* /proc/net/arp */ + lxpr_readdir_not_a_dir, /* /proc/net/dev */ + lxpr_readdir_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_readdir_not_a_dir, /* /proc/net/igmp */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_readdir_not_a_dir, /* /proc/net/mcfilter */ + lxpr_readdir_not_a_dir, /* /proc/net/netstat */ + lxpr_readdir_not_a_dir, /* /proc/net/raw */ + lxpr_readdir_not_a_dir, /* /proc/net/route */ + lxpr_readdir_not_a_dir, /* /proc/net/rpc */ + lxpr_readdir_not_a_dir, /* /proc/net/rt_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/sockstat */ + lxpr_readdir_not_a_dir, /* /proc/net/snmp */ + lxpr_readdir_not_a_dir, /* /proc/net/stat */ + lxpr_readdir_not_a_dir, /* /proc/net/tcp */ + lxpr_readdir_not_a_dir, /* /proc/net/udp */ + lxpr_readdir_not_a_dir, /* /proc/net/unix */ + lxpr_readdir_not_a_dir, /* /proc/partitions */ + lxpr_readdir_not_a_dir, /* /proc/self */ + lxpr_readdir_not_a_dir, /* /proc/stat */ + lxpr_readdir_not_a_dir, /* /proc/uptime */ + lxpr_readdir_not_a_dir, /* /proc/version */ +}; + + +/* + * lxpr_read(): Vnode operation for VOP_READ() + * + * As the format of all the files that can be read in lxproc is human readable + * and not binary structures there do not have to be different read variants + * depending on whether the reading process model is 32- or 64-bit. + */ +/* ARGSUSED */ +static int +lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop); + int error; + + ASSERT(type >= 0 && type < LXPR_NFILES); + + lxpr_read_function[type](lxpnp, uiobuf); + + error = lxpr_uiobuf_flush(uiobuf); + lxpr_uiobuf_free(uiobuf); + + return (error); +} + +/* + * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty() + * + * Various special case reads: + * - trying to read a directory + * - invalid file (used to mean a file that should be implemented, + * but isn't yet) + * - empty file + * - wait to be able to read a file that will never have anything to read + */ +/* ARGSUSED */ +static void +lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EISDIR); +} + +/* ARGSUSED */ +static void +lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EINVAL); +} + +/* ARGSUSED */ +static void +lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_pid_cmdline(): + * + * This is not precisely compatible with Linux: the Linux cmdline returns argv + * with the correct separation using \0 between the arguments, but we cannot do + * that without copying the real argv from the correct process context. This + * is too difficult to attempt so we pretend that the entire cmdline is just + * argv[0]. This is good enough for ps and htop to display correctly, but might + * cause some other things not to work correctly. + */ +static void +lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + char *buf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + buf = PTOU(p)->u_argv != 0 ? PTOU(p)->u_psargs : PTOU(p)->u_comm; + + lxpr_uiobuf_write(uiobuf, buf, strlen(buf) + 1); + lxpr_unlock(p); +} + +/* + * lxpr_read_pid_maps(): memory map file + */ +static void +lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + struct seg *seg; + char *buf; + int buflen = MAXPATHLEN; + struct print_data { + caddr_t saddr; + caddr_t eaddr; + int type; + char prot[5]; + uint32_t offset; + vnode_t *vp; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *pbuf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + + if (as == &kas) { + lxpr_unlock(p); + return; + } + + mutex_exit(&p->p_lock); + + /* Iterate over all segments in the address space */ + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + vnode_t *vp; + uint_t protbits; + + pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP); + + pbuf->saddr = seg->s_base; + pbuf->eaddr = seg->s_base+seg->s_size; + pbuf->type = SEGOP_GETTYPE(seg, seg->s_base); + + /* + * Cheat and only use the protection bits of the first page + * in the segment + */ + (void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot)); + (void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits); + + if (protbits & PROT_READ) pbuf->prot[0] = 'r'; + if (protbits & PROT_WRITE) pbuf->prot[1] = 'w'; + if (protbits & PROT_EXEC) pbuf->prot[2] = 'x'; + if (pbuf->type & MAP_SHARED) pbuf->prot[3] = 's'; + else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p'; + + if (seg->s_ops == &segvn_ops && + SEGOP_GETVP(seg, seg->s_base, &vp) == 0 && + vp != NULL && vp->v_type == VREG) { + VN_HOLD(vp); + pbuf->vp = vp; + } else { + pbuf->vp = NULL; + } + + pbuf->offset = (uint32_t)SEGOP_GETOFFSET(seg, pbuf->saddr); + + pbuf->next = NULL; + *print_tail = pbuf; + print_tail = &pbuf->next; + } + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + buf = kmem_alloc(buflen, KM_SLEEP); + + /* print the data we've extracted */ + pbuf = print_head; + while (pbuf != NULL) { + struct print_data *pbuf_next; + vattr_t vattr; + + int maj = 0; + int min = 0; + u_longlong_t inode = 0; + + *buf = '\0'; + if (pbuf->vp != NULL) { + vattr.va_mask = AT_FSID | AT_NODEID; + if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(), + NULL) == 0) { + maj = getmajor(vattr.va_fsid); + min = getminor(vattr.va_fsid); + inode = vattr.va_nodeid; + } + (void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED()); + VN_RELE(pbuf->vp); + } + + if (*buf != '\0') { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02d:%03d %lld %s\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode, buf); + } else { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02d:%03d %lld\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode); + } + + pbuf_next = pbuf->next; + kmem_free(pbuf, sizeof (*pbuf)); + pbuf = pbuf_next; + } + + kmem_free(buf, buflen); +} + +/* + * lxpr_read_pid_statm(): memory status file + */ +static void +lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + size_t vsize; + size_t rss; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + + mutex_exit(&p->p_lock); + + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + vsize = btopr(as->a_resvsize); + rss = rm_asrss(as); + AS_LOCK_EXIT(as, &as->a_lock); + + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, + "%lu %lu %lu %lu %lu %lu %lu\n", + vsize, rss, 0l, rss, 0l, 0l, 0l); +} + +/* + * lxpr_read_pid_status(): status file + */ +static void +lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + user_t *up; + cred_t *cr; + const gid_t *groups; + int ngroups; + struct as *as; + char *status; + pid_t pid, ppid; + size_t vsize; + size_t rss; + k_sigset_t current, ignore, handle; + int i, lx_sig; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Convert pid to the Linux default of 1 if we're the zone's init + * process + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; + ppid = 0; /* parent pid for init is 0 */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) + ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + } + + t = prchoose(p); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + status = "S (sleeping)"; + break; + case TS_RUN: + case TS_ONPROC: + status = "R (running)"; + break; + case TS_ZOMB: + status = "Z (zombie)"; + break; + case TS_STOPPED: + status = "T (stopped)"; + break; + default: + status = "! (unknown)"; + break; + } + thread_unlock(t); + } else { + /* + * there is a hole in the exit code, where a proc can have + * no threads but it is yet to be flagged SZOMB. We will + * assume we are about to become a zombie + */ + status = "Z (zombie)"; + } + + up = PTOU(p); + mutex_enter(&p->p_crlock); + crhold(cr = p->p_cred); + mutex_exit(&p->p_crlock); + + lxpr_uiobuf_printf(uiobuf, + "Name:\t%s\n" + "State:\t%s\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" + "TracerPid:\t%d\n" + "Uid:\t%u\t%u\t%u\t%u\n" + "Gid:\t%u\t%u\t%u\t%u\n" + "FDSize:\t%d\n" + "Groups:\t", + up->u_comm, + status, + pid, /* thread group id - same as pid */ + pid, + ppid, + 0, + crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr), + crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr), + p->p_fno_ctl); + + ngroups = crgetngroups(cr); + groups = crgetgroups(cr); + for (i = 0; i < ngroups; i++) { + lxpr_uiobuf_printf(uiobuf, + "%u ", + groups[i]); + } + crfree(cr); + + as = p->p_as; + if ((p->p_stat != SZOMB) && !(p->p_flag & SSYS) && (as != &kas)) { + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + + lxpr_uiobuf_printf(uiobuf, + "\n" + "VmSize:\t%8lu kB\n" + "VmLck:\t%8lu kB\n" + "VmRSS:\t%8lu kB\n" + "VmData:\t%8lu kB\n" + "VmStk:\t%8lu kB\n" + "VmExe:\t%8lu kB\n" + "VmLib:\t%8lu kB", + btok(vsize), + 0l, + ptok(rss), + 0l, + btok(p->p_stksize), + ptok(rss), + 0l); + } + + sigemptyset(¤t); + sigemptyset(&ignore); + sigemptyset(&handle); + + for (i = 1; i < NSIG; i++) { + lx_sig = lxpr_sigmap[i]; + + if ((lx_sig > 0) && (lx_sig < LX_NSIG)) { + if (sigismember(&p->p_sig, i)) + sigaddset(¤t, lx_sig); + + if (up->u_signal[i - 1] == SIG_IGN) + sigaddset(&ignore, lx_sig); + else if (up->u_signal[i - 1] != SIG_DFL) + sigaddset(&handle, lx_sig); + } + } + + lxpr_uiobuf_printf(uiobuf, + "\n" + "SigPnd:\t%08x%08x\n" + "SigBlk:\t%08x%08x\n" + "SigIgn:\t%08x%08x\n" + "SigCgt:\t%08x%08x\n" + "CapInh:\t%016x\n" + "CapPrm:\t%016x\n" + "CapEff:\t%016x\n", + current.__sigbits[1], current.__sigbits[0], + 0, 0, /* signals blocked on per thread basis */ + ignore.__sigbits[1], ignore.__sigbits[0], + handle.__sigbits[1], handle.__sigbits[0], + /* Can't do anything with linux capabilities */ + 0, + 0, + 0); + + lxpr_unlock(p); +} + + +/* + * lxpr_read_pid_stat(): pid stat file + */ +static void +lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + struct as *as; + char stat; + pid_t pid, ppid, pgpid, spid; + gid_t psgid; + dev_t psdev; + size_t rss, vsize; + int nice, pri; + caddr_t wchan; + processorid_t cpu; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Set Linux defaults if we're the zone's init process + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; /* PID for init */ + ppid = 0; /* parent PID for init is 0 */ + pgpid = 0; /* process group for init is 0 */ + psgid = (gid_t)-1; /* credential GID for init is -1 */ + spid = 0; /* session id for init is 0 */ + psdev = 0; /* session device for init is 0 */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) ? + curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + + pgpid = p->p_pgrp; + + mutex_enter(&p->p_splock); + mutex_enter(&p->p_sessp->s_lock); + spid = p->p_sessp->s_sid; + psdev = p->p_sessp->s_dev; + if (p->p_sessp->s_cred) + psgid = crgetgid(p->p_sessp->s_cred); + else + psgid = crgetgid(p->p_cred); + + mutex_exit(&p->p_sessp->s_lock); + mutex_exit(&p->p_splock); + } + + t = prchoose(p); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + stat = 'S'; break; + case TS_RUN: + case TS_ONPROC: + stat = 'R'; break; + case TS_ZOMB: + stat = 'Z'; break; + case TS_STOPPED: + stat = 'T'; break; + default: + stat = '!'; break; + } + + if (CL_DONICE(t, NULL, 0, &nice) != 0) + nice = 0; + + pri = t->t_pri; + wchan = t->t_wchan; + cpu = t->t_cpu->cpu_id; + thread_unlock(t); + } else { + /* Only zombies have no threads */ + stat = 'Z'; + nice = 0; + pri = 0; + wchan = 0; + cpu = 0; + } + as = p->p_as; + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + + lxpr_uiobuf_printf(uiobuf, + "%d (%s) %c %d %d %d %d %d " + "%lu %lu %lu %lu %lu " + "%lu %lu %ld %ld " + "%d %d %d " + "%lu " + "%lu " + "%lu %ld %llu " + "%lu %lu %u " + "%lu %lu " + "%lu %lu %lu %lu " + "%lu " + "%lu %lu " + "%d " + "%d" + "\n", + pid, PTOU(p)->u_comm, stat, ppid, pgpid, spid, psdev, psgid, + 0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */ + p->p_utime, p->p_stime, p->p_cutime, p->p_cstime, + pri, nice, p->p_lwpcnt, + 0l, /* itrealvalue (time before next SIGALRM) */ + PTOU(p)->u_ticks, + vsize, rss, p->p_vmem_ctl, + 0l, 0l, USRSTACK, /* startcode, endcode, startstack */ + 0l, 0l, /* kstkesp, kstkeip */ + 0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch */ + wchan, + 0l, 0l, /* nswap, cnswap */ + 0, /* exit_signal */ + cpu); + + lxpr_unlock(p); +} + +/* ARGSUSED */ +static void +lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, "Inter-| Receive " + " | Transmit\n"); + lxpr_uiobuf_printf(uiobuf, " face |bytes packets errs drop fifo" + " frame compressed multicast|bytes packets errs drop fifo" + " colls carrier compressed\n"); + + /* + * Data about each interface should go here, but that shouldn't be added + * unless there is an lxproc reader that actually makes use of it (and + * doesn't need anything else that we refuse to provide)... + */ +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_kmsg(): read the contents of the kernel message queue. We + * translate this into the reception of console messages for this zone; each + * read copies out a single zone console message, or blocks until the next one + * is produced. + */ + +#define LX_KMSG_PRI "<0>" + +static void +lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf) +{ + ldi_handle_t lh = lxpnp->lxpr_cons_ldih; + mblk_t *mp; + + if (ldi_getmsg(lh, &mp, NULL) == 0) { + /* + * lxproc doesn't like successive reads to the same file + * descriptor unless we do an explicit rewind each time. + */ + lxpr_uiobuf_seek(uiobuf, 0); + + lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI, + mp->b_cont->b_rptr); + + freemsg(mp); + } +} + +/* + * lxpr_read_loadavg(): read the contents of the "loadavg" file. We do just + * enough for uptime and other simple lxproc readers to work + */ +extern int nthread; + +static void +lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ulong_t avenrun1; + ulong_t avenrun5; + ulong_t avenrun15; + ulong_t avenrun1_cs; + ulong_t avenrun5_cs; + ulong_t avenrun15_cs; + int loadavg[3]; + int *loadbuf; + cpupart_t *cp; + zone_t *zone = LXPTOZ(lxpnp); + + uint_t nrunnable = 0; + rctl_qty_t nlwps; + + ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG); + + mutex_enter(&cpu_lock); + + /* + * Need to add up values over all CPU partitions. If pools are active, + * only report the values of the zone's partition, which by definition + * includes the current CPU. + */ + if (pool_pset_enabled()) { + psetid_t psetid = zone_pset_get(curproc->p_zone); + + ASSERT(curproc->p_zone != &zone0); + cp = CPU->cpu_part; + + nrunnable = cp->cp_nrunning + cp->cp_nrunnable; + (void) cpupart_get_loadavg(psetid, &loadavg[0], 3); + loadbuf = &loadavg[0]; + } else { + cp = cp_list_head; + do { + nrunnable += cp->cp_nrunning + cp->cp_nrunnable; + } while ((cp = cp->cp_next) != cp_list_head); + + loadbuf = zone == global_zone ? + &avenrun[0] : zone->zone_avenrun; + } + + /* + * If we're in the non-global zone, we'll report the total number of + * LWPs in the zone for the "nproc" parameter of /proc/loadavg, + * otherwise will just use nthread (which will include kernel threads, + * but should be good enough for lxproc). + */ + nlwps = zone == global_zone ? nthread : zone->zone_nlwps; + + mutex_exit(&cpu_lock); + + avenrun1 = loadbuf[0] >> FSHIFT; + avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun5 = loadbuf[1] >> FSHIFT; + avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun15 = loadbuf[2] >> FSHIFT; + avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n", + avenrun1, avenrun1_cs, + avenrun5, avenrun5_cs, + avenrun15, avenrun15_cs, + nrunnable, nlwps, 0); +} + +/* + * lxpr_read_meminfo(): read the contents of the "meminfo" file. + */ +static void +lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + int global = zone == global_zone; + long total_mem, free_mem, total_swap, used_swap; + + ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO); + + if (global || zone->zone_phys_mem_ctl == UINT64_MAX) { + total_mem = physmem * PAGESIZE; + free_mem = freemem * PAGESIZE; + } else { + total_mem = zone->zone_phys_mem_ctl; + free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem; + } + + if (global || zone->zone_max_swap_ctl == UINT64_MAX) { + total_swap = k_anoninfo.ani_max * PAGESIZE; + used_swap = k_anoninfo.ani_phys_resv * PAGESIZE; + } else { + mutex_enter(&zone->zone_mem_lock); + total_swap = zone->zone_max_swap_ctl; + used_swap = zone->zone_max_swap; + mutex_exit(&zone->zone_mem_lock); + } + + lxpr_uiobuf_printf(uiobuf, + " total: used: free: shared: buffers: cached:\n" + "Mem: %8lu %8lu %8lu %8u %8u %8u\n" + "Swap: %8lu %8lu %8lu\n" + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "MemShared: %8u kB\n" + "Buffers: %8u kB\n" + "Cached: %8u kB\n" + "SwapCached:%8u kB\n" + "Active: %8u kB\n" + "Inactive: %8u kB\n" + "HighTotal: %8u kB\n" + "HighFree: %8u kB\n" + "LowTotal: %8u kB\n" + "LowFree: %8u kB\n" + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n", + total_mem, total_mem - free_mem, free_mem, 0, 0, 0, + total_swap, used_swap, total_swap - used_swap, + btok(total_mem), /* MemTotal */ + btok(free_mem), /* MemFree */ + 0, /* MemShared */ + 0, /* Buffers */ + 0, /* Cached */ + 0, /* SwapCached */ + 0, /* Active */ + 0, /* Inactive */ + 0, /* HighTotal */ + 0, /* HighFree */ + btok(total_mem), /* LowTotal */ + btok(free_mem), /* LowFree */ + btok(total_swap), /* SwapTotal */ + btok(total_swap - used_swap)); /* SwapFree */ +} + +/* + * lxpr_read_mounts(): + */ +/* ARGSUSED */ +static void +lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + struct vfs *vfsp; + struct vfs *vfslist; + zone_t *zone = LXPTOZ(lxpnp); + struct print_data { + refstr_t *vfs_mntpt; + refstr_t *vfs_resource; + uint_t vfs_flag; + int vfs_fstype; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *printp; + + vfs_list_read_lock(); + + if (zone == global_zone) { + vfsp = vfslist = rootvfs; + } else { + vfsp = vfslist = zone->zone_vfslist; + /* + * If the zone has a root entry, it will be the first in + * the list. If it doesn't, we conjure one up. + */ + if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt), + zone->zone_rootpath) != 0) { + struct vfs *tvfsp; + /* + * The root of the zone is not a mount point. The vfs + * we want to report is that of the zone's root vnode. + */ + tvfsp = zone->zone_rootvp->v_vfsp; + + lxpr_uiobuf_printf(uiobuf, + "/ / %s %s 0 0\n", + vfssw[tvfsp->vfs_fstype].vsw_name, + tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + + } + if (vfslist == NULL) { + vfs_list_unlock(); + return; + } + } + + /* + * Later on we have to do a lookupname, which can end up causing + * another vfs_list_read_lock() to be called. Which can lead to a + * deadlock. To avoid this, we extract the data we need into a local + * list, then we can run this list without holding vfs_list_read_lock() + * We keep the list in the same order as the vfs_list + */ + do { + /* Skip mounts we shouldn't show */ + if (vfsp->vfs_flag & VFS_NOMNTTAB) { + goto nextfs; + } + + printp = kmem_alloc(sizeof (*printp), KM_SLEEP); + refstr_hold(vfsp->vfs_mntpt); + printp->vfs_mntpt = vfsp->vfs_mntpt; + refstr_hold(vfsp->vfs_resource); + printp->vfs_resource = vfsp->vfs_resource; + printp->vfs_flag = vfsp->vfs_flag; + printp->vfs_fstype = vfsp->vfs_fstype; + printp->next = NULL; + + *print_tail = printp; + print_tail = &printp->next; + +nextfs: + vfsp = (zone == global_zone) ? + vfsp->vfs_next : vfsp->vfs_zone_next; + + } while (vfsp != vfslist); + + vfs_list_unlock(); + + /* + * now we can run through what we've extracted without holding + * vfs_list_read_lock() + */ + printp = print_head; + while (printp != NULL) { + struct print_data *printp_next; + const char *resource; + char *mntpt; + struct vnode *vp; + int error; + + mntpt = (char *)refstr_value(printp->vfs_mntpt); + resource = refstr_value(printp->vfs_resource); + + if (mntpt != NULL && mntpt[0] != '\0') + mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); + else + mntpt = "-"; + + error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + + if (error != 0) + goto nextp; + + if (!(vp->v_flag & VROOT)) { + VN_RELE(vp); + goto nextp; + } + VN_RELE(vp); + + if (resource != NULL && resource[0] != '\0') { + if (resource[0] == '/') { + resource = ZONE_PATH_VISIBLE(resource, zone) ? + ZONE_PATH_TRANSLATE(resource, zone) : + mntpt; + } + } else { + resource = "-"; + } + + lxpr_uiobuf_printf(uiobuf, + "%s %s %s %s 0 0\n", + resource, mntpt, vfssw[printp->vfs_fstype].vsw_name, + printp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + +nextp: + printp_next = printp->next; + refstr_rele(printp->vfs_mntpt); + refstr_rele(printp->vfs_resource); + kmem_free(printp, sizeof (*printp)); + printp = printp_next; + + } +} + +/* + * lxpr_read_partitions(): + * + * We don't support partitions in a local zone because it requires access to + * physical devices. But we need to fake up enough of the file to show that we + * have no partitions. + */ +/* ARGSUSED */ +static void +lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, + "major minor #blocks name rio rmerge rsect ruse " + "wio wmerge wsect wuse running use aveq\n\n"); +} + +/* + * lxpr_read_version(): read the contents of the "version" file. Note that + * we don't lie here -- we don't pretend that we're Linux. If lxproc is to + * be used in a Linux-branded zone, there will need to be a mount option to + * indicate that Linux should be more fully mimicked. + */ +/* ARGSUSED */ +static void +lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, + "%s version %s (%s version %d.%d.%d) " + "#%s SMP %s\n", + utsname.sysname, utsname.release, +#if defined(__GNUC__) + "gcc", + __GNUC__, + __GNUC_MINOR__, + __GNUC_PATCHLEVEL__, +#else + "Sun C", + __SUNPRO_C / 0x100, + (__SUNPRO_C & 0xff) / 0x10, + __SUNPRO_C & 0xf, +#endif + utsname.version, + "00:00:00 00/00/00"); +} + +/* + * lxpr_read_stat(): read the contents of the "stat" file. + * + */ +/* ARGSUSED */ +static void +lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t sys_cum = 0; + ulong_t user_cum = 0; + ulong_t irq_cum = 0; + uint_t cpu_nrunnable_cum = 0; + uint_t w_io_cum = 0; + + ulong_t pgpgin_cum = 0; + ulong_t pgpgout_cum = 0; + ulong_t pgswapout_cum = 0; + ulong_t pgswapin_cum = 0; + ulong_t intr_cum = 0; + ulong_t pswitch_cum = 0; + ulong_t forks_cum = 0; + hrtime_t msnsecs[NCMSTATES]; + + /* temporary variable since scalehrtime modifies data in place */ + hrtime_t tmptime; + + ASSERT(lxpnp->lxpr_type == LXPR_STAT); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + /* Calculate cumulative stats */ + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + int i; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_cum += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]); + + pgpgin_cum += CPU_STATS(cp, vm.pgpgin); + pgpgout_cum += CPU_STATS(cp, vm.pgpgout); + pgswapin_cum += CPU_STATS(cp, vm.pgswapin); + pgswapout_cum += CPU_STATS(cp, vm.pgswapout); + + cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable; + w_io_cum += CPU_STATS(cp, sys.iowait); + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_cum += NSEC_TO_TICK(tmptime); + } + + for (i = 0; i < PIL_MAX; i++) + intr_cum += CPU_STATS(cp, sys.intr[i]); + + pswitch_cum += CPU_STATS(cp, sys.pswitch); + forks_cum += CPU_STATS(cp, sys.sysfork); + forks_cum += CPU_STATS(cp, sys.sysvfork); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + lxpr_uiobuf_printf(uiobuf, "cpu %ld %ld %ld %ld %ld %ld %ld\n", + user_cum, 0, sys_cum, idle_cum, 0, irq_cum, 0); + + /* Do per processor stats */ + do { + int i; + + ulong_t idle_ticks; + ulong_t sys_ticks; + ulong_t user_ticks; + ulong_t irq_ticks = 0; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_ticks = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]); + + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_ticks += NSEC_TO_TICK(tmptime); + } + + lxpr_uiobuf_printf(uiobuf, + "cpu%d %ld %ld %ld %ld %ld %ld %ld\n", + cp->cpu_id, user_ticks, 0, sys_ticks, idle_ticks, + 0, irq_ticks, 0); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); + + lxpr_uiobuf_printf(uiobuf, + "page %lu %lu\n" + "swap %lu %lu\n" + "intr %lu\n" + "ctxt %lu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + pgpgin_cum, pgpgout_cum, + pgswapin_cum, pgswapout_cum, + intr_cum, + pswitch_cum, + boot_time, + forks_cum, + cpu_nrunnable_cum, + w_io_cum); +} + +/* + * lxpr_read_uptime(): read the contents of the "uptime" file. + * + * format is: "%.2lf, %.2lf",uptime_secs, idle_secs + * Use fixed point arithmetic to get 2 decimal places + */ +/* ARGSUSED */ +static void +lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t cpu_count = 0; + ulong_t idle_s; + ulong_t idle_cs; + ulong_t up_s; + ulong_t up_cs; + hrtime_t birthtime; + hrtime_t centi_sec = 10000000; /* 10^7 */ + + ASSERT(lxpnp->lxpr_type == LXPR_UPTIME); + + /* Calculate cumulative stats */ + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU; + do { + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle); + idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait); + cpu_count += 1; + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + mutex_exit(&cpu_lock); + + /* Getting the Zone zsched process startup time */ + birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart; + up_cs = (gethrtime() - birthtime) / centi_sec; + up_s = up_cs / 100; + up_cs %= 100; + + ASSERT(cpu_count > 0); + idle_cum /= cpu_count; + idle_s = idle_cum / hz; + idle_cs = idle_cum % hz; + idle_cs *= 100; + idle_cs /= hz; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs); +} + +static const char *amd_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "mp", + "nx", NULL, "mmxext", NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", "3dnowext", "3dnow" +}; + +static const char *amd_x_ecx[] = { + "lahf_lm", NULL, "svm", NULL, + "altmovcr8" +}; + +static const char *tm_x_edx[] = { + "recovery", "longrun", NULL, "lrti" +}; + +/* + * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx." + */ +static const char *intc_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "nx", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", NULL, NULL +}; + +static const char *intc_edx[] = { + "fpu", "vme", "de", "pse", + "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", + "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", + NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", + "ht", "tm", "ia64", "pbe" +}; + +/* + * "sse3" on linux is called "pni" (Prescott New Instructions). + */ +static const char *intc_ecx[] = { + "pni", NULL, NULL, "monitor", + "ds_cpl", NULL, NULL, "est", + "tm2", NULL, "cid", NULL, + NULL, "cx16", "xtpr" +}; + +static void +lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + int i; + uint32_t bits; + cpu_t *cp, *cpstart; + int pools_enabled; + const char **fp; + char brandstr[CPU_IDSTRLEN]; + struct cpuid_regs cpr; + int maxeax; + int std_ecx, std_edx, ext_ecx, ext_edx; + + ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU; + do { + /* + * This returns the maximum eax value for standard cpuid + * functions in eax. + */ + cpr.cp_eax = 0; + (void) cpuid_insn(cp, &cpr); + maxeax = cpr.cp_eax; + + /* + * Get standard x86 feature flags. + */ + cpr.cp_eax = 1; + (void) cpuid_insn(cp, &cpr); + std_ecx = cpr.cp_ecx; + std_edx = cpr.cp_edx; + + /* + * Now get extended feature flags. + */ + cpr.cp_eax = 0x80000001; + (void) cpuid_insn(cp, &cpr); + ext_ecx = cpr.cp_ecx; + ext_edx = cpr.cp_edx; + + (void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN); + + lxpr_uiobuf_printf(uiobuf, + "processor\t: %d\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n" + "stepping\t: %d\n" + "cpu MHz\t\t: %u.%03u\n", + cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp), + cpuid_getmodel(cp), brandstr, cpuid_getstep(cp), + (uint32_t)(cpu_freq_hz / 1000000), + ((uint32_t)(cpu_freq_hz / 1000)) % 1000); + + lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n", + getl2cacheinfo(cp, NULL, NULL, NULL) / 1024); + + if (is_x86_feature(x86_featureset, X86FSET_HTT)) { + /* + * 'siblings' is used for HT-style threads + */ + lxpr_uiobuf_printf(uiobuf, + "physical id\t: %lu\n" + "siblings\t: %u\n", + pg_plat_hw_instance_id(cp, PGHW_CHIP), + cpuid_get_ncpu_per_chip(cp)); + } + + /* + * Since we're relatively picky about running on older hardware, + * we can be somewhat cavalier about the answers to these ones. + * + * In fact, given the hardware we support, we just say: + * + * fdiv_bug : no (if we're on a 64-bit kernel) + * hlt_bug : no + * f00f_bug : no + * coma_bug : no + * wp : yes (write protect in supervsr mode) + */ + lxpr_uiobuf_printf(uiobuf, + "fdiv_bug\t: %s\n" + "hlt_bug \t: no\n" + "f00f_bug\t: no\n" + "coma_bug\t: no\n" + "fpu\t\t: %s\n" + "fpu_exception\t: %s\n" + "cpuid level\t: %d\n" + "flags\t\t:", +#if defined(__i386) + fpu_pentium_fdivbug ? "yes" : "no", +#else + "no", +#endif /* __i386 */ + fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no", + maxeax); + + for (bits = std_edx, fp = intc_edx, i = 0; + i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + /* + * name additional features where appropriate + */ + switch (x86_vendor) { + case X86_VENDOR_Intel: + for (bits = ext_edx, fp = intc_x_edx, i = 0; + i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_AMD: + for (bits = ext_edx, fp = amd_x_edx, i = 0; + i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + for (bits = ext_ecx, fp = amd_x_ecx, i = 0; + i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_TM: + for (bits = ext_edx, fp = tm_x_edx, i = 0; + i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + default: + break; + } + + for (bits = std_ecx, fp = intc_ecx, i = 0; + i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + lxpr_uiobuf_printf(uiobuf, "\n\n"); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); +} + +/* ARGSUSED */ +static void +lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD); + lxpr_uiobuf_seterr(uiobuf, EFAULT); +} + +/* + * lxpr_getattr(): Vnode operation for VOP_GETATTR() + */ +static int +lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + register lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + extern uint_t nproc; + int error; + + /* + * Return attributes of underlying vnode if ATTR_REAL + * + * but keep fd files with the symlink permissions + */ + if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) { + vnode_t *rvp = lxpnp->lxpr_realvp; + + /* + * withold attribute information to owner or root + */ + if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) { + return (error); + } + + /* + * now its attributes + */ + if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) { + return (error); + } + + /* + * if it's a file in lx /proc/pid/fd/xx then set its + * mode and keep it looking like a symlink + */ + if (type == LXPR_PID_FD_FD) { + vap->va_mode = lxpnp->lxpr_mode; + vap->va_type = vp->v_type; + vap->va_size = 0; + vap->va_nlink = 1; + } + return (0); + } + + /* Default attributes, that may be overridden below */ + bzero(vap, sizeof (*vap)); + vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time; + vap->va_nlink = 1; + vap->va_type = vp->v_type; + vap->va_mode = lxpnp->lxpr_mode; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_blksize = DEV_BSIZE; + vap->va_uid = lxpnp->lxpr_uid; + vap->va_gid = lxpnp->lxpr_gid; + vap->va_nodeid = lxpnp->lxpr_ino; + + switch (type) { + case LXPR_PROCDIR: + vap->va_nlink = nproc + 2 + PROCDIRFILES; + vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE; + break; + case LXPR_PIDDIR: + vap->va_nlink = PIDDIRFILES; + vap->va_size = PIDDIRFILES * LXPR_SDSIZE; + break; + case LXPR_SELF: + vap->va_uid = crgetruid(curproc->p_cred); + vap->va_gid = crgetrgid(curproc->p_cred); + break; + default: + break; + } + + vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size); + return (0); +} + +/* + * lxpr_access(): Vnode operation for VOP_ACCESS() + */ +static int +lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + int shift = 0; + proc_t *tp; + + /* lx /proc is a read only file system */ + if (mode & VWRITE) + return (EROFS); + + /* + * If this is a restricted file, check access permissions. + */ + switch (lxpnp->lxpr_type) { + case LXPR_PIDDIR: + return (0); + case LXPR_PID_CURDIR: + case LXPR_PID_ENV: + case LXPR_PID_EXE: + case LXPR_PID_MAPS: + case LXPR_PID_MEM: + case LXPR_PID_ROOTDIR: + case LXPR_PID_FDDIR: + case LXPR_PID_FD_FD: + if ((tp = lxpr_lock(lxpnp->lxpr_pid)) == NULL) + return (ENOENT); + if (tp != curproc && secpolicy_proc_access(cr) != 0 && + priv_proc_cred_perm(cr, tp, NULL, mode) != 0) { + lxpr_unlock(tp); + return (EACCES); + } + lxpr_unlock(tp); + default: + break; + } + + if (lxpnp->lxpr_realvp != NULL) { + /* + * For these we use the underlying vnode's accessibility. + */ + return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct)); + } + + /* If user is root allow access regardless of permission bits */ + if (secpolicy_proc_access(cr) == 0) + return (0); + + /* + * Access check is based on only one of owner, group, public. If not + * owner, then check group. If not a member of the group, then check + * public access. + */ + if (crgetuid(cr) != lxpnp->lxpr_uid) { + shift += 3; + if (!groupmember((uid_t)lxpnp->lxpr_gid, cr)) + shift += 3; + } + + mode &= ~(lxpnp->lxpr_mode << shift); + + if (mode == 0) + return (0); + + return (EACCES); +} + +/* ARGSUSED */ +static vnode_t * +lxpr_lookup_not_a_dir(vnode_t *dp, char *comp) +{ + return (NULL); +} + +/* + * lxpr_lookup(): Vnode operation for VOP_LOOKUP() + */ +/* ARGSUSED */ +static int +lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type >= 0 && type < LXPR_NFILES); + + /* + * we should never get here because the lookup + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict lookup permission to owner or root + */ + if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + /* + * Just return the parent vnode if that's where we are trying to go. + */ + if (strcmp(comp, "..") == 0) { + VN_HOLD(lxpnp->lxpr_parent); + *vpp = lxpnp->lxpr_parent; + return (0); + } + + /* + * Special handling for directory searches. Note: null component name + * denotes that the current directory is being searched. + */ + if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) { + VN_HOLD(dp); + *vpp = dp; + return (0); + } + + *vpp = (lxpr_lookup_function[type](dp, comp)); + return ((*vpp == NULL) ? ENOENT : 0); +} + +/* + * Do a sequential search on the given directory table + */ +static vnode_t * +lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p, + lxpr_dirent_t *dirtab, int dirtablen) +{ + lxpr_node_t *lxpnp; + int count; + + for (count = 0; count < dirtablen; count++) { + if (strcmp(dirtab[count].d_name, comp) == 0) { + lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + return (dp); + } + } + return (NULL); +} + +static vnode_t * +lxpr_lookup_piddir(vnode_t *dp, char *comp) +{ + proc_t *p; + + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR); + + p = lxpr_lock(VTOLXP(dp)->lxpr_pid); + if (p == NULL) + return (NULL); + + dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES); + + lxpr_unlock(p); + + return (dp); +} + +/* + * Lookup one of the process's open files. + */ +static vnode_t * +lxpr_lookup_fddir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + lxpr_node_t *lxpnp; + vnode_t *vp = NULL; + proc_t *p; + file_t *fp; + uint_t fd; + int c; + uf_entry_t *ufp; + uf_info_t *fip; + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR); + + /* + * convert the string rendition of the filename + * to a file descriptor + */ + fd = 0; + while ((c = *comp++) != '\0') { + int ofd; + if (c < '0' || c > '9') + return (NULL); + + ofd = fd; + fd = 10*fd + c - '0'; + /* integer overflow */ + if (fd / 10 != ofd) + return (NULL); + } + + /* + * get the proc to work with and lock it + */ + p = lxpr_lock(dlxpnp->lxpr_pid); + if ((p == NULL)) + return (NULL); + + /* + * If the process is a zombie or system process + * it can't have any open files. + */ + if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) { + lxpr_unlock(p); + return (NULL); + } + + /* + * get us a fresh node/vnode + */ + lxpnp = lxpr_getnode(dp, LXPR_PID_FD_FD, p, fd); + + /* + * get open file info + */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + + /* + * got the fd data so now done with this proc + */ + lxpr_unlock(p); + + if (fd < fip->fi_nfiles) { + UF_ENTER(ufp, fip, fd); + /* + * ensure the fd is still kosher. + * it may have gone between the readdir and + * the lookup + */ + if (fip->fi_list[fd].uf_file == NULL) { + mutex_exit(&fip->fi_lock); + UF_EXIT(ufp); + lxpr_freenode(lxpnp); + return (NULL); + } + + if ((fp = ufp->uf_file) != NULL) + vp = fp->f_vnode; + UF_EXIT(ufp); + } + mutex_exit(&fip->fi_lock); + + if (vp == NULL) { + lxpr_freenode(lxpnp); + return (NULL); + } else { + /* + * Fill in the lxpr_node so future references will be able to + * find the underlying vnode. The vnode is held on the realvp. + */ + lxpnp->lxpr_realvp = vp; + VN_HOLD(lxpnp->lxpr_realvp); + } + + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); +} + +static vnode_t * +lxpr_lookup_netdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR); + + dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES); + + return (dp); +} + +static vnode_t * +lxpr_lookup_procdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR); + + /* + * We know all the names of files & dirs in our file system structure + * except those that are pid names. These change as pids are created/ + * deleted etc., so we just look for a number as the first char to see + * if we are we doing pid lookups. + * + * Don't need to check for "self" as it is implemented as a symlink + */ + if (*comp >= '0' && *comp <= '9') { + pid_t pid = 0; + lxpr_node_t *lxpnp = NULL; + proc_t *p; + int c; + + while ((c = *comp++) != '\0') + pid = 10 * pid + c - '0'; + + /* + * Can't continue if the process is still loading or it doesn't + * really exist yet (or maybe it just died!) + */ + p = lxpr_lock(pid); + if (p == NULL) + return (NULL); + + if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + lxpr_unlock(p); + return (NULL); + } + + /* + * allocate and fill in a new lxpr node + */ + lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0); + + lxpr_unlock(p); + + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); + } + + /* Lookup fixed names */ + return (lxpr_lookup_common(dp, comp, NULL, lxpr_dir, PROCDIRFILES)); +} + +/* + * lxpr_readdir(): Vnode operation for VOP_READDIR() + */ +/* ARGSUSED */ +static int +lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + ssize_t uresid; + off_t uoffset; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type >= 0 && type < LXPR_NFILES); + + /* + * we should never get here because the readdir + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict readdir permission to owner or root + */ + if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0) + return (error); + + uoffset = uiop->uio_offset; + uresid = uiop->uio_resid; + + /* can't do negative reads */ + if (uoffset < 0 || uresid <= 0) + return (EINVAL); + + /* can't read directory entries that don't exist! */ + if (uoffset % LXPR_SDSIZE) + return (ENOENT); + + return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp)); +} + +/* ARGSUSED */ +static int +lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + return (ENOTDIR); +} + +/* + * This has the common logic for returning directory entries + */ +static int +lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp, + lxpr_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Satisfy user request + */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXPR_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxpnp->lxpr_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXPR_SDSIZE) { + + dirent->d_ino = lxpr_parentinode(lxpnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex < dirtablen) { + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type, + lxpnp->lxpr_pid, 0); + + ASSERT(slen < LXPNSIZ); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); + + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + /* Have run out of space, but could have just done last table entry */ + if (eofp) { + *eofp = + (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0; + } + return (0); +} + + +static int +lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + zoneid_t zoneid; + pid_t pid; + int error; + int ceof; + + ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR); + + oresid = uiop->uio_resid; + zoneid = LXPTOZ(lxpnp)->zone_id; + + /* + * We return directory entries in the order: "." and ".." then the + * unique lxproc files, then the directories corresponding to the + * running processes. We have defined this as the ordering because + * it allows us to more easily keep track of where we are betwen calls + * to getdents(). If the number of processes changes between calls + * then we can't lose track of where we are in the lxproc files. + */ + + /* Do the fixed entries */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, lxpr_dir, + PROCDIRFILES); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + return (error); + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Do the process entries */ + while ((uresid = uiop->uio_resid) > 0) { + proc_t *p; + int len; + int reclen; + int i; + + uoffset = uiop->uio_offset; + + /* + * Stop when entire proc table has been examined. + */ + i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES; + if (i >= v.v_proc) { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + mutex_enter(&pidlock); + + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, a PID of 0, + * and anything the security policy doesn't allow + * us to look at. + */ + if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL || + p->p_pid == 0 || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + mutex_exit(&pidlock); + goto next; + } + mutex_exit(&pidlock); + + /* + * Convert pid to the Linux default of 1 if we're the zone's + * init process, otherwise use the value from the proc + * structure + */ + pid = ((p->p_pid != curproc->p_zone->zone_proc_initpid) ? + p->p_pid : 1); + + /* + * If this /proc was mounted in the global zone, view + * all procs; otherwise, only view zone member procs. + */ + if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) { + goto next; + } + + ASSERT(p->p_stat != 0); + + dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + return (EINVAL); + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, in the increment of this for + * the loop, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); +next: + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + if (eofp != NULL) { + *eofp = (uiop->uio_offset >= + ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0; + } + + return (0); +} + +static int +lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + proc_t *p; + + ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR); + + /* can't read its contents if it died */ + mutex_enter(&pidlock); + + p = prfind((lxpnp->lxpr_pid == 1) ? + curproc->p_zone->zone_proc_initpid : lxpnp->lxpr_pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES)); +} + +static int +lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_NETDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES)); +} + +static int +lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + int error; + int ceof; + proc_t *p; + int fddirsize; + uf_info_t *fip; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR); + + oresid = uiop->uio_resid; + + /* can't read its contents if it died */ + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) + return (ENOENT); + + /* Get open file info */ + fip = (&(p)->p_user.u_finfo); + + if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) { + fddirsize = 0; + } else { + fddirsize = fip->fi_nfiles; + } + + mutex_enter(&fip->fi_lock); + lxpr_unlock(p); + + /* Do the fixed entries (in this case just "." & "..") */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + return (error); + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Loop until user's request is satisfied or until + * all file descriptors have been examined. + */ + for (; (uresid = uiop->uio_resid) > 0; + uiop->uio_offset = uoffset + LXPR_SDSIZE) { + int reclen; + int fd; + int len; + + uoffset = uiop->uio_offset; + + /* + * Stop at the end of the fd list + */ + fd = (uoffset / LXPR_SDSIZE) - 2; + if (fd >= fddirsize) { + if (eofp) { + *eofp = 1; + } + goto out; + } + + if (fip->fi_list[fd].uf_file == NULL) + continue; + + dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + error = EINVAL; + goto out; + } + + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + goto out; + } + + if (eofp != NULL) { + *eofp = + (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0; + } + +out: + mutex_exit(&fip->fi_lock); + return (error); +} + + +/* + * lxpr_readlink(): Vnode operation for VOP_READLINK() + */ +/* ARGSUSED */ +static int +lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct) +{ + char bp[MAXPATHLEN + 1]; + size_t buflen = sizeof (bp); + lxpr_node_t *lxpnp = VTOLXP(vp); + vnode_t *rvp = lxpnp->lxpr_realvp; + pid_t pid; + int error = 0; + + /* must be a symbolic link file */ + if (vp->v_type != VLNK) + return (EINVAL); + + /* Try to produce a symlink name for anything that has a realvp */ + if (rvp != NULL) { + if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0) + return (error); + if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0) + return (error); + } else { + switch (lxpnp->lxpr_type) { + case LXPR_SELF: + /* + * Convert pid to the Linux default of 1 if we're the + * zone's init process + */ + pid = ((curproc->p_pid != + curproc->p_zone->zone_proc_initpid) + ? curproc->p_pid : 1); + + /* + * Don't need to check result as every possible int + * will fit within MAXPATHLEN bytes. + */ + (void) snprintf(bp, buflen, "%d", pid); + break; + case LXPR_PID_CURDIR: + case LXPR_PID_ROOTDIR: + case LXPR_PID_EXE: + return (EACCES); + default: + /* + * Need to return error so that nothing thinks + * that the symlink is empty and hence "." + */ + return (EINVAL); + } + } + + /* copy the link data to user space */ + return (uiomove(bp, strlen(bp), UIO_READ, uiop)); +} + +/* + * lxpr_inactive(): Vnode operation for VOP_INACTIVE() + * Vnode is no longer referenced, deallocate the file + * and all its resources. + */ +/* ARGSUSED */ +static void +lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + lxpr_freenode(VTOLXP(vp)); +} + +/* + * lxpr_sync(): Vnode operation for VOP_SYNC() + */ +static int +lxpr_sync() +{ + /* + * Nothing to sync but this function must never fail + */ + return (0); +} + +/* + * lxpr_cmp(): Vnode operation for VOP_CMP() + */ +static int +lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + vnode_t *rvp; + + while (vn_matchops(vp1, lxpr_vnodeops) && + (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) { + vp1 = rvp; + } + + while (vn_matchops(vp2, lxpr_vnodeops) && + (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) { + vp2 = rvp; + } + + if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops)) + return (vp1 == vp2); + + return (VOP_CMP(vp1, vp2, ct)); +} + +/* + * lxpr_realvp(): Vnode operation for VOP_REALVP() + */ +static int +lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) +{ + vnode_t *rvp; + + if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) { + vp = rvp; + if (VOP_REALVP(vp, &rvp, ct) == 0) + vp = rvp; + } + + *vpp = vp; + return (0); +} diff --git a/usr/src/uts/common/fs/lxproc/lxproc.h b/usr/src/uts/common/fs/lxproc/lxproc.h new file mode 100644 index 0000000000..a06bef1570 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxproc.h @@ -0,0 +1,275 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _LXPROC_H +#define _LXPROC_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lxproc.h: declarations, data structures and macros for lxprocfs + */ +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/user.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/dnlc.h> +#include <sys/atomic.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <vm/as.h> +#include <vm/anon.h> + +#define LX_SIGHUP 1 +#define LX_SIGINT 2 +#define LX_SIGQUIT 3 +#define LX_SIGILL 4 +#define LX_SIGTRAP 5 +#define LX_SIGABRT 6 +#define LX_SIGIOT 6 +#define LX_SIGBUS 7 +#define LX_SIGFPE 8 +#define LX_SIGKILL 9 +#define LX_SIGUSR1 10 +#define LX_SIGSEGV 11 +#define LX_SIGUSR2 12 +#define LX_SIGPIPE 13 +#define LX_SIGALRM 14 +#define LX_SIGTERM 15 +#define LX_SIGSTKFLT 16 +#define LX_SIGCHLD 17 +#define LX_SIGCONT 18 +#define LX_SIGSTOP 19 +#define LX_SIGTSTP 20 +#define LX_SIGTTIN 21 +#define LX_SIGTTOU 22 +#define LX_SIGURG 23 +#define LX_SIGXCPU 24 +#define LX_SIGXFSZ 25 +#define LX_SIGVTALRM 26 +#define LX_SIGPROF 27 +#define LX_SIGWINCH 28 +#define LX_SIGIO 29 +#define LX_SIGPOLL LX_SIGIO +#define LX_SIGPWR 30 +#define LX_SIGSYS 31 +#define LX_SIGUNUSED 31 + +#define LX_NSIG_WORDS 2 +#define LX_NBPW 32 +#define LX_NSIG ((LX_NBPW * LX_NSIG_WORDS) + 1) + +#define LX_SIGRTMIN 32 +#define LX_SIGRTMAX LX_NSIG - 1 + +/* + * Convert a vnode into an lxpr_mnt_t + */ +#define VTOLXPM(vp) ((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data) + +/* + * convert a vnode into an lxpr_node + */ +#define VTOLXP(vp) ((lxpr_node_t *)(vp)->v_data) + +/* + * convert a lxprnode into a vnode + */ +#define LXPTOV(lxpnp) ((lxpnp)->lxpr_vnode) + +/* + * convert a lxpr_node into zone for fs + */ +#define LXPTOZ(lxpnp) \ + (((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone) + +#define LXPNSIZ 256 /* max size of lx /proc file name entries */ + +/* + * Pretend that a directory entry takes 16 bytes + */ +#define LXPR_SDSIZE 16 + +/* + * Node/file types for lx /proc files + * (directories and files contained therein). + */ +typedef enum lxpr_nodetype { + LXPR_PROCDIR, /* /proc */ + LXPR_PIDDIR, /* /proc/<pid> */ + LXPR_PID_CMDLINE, /* /proc/<pid>/cmdline */ + LXPR_PID_CPU, /* /proc/<pid>/cpu */ + LXPR_PID_CURDIR, /* /proc/<pid>/cwd */ + LXPR_PID_ENV, /* /proc/<pid>/environ */ + LXPR_PID_EXE, /* /proc/<pid>/exe */ + LXPR_PID_MAPS, /* /proc/<pid>/maps */ + LXPR_PID_MEM, /* /proc/<pid>/mem */ + LXPR_PID_ROOTDIR, /* /proc/<pid>/root */ + LXPR_PID_STAT, /* /proc/<pid>/stat */ + LXPR_PID_STATM, /* /proc/<pid>/statm */ + LXPR_PID_STATUS, /* /proc/<pid>/status */ + LXPR_PID_FDDIR, /* /proc/<pid>/fd */ + LXPR_PID_FD_FD, /* /proc/<pid>/fd/nn */ + LXPR_CMDLINE, /* /proc/cmdline */ + LXPR_CPUINFO, /* /proc/cpuinfo */ + LXPR_DEVICES, /* /proc/devices */ + LXPR_DMA, /* /proc/dma */ + LXPR_FILESYSTEMS, /* /proc/filesystems */ + LXPR_INTERRUPTS, /* /proc/interrupts */ + LXPR_IOPORTS, /* /proc/ioports */ + LXPR_KCORE, /* /proc/kcore */ + LXPR_KMSG, /* /proc/kmsg */ + LXPR_LOADAVG, /* /proc/loadavg */ + LXPR_MEMINFO, /* /proc/meminfo */ + LXPR_MOUNTS, /* /proc/mounts */ + LXPR_NETDIR, /* /proc/net */ + LXPR_NET_ARP, /* /proc/net/arp */ + LXPR_NET_DEV, /* /proc/net/dev */ + LXPR_NET_DEV_MCAST, /* /proc/net/dev_mcast */ + LXPR_NET_IGMP, /* /proc/net/igmp */ + LXPR_NET_IP_MR_CACHE, /* /proc/net/ip_mr_cache */ + LXPR_NET_IP_MR_VIF, /* /proc/net/ip_mr_vif */ + LXPR_NET_MCFILTER, /* /proc/net/mcfilter */ + LXPR_NET_NETSTAT, /* /proc/net/netstat */ + LXPR_NET_RAW, /* /proc/net/raw */ + LXPR_NET_ROUTE, /* /proc/net/route */ + LXPR_NET_RPC, /* /proc/net/rpc */ + LXPR_NET_RT_CACHE, /* /proc/net/rt_cache */ + LXPR_NET_SOCKSTAT, /* /proc/net/sockstat */ + LXPR_NET_SNMP, /* /proc/net/snmp */ + LXPR_NET_STAT, /* /proc/net/stat */ + LXPR_NET_TCP, /* /proc/net/tcp */ + LXPR_NET_UDP, /* /proc/net/udp */ + LXPR_NET_UNIX, /* /proc/net/unix */ + LXPR_PARTITIONS, /* /proc/partitions */ + LXPR_SELF, /* /proc/self */ + LXPR_STAT, /* /proc/stat */ + LXPR_UPTIME, /* /proc/uptime */ + LXPR_VERSION, /* /proc/version */ + LXPR_NFILES /* number of lx /proc file types */ +} lxpr_nodetype_t; + +/* + * Number of fds allowed for in the inode number calculation + * per process (if a process has more fds then inode numbers + * may be duplicated) + */ +#define LXPR_FD_PERPROC 2000 + +/* + * external dirent characteristics + */ +#define LXPRMAXNAMELEN 14 +typedef struct { + lxpr_nodetype_t d_type; + char d_name[LXPRMAXNAMELEN]; +} lxpr_dirent_t; + +/* + * This is the lxprocfs private data object + * which is attached to v_data in the vnode structure + */ +typedef struct lxpr_node { + lxpr_nodetype_t lxpr_type; /* type of this node */ + vnode_t *lxpr_vnode; /* vnode for the node */ + vnode_t *lxpr_parent; /* parent directory */ + vnode_t *lxpr_realvp; /* real vnode, file in dirs */ + timestruc_t lxpr_time; /* creation etc time for file */ + mode_t lxpr_mode; /* file mode bits */ + uid_t lxpr_uid; /* file owner */ + gid_t lxpr_gid; /* file group owner */ + pid_t lxpr_pid; /* pid of proc referred to */ + ino_t lxpr_ino; /* node id */ + ldi_handle_t lxpr_cons_ldih; /* ldi handle for console device */ +} lxpr_node_t; + +struct zone; /* forward declaration */ + +/* + * This is the lxprocfs private data object + * which is attached to vfs_data in the vfs structure + */ +typedef struct lxpr_mnt { + lxpr_node_t *lxprm_node; /* node at root of proc mount */ + struct zone *lxprm_zone; /* zone for this mount */ + ldi_ident_t lxprm_li; /* ident for ldi */ +} lxpr_mnt_t; + +extern vnodeops_t *lxpr_vnodeops; +extern int nproc_highbit; /* highbit(v.v_nproc) */ + +typedef struct mounta mounta_t; + +extern void lxpr_initnodecache(); +extern void lxpr_fininodecache(); +extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *); +extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int); +extern ino_t lxpr_parentinode(lxpr_node_t *); +extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int); +extern void lxpr_freenode(lxpr_node_t *); + +typedef struct lxpr_uiobuf lxpr_uiobuf_t; +extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *); +extern void lxpr_uiobuf_free(lxpr_uiobuf_t *); +extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *); +extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t); +extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t); +extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...); +extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int); + +proc_t *lxpr_lock(pid_t); +void lxpr_unlock(proc_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LXPROC_H */ diff --git a/usr/src/uts/common/fs/nfs/nfs3_vnops.c b/usr/src/uts/common/fs/nfs/nfs3_vnops.c index d31b53d2e9..4e5882ad7c 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c @@ -28,6 +28,10 @@ * All rights reserved. */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + #include <sys/param.h> #include <sys/types.h> #include <sys/systm.h> @@ -2298,6 +2302,12 @@ top: vattr.va_mask = AT_SIZE; error = nfs3setattr(vp, &vattr, 0, cr); + + /* + * Existing file was truncated; + * emit a create event. + */ + vnevent_create(vp, ct); } } } @@ -2306,12 +2316,9 @@ top: if (error) { VN_RELE(vp); } else { - /* - * existing file got truncated, notify. - */ - vnevent_create(vp, ct); *vpp = vp; } + return (error); } diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c index 5ae2c28d53..f05a0717d9 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c @@ -31,6 +31,10 @@ * All Rights Reserved */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + #include <sys/param.h> #include <sys/types.h> #include <sys/systm.h> @@ -6653,16 +6657,20 @@ top: } else { vnode_t *tvp; rnode4_t *trp; - /* - * existing file got truncated, notify. - */ tvp = vp; if (vp->v_type == VREG) { trp = VTOR4(vp); if (IS_SHADOW(vp, trp)) tvp = RTOV4(trp); } - vnevent_create(tvp, ct); + + if (must_trunc) { + /* + * existing file got truncated, notify. + */ + vnevent_create(tvp, ct); + } + *vpp = vp; } return (error); diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c index a0abad0700..22d1ad4d68 100644 --- a/usr/src/uts/common/fs/nfs/nfs_server.c +++ b/usr/src/uts/common/fs/nfs/nfs_server.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Bayard G. Bell. All rights reserved. + * Copyright (c) 2012 Joyent, Inc. All rights reserved. * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ @@ -2521,6 +2522,9 @@ nfs_srvinit(void) { int error; + if (getzoneid() != GLOBAL_ZONEID) + return (EACCES); + error = nfs_exportinit(); if (error != 0) return (error); diff --git a/usr/src/uts/common/fs/nfs/nfs_vnops.c b/usr/src/uts/common/fs/nfs/nfs_vnops.c index a3f43a4e95..fa31e3693f 100644 --- a/usr/src/uts/common/fs/nfs/nfs_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs_vnops.c @@ -25,6 +25,10 @@ * All rights reserved. */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + #include <sys/param.h> #include <sys/types.h> #include <sys/systm.h> @@ -2030,6 +2034,14 @@ nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, vp->v_type == VREG) { vattr.va_mask = AT_SIZE; error = nfssetattr(vp, &vattr, 0, cr); + + if (!error) { + /* + * Existing file was truncated; + * emit a create event. + */ + vnevent_create(vp, ct); + } } } } @@ -2037,10 +2049,6 @@ nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, if (error) { VN_RELE(vp); } else { - /* - * existing file got truncated, notify. - */ - vnevent_create(vp, ct); *vpp = vp; } return (error); diff --git a/usr/src/uts/common/fs/portfs/port_fop.c b/usr/src/uts/common/fs/portfs/port_fop.c index 2852a98f52..48792394a5 100644 --- a/usr/src/uts/common/fs/portfs/port_fop.c +++ b/usr/src/uts/common/fs/portfs/port_fop.c @@ -23,6 +23,9 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ /* * File Events Notification @@ -1965,7 +1968,9 @@ port_fop(vnode_t *vp, int op, int retval) if (op & FOP_ATTRIB_MASK) { event |= FILE_ATTRIB; } - + if (op & FOP_TRUNC_MASK) { + event |= FILE_TRUNC; + } if (event) { port_fop_sendevent(vp, event, NULL, NULL); } @@ -2147,6 +2152,9 @@ port_fop_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr, int events = 0; retval = vnext_setattr(vf, vap, flags, cr, ct); + if (vap->va_mask & AT_SIZE) { + events |= FOP_FILE_TRUNC; + } if (vap->va_mask & (AT_SIZE|AT_MTIME)) { events |= FOP_FILE_SETATTR_MTIME; } @@ -2322,8 +2330,8 @@ port_fop_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *name, port_fop_sendevent(vp, FILE_DELETE, dvp, name); break; case VE_CREATE: - port_fop_sendevent(vp, FILE_MODIFIED|FILE_ATTRIB, - NULL, NULL); + port_fop_sendevent(vp, + FILE_MODIFIED|FILE_ATTRIB|FILE_TRUNC, NULL, NULL); break; case VE_LINK: port_fop_sendevent(vp, FILE_ATTRIB, NULL, NULL); diff --git a/usr/src/uts/common/fs/proc/prcontrol.c b/usr/src/uts/common/fs/proc/prcontrol.c index 55a48bb2cc..53709139cc 100644 --- a/usr/src/uts/common/fs/proc/prcontrol.c +++ b/usr/src/uts/common/fs/proc/prcontrol.c @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + #include <sys/types.h> #include <sys/uio.h> #include <sys/param.h> @@ -935,7 +939,7 @@ pr_control32(int32_t cmd, arg32_t *argp, prnode_t *pnp, cred_t *cr) case PCREAD: /* read from the address space */ case PCWRITE: /* write to the address space */ - if (PROCESS_NOT_32BIT(p)) + if (PROCESS_NOT_32BIT(p) || (pnp->pr_flags & PR_OFFMAX)) error = EOVERFLOW; else { enum uio_rw rw = (cmd == PCREAD)? UIO_READ : UIO_WRITE; diff --git a/usr/src/uts/common/fs/proc/prdata.h b/usr/src/uts/common/fs/proc/prdata.h index 1294421f9f..ce925778f2 100644 --- a/usr/src/uts/common/fs/proc/prdata.h +++ b/usr/src/uts/common/fs/proc/prdata.h @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ @@ -183,6 +187,7 @@ typedef struct prnode { #define PR_INVAL 0x01 /* vnode is invalidated */ #define PR_ISSELF 0x02 /* vnode is a self-open */ #define PR_AOUT 0x04 /* vnode is for an a.out path */ +#define PR_OFFMAX 0x08 /* vnode is a large file open */ /* * Conversion macros. diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c index a3e95a60fc..7831c1f9ea 100644 --- a/usr/src/uts/common/fs/proc/prvnops.c +++ b/usr/src/uts/common/fs/proc/prvnops.c @@ -23,6 +23,10 @@ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ @@ -337,6 +341,15 @@ propen(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) } /* + * If this is a large file open, indicate that in our flags -- some + * procfs structures are not off_t-neutral (e.g., priovec_t), and + * the open will need to be differentiated where 32-bit processes + * pass these structures across the user/kernel boundary. + */ + if (flag & FOFFMAX) + pnp->pr_flags |= PR_OFFMAX; + + /* * Do file-specific things. */ switch (type) { diff --git a/usr/src/uts/common/fs/swapfs/swap_subr.c b/usr/src/uts/common/fs/swapfs/swap_subr.c index 74c4302da9..a4d983665b 100644 --- a/usr/src/uts/common/fs/swapfs/swap_subr.c +++ b/usr/src/uts/common/fs/swapfs/swap_subr.c @@ -110,9 +110,11 @@ swapfs_recalc(pgcnt_t pgs) * memory that can be used as swap space should do so by * setting swapfs_desfree at boot time, not swapfs_minfree. * However, swapfs_minfree is tunable by install as a - * workaround for bugid 1147463. + * workaround for bugid 1147463. Note swapfs_minfree is set + * to 1/8th of memory, but clamped at the limit of 256 MB. */ - new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3); + new_swapfs_minfree = MIN(MAX(btopr(2 * 1024 * 1024), pgs >> 3), + btopr(256 * 1024 * 1024)); } /* diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c index f8a36a528f..f22cc3ecf0 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -76,7 +77,7 @@ static vfsdef_t vfw = { VFSDEF_VERSION, "tmpfs", tmpfsinit, - VSW_HASPROTO|VSW_STATS|VSW_ZMOUNT, + VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT, &tmpfs_proto_opttbl }; @@ -249,7 +250,7 @@ tmp_mount( return (ENOTDIR); mutex_enter(&mvp->v_lock); - if ((uap->flags & MS_OVERLAY) == 0 && + if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 && (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { mutex_exit(&mvp->v_lock); return (EBUSY); @@ -286,6 +287,21 @@ tmp_mount( (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn)) goto out; + if (uap->flags & MS_REMOUNT) { + tm = (struct tmount *)VFSTOTM(vfsp); + + /* + * If we change the size so its less than what is currently + * being used, we allow that. The file system will simply be + * full until enough files have been removed to get below the + * new max. + */ + mutex_enter(&tm->tm_contents); + tm->tm_anonmax = anonmax; + mutex_exit(&tm->tm_contents); + goto out; + } + if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) { pn_free(&dpn); error = ENOMEM; diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c index 61d72a4015..461016aa52 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + #include <sys/types.h> #include <sys/param.h> #include <sys/t_lock.h> @@ -978,6 +982,8 @@ again: } if (error == 0) { /* name found */ + boolean_t trunc = B_FALSE; + ASSERT(oldtp); rw_enter(&oldtp->tn_rwlock, RW_WRITER); @@ -1005,6 +1011,7 @@ again: rw_enter(&oldtp->tn_contents, RW_WRITER); (void) tmpnode_trunc(tm, oldtp, 0); rw_exit(&oldtp->tn_contents); + trunc = B_TRUE; } rw_exit(&oldtp->tn_rwlock); if (IS_DEVVP(*vpp)) { @@ -1019,9 +1026,9 @@ again: *vpp = newvp; } - if (error == 0) { + if (trunc) vnevent_create(*vpp, ct); - } + return (0); } diff --git a/usr/src/uts/common/fs/vfs.c b/usr/src/uts/common/fs/vfs.c index 83c53d859d..8d5c741428 100644 --- a/usr/src/uts/common/fs/vfs.c +++ b/usr/src/uts/common/fs/vfs.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -1129,6 +1130,7 @@ domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp, struct pathname pn, rpn; vsk_anchor_t *vskap; char fstname[FSTYPSZ]; + zone_t *zone; /* * The v_flag value for the mount point vp is permanently set @@ -1590,9 +1592,24 @@ domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp, } /* - * Serialize with zone creations. + * Serialize with zone state transitions. + * See vfs_list_add; zone mounted into is: + * zone_find_by_path(refstr_value(vfsp->vfs_mntpt)) + * not the zone doing the mount (curproc->p_zone), but if we're already + * inside a NGZ, then we know what zone we are. */ - mount_in_progress(); + if (INGLOBALZONE(curproc)) { + zone = zone_find_by_path(mountpt); + ASSERT(zone != NULL); + } else { + zone = curproc->p_zone; + /* + * zone_find_by_path does a hold, so do one here too so that + * we can do a zone_rele after mount_completed. + */ + zone_hold(zone); + } + mount_in_progress(zone); /* * Instantiate (or reinstantiate) the file system. If appropriate, * splice it into the file system name space. @@ -1761,7 +1778,8 @@ domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp, vfs_unlock(vfsp); } - mount_completed(); + mount_completed(zone); + zone_rele(zone); if (splice) vn_vfsunlock(vp); @@ -3881,6 +3899,8 @@ vfs_to_modname(const char *vfstype) vfstype = "fdfs"; } else if (strncmp(vfstype, "nfs", 3) == 0) { vfstype = "nfs"; + } else if (strcmp(vfstype, "lxproc") == 0) { + vfstype = "lxprocfs"; } return (vfstype); diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c index 382369c7fc..67f21866ec 100644 --- a/usr/src/uts/common/fs/vnode.c +++ b/usr/src/uts/common/fs/vnode.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent Inc. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -65,6 +66,7 @@ #include <fs/fs_subr.h> #include <sys/taskq.h> #include <fs/fs_reparse.h> +#include <sys/time.h> /* Determine if this vnode is a file that is read-only */ #define ISROFILE(vp) \ @@ -199,6 +201,11 @@ static void (**vsd_destructor)(void *); cr = crgetmapped(cr); \ } +#define VOP_LATENCY_10MS 10000000 +#define VOP_LATENCY_100MS 100000000 +#define VOP_LATENCY_1S 1000000000 +#define VOP_LATENCY_10S 10000000000 + /* * Convert stat(2) formats to vnode types and vice versa. (Knows about * numerical order of S_IFMT and vnode types.) @@ -3220,14 +3227,57 @@ fop_read( cred_t *cr, caller_context_t *ct) { - int err; ssize_t resid_start = uiop->uio_resid; + zone_t *zonep = curzone; + zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats; + + hrtime_t start, lat; + ssize_t len; + int err; + + if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) { + start = gethrtime(); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_runq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + } VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct); - VOPSTATS_UPDATE_IO(vp, read, - read_bytes, (resid_start - uiop->uio_resid)); + len = resid_start - uiop->uio_resid; + + VOPSTATS_UPDATE_IO(vp, read, read_bytes, len); + + if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) { + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.reads++; + zonep->zone_vfs_rwstats.nread += len; + kstat_runq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + if (lat < VOP_LATENCY_100MS) + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + } + return (err); } @@ -3239,14 +3289,62 @@ fop_write( cred_t *cr, caller_context_t *ct) { - int err; ssize_t resid_start = uiop->uio_resid; + zone_t *zonep = curzone; + zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats; + + hrtime_t start, lat; + ssize_t len; + int err; + + /* + * For the purposes of VFS kstat consumers, the "waitq" calculation is + * repurposed as the active queue for VFS write operations. There's no + * actual wait queue for VFS operations. + */ + if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) { + start = gethrtime(); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_waitq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + } VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct); - VOPSTATS_UPDATE_IO(vp, write, - write_bytes, (resid_start - uiop->uio_resid)); + len = resid_start - uiop->uio_resid; + + VOPSTATS_UPDATE_IO(vp, write, write_bytes, len); + + if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) { + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.writes++; + zonep->zone_vfs_rwstats.nwritten += len; + kstat_waitq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + if (lat < VOP_LATENCY_100MS) + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + } + return (err); } diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 5b0e464ac5..d8e9f26bdb 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ @@ -125,6 +126,7 @@ #include <sys/refcount.h> #include <sys/vdev.h> #include <sys/vdev_impl.h> +#include <sys/zfs_zone.h> #ifdef _KERNEL #include <sys/vmsystm.h> #include <vm/anon.h> @@ -2146,6 +2148,16 @@ arc_reclaim_needed(void) if (availrmem < swapfs_minfree + swapfs_reserve + extra) return (1); + /* + * Check that we have enough availrmem that memory locking (e.g., via + * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum + * stores the number of pages that cannot be locked; when availrmem + * drops below pages_pp_maximum, page locking mechanisms such as + * page_pp_lock() will fail.) + */ + if (availrmem <= pages_pp_maximum) + return (1); + #if defined(__i386) /* * If we're on an i386 platform, it's possible that we'll exhaust the @@ -3059,6 +3071,14 @@ top: rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, zio_flags, zb); + /* + * At this point, this read I/O has already missed in the ARC + * and will be going through to the disk. The I/O throttle + * should delay this I/O if this zone is using more than its I/O + * priority allows. + */ + zfs_zone_io_throttle(ZFS_ZONE_IOP_READ); + if (*arc_flags & ARC_WAIT) return (zio_wait(rzio)); @@ -3637,9 +3657,6 @@ arc_init(void) if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) arc_meta_limit = zfs_arc_meta_limit; - if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) - arc_c_min = arc_meta_limit / 2; - if (zfs_arc_grow_retry > 0) arc_grow_retry = zfs_arc_grow_retry; diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index ac9e3b28f6..e8bf55c321 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -2721,7 +2721,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dt.dl.dr_copies); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { - ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); + ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || + zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, NULL, db->db.db_size, &zp, dbuf_write_nofill_ready, dbuf_write_nofill_done, db, diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index fa49735c87..e76074bf8d 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011, Joyent, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ @@ -964,6 +965,7 @@ xuio_stat_wbuf_nocopy() } #ifdef _KERNEL + int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { @@ -1576,7 +1578,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) if (wp & WP_NOFILL) { ASSERT(!ismd && level == 0); - checksum = ZIO_CHECKSUM_OFF; + checksum = ZIO_CHECKSUM_NOPARITY; compress = ZIO_COMPRESS_OFF; dedup = B_FALSE; } diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index e44786f163..a9308b0c08 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -38,11 +38,11 @@ #include <sys/sa_impl.h> #include <sys/zfs_context.h> #include <sys/varargs.h> +#include <sys/zfs_zone.h> typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); - dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { @@ -222,6 +222,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) if (len == 0) return; + zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE); + min_bs = SPA_MINBLOCKSHIFT; max_bs = SPA_MAXBLOCKSHIFT; min_ibs = DN_MIN_INDBLKSHIFT; diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c index df3f02b1df..e7e11dc296 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dir.c +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c @@ -37,6 +37,7 @@ #include <sys/zio.h> #include <sys/arc.h> #include <sys/sunddi.h> +#include <sys/zfs_zone.h> #include "zfs_namecheck.h" static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); @@ -833,7 +834,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx); } else { if (err == EAGAIN) { - txg_delay(dd->dd_pool, tx->tx_txg, 1); + txg_delay(dd->dd_pool, tx->tx_txg, + zfs_zone_txg_delay()); err = ERESTART; } dsl_pool_memory_pressure(dd->dd_pool); diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index 8ab6655b6f..02ce0d15c3 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -40,6 +40,7 @@ #include <sys/zfs_znode.h> #include <sys/spa_impl.h> #include <sys/dsl_deadlist.h> +#include <sys/zfs_zone.h> #include <sys/bptree.h> #include <sys/zfeature.h> #include <sys/zil_impl.h> @@ -610,11 +611,11 @@ dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) /* * If this transaction group is over 7/8ths capacity, delay - * the caller 1 clock tick. This will slow down the "fill" - * rate until the sync process can catch up with us. + * the caller some number of clock ticks. This will slow down the + * "fill" rate until the sync process can catch up with us. */ if (reserved && reserved > (write_limit - (write_limit >> 3))) - txg_delay(dp, tx->tx_txg, 1); + txg_delay(dp, tx->tx_txg, zfs_zone_txg_delay()); return (0); } diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h index b748571ea0..ffca0a7dcb 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h @@ -21,13 +21,12 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ #ifndef _SYS_VDEV_DISK_H #define _SYS_VDEV_DISK_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/vdev.h> #ifdef _KERNEL #include <sys/buf.h> @@ -40,14 +39,22 @@ extern "C" { #endif +#ifdef _KERNEL typedef struct vdev_disk { ddi_devid_t vd_devid; char *vd_minor; ldi_handle_t vd_lh; } vdev_disk_t; +#endif +extern int vdev_disk_physio(vdev_t *, caddr_t, size_t, uint64_t, int); + +/* + * Since vdev_disk.c is not compiled into libzpool, this function should only be + * defined in the zfs kernel module. + */ #ifdef _KERNEL -extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int); +extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int); #endif #ifdef __cplusplus } diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 95b8f9bdaf..e4c02bde1d 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -104,6 +104,7 @@ struct vdev_queue { avl_tree_t vq_read_tree; avl_tree_t vq_write_tree; avl_tree_t vq_pending_tree; + zoneid_t vq_last_zone_id; uint64_t vq_io_complete_ts; uint64_t vq_io_delta_ts; kmutex_t vq_lock; diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h new file mode 100644 index 0000000000..496b718bd6 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_VDEV_RAIDZ_H +#define _SYS_VDEV_RAIDZ_H + +#include <sys/vdev.h> +#include <sys/semaphore.h> +#include <sys/buf.h> +#ifdef _KERNEL +#include <sys/ddi.h> +#include <sys/sunldi.h> +#include <sys/sunddi.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL +extern int vdev_raidz_physio(vdev_t *, + caddr_t, size_t, uint64_t, uint64_t, boolean_t); +#endif +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_RAIDZ_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h new file mode 100644 index 0000000000..069ec004f3 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h @@ -0,0 +1,62 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_ZONE_H +#define _SYS_FS_ZFS_ZONE_H + +#ifdef _KERNEL +#include <sys/isa_defs.h> +#include <sys/types32.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + ZFS_ZONE_IOP_READ = 0, + ZFS_ZONE_IOP_WRITE, + ZFS_ZONE_IOP_LOGICAL_WRITE, +} zfs_zone_iop_type_t; + +extern void zfs_zone_io_throttle(zfs_zone_iop_type_t); + +extern void zfs_zone_zio_init(zio_t *); +extern void zfs_zone_zio_start(zio_t *); +extern void zfs_zone_zio_done(zio_t *); +extern void zfs_zone_zio_dequeue(zio_t *); +extern void zfs_zone_zio_enqueue(zio_t *); +extern void zfs_zone_report_txg_sync(void *); +extern int zfs_zone_txg_delay(); +#ifdef _KERNEL +extern zio_t *zfs_zone_schedule(vdev_queue_t *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_ZONE_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 27ebe5e659..9c718f691a 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -22,6 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2011 Joyent, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ @@ -78,6 +79,8 @@ enum zio_checksum { ZIO_CHECKSUM_FLETCHER_4, ZIO_CHECKSUM_SHA256, ZIO_CHECKSUM_ZILOG2, + ZIO_CHECKSUM_SHA256_MAC, + ZIO_CHECKSUM_NOPARITY, ZIO_CHECKSUM_FUNCTIONS }; @@ -430,6 +433,9 @@ struct zio { zio_cksum_report_t *io_cksum_report; uint64_t io_ena; + zoneid_t io_zoneid; /* zone which originated this I/O */ + hrtime_t io_start; /* time I/O entered zio pipeline */ + hrtime_t io_dispatched; /* time I/O was dispatched to disk */ /* Taskq dispatching state */ taskq_ent_t io_tqent; }; diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index 91a639a648..17beaea3ad 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -31,6 +31,7 @@ #include <sys/dsl_pool.h> #include <sys/dsl_scan.h> #include <sys/callb.h> +#include <sys/zfs_zone.h> /* * Pool-wide transaction groups. @@ -412,6 +413,8 @@ txg_sync_thread(dsl_pool_t *dp) txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); + zfs_zone_report_txg_sync(dp); + start = ddi_get_lbolt(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index 655728ccde..dfadeca9d4 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -21,9 +21,11 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include <sys/zfs_context.h> +#include <sys/zfs_zone.h> #include <sys/spa_impl.h> #include <sys/refcount.h> #include <sys/vdev_disk.h> @@ -362,8 +364,25 @@ vdev_disk_close(vdev_t *vd) } int -vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size, - uint64_t offset, int flags) +vdev_disk_physio(vdev_t *vd, caddr_t data, + size_t size, uint64_t offset, int flags) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (dvd == NULL) + return (EIO); + + ASSERT(vd->vdev_ops == &vdev_disk_ops); + return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags)); +} + +int +vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, + size_t size, uint64_t offset, int flags) { buf_t *bp; int error = 0; @@ -516,6 +535,8 @@ vdev_disk_io_start(zio_t *zio) bp->b_bufsize = zio->io_size; bp->b_iodone = (int (*)())vdev_disk_io_intr; + zfs_zone_zio_start(zio); + /* ldi_strategy() will return non-zero only on programming errors */ VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0); @@ -527,6 +548,8 @@ vdev_disk_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; + zfs_zone_zio_done(zio); + /* * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if * the device has been removed. If this is the case, then we trigger an @@ -611,7 +634,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) /* read vdev label */ offset = vdev_label_offset(size, l, 0); - if (vdev_disk_physio(vd_lh, (caddr_t)label, + if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label, VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) continue; diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index 2b06040c51..8dec283fee 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ /* @@ -31,6 +32,7 @@ #include <sys/vdev_impl.h> #include <sys/zio.h> #include <sys/avl.h> +#include <sys/zfs_zone.h> /* * These tunables are for performance analysis. @@ -124,6 +126,8 @@ vdev_queue_init(vdev_t *vd) avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); + + vq->vq_last_zone_id = 0; } void @@ -143,6 +147,7 @@ static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { avl_add(&vq->vq_deadline_tree, zio); + zfs_zone_zio_enqueue(zio); avl_add(zio->io_vdev_tree, zio); } @@ -150,6 +155,7 @@ static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { avl_remove(&vq->vq_deadline_tree, zio); + zfs_zone_zio_dequeue(zio); avl_remove(zio->io_vdev_tree, zio); } @@ -192,7 +198,11 @@ again: avl_numnodes(&vq->vq_deadline_tree) == 0) return (NULL); +#ifdef _KERNEL + fio = lio = zfs_zone_schedule(vq); +#else fio = lio = avl_first(&vq->vq_deadline_tree); +#endif t = fio->io_vdev_tree; flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c index efae534257..49e8610542 100644 --- a/usr/src/uts/common/fs/zfs/vdev_raidz.c +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -22,11 +22,15 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/vdev_impl.h> +#include <sys/vdev_disk.h> +#include <sys/vdev_file.h> +#include <sys/vdev_raidz.h> #include <sys/zio.h> #include <sys/zio_checksum.h> #include <sys/fs/zfs.h> @@ -153,6 +157,8 @@ typedef struct raidz_map { VDEV_RAIDZ_64MUL_2((x), mask); \ } +#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) + /* * Force reconstruction to use the general purpose method. */ @@ -432,12 +438,12 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = { }; static raidz_map_t * -vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, - uint64_t nparity) +vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, + uint64_t unit_shift, uint64_t dcols, uint64_t nparity) { raidz_map_t *rm; - uint64_t b = zio->io_offset >> unit_shift; - uint64_t s = zio->io_size >> unit_shift; + uint64_t b = offset >> unit_shift; + uint64_t s = size >> unit_shift; uint64_t f = b % dcols; uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; @@ -507,7 +513,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, for (c = 0; c < rm->rm_firstdatacol; c++) rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); - rm->rm_col[c].rc_data = zio->io_data; + rm->rm_col[c].rc_data = data; for (c = c + 1; c < acols; c++) rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + @@ -536,7 +542,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, ASSERT(rm->rm_cols >= 2); ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); - if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { + if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { devidx = rm->rm_col[0].rc_devidx; o = rm->rm_col[0].rc_offset; rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; @@ -548,8 +554,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, rm->rm_skipstart = 1; } - zio->io_vsd = rm; - zio->io_vsd_ops = &vdev_raidz_vsd_ops; return (rm); } @@ -959,12 +963,9 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) * ~~ ~~ * __ __ * | 1 1 1 1 1 1 1 1 | - * | 128 64 32 16 8 4 2 1 | * | 19 205 116 29 64 16 4 1 | * | 1 0 0 0 0 0 0 0 | - * | 0 1 0 0 0 0 0 0 | - * (V|I)' = | 0 0 1 0 0 0 0 0 | - * | 0 0 0 1 0 0 0 0 | + * (V|I)' = | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | @@ -1495,6 +1496,152 @@ vdev_raidz_close(vdev_t *vd) vdev_close(vd->vdev_child[c]); } +/* + * Handle a read or write I/O to a RAID-Z dump device. + * + * The dump device is in a unique situation compared to other ZFS datasets: + * writing to this device should be as simple and fast as possible. In + * addition, durability matters much less since the dump will be extracted + * once the machine reboots. For that reason, this function eschews parity for + * performance and simplicity. The dump device uses the checksum setting + * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this + * dataset. + * + * Blocks of size 128 KB have been preallocated for this volume. I/Os less than + * 128 KB will not fill an entire block; in addition, they may not be properly + * aligned. In that case, this function uses the preallocated 128 KB block and + * omits reading or writing any "empty" portions of that block, as opposed to + * allocating a fresh appropriately-sized block. + * + * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: + * + * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) + * + * If this were a standard RAID-Z dataset, a block of at least 40 KB would be + * allocated which spans all five child vdevs. 8 KB of data would be written to + * each of four vdevs, with the fifth containing the parity bits. + * + * parity data data data data + * | PP | XX | XX | XX | XX | + * ^ ^ ^ ^ ^ + * | | | | | + * 8 KB parity ------8 KB data blocks------ + * + * However, when writing to the dump device, the layout is different: + * + * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) + * + * Unlike the normal RAID-Z case in which the block is allocated based on the + * I/O size, reads and writes here always use a 128 KB logical I/O size. is + * less than 128 KB, only the actual portions of data are written. In this + * example the data is written to the third data vdev since that vdev contains + * the offset [64 KB, 96 KB). + * + * parity data data data data + * | | | | XX | | + * ^ + * | + * 32 KB data block + * + * As a result, an individual I/O may not span all child vdevs; moreover, a + * small I/O may only operate on a single child vdev. + * + * Note that since there are no parity bits calculated or written, this format + * remains the same no matter how many parity bits are used in a normal RAID-Z + * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above + * would look like: + * + * parity parity parity data data data data + * | | | | | | XX | | + * ^ + * | + * 32 KB data block + */ +int +vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, + uint64_t offset, uint64_t origoffset, boolean_t doread) +{ + vdev_t *tvd = vd->vdev_top; + vdev_t *cvd; + raidz_map_t *rm; + raidz_col_t *rc; + int c, err = 0; + + uint64_t start, end, colstart, colend; + uint64_t coloffset, colsize, colskip; + + int flags = doread ? B_READ : B_WRITE; + +#ifdef _KERNEL + + /* + * Don't write past the end of the block + */ + VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE); + + start = offset; + end = start + size; + + /* + * Allocate a RAID-Z map for this block. Note that this block starts + * from the "original" offset, this is, the offset of the extent which + * contains the requisite offset of the data being read or written. + * + * Even if this I/O operation doesn't span the full block size, let's + * treat the on-disk format as if the only blocks are the complete 128 + * KB size. + */ + rm = vdev_raidz_map_alloc(data - (offset - origoffset), + SPA_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, vd->vdev_children, + vd->vdev_nparity); + + coloffset = origoffset; + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; + c++, coloffset += rc->rc_size) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + + /* + * Find the start and end of this column in the RAID-Z map, + * keeping in mind that the stated size and offset of the + * operation may not fill the entire column for this vdev. + * + * If any portion of the data spans this column, issue the + * appropriate operation to the vdev. + */ + if (coloffset + rc->rc_size <= start) + continue; + if (coloffset >= end) + continue; + + colstart = MAX(coloffset, start); + colend = MIN(end, coloffset + rc->rc_size); + colsize = colend - colstart; + colskip = colstart - coloffset; + + VERIFY3U(colsize, <=, rc->rc_size); + VERIFY3U(colskip, <=, rc->rc_size); + + /* + * Note that the child vdev will have a vdev label at the start + * of its range of offsets, hence the need for + * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another + * example of why this calculation is needed. + */ + if ((err = vdev_disk_physio(cvd, + ((char *)rc->rc_data) + colskip, colsize, + VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, + flags)) != 0) + break; + } + + vdev_raidz_map_free(rm); +#endif /* KERNEL */ + + return (err); +} + static uint64_t vdev_raidz_asize(vdev_t *vd, uint64_t psize) { @@ -1530,9 +1677,13 @@ vdev_raidz_io_start(zio_t *zio) raidz_col_t *rc; int c, i; - rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, + rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset, + tvd->vdev_ashift, vd->vdev_children, vd->vdev_nparity); + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; + ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); if (zio->io_type == ZIO_TYPE_WRITE) { @@ -1663,6 +1814,13 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) int c, ret = 0; raidz_col_t *rc; + blkptr_t *bp = zio->io_bp; + enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : + (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); + + if (checksum == ZIO_CHECKSUM_NOPARITY) + return (ret); + for (c = 0; c < rm->rm_firstdatacol; c++) { rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index 723d516552..11120c7c4b 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -23,8 +23,8 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* @@ -606,9 +606,11 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, * Check permissions for special properties. */ switch (prop) { + case ZFS_PROP_DEDUP: + case ZFS_PROP_COMPRESSION: case ZFS_PROP_ZONED: /* - * Disallow setting of 'zoned' from within a local zone. + * Disallow setting these properties from within a local zone. */ if (!INGLOBALZONE(curproc)) return (EPERM); @@ -1936,7 +1938,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc) } static int -zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) +zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os, + boolean_t cachedpropsonly) { int error = 0; nvlist_t *nv; @@ -1954,7 +1957,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) * XXX reading with out owning */ if (!zc->zc_objset_stats.dds_inconsistent && - dmu_objset_type(os) == DMU_OST_ZVOL) { + dmu_objset_type(os) == DMU_OST_ZVOL && + !cachedpropsonly) { error = zvol_get_stats(os, nv); if (error == EIO) return (error); @@ -1981,13 +1985,25 @@ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os = NULL; + nvlist_t *nvl = NULL; + boolean_t cachedpropsonly = B_FALSE; int error; - if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl) != 0)) return (error); - error = zfs_ioc_objset_stats_impl(zc, os); + if (nvl != NULL) { + (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly", + &cachedpropsonly); + nvlist_free(nvl); + } + + if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) + return (error); + error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly); dmu_objset_rele(os, FTAG); return (error); @@ -2201,8 +2217,21 @@ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { objset_t *os; + nvlist_t *nvl = NULL; + boolean_t cachedpropsonly = B_FALSE; int error; + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl) != 0)) + return (error); + + if (nvl != NULL) { + (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly", + &cachedpropsonly); + nvlist_free(nvl); + } + top: if (zc->zc_cookie == 0) (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch, @@ -2251,8 +2280,10 @@ top: objset_t *ossnap; error = dmu_objset_from_ds(ds, &ossnap); - if (error == 0) - error = zfs_ioc_objset_stats_impl(zc, ossnap); + if (error == 0) { + error = zfs_ioc_objset_stats_impl(zc, + ossnap, cachedpropsonly); + } dsl_dataset_rele(ds, FTAG); } } else if (error == ENOENT) { @@ -2954,6 +2985,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; + int error; ASSERT(zplprops != NULL); @@ -2997,8 +3029,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); - if (norm == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); + if (norm == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); @@ -3007,13 +3040,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, */ if (norm) u8 = 1; - if (u8 == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); + if (u8 == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); - if (sense == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); + if (sense == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index 3278a77041..c7bfbbaec4 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -1937,6 +1938,17 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); + /* + * If we're doing a forced unmount on a dataset which still has + * references and is in a zone, then we need to cleanup the zone + * reference at this point or else the zone will never be able to + * shutdown. + */ + if ((fflag & MS_FORCE) && vfsp->vfs_count > 1 && vfsp->vfs_zone) { + zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, ZONE_REF_VFS); + vfsp->vfs_zone = NULL; + } + return (0); } diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index c5d8ad7f45..bbbd91f46d 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -26,6 +26,10 @@ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + #include <sys/types.h> #include <sys/param.h> #include <sys/time.h> @@ -4146,6 +4150,8 @@ top: &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); } dmu_tx_commit(tx); @@ -4656,27 +4662,6 @@ zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, return (0); } -/* - * The reason we push dirty pages as part of zfs_delmap() is so that we get a - * more accurate mtime for the associated file. Since we don't have a way of - * detecting when the data was actually modified, we have to resort to - * heuristics. If an explicit msync() is done, then we mark the mtime when the - * last page is pushed. The problem occurs when the msync() call is omitted, - * which by far the most common case: - * - * open() - * mmap() - * <modify memory> - * munmap() - * close() - * <time lapse> - * putpage() via fsflush - * - * If we wait until fsflush to come along, we can have a modification time that - * is some arbitrary point in the future. In order to prevent this in the - * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is - * torn down. - */ /* ARGSUSED */ static int zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, @@ -4688,10 +4673,6 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); - if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && - vn_has_cached_data(vp)) - (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct); - return (0); } diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c new file mode 100644 index 0000000000..08f4f38e04 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_zone.c @@ -0,0 +1,1179 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zfs_zone.h> + +#ifndef _KERNEL + +/* + * Stubs for when compiling for user-land. + */ + +void +zfs_zone_io_throttle(zfs_zone_iop_type_t type) +{ +} + +void +zfs_zone_zio_init(zio_t *zp) +{ +} + +void +zfs_zone_zio_start(zio_t *zp) +{ +} + +void +zfs_zone_zio_done(zio_t *zp) +{ +} + +void +zfs_zone_zio_dequeue(zio_t *zp) +{ +} + +void +zfs_zone_zio_enqueue(zio_t *zp) +{ +} + +/*ARGSUSED*/ +void +zfs_zone_report_txg_sync(void *dp) +{ +} + +int +zfs_zone_txg_delay() +{ + return (1); +} + +#else + +/* + * The real code. + */ + +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/atomic.h> +#include <sys/zio.h> +#include <sys/zone.h> +#include <sys/avl.h> +#include <sys/sdt.h> +#include <sys/ddi.h> + +/* + * The zone throttle delays read and write operations from certain zones based + * on each zone's IO utilitzation. Once a cycle (defined by zfs_zone_cycle_time + * below), the delays for each zone are recalculated based on the utilization + * over the previous window. + */ +boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */ +uint16_t zfs_zone_delay_step = 5; /* amount to change delay */ +uint16_t zfs_zone_delay_ceiling = 100; /* longest possible delay */ + +hrtime_t zfs_zone_last_checked = 0; + +boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */ + +/* + * For certain workloads, one zone may be issuing primarily sequential I/O and + * another primarily random I/O. The sequential I/O will complete much more + * quickly than the random I/O, driving the average system latency for those + * operations way down. As a result, the random I/O may be throttled back, even + * though the sequential I/O should be throttled to allow the random I/O more + * access to the disk. + * + * This tunable limits the discrepancy between the read and write system + * latency. If one becomes excessively high, this tunable prevents the I/O + * throttler from exacerbating the imbalance. + */ +uint_t zfs_zone_rw_lat_limit = 10; + + +/* + * The I/O throttle will only start delaying zones when it detects disk + * utilization has reached a certain level. This tunable controls the threshold + * at which the throttle will start delaying zones. The calculation should + * correspond closely with the %b column from iostat. + */ +uint_t zfs_zone_util_threshold = 80; + +/* + * Throughout this subsystem, our timestamps are in microseconds. Our system + * average cycle is one second or 1 million microseconds. Our zone counter + * update cycle is two seconds or 2 million microseconds. We use a longer + * duration for that cycle because some ops can see a little over two seconds of + * latency when they are being starved by another zone. + */ +uint_t zfs_zone_sys_avg_cycle = 1000000; /* 1 s */ +uint_t zfs_zone_cycle_time = 2000000; /* 2 s */ + +uint_t zfs_zone_adjust_time = 250000; /* 250 ms */ + +typedef struct { + hrtime_t cycle_start; + int cycle_cnt; + hrtime_t cycle_lat; + hrtime_t sys_avg_lat; +} sys_lat_cycle_t; + +typedef struct { + hrtime_t zi_now; + uint_t zi_avgrlat; + uint_t zi_avgwlat; + uint64_t zi_totpri; + uint64_t zi_totutil; + int zi_active; + uint_t zi_diskutil; +} zoneio_stats_t; + +static sys_lat_cycle_t rd_lat; +static sys_lat_cycle_t wr_lat; + +/* + * Some basic disk stats to determine disk utilization. + */ +kmutex_t zfs_disk_lock; +uint_t zfs_disk_rcnt; +hrtime_t zfs_disk_rtime = 0; +hrtime_t zfs_disk_rlastupdate = 0; + +hrtime_t zfs_disk_last_rtime = 0; + +/* + * Data used to keep track of how often txg flush is running. + */ +extern int zfs_txg_timeout; +static uint_t txg_last_check; +static uint_t txg_cnt; +static uint_t txg_flush_rate; + +boolean_t zfs_zone_schedule_enable = B_TRUE; /* enable IO sched. */ +/* + * Threshold for when zio scheduling should kick in. + * + * This threshold is based on 1/2 of the zfs_vdev_max_pending value for the + * number of I/Os that can be pending on a device. If there are more than a + * few ops already queued up, beyond those already issued to the vdev, then + * use scheduling to get the next zio. + */ +int zfs_zone_schedule_thresh = 5; + +/* + * Tunables for delay throttling when TxG flush is occurring. + */ +int zfs_zone_txg_throttle_scale = 2; +int zfs_zone_txg_delay_ticks = 2; + +typedef struct { + int zq_qdepth; + int zq_priority; + int zq_wt; + zoneid_t zq_zoneid; +} zone_q_bump_t; + +/* + * This uses gethrtime() but returns a value in usecs. + */ +#define GET_USEC_TIME (gethrtime() / 1000) +#define NANO_TO_MICRO(x) (x / (NANOSEC / MICROSEC)) + +/* + * Keep track of the zone's ZFS IOPs. + * + * If the number of ops is >1 then we can just use that value. However, + * if the number of ops is <2 then we might have a zone which is trying to do + * IO but is not able to get any ops through the system. We don't want to lose + * track of this zone so we factor in its decayed count into the current count. + * + * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count. + * However, since this calculation is driven by IO activity and since IO does + * not happen at fixed intervals, we use a timestamp to see when the last update + * was made. If it was more than one cycle ago, then we need to decay the + * historical count by the proper number of additional cycles in which no IO was + * performed. + * + * Return true if we actually computed a new historical count. + * If we're still within an active cycle there is nothing to do, return false. + */ +static hrtime_t +compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) +{ + hrtime_t delta; + int gen_cnt; + + /* + * Check if its time to recompute a new zone count. + * If we're still collecting data for the current cycle, return false. + */ + delta = unow - cp->cycle_start; + if (delta < zfs_zone_cycle_time) + return (delta); + + /* A previous cycle is past, compute the new zone count. */ + + /* + * Figure out how many generations we have to decay the historical + * count, since multiple cycles may have elapsed since our last IO. + * We depend on int rounding here. + */ + gen_cnt = (int)(delta / zfs_zone_cycle_time); + + /* If more than 5 cycles since last the IO, reset count. */ + if (gen_cnt > 5) { + cp->zone_avg_cnt = 0; + } else { + /* Update the count. */ + int i; + + /* + * If the zone did more than 1 IO, just use its current count + * as the historical value, otherwise decay the historical + * count and factor that into the new historical count. We + * pick a threshold > 1 so that we don't lose track of IO due + * to int rounding. + */ + if (cp->cycle_cnt > 1) + cp->zone_avg_cnt = cp->cycle_cnt; + else + cp->zone_avg_cnt = cp->cycle_cnt + + (cp->zone_avg_cnt / 2); + + /* + * If more than one generation has elapsed since the last + * update, decay the values further. + */ + for (i = 1; i < gen_cnt; i++) + cp->zone_avg_cnt = cp->zone_avg_cnt / 2; + } + + /* A new cycle begins. */ + cp->cycle_start = unow; + cp->cycle_cnt = 0; + + return (0); +} + +/* + * Add IO op data to the zone. + */ +static void +add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op) +{ + switch (op) { + case ZFS_ZONE_IOP_READ: + (void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops); + zonep->zone_rd_ops.cycle_cnt++; + break; + case ZFS_ZONE_IOP_WRITE: + (void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops); + zonep->zone_wr_ops.cycle_cnt++; + break; + case ZFS_ZONE_IOP_LOGICAL_WRITE: + (void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops); + zonep->zone_lwr_ops.cycle_cnt++; + break; + } +} + +/* + * Use a decaying average to keep track of the overall system latency. + * + * We want to have the recent activity heavily weighted, but if the + * activity decreases or stops, then the average should quickly decay + * down to the new value. + * + * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average. + * However, since this calculation is driven by IO activity and since IO does + * not happen + * + * at fixed intervals, we use a timestamp to see when the last update was made. + * If it was more than one cycle ago, then we need to decay the average by the + * proper number of additional cycles in which no IO was performed. + * + * Return true if we actually computed a new system average. + * If we're still within an active cycle there is nothing to do, return false. + */ +static int +compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp) +{ + hrtime_t delta; + int gen_cnt; + + /* + * Check if its time to recompute a new average. + * If we're still collecting data for the current cycle, return false. + */ + delta = unow - cp->cycle_start; + if (delta < zfs_zone_sys_avg_cycle) + return (0); + + /* A previous cycle is past, compute a new system average. */ + + /* + * Figure out how many generations we have to decay, since multiple + * cycles may have elapsed since our last IO. + * We count on int rounding here. + */ + gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle); + + /* If more than 5 cycles since last the IO, reset average. */ + if (gen_cnt > 5) { + cp->sys_avg_lat = 0; + } else { + /* Update the average. */ + int i; + + cp->sys_avg_lat = + (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt); + + /* + * If more than one generation has elapsed since the last + * update, decay the values further. + */ + for (i = 1; i < gen_cnt; i++) + cp->sys_avg_lat = cp->sys_avg_lat / 2; + } + + /* A new cycle begins. */ + cp->cycle_start = unow; + cp->cycle_cnt = 0; + cp->cycle_lat = 0; + + return (1); +} + +static void +add_sys_iop(hrtime_t unow, int op, int lat) +{ + switch (op) { + case ZFS_ZONE_IOP_READ: + (void) compute_new_sys_avg(unow, &rd_lat); + rd_lat.cycle_cnt++; + rd_lat.cycle_lat += lat; + break; + case ZFS_ZONE_IOP_WRITE: + (void) compute_new_sys_avg(unow, &wr_lat); + wr_lat.cycle_cnt++; + wr_lat.cycle_lat += lat; + break; + } +} + +/* + * Get the zone IO counts. + */ +static uint_t +calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) +{ + hrtime_t delta; + uint_t cnt; + + if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) { + /* + * No activity in the current cycle, we already have the + * historical data so we'll use that. + */ + cnt = cp->zone_avg_cnt; + } else { + /* + * If we're less than half way through the cycle then use + * the current count plus half the historical count, otherwise + * just use the current count. + */ + if (delta < (zfs_zone_cycle_time / 2)) + cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2); + else + cnt = cp->cycle_cnt; + } + + return (cnt); +} + +/* + * Get the average read/write latency in usecs for the system. + */ +static uint_t +calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp) +{ + if (compute_new_sys_avg(unow, cp)) { + /* + * No activity in the current cycle, we already have the + * historical data so we'll use that. + */ + return (cp->sys_avg_lat); + } else { + /* + * We're within a cycle; weight the current activity higher + * compared to the historical data and use that. + */ + extern void __dtrace_probe_zfs__zone__calc__wt__avg(uintptr_t, + uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__calc__wt__avg( + (uintptr_t)cp->sys_avg_lat, + (uintptr_t)cp->cycle_lat, + (uintptr_t)cp->cycle_cnt); + + return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) / + (1 + (cp->cycle_cnt * 8))); + } +} + +/* + * Account for the current IOP on the zone and for the system as a whole. + * The latency parameter is in usecs. + */ +static void +add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat) +{ + /* Add op to zone */ + add_zone_iop(zonep, unow, op); + + /* Track system latency */ + if (op != ZFS_ZONE_IOP_LOGICAL_WRITE) + add_sys_iop(unow, op, lat); +} + +/* + * Calculate and return the total number of read ops, write ops and logical + * write ops for the given zone. If the zone has issued operations of any type + * return a non-zero value, otherwise return 0. + */ +static int +get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops, + uint_t *lwops) +{ + *rops = calc_zone_cnt(unow, &zonep->zone_rd_ops); + *wops = calc_zone_cnt(unow, &zonep->zone_wr_ops); + *lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops); + + extern void __dtrace_probe_zfs__zone__io__cnt(uintptr_t, + uintptr_t, uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__io__cnt((uintptr_t)zonep->zone_id, + (uintptr_t)(*rops), (uintptr_t)*wops, (uintptr_t)*lwops); + + return (*rops | *wops | *lwops); +} + +/* + * Get the average read/write latency in usecs for the system. + */ +static void +get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat) +{ + *rlat = calc_avg_lat(unow, &rd_lat); + *wlat = calc_avg_lat(unow, &wr_lat); + + /* + * In an attempt to improve the accuracy of the throttling algorithm, + * assume that IO operations can't have zero latency. Instead, assume + * a reasonable lower bound for each operation type. If the actual + * observed latencies are non-zero, use those latency values instead. + */ + if (*rlat == 0) + *rlat = 1000; + if (*wlat == 0) + *wlat = 1000; + + extern void __dtrace_probe_zfs__zone__sys__avg__lat(uintptr_t, + uintptr_t); + + __dtrace_probe_zfs__zone__sys__avg__lat((uintptr_t)(*rlat), + (uintptr_t)*wlat); +} + +/* + * Find disk utilization for each zone and average utilization for all active + * zones. + */ +static int +zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg) +{ + zoneio_stats_t *sp = arg; + uint_t rops, wops, lwops; + + if (zonep->zone_id == GLOBAL_ZONEID || + get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) { + zonep->zone_io_util = 0; + return (0); + } + + zonep->zone_io_util = (rops * sp->zi_avgrlat) + + (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat); + sp->zi_totutil += zonep->zone_io_util; + + if (zonep->zone_io_util > 0) { + sp->zi_active++; + sp->zi_totpri += zonep->zone_zfs_io_pri; + } + + /* + * sdt:::zfs-zone-utilization + * + * arg0: zone ID + * arg1: read operations observed during time window + * arg2: physical write operations observed during time window + * arg3: logical write ops observed during time window + * arg4: calculated utilization given read and write ops + * arg5: I/O priority assigned to this zone + */ + extern void __dtrace_probe_zfs__zone__utilization( + uint_t, uint_t, uint_t, uint_t, uint_t, uint_t); + + __dtrace_probe_zfs__zone__utilization((uint_t)(zonep->zone_id), + (uint_t)rops, (uint_t)wops, (uint_t)lwops, + (uint_t)zonep->zone_io_util, (uint_t)zonep->zone_zfs_io_pri); + + return (0); +} + +static void +zfs_zone_delay_inc(zone_t *zonep) +{ + if (zonep->zone_io_delay < zfs_zone_delay_ceiling) + zonep->zone_io_delay += zfs_zone_delay_step; +} + +static void +zfs_zone_delay_dec(zone_t *zonep) +{ + if (zonep->zone_io_delay > 0) + zonep->zone_io_delay -= zfs_zone_delay_step; +} + +/* + * For all zones "far enough" away from the average utilization, increase that + * zones delay. Otherwise, reduce its delay. + */ +static int +zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg) +{ + zoneio_stats_t *sp = arg; + uint16_t delay = zonep->zone_io_delay; + uint_t fairutil = 0; + + zonep->zone_io_util_above_avg = B_FALSE; + + /* + * Given the calculated total utilitzation for all zones, calculate the + * fair share of I/O for this zone. + */ + if (zfs_zone_priority_enable && sp->zi_totpri > 0) { + fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) / + sp->zi_totpri; + } else if (sp->zi_active > 0) { + fairutil = sp->zi_totutil / sp->zi_active; + } + + /* + * Adjust each IO's delay. If the overall delay becomes too high, avoid + * increasing beyond the ceiling value. + */ + if (zonep->zone_io_util > fairutil && + sp->zi_diskutil > zfs_zone_util_threshold) { + zonep->zone_io_util_above_avg = B_TRUE; + + if (sp->zi_active > 1) + zfs_zone_delay_inc(zonep); + } else if (zonep->zone_io_util < fairutil || sp->zi_active <= 1) { + zfs_zone_delay_dec(zonep); + } + + /* + * sdt:::zfs-zone-throttle + * + * arg0: zone ID + * arg1: old delay for this zone + * arg2: new delay for this zone + * arg3: calculated fair I/O utilization + * arg4: actual I/O utilization + */ + extern void __dtrace_probe_zfs__zone__throttle( + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__throttle( + (uintptr_t)zonep->zone_id, (uintptr_t)delay, + (uintptr_t)zonep->zone_io_delay, (uintptr_t)fairutil, + (uintptr_t)zonep->zone_io_util); + + return (0); +} + +/* + * Examine the utilization between different zones, and adjust the delay for + * each zone appropriately. + */ +static void +zfs_zone_wait_adjust(hrtime_t unow) +{ + zoneio_stats_t stats; + + (void) bzero(&stats, sizeof (stats)); + + stats.zi_now = unow; + get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat); + + if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit) + stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit; + else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat) + stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit; + + if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0) + return; + + /* + * Calculate disk utilization for the most recent period. + */ + if (zfs_disk_last_rtime == 0 || unow - zfs_zone_last_checked <= 0) { + stats.zi_diskutil = 0; + } else { + stats.zi_diskutil = + ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) / + ((unow - zfs_zone_last_checked) * 1000); + } + zfs_disk_last_rtime = zfs_disk_rtime; + + /* + * sdt:::zfs-zone-stats + * + * Statistics observed over the last period: + * + * arg0: average system read latency + * arg1: average system write latency + * arg2: number of active zones + * arg3: total I/O 'utilization' for all zones + * arg4: total I/O priority of all active zones + * arg5: calculated disk utilization + */ + extern void __dtrace_probe_zfs__zone__stats( + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__stats((uintptr_t)(stats.zi_avgrlat), + (uintptr_t)(stats.zi_avgwlat), + (uintptr_t)(stats.zi_active), + (uintptr_t)(stats.zi_totutil), + (uintptr_t)(stats.zi_totpri), + (uintptr_t)(stats.zi_diskutil)); + + (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats); +} + +/* + * Callback used to calculate a zone's IO schedule priority. + * + * We scan the zones looking for ones with ops in the queue. Out of those, + * we pick the one that calculates to the highest schedule priority. + */ +static int +get_sched_pri_cb(zone_t *zonep, void *arg) +{ + int pri; + zone_q_bump_t *qbp = arg; + + extern void __dtrace_probe_zfs__zone__enqueued(uintptr_t, uintptr_t); + __dtrace_probe_zfs__zone__enqueued((uintptr_t)(zonep->zone_id), + (uintptr_t)(zonep->zone_zfs_queued)); + + if (zonep->zone_zfs_queued == 0) { + zonep->zone_zfs_weight = 0; + return (0); + } + + /* + * On each pass, increment the zone's weight. We use this as input + * to the calculation to prevent starvation. The value is reset + * each time we issue an IO for this zone so zones which haven't + * done any IO over several iterations will see their weight max + * out. + */ + if (zonep->zone_zfs_weight < 20) + zonep->zone_zfs_weight++; + + /* + * This zone's IO priority is the inverse of the number of IOs + * the zone has enqueued * zone's configured priority * weight. + * The queue depth has already been scaled by 10 to avoid problems + * with int rounding. + * + * This means that zones with fewer IOs in the queue will get + * preference unless other zone's assigned priority pulls them + * ahead. The weight is factored in to help ensure that zones + * which haven't done IO in a while aren't getting starved. + */ + pri = (qbp->zq_qdepth / zonep->zone_zfs_queued) * + zonep->zone_zfs_io_pri * zonep->zone_zfs_weight; + + /* + * If this zone has a higher priority than what we found so far, + * schedule it next. + */ + if (pri > qbp->zq_priority) { + qbp->zq_zoneid = zonep->zone_id; + qbp->zq_priority = pri; + qbp->zq_wt = zonep->zone_zfs_weight; + } + return (0); +} + +/* + * See if we need to bump a zone's zio to the head of the queue. + * + * For single-threaded synchronous workloads a zone cannot get more than + * 1 op into the queue at a time unless the zone is running multiple workloads + * in parallel. This can cause an imbalance in performance if there are zones + * with many parallel workloads (and ops in the queue) vs. other zones which + * are doing simple single-threaded workloads, such as interactive tasks in the + * shell. These zones can get backed up behind a deep queue and their IO + * performance will appear to be very poor as a result. This can make the + * zone work badly for interactive behavior. + * + * The scheduling algorithm kicks in once we start to get a deeper queue. + * Once that occurs, we look at all of the zones to see which one calculates + * to the highest priority. We bump that zone's first zio to the head of the + * queue. + * + * We use a counter on the zone so that we can quickly find how many ops each + * zone has in the queue without having to search the entire queue itself. + * This scales better since the number of zones is expected to be on the + * order of 10-100 whereas the queue depth can be in the range of 50-2000. + * In addition, since the zio's in the queue only have the zoneid, we would + * have to look up the zone for each zio enqueued and that means the overhead + * for scanning the queue each time would be much higher. + * + * In all cases, we fall back to simply pulling the next op off the queue + * if something should go wrong. + */ +static zio_t * +get_next_zio(vdev_queue_t *vq, int qdepth) +{ + zone_q_bump_t qbump; + zio_t *zp = NULL, *zphead; + int cnt = 0; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + /* To avoid problems with int rounding, scale the queue depth by 10 */ + qbump.zq_qdepth = qdepth * 10; + qbump.zq_priority = 0; + qbump.zq_zoneid = 0; + (void) zone_walk(get_sched_pri_cb, &qbump); + + zphead = avl_first(&vq->vq_deadline_tree); + + /* Check if the scheduler didn't pick a zone for some reason!? */ + if (qbump.zq_zoneid != 0) { + for (zp = avl_first(&vq->vq_deadline_tree); zp != NULL; + zp = avl_walk(&vq->vq_deadline_tree, zp, AVL_AFTER)) { + if (zp->io_zoneid == qbump.zq_zoneid) + break; + cnt++; + } + } + + if (zp == NULL) { + zp = zphead; + } else if (zp != zphead) { + /* + * Only fire the probe if we actually picked a different zio + * than the one already at the head of the queue. + */ + extern void __dtrace_probe_zfs__zone__sched__bump(uintptr_t, + uintptr_t, uintptr_t, uintptr_t); + __dtrace_probe_zfs__zone__sched__bump( + (uintptr_t)(zp->io_zoneid), (uintptr_t)(cnt), + (uintptr_t)(qbump.zq_priority), (uintptr_t)(qbump.zq_wt)); + } + + return (zp); +} + +/* + * Add our zone ID to the zio so we can keep track of which zones are doing + * what, even when the current thread processing the zio is not associated + * with the zone (e.g. the kernel taskq which pushes out RX groups). + */ +void +zfs_zone_zio_init(zio_t *zp) +{ + zone_t *zonep = curzone; + + zp->io_zoneid = zonep->zone_id; +} + +/* + * Track IO operations per zone. Called from dmu_tx_count_write for write ops + * and dmu_read_uio for read ops. For each operation, increment that zone's + * counter based on the type of operation. + * + * There are three basic ways that we can see write ops: + * 1) An application does write syscalls. Those ops go into a TXG which + * we'll count here. Sometime later a kernel taskq thread (we'll see the + * vdev IO as zone 0) will perform some number of physical writes to commit + * the TXG to disk. Those writes are not associated with the zone which + * made the write syscalls and the number of operations is not correlated + * between the taskq and the zone. + * 2) An application opens a file with O_SYNC. Each write will result in + * an operation which we'll see here plus a low-level vdev write from + * that zone. + * 3) An application does write syscalls followed by an fsync(). We'll + * count the writes going into a TXG here. We'll also see some number + * (usually much smaller, maybe only 1) of low-level vdev writes from this + * zone when the fsync is performed, plus some other low-level vdev writes + * from the taskq in zone 0 (are these metadata writes?). + * + * 4) In addition to the above, there are misc. system-level writes, such as + * writing out dirty pages to swap, or sync(2) calls, which will be handled + * by the global zone and which we count but don't generally worry about. + * + * Because of the above, we can see writes twice because this is called + * at a high level by a zone thread, but we also will count the phys. writes + * that are performed at a low level via zfs_zone_zio_start. + * + * Without this, it can look like a non-global zone never writes (case 1). + * Depending on when the TXG is flushed, the counts may be in the same sample + * bucket or in a different one. + * + * Tracking read operations is simpler due to their synchronous semantics. The + * zfs_read function -- called as a result of a read(2) syscall -- will always + * retrieve the data to be read through dmu_read_uio. + */ +void +zfs_zone_io_throttle(zfs_zone_iop_type_t type) +{ + zone_t *zonep = curzone; + hrtime_t unow; + uint16_t wait; + + unow = GET_USEC_TIME; + + /* + * Only bump the counters for logical operations here. The counters for + * tracking physical IO operations are handled in zfs_zone_zio_done. + */ + if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) { + mutex_enter(&zonep->zone_stg_io_lock); + add_iop(zonep, unow, type, 0); + mutex_exit(&zonep->zone_stg_io_lock); + } + + if (!zfs_zone_delay_enable) + return; + + /* + * XXX There's a potential race here in that more than one thread may + * update the zone delays concurrently. The worst outcome is corruption + * of our data to track each zone's IO, so the algorithm may make + * incorrect throttling decisions until the data is refreshed. + */ + if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) { + zfs_zone_wait_adjust(unow); + zfs_zone_last_checked = unow; + } + + if ((wait = zonep->zone_io_delay) > 0) { + /* + * If this is a write and we're doing above normal TxG + * flushing, then throttle for longer than normal. + */ + if (type == ZFS_ZONE_IOP_LOGICAL_WRITE && + (txg_cnt > 1 || txg_flush_rate > 1)) + wait *= zfs_zone_txg_throttle_scale; + + /* + * sdt:::zfs-zone-wait + * + * arg0: zone ID + * arg1: type of IO operation + * arg2: time to delay (in us) + */ + extern void __dtrace_probe_zfs__zone__wait( + uintptr_t, uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__wait((uintptr_t)(zonep->zone_id), + (uintptr_t)type, (uintptr_t)wait); + + drv_usecwait(wait); + + if (zonep->zone_vfs_stats != NULL) { + atomic_inc_64(&zonep->zone_vfs_stats-> + zv_delay_cnt.value.ui64); + atomic_add_64(&zonep->zone_vfs_stats-> + zv_delay_time.value.ui64, wait); + } + } +} + +/* + * XXX Ignore the pool pointer parameter for now. + * + * Keep track to see if the TxG flush rate is running above the expected rate. + * If so, this implies that we are filling TxG's at a high rate due to a heavy + * write workload. We use this as input into the zone throttle. + * + * This function is called every 5 seconds (zfs_txg_timeout) under a normal + * write load. In this case, the flush rate is going to be 1. When there + * is a heavy write load, TxG's fill up fast and the sync thread will write + * the TxG more frequently (perhaps once a second). In this case the rate + * will be > 1. The flush rate is a lagging indicator since it can be up + * to 5 seconds old. We use the txg_cnt to keep track of the rate in the + * current 5 second interval and txg_flush_rate to keep track of the previous + * 5 second interval. In that way we don't have a period (1 or more seconds) + * where the txg_cnt == 0 and we cut back on throttling even though the rate + * is still high. + */ +/*ARGSUSED*/ +void +zfs_zone_report_txg_sync(void *dp) +{ + uint_t now; + + txg_cnt++; + now = (uint_t)(gethrtime() / NANOSEC); + if ((now - txg_last_check) >= zfs_txg_timeout) { + txg_flush_rate = txg_cnt / 2; + txg_cnt = 0; + txg_last_check = now; + } +} + +int +zfs_zone_txg_delay() +{ + zone_t *zonep = curzone; + int delay = 1; + + if (zonep->zone_io_util_above_avg) + delay = zfs_zone_txg_delay_ticks; + + extern void __dtrace_probe_zfs__zone__txg__delay(uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__txg__delay((uintptr_t)(zonep->zone_id), + (uintptr_t)delay); + + return (delay); +} + +/* + * Called from zio_vdev_io_start when an IO hits the end of the zio pipeline + * and is issued. + * Keep track of start time for latency calculation in zfs_zone_zio_done. + */ +void +zfs_zone_zio_start(zio_t *zp) +{ + zone_t *zonep; + + /* + * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for + * an actual I/O operation. Ignore those operations as they relate to + * throttling and scheduling. + */ + if (zp->io_type == ZIO_TYPE_IOCTL) + return; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_zfs_lock); + if (zp->io_type == ZIO_TYPE_READ) + kstat_runq_enter(&zonep->zone_zfs_rwstats); + zonep->zone_zfs_weight = 0; + mutex_exit(&zonep->zone_zfs_lock); + + mutex_enter(&zfs_disk_lock); + zp->io_dispatched = gethrtime(); + + if (zfs_disk_rcnt++ != 0) + zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate); + zfs_disk_rlastupdate = zp->io_dispatched; + mutex_exit(&zfs_disk_lock); + + zone_rele(zonep); +} + +/* + * Called from vdev_queue_io_done when an IO completes. + * Increment our counter for zone ops. + * Calculate the IO latency avg. for this zone. + */ +void +zfs_zone_zio_done(zio_t *zp) +{ + zone_t *zonep; + hrtime_t now, unow, udelta; + + if (zp->io_type == ZIO_TYPE_IOCTL) + return; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + now = gethrtime(); + unow = NANO_TO_MICRO(now); + udelta = unow - NANO_TO_MICRO(zp->io_dispatched); + + mutex_enter(&zonep->zone_zfs_lock); + + /* + * To calculate the wsvc_t average, keep a cumulative sum of all the + * wait time before each I/O was dispatched. Since most writes are + * asynchronous, only track the wait time for read I/Os. + */ + if (zp->io_type == ZIO_TYPE_READ) { + zonep->zone_zfs_rwstats.reads++; + zonep->zone_zfs_rwstats.nread += zp->io_size; + + zonep->zone_zfs_stats->zz_waittime.value.ui64 += + zp->io_dispatched - zp->io_start; + + kstat_runq_exit(&zonep->zone_zfs_rwstats); + } else { + zonep->zone_zfs_rwstats.writes++; + zonep->zone_zfs_rwstats.nwritten += zp->io_size; + } + + mutex_exit(&zonep->zone_zfs_lock); + + mutex_enter(&zfs_disk_lock); + zfs_disk_rcnt--; + zfs_disk_rtime += (now - zfs_disk_rlastupdate); + zfs_disk_rlastupdate = now; + mutex_exit(&zfs_disk_lock); + + if (zfs_zone_delay_enable) { + mutex_enter(&zonep->zone_stg_io_lock); + add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ? + ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta); + mutex_exit(&zonep->zone_stg_io_lock); + } + + zone_rele(zonep); + + /* + * sdt:::zfs-zone-latency + * + * arg0: zone ID + * arg1: type of I/O operation + * arg2: I/O latency (in us) + */ + extern void __dtrace_probe_zfs__zone__latency( + uintptr_t, uintptr_t, uintptr_t); + + __dtrace_probe_zfs__zone__latency((uintptr_t)(zp->io_zoneid), + (uintptr_t)(zp->io_type), (uintptr_t)(udelta)); +} + +void +zfs_zone_zio_dequeue(zio_t *zp) +{ + zone_t *zonep; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_stg_io_lock); + ASSERT(zonep->zone_zfs_queued > 0); + if (zonep->zone_zfs_queued == 0) + cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0"); + else + zonep->zone_zfs_queued--; + mutex_exit(&zonep->zone_stg_io_lock); + zone_rele(zonep); +} + +void +zfs_zone_zio_enqueue(zio_t *zp) +{ + zone_t *zonep; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_stg_io_lock); + zonep->zone_zfs_queued++; + mutex_exit(&zonep->zone_stg_io_lock); + zone_rele(zonep); +} + +/* + * Called from vdev_queue_io_to_issue. This function is where zio's are found + * at the head of the queue (by avl_first), then pulled off (by + * vdev_queue_io_remove) and issued. We do our scheduling here to find the + * next zio to issue. + * + * The vq->vq_lock mutex is held when we're executing this function so we + * can safely access the "last zone" variable on the queue. + */ +zio_t * +zfs_zone_schedule(vdev_queue_t *vq) +{ + int cnt; + zoneid_t last_zone; + zio_t *zp; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + cnt = avl_numnodes(&vq->vq_deadline_tree); + last_zone = vq->vq_last_zone_id; + + /* + * If there are only a few ops in the queue then just issue the head. + * If there are more than a few ops already queued up, then use + * scheduling to get the next zio. + */ + if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh) + zp = avl_first(&vq->vq_deadline_tree); + else + zp = get_next_zio(vq, cnt); + + vq->vq_last_zone_id = zp->io_zoneid; + + /* + * Probe with 3 args; the number of IOs in the queue, the zone that + * was last scheduled off this queue, and the zone that was associated + * with the next IO that is scheduled. + */ + extern void __dtrace_probe_zfs__zone__sched(uintptr_t, uintptr_t, + uintptr_t); + + __dtrace_probe_zfs__zone__sched((uintptr_t)(cnt), + (uintptr_t)(last_zone), (uintptr_t)(zp->io_zoneid)); + + return (zp); +} + +#endif diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index d1bed63f30..00964aa83f 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -36,6 +36,7 @@ #include <sys/dmu_objset.h> #include <sys/arc.h> #include <sys/ddt.h> +#include <sys/zfs_zone.h> /* * ========================================================================== @@ -511,6 +512,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio = kmem_cache_alloc(zio_cache, KM_SLEEP); bzero(zio, sizeof (zio_t)); + zio->io_start = gethrtime(); + mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); @@ -562,11 +565,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_bookmark = *zb; if (pio != NULL) { + zio->io_zoneid = pio->io_zoneid; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); + } else { + zfs_zone_zio_init(zio); } return (zio); @@ -904,6 +910,8 @@ zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; + zio->io_start = gethrtime(); + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW)) { @@ -2289,6 +2297,9 @@ zio_vdev_io_start(zio_t *zio) ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); + if (zio->io_type == ZIO_TYPE_WRITE) + zio->io_start = gethrtime(); + if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_enter(spa, SCL_ZIO, zio, RW_READER); diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c index c8fe20f2eb..7af4644cbf 100644 --- a/usr/src/uts/common/fs/zfs/zio_checksum.c +++ b/usr/src/uts/common/fs/zfs/zio_checksum.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include <sys/zfs_context.h> @@ -66,6 +67,13 @@ zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); } +/* + * The sha256_mac checksum algorithm was added to try to maintain on-disk + * compatibility with ZFS on other platforms. That effort didn't work for other + * reasons. As a result, the sha256_mac algorithm is unused except in the rare + * case of an older platform interpreting noparity as sha256_mac -- which is why + * they both are no-ops. + */ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { {{NULL, NULL}, 0, 0, 0, "inherit"}, {{NULL, NULL}, 0, 0, 0, "on"}, @@ -77,6 +85,8 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"}, {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"}, + {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "sha256_mac"}, + {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "noparity"}, }; enum zio_checksum diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index fa26629c6e..ef96b1c401 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -77,9 +77,11 @@ #include <sys/zfs_rlock.h> #include <sys/vdev_disk.h> #include <sys/vdev_impl.h> +#include <sys/vdev_raidz.h> #include <sys/zvol.h> #include <sys/dumphdr.h> #include <sys/zil_impl.h> +#include <sys/sdt.h> #include "zfs_namecheck.h" @@ -1070,27 +1072,28 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid, } static int -zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size, - boolean_t doread, boolean_t isdump) +zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset, + uint64_t size, boolean_t doread, boolean_t isdump) { vdev_disk_t *dvd; int c; int numerrors = 0; - for (c = 0; c < vd->vdev_children; c++) { - ASSERT(vd->vdev_ops == &vdev_mirror_ops || - vd->vdev_ops == &vdev_replacing_ops || - vd->vdev_ops == &vdev_spare_ops); - int err = zvol_dumpio_vdev(vd->vdev_child[c], - addr, offset, size, doread, isdump); - if (err != 0) { - numerrors++; - } else if (doread) { - break; + if (vd->vdev_ops == &vdev_mirror_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops) { + for (c = 0; c < vd->vdev_children; c++) { + int err = zvol_dumpio_vdev(vd->vdev_child[c], + addr, offset, origoffset, size, doread, isdump); + if (err != 0) { + numerrors++; + } else if (doread) { + break; + } } } - if (!vd->vdev_ops->vdev_op_leaf) + if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops) return (numerrors < vd->vdev_children ? 0 : EIO); if (doread && !vdev_readable(vd)) @@ -1098,19 +1101,27 @@ zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size, else if (!doread && !vdev_writeable(vd)) return (EIO); - dvd = vd->vdev_tsd; - ASSERT3P(dvd, !=, NULL); + if (vd->vdev_ops == &vdev_raidz_ops) { + return (vdev_raidz_physio(vd, + addr, size, offset, origoffset, doread)); + } + offset += VDEV_LABEL_START_SIZE; if (ddi_in_panic() || isdump) { ASSERT(!doread); if (doread) return (EIO); + dvd = vd->vdev_tsd; + ASSERT3P(dvd, !=, NULL); return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset), lbtodb(size))); } else { - return (vdev_disk_physio(dvd->vd_lh, addr, size, offset, - doread ? B_READ : B_WRITE)); + dvd = vd->vdev_tsd; + ASSERT3P(dvd, !=, NULL); + + return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size, + offset, doread ? B_READ : B_WRITE)); } } @@ -1142,7 +1153,8 @@ zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size, vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva)); offset += DVA_GET_OFFSET(&ze->ze_dva); - error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump); + error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva), + size, doread, isdump); if (!ddi_in_panic()) spa_config_exit(spa, SCL_STATE, FTAG); @@ -1333,6 +1345,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) return (error); } + DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0); + rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, RL_READER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { @@ -1351,6 +1365,10 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) } } zfs_range_unlock(rl); + + DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int, + error); + return (error); } @@ -1380,6 +1398,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) return (error); } + DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1); + sync = !(zv->zv_flags & ZVOL_WCE) || (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); @@ -1410,6 +1430,10 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) zfs_range_unlock(rl); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); + + DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int, + error); + return (error); } @@ -1863,7 +1887,7 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) ZIO_COMPRESS_OFF) == 0); VERIFY(nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_CHECKSUM), - ZIO_CHECKSUM_OFF) == 0); + ZIO_CHECKSUM_NOPARITY) == 0); if (version >= SPA_VERSION_DEDUP) { VERIFY(nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_DEDUP), diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index 42adb4c451..bd50364310 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 1990 Mentat Inc. */ @@ -2195,6 +2196,8 @@ struct ip_xmit_attr_s { */ ixa_notify_t ixa_notify; /* Registered upcall notify function */ void *ixa_notify_cookie; /* ULP cookie for ixa_notify */ + + uint_t ixa_tcpcleanup; /* Used by conn_ixa_cleanup */ }; /* @@ -2266,6 +2269,14 @@ struct ip_xmit_attr_s { #define IXA_FREE_TSL 0x00000002 /* ixa_tsl needs to be rele */ /* + * Trivial state machine used to synchronize IXA cleanup for TCP connections. + * See conn_ixa_cleanup(). + */ +#define IXATC_IDLE 0x00000000 +#define IXATC_INPROGRESS 0x00000001 +#define IXATC_COMPLETE 0x00000002 + +/* * Simplistic way to set the ixa_xmit_hint for locally generated traffic * and forwarded traffic. The shift amount are based on the size of the * structs to discard the low order bits which don't have much if any variation @@ -3030,6 +3041,7 @@ extern vmem_t *ip_minor_arena_la; #define ips_ip_strict_src_multihoming ips_propinfo_tbl[80].prop_cur_uval #define ips_ipv6_strict_src_multihoming ips_propinfo_tbl[81].prop_cur_uval #define ips_ipv6_drop_inbound_icmpv6 ips_propinfo_tbl[82].prop_cur_bval +#define ips_ip_dce_reclaim_threshold ips_propinfo_tbl[83].prop_cur_uval extern int dohwcksum; /* use h/w cksum if supported by the h/w */ #ifdef ZC_TEST diff --git a/usr/src/uts/common/inet/ip/ip_attr.c b/usr/src/uts/common/inet/ip/ip_attr.c index 3197858f8e..e040af14ba 100644 --- a/usr/src/uts/common/inet/ip/ip_attr.c +++ b/usr/src/uts/common/inet/ip/ip_attr.c @@ -909,6 +909,11 @@ ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa) */ if (ixa->ixa_free_flags & IXA_FREE_CRED) crhold(ixa->ixa_cred); + + /* + * There is no cleanup in progress on this new copy. + */ + ixa->ixa_tcpcleanup = IXATC_IDLE; } /* @@ -1176,6 +1181,59 @@ ixa_cleanup_stale(ip_xmit_attr_t *ixa) } } +static mblk_t * +tcp_ixa_cleanup_getmblk(conn_t *connp) +{ + tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp; + int need_retry; + mblk_t *mp; + + mutex_enter(&tcps->tcps_ixa_cleanup_lock); + + /* + * It's possible that someone else came in and started cleaning up + * another connection between the time we verified this one is not being + * cleaned up and the time we actually get the shared mblk. If that's + * the case, we've dropped the lock, and some other thread may have + * cleaned up this connection again, and is still waiting for + * notification of that cleanup's completion. Therefore we need to + * recheck. + */ + do { + need_retry = 0; + while (connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE) { + cv_wait(&tcps->tcps_ixa_cleanup_done_cv, + &tcps->tcps_ixa_cleanup_lock); + } + + while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) { + /* + * Multiple concurrent cleanups; need to have the last + * one run since it could be an unplumb. + */ + need_retry = 1; + cv_wait(&tcps->tcps_ixa_cleanup_ready_cv, + &tcps->tcps_ixa_cleanup_lock); + } + } while (need_retry); + + /* + * We now have the lock and the mblk; now make sure that no one else can + * try to clean up this connection or enqueue it for cleanup, clear the + * mblk pointer for this stack, drop the lock, and return the mblk. + */ + ASSERT(MUTEX_HELD(&tcps->tcps_ixa_cleanup_lock)); + ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_IDLE); + ASSERT(tcps->tcps_ixa_cleanup_mp == mp); + ASSERT(mp != NULL); + + connp->conn_ixa->ixa_tcpcleanup = IXATC_INPROGRESS; + tcps->tcps_ixa_cleanup_mp = NULL; + mutex_exit(&tcps->tcps_ixa_cleanup_lock); + + return (mp); +} + /* * Used to run ixa_cleanup_stale inside the tcp squeue. * When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp @@ -1195,11 +1253,39 @@ tcp_ixa_cleanup(void *arg, mblk_t *mp, void *arg2, mutex_enter(&tcps->tcps_ixa_cleanup_lock); ASSERT(tcps->tcps_ixa_cleanup_mp == NULL); + connp->conn_ixa->ixa_tcpcleanup = IXATC_COMPLETE; tcps->tcps_ixa_cleanup_mp = mp; - cv_signal(&tcps->tcps_ixa_cleanup_cv); + cv_signal(&tcps->tcps_ixa_cleanup_ready_cv); + /* + * It is possible for any number of threads to be waiting for cleanup of + * different connections. Absent a per-connection (or per-IXA) CV, we + * need to wake them all up even though only one can be waiting on this + * particular cleanup. + */ + cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv); mutex_exit(&tcps->tcps_ixa_cleanup_lock); } +static void +tcp_ixa_cleanup_wait_and_finish(conn_t *connp) +{ + tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp; + + mutex_enter(&tcps->tcps_ixa_cleanup_lock); + + ASSERT(connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE); + + while (connp->conn_ixa->ixa_tcpcleanup == IXATC_INPROGRESS) { + cv_wait(&tcps->tcps_ixa_cleanup_done_cv, + &tcps->tcps_ixa_cleanup_lock); + } + + ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_COMPLETE); + connp->conn_ixa->ixa_tcpcleanup = IXATC_IDLE; + cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv); + + mutex_exit(&tcps->tcps_ixa_cleanup_lock); +} /* * ipcl_walk() function to help release any IRE, NCE, or DCEs that @@ -1214,21 +1300,8 @@ conn_ixa_cleanup(conn_t *connp, void *arg) if (IPCL_IS_TCP(connp)) { mblk_t *mp; - tcp_stack_t *tcps; - - tcps = connp->conn_netstack->netstack_tcp; - mutex_enter(&tcps->tcps_ixa_cleanup_lock); - while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) { - /* - * Multiple concurrent cleanups; need to have the last - * one run since it could be an unplumb. - */ - cv_wait(&tcps->tcps_ixa_cleanup_cv, - &tcps->tcps_ixa_cleanup_lock); - } - tcps->tcps_ixa_cleanup_mp = NULL; - mutex_exit(&tcps->tcps_ixa_cleanup_lock); + mp = tcp_ixa_cleanup_getmblk(connp); if (connp->conn_sqp->sq_run == curthread) { /* Already on squeue */ @@ -1237,15 +1310,8 @@ conn_ixa_cleanup(conn_t *connp, void *arg) CONN_INC_REF(connp); SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup, connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP); - - /* Wait until tcp_ixa_cleanup has run */ - mutex_enter(&tcps->tcps_ixa_cleanup_lock); - while (tcps->tcps_ixa_cleanup_mp == NULL) { - cv_wait(&tcps->tcps_ixa_cleanup_cv, - &tcps->tcps_ixa_cleanup_lock); - } - mutex_exit(&tcps->tcps_ixa_cleanup_lock); } + tcp_ixa_cleanup_wait_and_finish(connp); } else if (IPCL_IS_SCTP(connp)) { sctp_t *sctp; sctp_faddr_t *fp; diff --git a/usr/src/uts/common/inet/ip/ip_dce.c b/usr/src/uts/common/inet/ip/ip_dce.c index 215bc4675f..502ee8a735 100644 --- a/usr/src/uts/common/inet/ip/ip_dce.c +++ b/usr/src/uts/common/inet/ip/ip_dce.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -32,6 +33,7 @@ #include <sys/cmn_err.h> #include <sys/debug.h> #include <sys/atomic.h> +#include <sys/callb.h> #define _SUN_TPI_VERSION 2 #include <sys/tihdr.h> @@ -102,7 +104,19 @@ static void dce_delete_locked(dcb_t *, dce_t *); static void dce_make_condemned(dce_t *); static kmem_cache_t *dce_cache; +static kthread_t *dce_reclaim_thread; +static kmutex_t dce_reclaim_lock; +static kcondvar_t dce_reclaim_cv; +static int dce_reclaim_shutdown; +/* Global so it can be tuned in /etc/system. This must be a power of two. */ +uint_t ip_dce_hash_size = 1024; + +/* The time in seconds between executions of the IP DCE reclaim worker. */ +uint_t ip_dce_reclaim_interval = 60; + +/* The factor of the DCE threshold at which to start hard reclaims */ +uint_t ip_dce_reclaim_threshold_hard = 2; /* Operates on a uint64_t */ #define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48)) @@ -117,6 +131,11 @@ dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction) uint_t fraction_pmtu = fraction*4; uint_t hash; dce_t *dce, *nextdce; + hrtime_t seed = gethrtime(); + uint_t retained = 0; + uint_t max = ipst->ips_ip_dce_reclaim_threshold; + + max *= ip_dce_reclaim_threshold_hard; rw_enter(&dcb->dcb_lock, RW_WRITER); for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) { @@ -132,13 +151,21 @@ dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction) } else { mutex_exit(&dce->dce_lock); } - hash = RANDOM_HASH((uint64_t)(uintptr_t)dce); - if (dce->dce_flags & DCEF_PMTU) { - if (hash % fraction_pmtu != 0) - continue; - } else { - if (hash % fraction != 0) - continue; + + if (max == 0 || retained < max) { + hash = RANDOM_HASH((uint64_t)((uintptr_t)dce | seed)); + + if (dce->dce_flags & DCEF_PMTU) { + if (hash % fraction_pmtu != 0) { + retained++; + continue; + } + } else { + if (hash % fraction != 0) { + retained++; + continue; + } + } } IP_STAT(ipst, ip_dce_reclaim_deleted); @@ -175,17 +202,19 @@ ip_dce_reclaim_stack(ip_stack_t *ipst) } /* - * Called by the memory allocator subsystem directly, when the system - * is running low on memory. + * Called by dce_reclaim_worker() below, and no one else. Typically this will + * mean that the number of entries in the hash buckets has exceeded a tunable + * threshold. */ -/* ARGSUSED */ -void -ip_dce_reclaim(void *args) +static void +ip_dce_reclaim(void) { netstack_handle_t nh; netstack_t *ns; ip_stack_t *ipst; + ASSERT(curthread == dce_reclaim_thread); + netstack_next_init(&nh); while ((ns = netstack_next(&nh)) != NULL) { /* @@ -196,26 +225,75 @@ ip_dce_reclaim(void *args) netstack_rele(ns); continue; } - ip_dce_reclaim_stack(ipst); + if (atomic_swap_uint(&ipst->ips_dce_reclaim_needed, 0) != 0) + ip_dce_reclaim_stack(ipst); netstack_rele(ns); } netstack_next_fini(&nh); } +/* ARGSUSED */ +static void +dce_reclaim_worker(void *arg) +{ + callb_cpr_t cprinfo; + + CALLB_CPR_INIT(&cprinfo, &dce_reclaim_lock, callb_generic_cpr, + "dce_reclaim_worker"); + + mutex_enter(&dce_reclaim_lock); + while (!dce_reclaim_shutdown) { + CALLB_CPR_SAFE_BEGIN(&cprinfo); + (void) cv_timedwait(&dce_reclaim_cv, &dce_reclaim_lock, + ddi_get_lbolt() + ip_dce_reclaim_interval * hz); + CALLB_CPR_SAFE_END(&cprinfo, &dce_reclaim_lock); + + if (dce_reclaim_shutdown) + break; + + mutex_exit(&dce_reclaim_lock); + ip_dce_reclaim(); + mutex_enter(&dce_reclaim_lock); + } + + ASSERT(MUTEX_HELD(&dce_reclaim_lock)); + dce_reclaim_thread = NULL; + dce_reclaim_shutdown = 0; + cv_broadcast(&dce_reclaim_cv); + CALLB_CPR_EXIT(&cprinfo); /* drops the lock */ + + thread_exit(); +} + void dce_g_init(void) { dce_cache = kmem_cache_create("dce_cache", - sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0); + sizeof (dce_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + mutex_init(&dce_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dce_reclaim_cv, NULL, CV_DEFAULT, NULL); + + dce_reclaim_thread = thread_create(NULL, 0, dce_reclaim_worker, + NULL, 0, &p0, TS_RUN, minclsyspri); } void dce_g_destroy(void) { + mutex_enter(&dce_reclaim_lock); + dce_reclaim_shutdown = 1; + cv_signal(&dce_reclaim_cv); + while (dce_reclaim_thread != NULL) + cv_wait(&dce_reclaim_cv, &dce_reclaim_lock); + mutex_exit(&dce_reclaim_lock); + + cv_destroy(&dce_reclaim_cv); + mutex_destroy(&dce_reclaim_lock); + kmem_cache_destroy(dce_cache); } - /* * Allocate a default DCE and a hash table for per-IP address DCEs */ @@ -234,7 +312,7 @@ dce_stack_init(ip_stack_t *ipst) ipst->ips_dce_default->dce_ipst = ipst; /* This must be a power of two since we are using IRE_ADDR_HASH macro */ - ipst->ips_dce_hashsize = 256; + ipst->ips_dce_hashsize = ip_dce_hash_size; ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize * sizeof (dcb_t), KM_SLEEP); ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize * @@ -414,6 +492,12 @@ dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst) hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize); dcb = &ipst->ips_dce_hash_v4[hash]; + /* + * Assuming that we get fairly even distribution across all of the + * buckets, once one bucket is overly full, prune the whole cache. + */ + if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold) + atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1); rw_enter(&dcb->dcb_lock, RW_WRITER); for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { if (dce->dce_v4addr == dst) { @@ -447,6 +531,7 @@ dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst) dce->dce_ptpn = &dcb->dcb_dce; dcb->dcb_dce = dce; dce->dce_bucket = dcb; + atomic_add_32(&dcb->dcb_cnt, 1); dce_refhold(dce); /* For the caller */ rw_exit(&dcb->dcb_lock); @@ -476,6 +561,12 @@ dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst) hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize); dcb = &ipst->ips_dce_hash_v6[hash]; + /* + * Assuming that we get fairly even distribution across all of the + * buckets, once one bucket is overly full, prune the whole cache. + */ + if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold) + atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1); rw_enter(&dcb->dcb_lock, RW_WRITER); for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) && diff --git a/usr/src/uts/common/inet/ip/ip_tunables.c b/usr/src/uts/common/inet/ip/ip_tunables.c index 516d6c1a21..1e249b493e 100644 --- a/usr/src/uts/common/inet/ip/ip_tunables.c +++ b/usr/src/uts/common/inet/ip/ip_tunables.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -908,6 +909,11 @@ mod_prop_info_t ip_propinfo_tbl[] = { #else { "", 0, NULL, NULL, {0}, {0} }, #endif + + { "_dce_reclaim_threshold", MOD_PROTO_IP, + mod_set_uint32, mod_get_uint32, + {1, 100000, 32}, {32} }, + { "mtu", MOD_PROTO_IPV4, NULL, ip_get_mtu, {0}, {0} }, { "mtu", MOD_PROTO_IPV6, NULL, ip_get_mtu, {0}, {0} }, diff --git a/usr/src/uts/common/inet/ip/ipsecesp.c b/usr/src/uts/common/inet/ip/ipsecesp.c index 47972a8c1a..96a0457678 100644 --- a/usr/src/uts/common/inet/ip/ipsecesp.c +++ b/usr/src/uts/common/inet/ip/ipsecesp.c @@ -234,8 +234,7 @@ esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid) { espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat", "net", KSTAT_TYPE_NAMED, - sizeof (esp_kstats_t) / sizeof (kstat_named_t), - KSTAT_FLAG_PERSISTENT, stackid); + sizeof (esp_kstats_t) / sizeof (kstat_named_t), 0, stackid); if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL) return (B_FALSE); diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h index a564376cfb..706752b236 100644 --- a/usr/src/uts/common/inet/ip_stack.h +++ b/usr/src/uts/common/inet/ip_stack.h @@ -269,6 +269,7 @@ struct ip_stack { uint_t ips_dce_hashsize; struct dcb_s *ips_dce_hash_v4; struct dcb_s *ips_dce_hash_v6; + uint_t ips_dce_reclaim_needed; /* pending binds */ mblk_t *ips_ip6_asp_pending_ops; diff --git a/usr/src/uts/common/inet/ipd/ipd.c b/usr/src/uts/common/inet/ipd/ipd.c new file mode 100644 index 0000000000..5670e5afaa --- /dev/null +++ b/usr/src/uts/common/inet/ipd/ipd.c @@ -0,0 +1,1192 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +/* + * ipd: Internet packet disturber + * + * The purpose of ipd is to simulate congested and lossy networks when they + * don't actually exist. The features of these congested and lossy networks are + * events that end up leading to retransmits and thus kicking us out of the + * TCP/IP fastpath. Since normally this would require us to have an actually + * congested network, which can be problematic, we instead simulate this + * behavior. + * + * 1. ipd's operations and restrictions + * + * ipd currently has facilities to cause IP traffic to be: + * + * - Corrupted with some probability. + * - Delayed for a set number of microseconds. + * - Dropped with some probability. + * + * Each of these features are enabled on a per-zone basic. The current + * implementation restricts this specifically to exclusive stack zones. + * Enabling ipd on a given zone causes pfhooks to be installed for that zone's + * netstack. Because of the nature of ipd, it currently only supports exclusive + * stack zones and as a further restriction, it only allows the global zone + * administrative access. ipd can be enabled for the global zone, but doing so + * will cause all shared-stack zones to also be affected. + * + * 2. General architecture and Locking + * + * ipd consists of a few components. There is a per netstack data structure that + * is created and destroyed with the creation and destruction of each exclusive + * stack zone. Each of these netstacks is stored in a global list which is + * accessed for control of ipd via ioctls. The following diagram touches on the + * data structures that are used throughout ipd. + * + * ADMINISTRATIVE DATA PATH + * + * +--------+ +------+ +------+ + * | ipdadm | | ip | | nics | + * +--------+ +------+ +------+ + * | ^ | | + * | | ioctl(2) | | + * V | V V + * +----------+ +-------------------------+ + * | /dev/ipd | | pfhooks packet callback | == ipd_hook() + * +----------+ +-------------------------+ + * | | + * | | + * V | + * +----------------+ | + * | list_t ipd_nsl |------+ | + * +----------------+ | | + * | | + * V per netstack V + * +----------------------------+ + * | ipd_nestack_t | + * +----------------------------+ + * + * ipd has two different entry points, one is administrative, the other is the + * data path. The administrative path is accessed by a userland component called + * ipdadm(1M). It communicates to the kernel component via ioctls to /dev/ipd. + * If the administrative path enables a specific zone, then the data path will + * become active for that zone. Any packet that leaves that zone's IP stack or + * is going to enter it, comes through the callback specified in the hook_t(9S) + * structure. This will cause each packet to go through ipd_hook(). + * + * While the locking inside of ipd should be straightforward, unfortunately, the + * pfhooks subsystem necessarily complicates this a little bit. There are + * currently three different sets of locks in ipd. + * + * - Global lock N on the netstack list. + * - Global lock A on the active count. + * - Per-netstack data structure lock Z. + * + * # Locking rules + * + * L.1a N must always be acquired first and released last + * + * If you need to acquire the netstack list lock, either for reading or writing, + * then N must be acquired first and before any other locks. It may not be + * dropped before any other lock. + * + * L.1b N must only be acquired from the administrative path and zone creation, + * shutdown, and destruct callbacks. + * + * The data path, e.g. receiving the per-packet callbacks, should never be + * grabbing the list lock. If it is, then the architecture here needs to be + * reconsidered. + * + * L.2 Z cannot be held across calls to the pfhooks subsystem if packet hooks + * are active. + * + * The way the pfhooks subsystem is designed is that a reference count is + * present on the hook_t while it is active. As long as that reference count is + * non-zero, a call to net_hook_unregister will block until it is lowered. + * Because the callbacks want the same lock for the netstack that is held by the + * administrative path calling into net_hook_unregister, we deadlock. + * + * ioctl from ipdadm remove hook_t cb (from nic) hook_t cb (from IP) + * ----------------------- -------------------- ------------------- + * | | | + * | bump hook_t refcount | + * mutex_enter(ipd_nsl_lock); enter ipd_hook() bump hook_t refcount + * mutex acquired mutex_enter(ins->ipdn_lock); | + * | mutex acquired enter ipd_hook() + * mutex_enter(ins->ipdn_lock); | mutex_enter(ins->ipdn_lock); + * | | | + * | | | + * | mutex_exit(ins->ipdn_lock); | + * | | | + * mutex acquired leave ipd_hook() | + * | decrement hook_t refcount | + * | | | + * ipd_teardown_hooks() | | + * net_hook_unregister() | | + * cv_wait() if recount | | + * | | | + * --------------------------------------------------------------------------- + * + * At this point, we can see that the second hook callback still doesn't have + * the mutex, but it has bumped the hook_t refcount. However, it will never + * acquire the mutex that it needs to finish its operation and decrement the + * refcount. + * + * Obviously, deadlocking is not acceptable, thus the following corollary to the + * second locking rule: + * + * L.2 Corollary: If Z is being released across a call to the pfhooks subsystem, + * N must be held. + * + * There is currently only one path where we have to worry about this. That is + * when we are removing a hook, but the zone is not being shutdown, then hooks + * are currently active. The only place that this currently happens is in + * ipd_check_hooks(). + * + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/stat.h> +#include <sys/cmn_err.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/modctl.h> +#include <sys/kstat.h> +#include <sys/neti.h> +#include <sys/list.h> +#include <sys/ksynch.h> +#include <sys/sysmacros.h> +#include <sys/policy.h> +#include <sys/atomic.h> +#include <sys/model.h> +#include <sys/strsun.h> + +#include <sys/netstack.h> +#include <sys/hook.h> +#include <sys/hook_event.h> + +#include <sys/ipd.h> + +#define IPDN_STATUS_DISABLED 0x1 +#define IPDN_STATUS_ENABLED 0x2 +#define IPDN_STATUS_CONDEMNED 0x4 + +/* + * These flags are used to determine whether or not the hooks are registered. + */ +#define IPDN_HOOK_NONE 0x0 +#define IPDN_HOOK_V4IN 0x1 +#define IPDN_HOOK_V4OUT 0x2 +#define IPDN_HOOK_V6IN 0x4 +#define IPDN_HOOK_V6OUT 0x8 +#define IPDN_HOOK_ALL 0xf + +/* + * Per-netstack kstats. + */ +typedef struct ipd_nskstat { + kstat_named_t ink_ndrops; + kstat_named_t ink_ncorrupts; + kstat_named_t ink_ndelays; +} ipd_nskstat_t; + +/* + * Different parts of this structure have different locking semantics. The list + * node is not normally referenced, if it is, one has to hold the ipd_nsl_lock. + * The following members are read only: ipdn_netid and ipdn_zoneid. The members + * of the kstat structure are always accessible in the data path, but the + * counters must be bumped with atomic operations. The ipdn_lock protects every + * other aspect of this structure. Please see the big theory statement on the + * requirements for lock ordering. + */ +typedef struct ipd_netstack { + list_node_t ipdn_link; /* link on ipd_nsl */ + netid_t ipdn_netid; /* netstack id */ + zoneid_t ipdn_zoneid; /* zone id */ + kstat_t *ipdn_kstat; /* kstat_t ptr */ + ipd_nskstat_t ipdn_ksdata; /* kstat data */ + kmutex_t ipdn_lock; /* protects following members */ + int ipdn_status; /* status flags */ + net_handle_t ipdn_v4hdl; /* IPv4 net handle */ + net_handle_t ipdn_v6hdl; /* IPv4 net handle */ + int ipdn_hooked; /* are hooks registered */ + hook_t *ipdn_v4in; /* IPv4 traffic in hook */ + hook_t *ipdn_v4out; /* IPv4 traffice out hook */ + hook_t *ipdn_v6in; /* IPv6 traffic in hook */ + hook_t *ipdn_v6out; /* IPv6 traffic out hook */ + int ipdn_enabled; /* which perturbs are on */ + int ipdn_corrupt; /* corrupt percentage */ + int ipdn_drop; /* drop percentage */ + uint_t ipdn_delay; /* delay us */ + long ipdn_rand; /* random seed */ +} ipd_netstack_t; + +/* + * ipd internal variables + */ +static dev_info_t *ipd_devi; /* device info */ +static net_instance_t *ipd_neti; /* net_instance for hooks */ +static unsigned int ipd_max_delay = IPD_MAX_DELAY; /* max delay in us */ +static kmutex_t ipd_nsl_lock; /* lock for the nestack list */ +static list_t ipd_nsl; /* list of netstacks */ +static kmutex_t ipd_nactive_lock; /* lock for nactive */ +static unsigned int ipd_nactive; /* number of active netstacks */ +static int ipd_nactive_fudge = 4; /* amount to fudge by in list */ + +/* + * Note that this random number implementation is based upon the old BSD 4.1 + * rand. It's good enough for us! + */ +static int +ipd_nextrand(ipd_netstack_t *ins) +{ + ins->ipdn_rand = ins->ipdn_rand * 1103515245L + 12345; + return (ins->ipdn_rand & 0x7fffffff); +} + +static void +ipd_ksbump(kstat_named_t *nkp) +{ + atomic_inc_64(&nkp->value.ui64); +} + +/* + * This is where all the magic actually happens. The way that this works is we + * grab the ins lock to basically get a copy of all the data that we need to do + * our job and then let it go to minimize contention. In terms of actual work on + * the packet we do them in the following order: + * + * - drop + * - delay + * - corrupt + */ +/*ARGSUSED*/ +static int +ipd_hook(hook_event_token_t event, hook_data_t data, void *arg) +{ + unsigned char *crp; + int dwait, corrupt, drop, rand, off, status; + mblk_t *mbp; + ipd_netstack_t *ins = arg; + hook_pkt_event_t *pkt = (hook_pkt_event_t *)data; + + mutex_enter(&ins->ipdn_lock); + status = ins->ipdn_status; + dwait = ins->ipdn_delay; + corrupt = ins->ipdn_corrupt; + drop = ins->ipdn_drop; + rand = ipd_nextrand(ins); + mutex_exit(&ins->ipdn_lock); + + /* + * This probably cannot happen, but we'll do an extra guard just in + * case. + */ + if (status & IPDN_STATUS_CONDEMNED) + return (0); + + if (drop != 0 && rand % 100 < drop) { + freemsg(*pkt->hpe_mp); + *pkt->hpe_mp = NULL; + pkt->hpe_mb = NULL; + pkt->hpe_hdr = NULL; + ipd_ksbump(&ins->ipdn_ksdata.ink_ndrops); + + return (1); + } + + if (dwait != 0) { + if (dwait < TICK_TO_USEC(1)) + drv_usecwait(dwait); + else + delay(drv_usectohz(dwait)); + ipd_ksbump(&ins->ipdn_ksdata.ink_ndelays); + } + + if (corrupt != 0 && rand % 100 < corrupt) { + /* + * Since we're corrupting the mblk, just corrupt everything in + * the chain. While we could corrupt the entire packet, that's a + * little strong. Instead we're going to just change one of the + * bytes in each mblock. + */ + mbp = *pkt->hpe_mp; + while (mbp != NULL) { + if (mbp->b_wptr == mbp->b_rptr) + continue; + + /* + * While pfhooks probably won't send us anything else, + * let's just be extra careful. The stack probably isn't + * as resiliant to corruption of control messages. + */ + if (DB_TYPE(mbp) != M_DATA) + continue; + + off = rand % ((uintptr_t)mbp->b_wptr - + (uintptr_t)mbp->b_rptr); + crp = mbp->b_rptr + off; + off = rand % 8; + *crp = *crp ^ (1 << off); + + mbp = mbp->b_cont; + } + ipd_ksbump(&ins->ipdn_ksdata.ink_ncorrupts); + } + + return (0); +} + +/* + * Sets up and registers all the proper hooks needed for the netstack to capture + * packets. Callers are assumed to already be holding the ipd_netstack_t's lock. + * If there is a failure in setting something up, it is the responsibility of + * this function to clean it up. Once this function has been called, it should + * not be called until a corresponding call to tear down the hooks has been + * done. + */ +static int +ipd_setup_hooks(ipd_netstack_t *ins) +{ + ASSERT(MUTEX_HELD(&ins->ipdn_lock)); + ins->ipdn_v4hdl = net_protocol_lookup(ins->ipdn_netid, NHF_INET); + if (ins->ipdn_v4hdl == NULL) + goto cleanup; + + ins->ipdn_v6hdl = net_protocol_lookup(ins->ipdn_netid, NHF_INET6); + if (ins->ipdn_v6hdl == NULL) + goto cleanup; + + ins->ipdn_v4in = hook_alloc(HOOK_VERSION); + if (ins->ipdn_v4in == NULL) + goto cleanup; + + ins->ipdn_v4in->h_flags = 0; + ins->ipdn_v4in->h_hint = HH_NONE; + ins->ipdn_v4in->h_hintvalue = 0; + ins->ipdn_v4in->h_func = ipd_hook; + ins->ipdn_v4in->h_arg = ins; + ins->ipdn_v4in->h_name = "ipd IPv4 in"; + + if (net_hook_register(ins->ipdn_v4hdl, NH_PHYSICAL_IN, + ins->ipdn_v4in) != 0) + goto cleanup; + ins->ipdn_hooked |= IPDN_HOOK_V4IN; + + ins->ipdn_v4out = hook_alloc(HOOK_VERSION); + if (ins->ipdn_v4out == NULL) + goto cleanup; + ins->ipdn_v4out->h_flags = 0; + ins->ipdn_v4out->h_hint = HH_NONE; + ins->ipdn_v4out->h_hintvalue = 0; + ins->ipdn_v4out->h_func = ipd_hook; + ins->ipdn_v4out->h_arg = ins; + ins->ipdn_v4out->h_name = "ipd IPv4 out"; + + if (net_hook_register(ins->ipdn_v4hdl, NH_PHYSICAL_OUT, + ins->ipdn_v4out) != 0) + goto cleanup; + ins->ipdn_hooked |= IPDN_HOOK_V4OUT; + + ins->ipdn_v6in = hook_alloc(HOOK_VERSION); + if (ins->ipdn_v6in == NULL) + goto cleanup; + ins->ipdn_v6in->h_flags = 0; + ins->ipdn_v6in->h_hint = HH_NONE; + ins->ipdn_v6in->h_hintvalue = 0; + ins->ipdn_v6in->h_func = ipd_hook; + ins->ipdn_v6in->h_arg = ins; + ins->ipdn_v6in->h_name = "ipd IPv6 in"; + + if (net_hook_register(ins->ipdn_v6hdl, NH_PHYSICAL_IN, + ins->ipdn_v6in) != 0) + goto cleanup; + ins->ipdn_hooked |= IPDN_HOOK_V6IN; + + ins->ipdn_v6out = hook_alloc(HOOK_VERSION); + if (ins->ipdn_v6out == NULL) + goto cleanup; + ins->ipdn_v6out->h_flags = 0; + ins->ipdn_v6out->h_hint = HH_NONE; + ins->ipdn_v6out->h_hintvalue = 0; + ins->ipdn_v6out->h_func = ipd_hook; + ins->ipdn_v6out->h_arg = ins; + ins->ipdn_v6out->h_name = "ipd IPv6 out"; + + if (net_hook_register(ins->ipdn_v6hdl, NH_PHYSICAL_OUT, + ins->ipdn_v6out) != 0) + goto cleanup; + ins->ipdn_hooked |= IPDN_HOOK_V6OUT; + mutex_enter(&ipd_nactive_lock); + ipd_nactive++; + mutex_exit(&ipd_nactive_lock); + + return (0); + +cleanup: + if (ins->ipdn_hooked & IPDN_HOOK_V6OUT) + (void) net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_OUT, + ins->ipdn_v6out); + + if (ins->ipdn_hooked & IPDN_HOOK_V6IN) + (void) net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_IN, + ins->ipdn_v6in); + + if (ins->ipdn_hooked & IPDN_HOOK_V4OUT) + (void) net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_OUT, + ins->ipdn_v4out); + + if (ins->ipdn_hooked & IPDN_HOOK_V4IN) + (void) net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_IN, + ins->ipdn_v4in); + + ins->ipdn_hooked = IPDN_HOOK_NONE; + + if (ins->ipdn_v6out != NULL) + hook_free(ins->ipdn_v6out); + + if (ins->ipdn_v6in != NULL) + hook_free(ins->ipdn_v6in); + + if (ins->ipdn_v4out != NULL) + hook_free(ins->ipdn_v4out); + + if (ins->ipdn_v4in != NULL) + hook_free(ins->ipdn_v4in); + + if (ins->ipdn_v6hdl != NULL) + (void) net_protocol_release(ins->ipdn_v6hdl); + + if (ins->ipdn_v4hdl != NULL) + (void) net_protocol_release(ins->ipdn_v4hdl); + + return (1); +} + +static void +ipd_teardown_hooks(ipd_netstack_t *ins) +{ + ASSERT(ins->ipdn_hooked == IPDN_HOOK_ALL); + VERIFY(net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_OUT, + ins->ipdn_v6out) == 0); + VERIFY(net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_IN, + ins->ipdn_v6in) == 0); + VERIFY(net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_OUT, + ins->ipdn_v4out) == 0); + VERIFY(net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_IN, + ins->ipdn_v4in) == 0); + + ins->ipdn_hooked = IPDN_HOOK_NONE; + + hook_free(ins->ipdn_v6out); + hook_free(ins->ipdn_v6in); + hook_free(ins->ipdn_v4out); + hook_free(ins->ipdn_v4in); + + VERIFY(net_protocol_release(ins->ipdn_v6hdl) == 0); + VERIFY(net_protocol_release(ins->ipdn_v4hdl) == 0); + + mutex_enter(&ipd_nactive_lock); + ipd_nactive--; + mutex_exit(&ipd_nactive_lock); +} + +static int +ipd_check_hooks(ipd_netstack_t *ins, int type, boolean_t enable) +{ + int olden, rval; + olden = ins->ipdn_enabled; + + if (enable) + ins->ipdn_enabled |= type; + else + ins->ipdn_enabled &= ~type; + + /* + * If hooks were previously enabled. + */ + if (olden == 0 && ins->ipdn_enabled != 0) { + rval = ipd_setup_hooks(ins); + if (rval != 0) { + ins->ipdn_enabled &= ~type; + ASSERT(ins->ipdn_enabled == 0); + return (rval); + } + + return (0); + } + + if (olden != 0 && ins->ipdn_enabled == 0) { + ASSERT(olden != 0); + + /* + * We have to drop the lock here, lest we cause a deadlock. + * Unfortunately, there may be hooks that are running and are + * actively in flight and we have to call the unregister + * function. Due to the hooks framework, if there is an inflight + * hook (most likely right now), and we are holding the + * netstack's lock, those hooks will never return. This is + * unfortunate. + * + * Because we only come into this path holding the list lock, we + * know that only way that someone else can come in and get to + * this structure is via the hook callbacks which are going to + * only be doing reads. They'll also see that everything has + * been disabled and return. So while this is unfortunate, it + * should be relatively safe. + */ + mutex_exit(&ins->ipdn_lock); + ipd_teardown_hooks(ins); + mutex_enter(&ins->ipdn_lock); + return (0); + } + + /* + * Othwerise, nothing should have changed here. + */ + ASSERT((olden == 0) == (ins->ipdn_enabled == 0)); + return (0); +} + +static int +ipd_toggle_corrupt(ipd_netstack_t *ins, int percent) +{ + int rval; + + ASSERT(MUTEX_HELD(&ins->ipdn_lock)); + + if (percent < 0 || percent > 100) + return (ERANGE); + + /* + * If we've been asked to set the value to a value that we already have, + * great, then we're done. + */ + if (percent == ins->ipdn_corrupt) + return (0); + + ins->ipdn_corrupt = percent; + rval = ipd_check_hooks(ins, IPD_CORRUPT, percent != 0); + + /* + * If ipd_check_hooks_failed, that must mean that we failed to set up + * the hooks, so we are going to effectively zero out and fail the + * request to enable corruption. + */ + if (rval != 0) + ins->ipdn_corrupt = 0; + + return (rval); +} + +static int +ipd_toggle_delay(ipd_netstack_t *ins, uint32_t delay) +{ + int rval; + + ASSERT(MUTEX_HELD(&ins->ipdn_lock)); + + if (delay > ipd_max_delay) + return (ERANGE); + + /* + * If we've been asked to set the value to a value that we already have, + * great, then we're done. + */ + if (delay == ins->ipdn_delay) + return (0); + + ins->ipdn_delay = delay; + rval = ipd_check_hooks(ins, IPD_DELAY, delay != 0); + + /* + * If ipd_check_hooks_failed, that must mean that we failed to set up + * the hooks, so we are going to effectively zero out and fail the + * request to enable corruption. + */ + if (rval != 0) + ins->ipdn_delay = 0; + + return (rval); +} +static int +ipd_toggle_drop(ipd_netstack_t *ins, int percent) +{ + int rval; + + ASSERT(MUTEX_HELD(&ins->ipdn_lock)); + + if (percent < 0 || percent > 100) + return (ERANGE); + + /* + * If we've been asked to set the value to a value that we already have, + * great, then we're done. + */ + if (percent == ins->ipdn_drop) + return (0); + + ins->ipdn_drop = percent; + rval = ipd_check_hooks(ins, IPD_DROP, percent != 0); + + /* + * If ipd_check_hooks_failed, that must mean that we failed to set up + * the hooks, so we are going to effectively zero out and fail the + * request to enable corruption. + */ + if (rval != 0) + ins->ipdn_drop = 0; + + return (rval); +} + +static int +ipd_ioctl_perturb(ipd_ioc_perturb_t *ipi, cred_t *cr, intptr_t cmd) +{ + zoneid_t zid; + ipd_netstack_t *ins; + int rval = 0; + + /* + * If the zone that we're coming from is not the GZ, then we ignore it + * completely and then instead just set the zoneid to be that of the + * caller. If the zoneid is that of the GZ, then we don't touch this + * value. + */ + zid = crgetzoneid(cr); + if (zid != GLOBAL_ZONEID) + ipi->ipip_zoneid = zid; + + if (zoneid_to_netstackid(ipi->ipip_zoneid) == GLOBAL_NETSTACKID && + zid != GLOBAL_ZONEID) + return (EPERM); + + /* + * We need to hold the ipd_nsl_lock throughout the entire operation, + * otherwise someone else could come in and remove us from the list and + * free us, e.g. the netstack destroy handler. By holding the lock, we + * stop it from being able to do anything wrong. + */ + mutex_enter(&ipd_nsl_lock); + for (ins = list_head(&ipd_nsl); ins != NULL; + ins = list_next(&ipd_nsl, ins)) { + if (ins->ipdn_zoneid == ipi->ipip_zoneid) + break; + } + + if (ins == NULL) { + mutex_exit(&ipd_nsl_lock); + return (EINVAL); + } + + mutex_enter(&ins->ipdn_lock); + + if (ins->ipdn_status & IPDN_STATUS_CONDEMNED) { + rval = ESHUTDOWN; + goto cleanup; + } + + switch (cmd) { + case IPDIOC_CORRUPT: + rval = ipd_toggle_corrupt(ins, ipi->ipip_arg); + break; + case IPDIOC_DELAY: + rval = ipd_toggle_delay(ins, ipi->ipip_arg); + break; + case IPDIOC_DROP: + rval = ipd_toggle_drop(ins, ipi->ipip_arg); + break; + } + +cleanup: + mutex_exit(&ins->ipdn_lock); + mutex_exit(&ipd_nsl_lock); + return (rval); +} + +static int +ipd_ioctl_remove(ipd_ioc_perturb_t *ipi, cred_t *cr) +{ + zoneid_t zid; + ipd_netstack_t *ins; + int rval = 0; + + /* + * See ipd_ioctl_perturb for the rational here. + */ + zid = crgetzoneid(cr); + if (zid != GLOBAL_ZONEID) + ipi->ipip_zoneid = zid; + + if (zoneid_to_netstackid(ipi->ipip_zoneid) == GLOBAL_NETSTACKID && + zid != GLOBAL_ZONEID) + return (EPERM); + + mutex_enter(&ipd_nsl_lock); + for (ins = list_head(&ipd_nsl); ins != NULL; + ins = list_next(&ipd_nsl, ins)) { + if (ins->ipdn_zoneid == ipi->ipip_zoneid) + break; + } + + if (ins == NULL) { + mutex_exit(&ipd_nsl_lock); + return (EINVAL); + } + + mutex_enter(&ins->ipdn_lock); + + /* + * If this is condemned, that means it's very shortly going to be torn + * down. In that case, there's no reason to actually do anything here, + * as it will all be done rather shortly in the destroy function. + * Furthermore, because condemned corresponds with it having hit + * shutdown, we know that no more packets can be received by this + * netstack. All this translates to a no-op. + */ + if (ins->ipdn_status & IPDN_STATUS_CONDEMNED) { + rval = 0; + goto cleanup; + } + + rval = EINVAL; + /* + * Go through and disable the requested pieces. We can safely ignore the + * return value of ipd_check_hooks because the removal case should never + * fail, we verify that in the hook teardown case. + */ + if (ipi->ipip_arg & IPD_CORRUPT) { + ins->ipdn_corrupt = 0; + (void) ipd_check_hooks(ins, IPD_CORRUPT, B_FALSE); + rval = 0; + } + + if (ipi->ipip_arg & IPD_DELAY) { + ins->ipdn_delay = 0; + (void) ipd_check_hooks(ins, IPD_DELAY, B_FALSE); + rval = 0; + } + + if (ipi->ipip_arg & IPD_DROP) { + ins->ipdn_drop = 0; + (void) ipd_check_hooks(ins, IPD_DROP, B_FALSE); + rval = 0; + } + +cleanup: + mutex_exit(&ins->ipdn_lock); + mutex_exit(&ipd_nsl_lock); + return (rval); +} + +/* + * When this function is called, the value of the ipil_nzones argument controls + * how this function works. When called with a value of zero, then we treat that + * as the caller asking us what's a reasonable number of entries for me to + * allocate memory for. If the zone is the global zone, then we tell them how + * many folks are currently active and add a fudge factor. Otherwise the answer + * is always one. + * + * In the non-zero case, we give them that number of zone ids. While this isn't + * quite ideal as it might mean that someone misses something, this generally + * won't be an issue, as it involves a rather tight race condition in the + * current ipdadm implementation. + */ +static int +ipd_ioctl_list(intptr_t arg, cred_t *cr) +{ + zoneid_t zid; + ipd_ioc_info_t *configs; + ipd_netstack_t *ins; + uint_t azones, rzones, nzones, cur; + int rval = 0; + STRUCT_DECL(ipd_ioc_list, h); + + STRUCT_INIT(h, get_udatamodel()); + if (ddi_copyin((void *)arg, STRUCT_BUF(h), + STRUCT_SIZE(h), 0) != 0) + return (EFAULT); + + zid = crgetzoneid(cr); + + rzones = STRUCT_FGET(h, ipil_nzones); + if (rzones == 0) { + if (zid == GLOBAL_ZONEID) { + mutex_enter(&ipd_nactive_lock); + rzones = ipd_nactive + ipd_nactive_fudge; + mutex_exit(&ipd_nactive_lock); + } else { + rzones = 1; + } + STRUCT_FSET(h, ipil_nzones, rzones); + if (ddi_copyout(STRUCT_BUF(h), (void *)arg, + STRUCT_SIZE(h), 0) != 0) + return (EFAULT); + + return (0); + } + + mutex_enter(&ipd_nsl_lock); + if (zid == GLOBAL_ZONEID) { + azones = ipd_nactive; + } else { + azones = 1; + } + + configs = kmem_alloc(sizeof (ipd_ioc_info_t) * azones, KM_SLEEP); + cur = 0; + for (ins = list_head(&ipd_nsl); ins != NULL; + ins = list_next(&ipd_nsl, ins)) { + if (ins->ipdn_enabled == 0) + continue; + + ASSERT(cur < azones); + + if (zid == GLOBAL_ZONEID || zid == ins->ipdn_zoneid) { + configs[cur].ipii_zoneid = ins->ipdn_zoneid; + + mutex_enter(&ins->ipdn_lock); + configs[cur].ipii_corrupt = ins->ipdn_corrupt; + configs[cur].ipii_delay = ins->ipdn_delay; + configs[cur].ipii_drop = ins->ipdn_drop; + mutex_exit(&ins->ipdn_lock); + + ++cur; + } + + if (zid != GLOBAL_ZONEID && zid == ins->ipdn_zoneid) + break; + } + mutex_exit(&ipd_nsl_lock); + + ASSERT(zid != GLOBAL_ZONEID || cur == azones); + + if (cur == 0) + STRUCT_FSET(h, ipil_nzones, 0); + else + STRUCT_FSET(h, ipil_nzones, cur); + + nzones = MIN(cur, rzones); + if (nzones > 0) { + if (ddi_copyout(configs, STRUCT_FGETP(h, ipil_info), + nzones * sizeof (ipd_ioc_info_t), NULL) != 0) + rval = EFAULT; + } + + kmem_free(configs, sizeof (ipd_ioc_info_t) * azones); + if (ddi_copyout(STRUCT_BUF(h), (void *)arg, STRUCT_SIZE(h), 0) != 0) + return (EFAULT); + + return (rval); +} + +static void * +ipd_nin_create(const netid_t id) +{ + ipd_netstack_t *ins; + ipd_nskstat_t *ink; + + ins = kmem_zalloc(sizeof (ipd_netstack_t), KM_SLEEP); + ins->ipdn_status = IPDN_STATUS_DISABLED; + ins->ipdn_netid = id; + ins->ipdn_zoneid = netstackid_to_zoneid(id); + ins->ipdn_rand = gethrtime(); + mutex_init(&ins->ipdn_lock, NULL, MUTEX_DRIVER, NULL); + + ins->ipdn_kstat = net_kstat_create(id, "ipd", ins->ipdn_zoneid, + "ipd", "net", KSTAT_TYPE_NAMED, + sizeof (ipd_nskstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ins->ipdn_kstat != NULL) { + if (ins->ipdn_zoneid != GLOBAL_ZONEID) + kstat_zone_add(ins->ipdn_kstat, GLOBAL_ZONEID); + + ink = &ins->ipdn_ksdata; + ins->ipdn_kstat->ks_data = ink; + kstat_named_init(&ink->ink_ncorrupts, "corrupts", + KSTAT_DATA_UINT64); + kstat_named_init(&ink->ink_ndrops, "drops", KSTAT_DATA_UINT64); + kstat_named_init(&ink->ink_ndelays, "delays", + KSTAT_DATA_UINT64); + kstat_install(ins->ipdn_kstat); + } + + mutex_enter(&ipd_nsl_lock); + list_insert_tail(&ipd_nsl, ins); + mutex_exit(&ipd_nsl_lock); + + return (ins); +} + +static void +ipd_nin_shutdown(const netid_t id, void *arg) +{ + ipd_netstack_t *ins = arg; + + VERIFY(id == ins->ipdn_netid); + mutex_enter(&ins->ipdn_lock); + ASSERT(ins->ipdn_status == IPDN_STATUS_DISABLED || + ins->ipdn_status == IPDN_STATUS_ENABLED); + ins->ipdn_status |= IPDN_STATUS_CONDEMNED; + if (ins->ipdn_kstat != NULL) + net_kstat_delete(id, ins->ipdn_kstat); + mutex_exit(&ins->ipdn_lock); +} + +/*ARGSUSED*/ +static void +ipd_nin_destroy(const netid_t id, void *arg) +{ + ipd_netstack_t *ins = arg; + + /* + * At this point none of the hooks should be able to fire because the + * zone has been shutdown and we are in the process of destroying it. + * Thus it should not be possible for someone else to come in and grab + * our ipd_netstack_t for this zone. Because of that, we know that we + * are the only ones who could be running here. + */ + mutex_enter(&ipd_nsl_lock); + list_remove(&ipd_nsl, ins); + mutex_exit(&ipd_nsl_lock); + + if (ins->ipdn_hooked) + ipd_teardown_hooks(ins); + mutex_destroy(&ins->ipdn_lock); + kmem_free(ins, sizeof (ipd_netstack_t)); +} + +/*ARGSUSED*/ +static int +ipd_open(dev_t *devp, int flag, int otype, cred_t *credp) +{ + if (flag & FEXCL || flag & FNDELAY) + return (EINVAL); + + if (otype != OTYP_CHR) + return (EINVAL); + + if (!(flag & FREAD && flag & FWRITE)) + return (EINVAL); + + if (secpolicy_ip_config(credp, B_FALSE) != 0) + return (EPERM); + + return (0); +} + +/*ARGSUSED*/ +static int +ipd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) +{ + int rval; + ipd_ioc_perturb_t ipip; + ipd_ioc_info_t ipii; + + switch (cmd) { + case IPDIOC_CORRUPT: + case IPDIOC_DELAY: + case IPDIOC_DROP: + if (ddi_copyin((void *)arg, &ipip, sizeof (ipd_ioc_perturb_t), + 0) != 0) + return (EFAULT); + rval = ipd_ioctl_perturb(&ipip, cr, cmd); + return (rval); + case IPDIOC_REMOVE: + if (ddi_copyin((void *)arg, &ipip, sizeof (ipd_ioc_perturb_t), + 0) != 0) + return (EFAULT); + rval = ipd_ioctl_remove(&ipip, cr); + return (rval); + case IPDIOC_LIST: + /* + * Because the list ioctl doesn't have a fixed-size struct due + * to needing to pass around a pointer, we instead delegate the + * copyin logic to the list code. + */ + return (ipd_ioctl_list(arg, cr)); + default: + break; + } + return (ENOTTY); +} + +/*ARGSUSED*/ +static int +ipd_close(dev_t dev, int flag, int otype, cred_t *credp) +{ + return (0); +} + +static int +ipd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + minor_t instance; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (ipd_devi != NULL) + return (DDI_FAILURE); + + instance = ddi_get_instance(dip); + if (ddi_create_minor_node(dip, "ipd", S_IFCHR, instance, + DDI_PSEUDO, 0) == DDI_FAILURE) + return (DDI_FAILURE); + + ipd_neti = net_instance_alloc(NETINFO_VERSION); + if (ipd_neti == NULL) { + ddi_remove_minor_node(dip, NULL); + return (DDI_FAILURE); + } + + /* + * Note that these global structures MUST be initialized before we call + * net_instance_register, as that will instantly cause us to drive into + * the ipd_nin_create callbacks. + */ + list_create(&ipd_nsl, sizeof (ipd_netstack_t), + offsetof(ipd_netstack_t, ipdn_link)); + mutex_init(&ipd_nsl_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ipd_nactive_lock, NULL, MUTEX_DRIVER, NULL); + + /* Note, net_instance_alloc sets the version. */ + ipd_neti->nin_name = "ipd"; + ipd_neti->nin_create = ipd_nin_create; + ipd_neti->nin_destroy = ipd_nin_destroy; + ipd_neti->nin_shutdown = ipd_nin_shutdown; + if (net_instance_register(ipd_neti) == DDI_FAILURE) { + net_instance_free(ipd_neti); + ddi_remove_minor_node(dip, NULL); + } + + ddi_report_dev(dip); + ipd_devi = dip; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +ipd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + int error; + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = ipd_devi; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)(uintptr_t)getminor((dev_t)arg); + error = DDI_SUCCESS; + default: + error = DDI_FAILURE; + break; + } + + return (error); +} + +static int +ipd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + ASSERT(dip == ipd_devi); + ddi_remove_minor_node(dip, NULL); + ipd_devi = NULL; + + if (ipd_neti != NULL) { + VERIFY(net_instance_unregister(ipd_neti) == 0); + net_instance_free(ipd_neti); + } + + mutex_destroy(&ipd_nsl_lock); + mutex_destroy(&ipd_nactive_lock); + list_destroy(&ipd_nsl); + + return (DDI_SUCCESS); +} + +static struct cb_ops ipd_cb_ops = { + ipd_open, /* open */ + ipd_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + ipd_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* streamtab */ + D_NEW | D_MP, /* Driver compatibility flag */ + CB_REV, /* rev */ + nodev, /* aread */ + nodev /* awrite */ +}; + +static struct dev_ops ipd_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + ipd_getinfo, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + ipd_attach, /* attach */ + ipd_detach, /* detach */ + nodev, /* reset */ + &ipd_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed /* quiesce */ +}; + +static struct modldrv modldrv = { + &mod_driverops, + "Internet packet disturber", + &ipd_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + { (void *)&modldrv, NULL } +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/inet/ipd/ipd.conf b/usr/src/uts/common/inet/ipd/ipd.conf new file mode 100644 index 0000000000..83b9b685f4 --- /dev/null +++ b/usr/src/uts/common/inet/ipd/ipd.conf @@ -0,0 +1,27 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2012 Joyent, Inc. All rights reserved. +# Use is subject to license terms. +# + +name="ipd" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c index 98cda0b7cc..75bac21ae4 100644 --- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c +++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c @@ -141,11 +141,13 @@ ipf_stack_t *ifs; #define UNDO_HOOK(_f, _b, _e, _h) \ do { \ + int tmp; \ if (ifs->_f != NULL) { \ if (ifs->_b) { \ - ifs->_b = (net_hook_unregister(ifs->_f, \ - _e, ifs->_h) != 0); \ - if (!ifs->_b) { \ + tmp = net_hook_unregister(ifs->_f, \ + _e, ifs->_h); \ + ifs->_b = (tmp != 0 && tmp != ENXIO); \ + if (!ifs->_b && ifs->_h != NULL) { \ hook_free(ifs->_h); \ ifs->_h = NULL; \ } \ diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c index 6d0bf70b2a..2e08dc359b 100644 --- a/usr/src/uts/common/inet/squeue.c +++ b/usr/src/uts/common/inet/squeue.c @@ -23,6 +23,10 @@ */ /* + * Copyright 2012 Joyent, Inc. All rights reserved. + */ + +/* * Squeues: General purpose serialization mechanism * ------------------------------------------------ * @@ -120,6 +124,8 @@ #include <sys/sdt.h> #include <sys/ddi.h> #include <sys/sunddi.h> +#include <sys/stack.h> +#include <sys/archsystm.h> #include <inet/ipclassifier.h> #include <inet/udp_impl.h> @@ -142,6 +148,9 @@ int squeue_workerwait_ms = 0; static int squeue_drain_ns = 0; static int squeue_workerwait_tick = 0; +uintptr_t squeue_drain_stack_needed = 10240; +uint_t squeue_drain_stack_toodeep; + #define MAX_BYTES_TO_PICKUP 150000 #define ENQUEUE_CHAIN(sqp, mp, tail, cnt) { \ @@ -546,6 +555,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, ASSERT(MUTEX_HELD(&sqp->sq_lock)); ASSERT(sqp->sq_first != NULL); now = gethrtime(); + sqp->sq_run = curthread; sqp->sq_drain(sqp, SQS_ENTER, now + squeue_drain_ns); /* @@ -711,6 +721,20 @@ squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire) boolean_t sq_poll_capable; ip_recv_attr_t *ira, iras; + /* + * Before doing any work, check our stack depth; if we're not a + * worker thread for this squeue and we're beginning to get tight on + * on stack, kick the worker, bump a counter and return. + */ + if (proc_type != SQS_WORKER && STACK_BIAS + (uintptr_t)getfp() - + (uintptr_t)curthread->t_stkbase < squeue_drain_stack_needed) { + ASSERT(mutex_owned(&sqp->sq_lock)); + sqp->sq_awaken = ddi_get_lbolt(); + cv_signal(&sqp->sq_worker_cv); + squeue_drain_stack_toodeep++; + return; + } + sq_poll_capable = (sqp->sq_state & SQS_POLL_CAPAB) != 0; again: ASSERT(mutex_owned(&sqp->sq_lock)); diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 1bb87e5c56..f79427e766 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -3792,7 +3792,8 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) ASSERT(error == 0); tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL); ASSERT(tcps->tcps_ixa_cleanup_mp != NULL); - cv_init(&tcps->tcps_ixa_cleanup_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tcps->tcps_ixa_cleanup_ready_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tcps->tcps_ixa_cleanup_done_cv, NULL, CV_DEFAULT, NULL); mutex_init(&tcps->tcps_ixa_cleanup_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&tcps->tcps_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); @@ -3857,7 +3858,8 @@ tcp_stack_fini(netstackid_t stackid, void *arg) freeb(tcps->tcps_ixa_cleanup_mp); tcps->tcps_ixa_cleanup_mp = NULL; - cv_destroy(&tcps->tcps_ixa_cleanup_cv); + cv_destroy(&tcps->tcps_ixa_cleanup_ready_cv); + cv_destroy(&tcps->tcps_ixa_cleanup_done_cv); mutex_destroy(&tcps->tcps_ixa_cleanup_lock); /* diff --git a/usr/src/uts/common/inet/tcp/tcp_stats.c b/usr/src/uts/common/inet/tcp/tcp_stats.c index 6e729ff461..e6b13fe6c9 100644 --- a/usr/src/uts/common/inet/tcp/tcp_stats.c +++ b/usr/src/uts/common/inet/tcp/tcp_stats.c @@ -21,12 +21,14 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent Inc. All rights reserved. */ #include <sys/types.h> #include <sys/tihdr.h> #include <sys/policy.h> #include <sys/tsol/tnet.h> +#include <sys/kstat.h> #include <inet/common.h> #include <inet/ip.h> @@ -505,7 +507,7 @@ tcp_kstat_init(netstackid_t stackid) { "connTableSize6", KSTAT_DATA_INT32, 0 } }; - ksp = kstat_create_netstack(TCP_MOD_NAME, 0, TCP_MOD_NAME, "mib2", + ksp = kstat_create_netstack(TCP_MOD_NAME, stackid, TCP_MOD_NAME, "mib2", KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0, stackid); if (ksp == NULL) @@ -518,6 +520,13 @@ tcp_kstat_init(netstackid_t stackid) ksp->ks_update = tcp_kstat_update; ksp->ks_private = (void *)(uintptr_t)stackid; + /* + * If this is an exclusive netstack for a local zone, the global zone + * should still be able to read the kstat. + */ + if (stackid != GLOBAL_NETSTACKID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + kstat_install(ksp); return (ksp); } @@ -733,7 +742,7 @@ tcp_kstat2_init(netstackid_t stackid) #endif }; - ksp = kstat_create_netstack(TCP_MOD_NAME, 0, "tcpstat", "net", + ksp = kstat_create_netstack(TCP_MOD_NAME, stackid, "tcpstat", "net", KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), 0, stackid); @@ -744,6 +753,13 @@ tcp_kstat2_init(netstackid_t stackid) ksp->ks_private = (void *)(uintptr_t)stackid; ksp->ks_update = tcp_kstat2_update; + /* + * If this is an exclusive netstack for a local zone, the global zone + * should still be able to read the kstat. + */ + if (stackid != GLOBAL_NETSTACKID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + kstat_install(ksp); return (ksp); } diff --git a/usr/src/uts/common/inet/tcp/tcp_time_wait.c b/usr/src/uts/common/inet/tcp/tcp_time_wait.c index 6f896fa740..adde51e745 100644 --- a/usr/src/uts/common/inet/tcp/tcp_time_wait.c +++ b/usr/src/uts/common/inet/tcp/tcp_time_wait.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright (c) 2012, Joyent Inc. All rights reserved. */ /* @@ -111,6 +111,21 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) return (B_TRUE); } +/* Constants used for fast checking of a localhost address */ +#if defined(_BIG_ENDIAN) +#define IPv4_LOCALHOST 0x7f000000U +#define IPv4_LH_MASK 0xffffff00U +#else +#define IPv4_LOCALHOST 0x0000007fU +#define IPv4_LH_MASK 0x00ffffffU +#endif + +#define IS_LOCAL_HOST(x) ( \ + ((x)->tcp_connp->conn_ipversion == IPV4_VERSION && \ + ((x)->tcp_connp->conn_laddr_v4 & IPv4_LH_MASK) == IPv4_LOCALHOST) || \ + ((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \ + IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6))) + /* * Add a connection to the list of detached TIME_WAIT connections * and set its time to expire. @@ -122,6 +137,7 @@ tcp_time_wait_append(tcp_t *tcp) squeue_t *sqp = tcp->tcp_connp->conn_sqp; tcp_squeue_priv_t *tcp_time_wait = *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); + hrtime_t firetime = 0; tcp_timers_stop(tcp); @@ -138,13 +154,37 @@ tcp_time_wait_append(tcp_t *tcp) ASSERT(tcp->tcp_listener == NULL); tcp->tcp_time_wait_expire = ddi_get_lbolt64(); - /* - * Since tcp_time_wait_expire is lbolt64, it should not wrap around - * in practice. Hence it cannot be 0. Note that zero means that the - * tcp_t is not in the TIME_WAIT list. - */ - tcp->tcp_time_wait_expire += MSEC_TO_TICK( - tcps->tcps_time_wait_interval); + if (IS_LOCAL_HOST(tcp)) { + /* + * This is the fastpath for handling localhost connections. + * Since we don't have to worry about packets on the localhost + * showing up after a long network delay, we want to expire + * these quickly so the port range on the localhost doesn't + * get starved by short-running, local apps. + * + * Leave tcp_time_wait_expire at the current time. This + * essentially means the connection is expired now and it will + * clean up the next time tcp_time_wait_collector runs. We set + * firetime to use a short delay so that if we have to start a + * tcp_time_wait_collector thread below, it runs soon instead + * of after a delay of time_wait_interval. firetime being set + * to a non-0 value is also our indicator that we should add + * this connection to the head of the time wait list (since we + * are already expired) so that its sure to get cleaned up on + * the next run of tcp_time_wait_collector (which expects the + * entries to appear in time-order and stops when it hits the + * first non-expired entry). + */ + firetime = TCP_TIME_WAIT_DELAY; + } else { + /* + * Since tcp_time_wait_expire is lbolt64, it should not wrap + * around in practice. Hence it cannot be 0. Note that zero + * means that the tcp_t is not in the TIME_WAIT list. + */ + tcp->tcp_time_wait_expire += MSEC_TO_TICK( + tcps->tcps_time_wait_interval); + } ASSERT(TCP_IS_DETACHED(tcp)); ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); @@ -164,13 +204,17 @@ tcp_time_wait_append(tcp_t *tcp) * a timer is needed. */ if (tcp_time_wait->tcp_time_wait_tid == 0) { + if (firetime == 0) + firetime = (hrtime_t) + (tcps->tcps_time_wait_interval + 1) * + MICROSEC; + tcp_time_wait->tcp_time_wait_tid = timeout_generic(CALLOUT_NORMAL, - tcp_time_wait_collector, sqp, - (hrtime_t)(tcps->tcps_time_wait_interval + 1) * - MICROSEC, CALLOUT_TCP_RESOLUTION, - CALLOUT_FLAG_ROUNDUP); + tcp_time_wait_collector, sqp, firetime, + CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); } + tcp_time_wait->tcp_time_wait_tail = tcp; } else { /* * The list is not empty, so a timer must be running. If not, @@ -182,11 +226,23 @@ tcp_time_wait_append(tcp_t *tcp) ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == TCPS_TIME_WAIT); - tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp; - tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail; + if (firetime == 0) { + /* add at end */ + tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = + tcp; + tcp->tcp_time_wait_prev = + tcp_time_wait->tcp_time_wait_tail; + tcp_time_wait->tcp_time_wait_tail = tcp; + } else { + /* add at head */ + tcp->tcp_time_wait_next = + tcp_time_wait->tcp_time_wait_head; + tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = + tcp; + tcp_time_wait->tcp_time_wait_head = tcp; + } } - tcp_time_wait->tcp_time_wait_tail = tcp; mutex_exit(&tcp_time_wait->tcp_time_wait_lock); } @@ -416,6 +472,10 @@ tcp_time_wait_collector(void *arg) tcp_time_wait->tcp_time_wait_tid == 0) { hrtime_t firetime; + /* shouldn't be necessary, but just in case */ + if (tcp->tcp_time_wait_expire < now) + tcp->tcp_time_wait_expire = now; + firetime = TICK_TO_NSEC(tcp->tcp_time_wait_expire - now); /* This ensures that we won't wake up too often. */ firetime = MAX(TCP_TIME_WAIT_DELAY, firetime); diff --git a/usr/src/uts/common/inet/tcp_stack.h b/usr/src/uts/common/inet/tcp_stack.h index 2dccf6b78c..e46ebe08da 100644 --- a/usr/src/uts/common/inet/tcp_stack.h +++ b/usr/src/uts/common/inet/tcp_stack.h @@ -101,7 +101,8 @@ struct tcp_stack { /* Used to synchronize access when reclaiming memory */ mblk_t *tcps_ixa_cleanup_mp; kmutex_t tcps_ixa_cleanup_lock; - kcondvar_t tcps_ixa_cleanup_cv; + kcondvar_t tcps_ixa_cleanup_ready_cv; + kcondvar_t tcps_ixa_cleanup_done_cv; /* Variables for handling kmem reclaim call back. */ kmutex_t tcps_reclaim_lock; diff --git a/usr/src/uts/common/io/aggr/aggr_port.c b/usr/src/uts/common/io/aggr/aggr_port.c index 00545d2c03..a39110255a 100644 --- a/usr/src/uts/common/io/aggr/aggr_port.c +++ b/usr/src/uts/common/io/aggr/aggr_port.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. */ /* @@ -528,8 +529,13 @@ aggr_port_promisc(aggr_port_t *port, boolean_t on) if (on) { mac_rx_clear(port->lp_mch); + /* We use the promisc callback because without hardware + * rings, we deliver through flows that will cause duplicate + * delivery of packets when we've flipped into this mode + * to compensate for the lack of hardware MAC matching + */ rc = mac_promisc_add(port->lp_mch, MAC_CLIENT_PROMISC_ALL, - aggr_recv_cb, port, &port->lp_mphp, + aggr_recv_promisc_cb, port, &port->lp_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP); if (rc != 0) { mac_rx_set(port->lp_mch, aggr_recv_cb, port); diff --git a/usr/src/uts/common/io/aggr/aggr_recv.c b/usr/src/uts/common/io/aggr/aggr_recv.c index 2bdb7872e3..0dfe234b70 100644 --- a/usr/src/uts/common/io/aggr/aggr_recv.c +++ b/usr/src/uts/common/io/aggr/aggr_recv.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. */ /* @@ -68,16 +69,27 @@ aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp) /* * Callback function invoked by MAC service module when packets are - * made available by a MAC port. + * made available by a MAC port, both in promisc_on mode and not. */ /* ARGSUSED */ -void -aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, - boolean_t loopback) +static void +aggr_recv_path_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback, boolean_t promisc_path) { aggr_port_t *port = (aggr_port_t *)arg; aggr_grp_t *grp = port->lp_grp; + /* In the case where lp_promisc_on has been turned on to + * compensate for insufficient hardware MAC matching and + * hardware rings are not in use we will fall back to + * using flows for delivery which can result in duplicates + * pushed up the stack. Only respect the chosen path. + */ + if (port->lp_promisc_on != promisc_path) { + freemsgchain(mp); + return; + } + if (grp->lg_lacp_mode == AGGR_LACP_OFF) { aggr_mac_rx(grp->lg_mh, mrh, mp); } else { @@ -161,3 +173,19 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, } } } + +/* ARGSUSED */ +void +aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) +{ + aggr_recv_path_cb(arg, mrh, mp, loopback, B_FALSE); +} + +/* ARGSUSED */ +void +aggr_recv_promisc_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) +{ + aggr_recv_path_cb(arg, mrh, mp, loopback, B_TRUE); +} diff --git a/usr/src/uts/common/io/bge/bge_chip2.c b/usr/src/uts/common/io/bge/bge_chip2.c index f687ce4892..a459f867f3 100644 --- a/usr/src/uts/common/io/bge/bge_chip2.c +++ b/usr/src/uts/common/io/bge/bge_chip2.c @@ -24,7 +24,7 @@ */ /* - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2011, 2012 Nexenta Systems, Inc. All rights reserved. */ #include "bge_impl.h" @@ -363,7 +363,34 @@ bge_chip_cfg_init(bge_t *bgep, chip_id_t *cidp, boolean_t enable_dma) if (DEVICE_5717_SERIES_CHIPSETS(bgep)) pci_config_put32(handle, PCI_CONF_BGE_MHCR, 0); mhcr = pci_config_get32(handle, PCI_CONF_BGE_MHCR); - cidp->asic_rev = mhcr & MHCR_CHIP_REV_MASK; + cidp->asic_rev = (mhcr & MHCR_CHIP_REV_MASK) >> MHCR_CHIP_REV_SHIFT; + if (MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_PRODID) { + uint32_t reg; + switch (cidp->device) { + case DEVICE_ID_5717: + case DEVICE_ID_5718: + case DEVICE_ID_5719: + case DEVICE_ID_5720: + reg = PCI_CONF_GEN2_PRODID_ASICREV; + break; + case DEVICE_ID_57781: + case DEVICE_ID_57785: + case DEVICE_ID_57761: + case DEVICE_ID_57765: + case DEVICE_ID_57791: + case DEVICE_ID_57795: + case DEVICE_ID_57762: + case DEVICE_ID_57766: + case DEVICE_ID_57782: + case DEVICE_ID_57786: + reg = PCI_CONF_GEN15_PRODID_ASICREV; + break; + default: + reg = PCI_CONF_PRODID_ASICREV; + break; + } + cidp->asic_rev = pci_config_get32(handle, reg); + } cidp->businfo = pci_config_get32(handle, PCI_CONF_BGE_PCISTATE); cidp->command = pci_config_get16(handle, PCI_CONF_COMM); @@ -386,6 +413,45 @@ bge_chip_cfg_init(bge_t *bgep, chip_id_t *cidp, boolean_t enable_dma) BGE_DEBUG(("bge_chip_cfg_init: clsize %d latency %d command 0x%x", cidp->clsize, cidp->latency, cidp->command)); + cidp->chip_type = 0; + if (MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5717 || + MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5719 || + MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5720) + cidp->chip_type |= CHIP_TYPE_5717_PLUS; + + if (MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_57765 || + MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_57766) + cidp->chip_type |= CHIP_TYPE_57765_CLASS; + + if (cidp->chip_type & CHIP_TYPE_57765_CLASS || + cidp->chip_type & CHIP_TYPE_5717_PLUS) + cidp->chip_type |= CHIP_TYPE_57765_PLUS; + + /* Intentionally exclude ASIC_REV_5906 */ + if (MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5755 || + MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5787 || + MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5784 || + MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5761 || + MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5785 || + MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_57780 || + cidp->chip_type & CHIP_TYPE_57765_PLUS) + cidp->chip_type |= CHIP_TYPE_5755_PLUS; + + if (MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5780 || + MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5714) + cidp->chip_type |= CHIP_TYPE_5780_CLASS; + + if (MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5750 || + MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5752 || + MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5906 || + cidp->chip_type & CHIP_TYPE_5755_PLUS || + cidp->chip_type & CHIP_TYPE_5780_CLASS) + cidp->chip_type |= CHIP_TYPE_5750_PLUS; + + if (MHCR_CHIP_ASIC_REV(cidp->asic_rev) == MHCR_CHIP_ASIC_REV_5705 || + cidp->chip_type & CHIP_TYPE_5750_PLUS) + cidp->chip_type |= CHIP_TYPE_5705_PLUS; + /* * Step 2 (also step 6): disable and clear interrupts. * Steps 11-13: configure PIO endianness options, and enable @@ -445,8 +511,9 @@ bge_chip_cfg_init(bge_t *bgep, chip_id_t *cidp, boolean_t enable_dma) * see whether the host is truly up to date, and regenerate * its interrupt if not. */ - mhcr = MHCR_ENABLE_INDIRECT_ACCESS | + mhcr = MHCR_ENABLE_INDIRECT_ACCESS | MHCR_ENABLE_TAGGED_STATUS_MODE | + MHCR_ENABLE_PCI_STATE_WRITE | MHCR_MASK_INTERRUPT_MODE | MHCR_CLEAR_INTERRUPT_INTA; @@ -1896,10 +1963,16 @@ bge_nvmem_id(bge_t *bgep) case DEVICE_ID_5705_2: case DEVICE_ID_5717: case DEVICE_ID_5718: + case DEVICE_ID_5719: + case DEVICE_ID_5720: case DEVICE_ID_5724: + case DEVICE_ID_57760: case DEVICE_ID_57780: + case DEVICE_ID_57788: + case DEVICE_ID_57790: case DEVICE_ID_5780: case DEVICE_ID_5782: + case DEVICE_ID_5784M: case DEVICE_ID_5785: case DEVICE_ID_5787: case DEVICE_ID_5787M: @@ -1918,6 +1991,8 @@ bge_nvmem_id(bge_t *bgep) case DEVICE_ID_5723: case DEVICE_ID_5761: case DEVICE_ID_5761E: + case DEVICE_ID_5761S: + case DEVICE_ID_5761SE: case DEVICE_ID_5764: case DEVICE_ID_5714C: case DEVICE_ID_5714S: @@ -2023,14 +2098,35 @@ bge_chip_id_init(bge_t *bgep) cidp->msi_enabled = B_FALSE; + if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) > + MHCR_CHIP_ASIC_REV_PRODID || + MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5906 || + MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5700 || + MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5701 || + MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5750) + /* + * Just a plain reset; the "check" code breaks these chips + */ + cidp->flags |= CHIP_FLAG_NO_CHECK_RESET; + switch (cidp->device) { case DEVICE_ID_5717: case DEVICE_ID_5718: + case DEVICE_ID_5719: + case DEVICE_ID_5720: case DEVICE_ID_5724: if (cidp->device == DEVICE_ID_5717) cidp->chip_label = 5717; else if (cidp->device == DEVICE_ID_5718) cidp->chip_label = 5718; + else if (cidp->device == DEVICE_ID_5719) + cidp->chip_label = 5719; + else if (cidp->device == DEVICE_ID_5720) + cidp->chip_label = 5720; else cidp->chip_label = 5724; cidp->msi_enabled = bge_enable_msi; @@ -2044,7 +2140,7 @@ bge_chip_id_init(bge_t *bgep) cidp->mbuf_hi_water = MBUF_HIWAT_5717; cidp->mbuf_base = bge_mbuf_pool_base_5705; cidp->mbuf_length = bge_mbuf_pool_len_5705; - cidp->recv_slots = BGE_RECV_SLOTS_5705; + cidp->recv_slots = BGE_RECV_SLOTS_5717; cidp->bge_mlcr_default = MLCR_DEFAULT_5717; cidp->rx_rings = BGE_RECV_RINGS_MAX_5705; cidp->tx_rings = BGE_SEND_RINGS_MAX_5705; @@ -2220,7 +2316,13 @@ bge_chip_id_init(bge_t *bgep) case DEVICE_ID_5723: case DEVICE_ID_5761: case DEVICE_ID_5761E: + case DEVICE_ID_5761S: + case DEVICE_ID_5761SE: + case DEVICE_ID_5784M: + case DEVICE_ID_57760: case DEVICE_ID_57780: + case DEVICE_ID_57788: + case DEVICE_ID_57790: cidp->msi_enabled = bge_enable_msi; /* * We don't use MSI for BCM5764 and BCM5785, as the @@ -2234,10 +2336,18 @@ bge_chip_id_init(bge_t *bgep) cidp->chip_label = 5723; else if (cidp->device == DEVICE_ID_5764) cidp->chip_label = 5764; + else if (cidp->device == DEVICE_ID_5784M) + cidp->chip_label = 5784; else if (cidp->device == DEVICE_ID_5785) cidp->chip_label = 5785; + else if (cidp->device == DEVICE_ID_57760) + cidp->chip_label = 57760; else if (cidp->device == DEVICE_ID_57780) cidp->chip_label = 57780; + else if (cidp->device == DEVICE_ID_57788) + cidp->chip_label = 57788; + else if (cidp->device == DEVICE_ID_57790) + cidp->chip_label = 57790; else cidp->chip_label = 5761; cidp->bge_dma_rwctrl = bge_dma_rwctrl_5721; @@ -3401,18 +3511,27 @@ bge_chip_reset(bge_t *bgep, boolean_t enable_dma) mhcr = MHCR_ENABLE_INDIRECT_ACCESS | MHCR_ENABLE_TAGGED_STATUS_MODE | MHCR_MASK_INTERRUPT_MODE | - MHCR_MASK_PCI_INT_OUTPUT | MHCR_CLEAR_INTERRUPT_INTA | MHCR_ENABLE_ENDIAN_WORD_SWAP | MHCR_ENABLE_ENDIAN_BYTE_SWAP; + + if (bgep->intr_type == DDI_INTR_TYPE_FIXED) + mhcr |= MHCR_MASK_PCI_INT_OUTPUT; + if (DEVICE_5717_SERIES_CHIPSETS(bgep)) pci_config_put32(bgep->cfg_handle, PCI_CONF_BGE_MHCR, 0); +#else + mhcr = MHCR_ENABLE_INDIRECT_ACCESS | + MHCR_ENABLE_TAGGED_STATUS_MODE | + MHCR_MASK_INTERRUPT_MODE | + MHCR_MASK_PCI_INT_OUTPUT | + MHCR_CLEAR_INTERRUPT_INTA; +#endif pci_config_put32(bgep->cfg_handle, PCI_CONF_BGE_MHCR, mhcr); bge_reg_put32(bgep, MEMORY_ARBITER_MODE_REG, bge_reg_get32(bgep, MEMORY_ARBITER_MODE_REG) | MEMORY_ARBITER_ENABLE); -#endif if (asf_mode == ASF_MODE_INIT) { bge_asf_pre_reset_operations(bgep, BGE_INIT_RESET); } else if (asf_mode == ASF_MODE_SHUTDOWN) { @@ -3436,9 +3555,13 @@ bge_chip_reset(bge_t *bgep, boolean_t enable_dma) mhcr = MHCR_ENABLE_INDIRECT_ACCESS | MHCR_ENABLE_TAGGED_STATUS_MODE | + MHCR_ENABLE_PCI_STATE_WRITE | MHCR_MASK_INTERRUPT_MODE | - MHCR_MASK_PCI_INT_OUTPUT | MHCR_CLEAR_INTERRUPT_INTA; + + if (bgep->intr_type == DDI_INTR_TYPE_FIXED) + mhcr |= MHCR_MASK_PCI_INT_OUTPUT; + #ifdef _BIG_ENDIAN mhcr |= MHCR_ENABLE_ENDIAN_WORD_SWAP | MHCR_ENABLE_ENDIAN_BYTE_SWAP; #endif /* _BIG_ENDIAN */ @@ -3449,6 +3572,12 @@ bge_chip_reset(bge_t *bgep, boolean_t enable_dma) if (bgep->asf_enabled) bgep->asf_wordswapped = B_FALSE; #endif + + if (DEVICE_IS_5755_PLUS(bgep) || + MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5752) + bge_reg_put32(bgep, GRC_FASTBOOT_PC, 0); + /* * NVRAM Corruption Workaround */ @@ -3508,6 +3637,11 @@ bge_chip_reset(bge_t *bgep, boolean_t enable_dma) #else modeflags = MODE_WORD_SWAP_FRAME | MODE_BYTE_SWAP_FRAME; #endif /* _BIG_ENDIAN */ + if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5720) + modeflags |= + MODE_BYTE_SWAP_B2HRX_DATA | MODE_WORD_SWAP_B2HRX_DATA | + MODE_B2HRX_ENABLE | MODE_HTX2B_ENABLE; #ifdef BGE_IPMI_ASF if (bgep->asf_enabled) modeflags |= MODE_HOST_STACK_UP; @@ -3592,6 +3726,13 @@ bge_chip_reset(bge_t *bgep, boolean_t enable_dma) */ bge_reg_put32(bgep, ETHERNET_MAC_MODE_REG, 0); + if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5720) { + uint32_t regval = bge_reg_get32(bgep, CPMU_CLCK_ORIDE_REG); + bge_reg_put32(bgep, CPMU_CLCK_ORIDE_REG, + regval & ~CPMU_CLCK_ORIDE_MAC_ORIDE_EN); + } + /* * Step 21: restore cache-line-size, latency timer, and * subsystem ID registers to their original values (not @@ -3818,8 +3959,17 @@ bge_chip_start(bge_t *bgep, boolean_t reset_phys) /* * Steps 34-36: enable buffer manager & internal h/w queues */ - if (!bge_chip_enable_engine(bgep, BUFFER_MANAGER_MODE_REG, - STATE_MACHINE_ATTN_ENABLE_BIT)) + + regval = STATE_MACHINE_ATTN_ENABLE_BIT; + if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5719) + regval |= BUFF_MGR_NO_TX_UNDERRUN; + if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5717 || + bgep->chipid.asic_rev == MHCR_CHIP_REV_5719_A0 || + bgep->chipid.asic_rev == MHCR_CHIP_REV_5720_A0) + regval |= BUFF_MGR_MBUF_LOW_ATTN_ENABLE; + if (!bge_chip_enable_engine(bgep, BUFFER_MANAGER_MODE_REG, regval)) retval = DDI_FAILURE; if (!bge_chip_enable_engine(bgep, FTQ_RESET_REG, 0)) retval = DDI_FAILURE; @@ -3913,7 +4063,13 @@ bge_chip_start(bge_t *bgep, boolean_t reset_phys) /* * Step 50: configure the IPG et al */ - bge_reg_put32(bgep, MAC_TX_LENGTHS_REG, MAC_TX_LENGTHS_DEFAULT); + regval = MAC_TX_LENGTHS_DEFAULT; + if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) + == MHCR_CHIP_ASIC_REV_5720) + regval |= bge_reg_get32(bgep, MAC_TX_LENGTHS_REG) & + (MAC_TX_LENGTHS_JMB_FRM_LEN_MSK | + MAC_TX_LENGTHS_CNT_DWN_VAL_MSK); + bge_reg_put32(bgep, MAC_TX_LENGTHS_REG, regval); /* * Step 51: configure the default Rx Return Ring @@ -4068,22 +4224,45 @@ bge_chip_start(bge_t *bgep, boolean_t reset_phys) retval = DDI_FAILURE; dma_wrprio = (bge_dma_wrprio << DMA_PRIORITY_SHIFT) | ALL_DMA_ATTN_BITS; - if ((MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == - MHCR_CHIP_ASIC_REV_5755) || - (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == - MHCR_CHIP_ASIC_REV_5723) || - (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == - MHCR_CHIP_ASIC_REV_5906)) { + if (DEVICE_IS_5755_PLUS(bgep)) dma_wrprio |= DMA_STATUS_TAG_FIX_CQ12384; - } if (!bge_chip_enable_engine(bgep, WRITE_DMA_MODE_REG, dma_wrprio)) retval = DDI_FAILURE; + if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5761 || + MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5784 || + MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5785 || + MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_57780 || + DEVICE_IS_57765_PLUS(bgep)) { + regval = bge_reg_get32(bgep, READ_DMA_RESERVED_CONTROL_REG); + if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5719 || + MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5720) { + regval &= ~(RDMA_RSRVCTRL_TXMRGN_MASK | + RDMA_RSRVCTRL_FIFO_LWM_MASK | + RDMA_RSRVCTRL_FIFO_HWM_MASK); + regval |= RDMA_RSRVCTRL_TXMRGN_320B | + RDMA_RSRVCTRL_FIFO_LWM_1_5K | + RDMA_RSRVCTRL_FIFO_HWM_1_5K; + } + bge_reg_put32(bgep, READ_DMA_RESERVED_CONTROL_REG, + regval | RDMA_RSRVCTRL_FIFO_OFLW_FIX); + } if (DEVICE_5723_SERIES_CHIPSETS(bgep) || DEVICE_5717_SERIES_CHIPSETS(bgep)) bge_dma_rdprio = 0; + regval = bge_dma_rdprio << DMA_PRIORITY_SHIFT; + if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5720) + regval |= bge_reg_get32(bgep, READ_DMA_MODE_REG) & + DMA_H2BNC_VLAN_DET; if (!bge_chip_enable_engine(bgep, READ_DMA_MODE_REG, - (bge_dma_rdprio << DMA_PRIORITY_SHIFT) | ALL_DMA_ATTN_BITS)) + regval | ALL_DMA_ATTN_BITS)) retval = DDI_FAILURE; if (!bge_chip_enable_engine(bgep, RCV_DATA_COMPLETION_MODE_REG, STATE_MACHINE_ATTN_ENABLE_BIT)) @@ -4116,7 +4295,23 @@ bge_chip_start(bge_t *bgep, boolean_t reset_phys) * Step 88: download firmware -- doesn't apply * Steps 89-90: enable Transmit & Receive MAC Engines */ - if (!bge_chip_enable_engine(bgep, TRANSMIT_MAC_MODE_REG, 0)) + if (DEVICE_IS_5755_PLUS(bgep) || + MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5906) { + regval = bge_reg_get32(bgep, TRANSMIT_MAC_MODE_REG); + regval |= TRANSMIT_MODE_MBUF_LOCKUP_FIX; + } else { + regval = 0; + } + if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5720) { + regval &= ~(TRANSMIT_MODE_HTX2B_JMB_FRM_LEN | + TRANSMIT_MODE_HTX2B_CNT_DN_MODE); + regval |= bge_reg_get32(bgep, TRANSMIT_MAC_MODE_REG) & + (TRANSMIT_MODE_HTX2B_JMB_FRM_LEN | + TRANSMIT_MODE_HTX2B_CNT_DN_MODE); + } + if (!bge_chip_enable_engine(bgep, TRANSMIT_MAC_MODE_REG, regval)) retval = DDI_FAILURE; #ifdef BGE_IPMI_ASF if (!bgep->asf_enabled) { @@ -4219,7 +4414,6 @@ bge_chip_start(bge_t *bgep, boolean_t reset_phys) if (bgep->intr_type == DDI_INTR_TYPE_FIXED) bge_cfg_clr32(bgep, PCI_CONF_BGE_MHCR, bgep->chipid.mask_pci_int); - /* * All done! */ diff --git a/usr/src/uts/common/io/bge/bge_hw.h b/usr/src/uts/common/io/bge/bge_hw.h index f8e6c4d09a..cfcae929dd 100644 --- a/usr/src/uts/common/io/bge/bge_hw.h +++ b/usr/src/uts/common/io/bge/bge_hw.h @@ -23,6 +23,10 @@ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ + #ifndef _BGE_HW_H #define _BGE_HW_H @@ -68,9 +72,12 @@ extern "C" { #define DEVICE_ID_5724 0x165c #define DEVICE_ID_5705M 0x165d #define DEVICE_ID_5705MA3 0x165e +#define DEVICE_ID_5719 0x1657 +#define DEVICE_ID_5720 0x165f #define DEVICE_ID_5705F 0x166e #define DEVICE_ID_5780 0x166a #define DEVICE_ID_5782 0x1696 +#define DEVICE_ID_5784M 0x1698 #define DEVICE_ID_5785 0x1699 #define DEVICE_ID_5787 0x169b #define DEVICE_ID_5787M 0x1693 @@ -92,12 +99,27 @@ extern "C" { #define DEVICE_ID_5714S 0x1669 #define DEVICE_ID_5715C 0x1678 #define DEVICE_ID_5715S 0x1679 -#define DEVICE_ID_5761E 0x1680 #define DEVICE_ID_5761 0x1681 +#define DEVICE_ID_5761E 0x1680 +#define DEVICE_ID_5761S 0x1688 +#define DEVICE_ID_5761SE 0x1689 #define DEVICE_ID_5764 0x1684 #define DEVICE_ID_5906 0x1712 #define DEVICE_ID_5906M 0x1713 +#define DEVICE_ID_57760 0x1690 #define DEVICE_ID_57780 0x1692 +#define DEVICE_ID_57788 0x1691 +#define DEVICE_ID_57790 0x1694 +#define DEVICE_ID_57781 0x16b1 +#define DEVICE_ID_57785 0x16b5 +#define DEVICE_ID_57761 0x16b0 +#define DEVICE_ID_57765 0x16b4 +#define DEVICE_ID_57791 0x16b2 +#define DEVICE_ID_57795 0x16b6 +#define DEVICE_ID_57762 0x1682 +#define DEVICE_ID_57766 0x1686 +#define DEVICE_ID_57786 0x16b3 +#define DEVICE_ID_57782 0x16b7 #define REVISION_ID_5700_B0 0x10 #define REVISION_ID_5700_B2 0x12 @@ -189,15 +211,23 @@ extern "C" { #define DEVICE_5717_SERIES_CHIPSETS(bgep) \ (bgep->chipid.device == DEVICE_ID_5717) ||\ (bgep->chipid.device == DEVICE_ID_5718) ||\ + (bgep->chipid.device == DEVICE_ID_5719) ||\ + (bgep->chipid.device == DEVICE_ID_5720) ||\ (bgep->chipid.device == DEVICE_ID_5724) #define DEVICE_5723_SERIES_CHIPSETS(bgep) \ ((bgep->chipid.device == DEVICE_ID_5723) ||\ (bgep->chipid.device == DEVICE_ID_5761) ||\ (bgep->chipid.device == DEVICE_ID_5761E) ||\ + (bgep->chipid.device == DEVICE_ID_5761S) ||\ + (bgep->chipid.device == DEVICE_ID_5761SE) ||\ (bgep->chipid.device == DEVICE_ID_5764) ||\ + (bgep->chipid.device == DEVICE_ID_5784M) ||\ (bgep->chipid.device == DEVICE_ID_5785) ||\ - (bgep->chipid.device == DEVICE_ID_57780)) + (bgep->chipid.device == DEVICE_ID_57760) ||\ + (bgep->chipid.device == DEVICE_ID_57780) ||\ + (bgep->chipid.device == DEVICE_ID_57788) ||\ + (bgep->chipid.device == DEVICE_ID_57790)) #define DEVICE_5714_SERIES_CHIPSETS(bgep) \ ((bgep->chipid.device == DEVICE_ID_5714C) ||\ @@ -209,6 +239,20 @@ extern "C" { ((bgep->chipid.device == DEVICE_ID_5906) ||\ (bgep->chipid.device == DEVICE_ID_5906M)) + +#define CHIP_TYPE_5705_PLUS (1 << 0) +#define CHIP_TYPE_5750_PLUS (1 << 1) +#define CHIP_TYPE_5780_CLASS (1 << 2) +#define CHIP_TYPE_5755_PLUS (1 << 3) +#define CHIP_TYPE_57765_CLASS (1 << 4) +#define CHIP_TYPE_57765_PLUS (1 << 5) +#define CHIP_TYPE_5717_PLUS (1 << 6) + +#define DEVICE_IS_57765_PLUS(bgep) \ + (bgep->chipid.chip_type & CHIP_TYPE_57765_PLUS) +#define DEVICE_IS_5755_PLUS(bgep) \ + (bgep->chipid.chip_type & CHIP_TYPE_5755_PLUS) + /* * Second section: * Offsets of important registers & definitions for bits therein @@ -225,6 +269,7 @@ extern "C" { */ #define PCI_CONF_BGE_MHCR 0x68 #define MHCR_CHIP_REV_MASK 0xffff0000 +#define MHCR_CHIP_REV_SHIFT 16 #define MHCR_ENABLE_TAGGED_STATUS_MODE 0x00000200 #define MHCR_MASK_INTERRUPT_MODE 0x00000100 #define MHCR_ENABLE_INDIRECT_ACCESS 0x00000080 @@ -236,95 +281,38 @@ extern "C" { #define MHCR_MASK_PCI_INT_OUTPUT 0x00000002 #define MHCR_CLEAR_INTERRUPT_INTA 0x00000001 -#define MHCR_CHIP_REV_5700_B0 0x71000000 -#define MHCR_CHIP_REV_5700_B2 0x71020000 -#define MHCR_CHIP_REV_5700_B3 0x71030000 -#define MHCR_CHIP_REV_5700_C0 0x72000000 -#define MHCR_CHIP_REV_5700_C1 0x72010000 -#define MHCR_CHIP_REV_5700_C2 0x72020000 - -#define MHCR_CHIP_REV_5701_A0 0x00000000 -#define MHCR_CHIP_REV_5701_A2 0x00020000 -#define MHCR_CHIP_REV_5701_A3 0x00030000 -#define MHCR_CHIP_REV_5701_A5 0x01050000 - -#define MHCR_CHIP_REV_5702_A0 0x10000000 -#define MHCR_CHIP_REV_5702_A1 0x10010000 -#define MHCR_CHIP_REV_5702_A2 0x10020000 - -#define MHCR_CHIP_REV_5703_A0 0x10000000 -#define MHCR_CHIP_REV_5703_A1 0x10010000 -#define MHCR_CHIP_REV_5703_A2 0x10020000 -#define MHCR_CHIP_REV_5703_B0 0x11000000 -#define MHCR_CHIP_REV_5703_B1 0x11010000 - -#define MHCR_CHIP_REV_5704_A0 0x20000000 -#define MHCR_CHIP_REV_5704_A1 0x20010000 -#define MHCR_CHIP_REV_5704_A2 0x20020000 -#define MHCR_CHIP_REV_5704_A3 0x20030000 -#define MHCR_CHIP_REV_5704_B0 0x21000000 - -#define MHCR_CHIP_REV_5705_A0 0x30000000 -#define MHCR_CHIP_REV_5705_A1 0x30010000 -#define MHCR_CHIP_REV_5705_A2 0x30020000 -#define MHCR_CHIP_REV_5705_A3 0x30030000 -#define MHCR_CHIP_REV_5705_A5 0x30050000 - -#define MHCR_CHIP_REV_5782_A0 0x30030000 -#define MHCR_CHIP_REV_5782_A1 0x30030088 - -#define MHCR_CHIP_REV_5788_A1 0x30050000 - -#define MHCR_CHIP_REV_5751_A0 0x40000000 -#define MHCR_CHIP_REV_5751_A1 0x40010000 - -#define MHCR_CHIP_REV_5721_A0 0x41000000 -#define MHCR_CHIP_REV_5721_A1 0x41010000 - -#define MHCR_CHIP_REV_5714_A0 0x50000000 -#define MHCR_CHIP_REV_5714_A1 0x90010000 - -#define MHCR_CHIP_REV_5715_A0 0x50000000 -#define MHCR_CHIP_REV_5715_A1 0x90010000 - -#define MHCR_CHIP_REV_5715S_A0 0x50000000 -#define MHCR_CHIP_REV_5715S_A1 0x90010000 - -#define MHCR_CHIP_REV_5754_A0 0xb0000000 -#define MHCR_CHIP_REV_5754_A1 0xb0010000 - -#define MHCR_CHIP_REV_5787_A0 0xb0000000 -#define MHCR_CHIP_REV_5787_A1 0xb0010000 -#define MHCR_CHIP_REV_5787_A2 0xb0020000 - -#define MHCR_CHIP_REV_5755_A0 0xa0000000 -#define MHCR_CHIP_REV_5755_A1 0xa0010000 - -#define MHCR_CHIP_REV_5906_A0 0xc0000000 -#define MHCR_CHIP_REV_5906_A1 0xc0010000 -#define MHCR_CHIP_REV_5906_A2 0xc0020000 - -#define MHCR_CHIP_REV_5723_A0 0xf0000000 -#define MHCR_CHIP_REV_5723_A1 0xf0010000 -#define MHCR_CHIP_REV_5723_A2 0xf0020000 -#define MHCR_CHIP_REV_5723_B0 0xf1000000 - -#define MHCR_CHIP_ASIC_REV(ChipRevId) ((ChipRevId) & 0xf0000000) -#define MHCR_CHIP_ASIC_REV_5700 (0x7 << 28) -#define MHCR_CHIP_ASIC_REV_5701 (0x0 << 28) -#define MHCR_CHIP_ASIC_REV_5703 (0x1 << 28) -#define MHCR_CHIP_ASIC_REV_5704 (0x2 << 28) -#define MHCR_CHIP_ASIC_REV_5705 (0x3 << 28) -#define MHCR_CHIP_ASIC_REV_5721_5751 (0x4 << 28) -#define MHCR_CHIP_ASIC_REV_5714 (0x5 << 28) -#define MHCR_CHIP_ASIC_REV_5752 (0x6 << 28) -#define MHCR_CHIP_ASIC_REV_5754 (0xb << 28) -#define MHCR_CHIP_ASIC_REV_5787 ((uint32_t)0xb << 28) -#define MHCR_CHIP_ASIC_REV_5755 ((uint32_t)0xa << 28) -#define MHCR_CHIP_ASIC_REV_5715 ((uint32_t)0x9 << 28) -#define MHCR_CHIP_ASIC_REV_5906 ((uint32_t)0xc << 28) -#define MHCR_CHIP_ASIC_REV_5723 ((uint32_t)0xf << 28) - +#define MHCR_CHIP_REV_5703_A0 0x1000 +#define MHCR_CHIP_REV_5704_A0 0x2000 +#define MHCR_CHIP_REV_5751_A0 0x4000 +#define MHCR_CHIP_REV_5721_A0 0x4100 +#define MHCR_CHIP_REV_5755_A0 0xa000 +#define MHCR_CHIP_REV_5755_A1 0xa001 +#define MHCR_CHIP_REV_5719_A0 0x05719000 +#define MHCR_CHIP_REV_5720_A0 0x05720000 + +#define MHCR_CHIP_ASIC_REV(ChipRevId) ((ChipRevId) >> 12) +#define MHCR_CHIP_ASIC_REV_5700 0x07 +#define MHCR_CHIP_ASIC_REV_5701 0x00 +#define MHCR_CHIP_ASIC_REV_5703 0x01 +#define MHCR_CHIP_ASIC_REV_5704 0x02 +#define MHCR_CHIP_ASIC_REV_5705 0x03 +#define MHCR_CHIP_ASIC_REV_5750 0x04 +#define MHCR_CHIP_ASIC_REV_5752 0x06 +#define MHCR_CHIP_ASIC_REV_5780 0x08 +#define MHCR_CHIP_ASIC_REV_5714 0x09 +#define MHCR_CHIP_ASIC_REV_5755 0x0a +#define MHCR_CHIP_ASIC_REV_5787 0x0b +#define MHCR_CHIP_ASIC_REV_5906 0x0c +#define MHCR_CHIP_ASIC_REV_PRODID 0x0f +#define MHCR_CHIP_ASIC_REV_5784 0x5784 +#define MHCR_CHIP_ASIC_REV_5761 0x5761 +#define MHCR_CHIP_ASIC_REV_5785 0x5785 +#define MHCR_CHIP_ASIC_REV_5717 0x5717 +#define MHCR_CHIP_ASIC_REV_5719 0x5719 +#define MHCR_CHIP_ASIC_REV_5720 0x5720 +#define MHCR_CHIP_ASIC_REV_57780 0x57780 +#define MHCR_CHIP_ASIC_REV_57765 0x57785 +#define MHCR_CHIP_ASIC_REV_57766 0x57766 /* * PCI DMA read/write Control Register, in PCI config space @@ -466,6 +454,10 @@ extern "C" { #define PCI_CONF_DEV_STUS_5723 0xd6 #define DEVICE_ERROR_STUS 0xf +#define PCI_CONF_PRODID_ASICREV 0x000000bc +#define PCI_CONF_GEN2_PRODID_ASICREV 0x000000f4 +#define PCI_CONF_GEN15_PRODID_ASICREV 0x000000fc + #define NIC_MEM_WINDOW_OFFSET 0x00008000 /* 32k */ /* @@ -541,6 +533,7 @@ extern "C" { #define MEMORY_ARBITER_MODE_REG 0x4000 #define BUFFER_MANAGER_MODE_REG 0x4400 #define READ_DMA_MODE_REG 0x4800 +#define READ_DMA_RESERVED_CONTROL_REG 0x4900 #define WRITE_DMA_MODE_REG 0x4c00 #define DMA_COMPLETION_MODE_REG 0x6400 @@ -552,6 +545,9 @@ extern "C" { * Transmit MAC Mode Register * (TRANSMIT_MAC_MODE_REG, 0x045c) */ +#define TRANSMIT_MODE_HTX2B_CNT_DN_MODE 0x00800000 +#define TRANSMIT_MODE_HTX2B_JMB_FRM_LEN 0x00400000 +#define TRANSMIT_MODE_MBUF_LOCKUP_FIX 0x00000100 #define TRANSMIT_MODE_LONG_PAUSE 0x00000040 #define TRANSMIT_MODE_BIG_BACKOFF 0x00000020 #define TRANSMIT_MODE_FLOW_CONTROL 0x00000010 @@ -619,12 +615,14 @@ extern "C" { */ #define BUFF_MGR_TEST_MODE 0x00000008 #define BUFF_MGR_MBUF_LOW_ATTN_ENABLE 0x00000010 +#define BUFF_MGR_NO_TX_UNDERRUN 0x80000000 #define BUFF_MGR_ALL_ATTN_BITS 0x00000014 /* * Read and Write DMA Mode Registers (READ_DMA_MODE_REG, - * 0x4800 and WRITE_DMA_MODE_REG, 0x4c00) + * 0x4800, READ_DMA_RESERVED_CONTROL_REG, 0x4900, + * WRITE_DMA_MODE_REG, 0x4c00) * * These registers each contain a 2-bit priority field, which controls * the relative priority of that type of DMA (read vs. write vs. MSI), @@ -635,6 +633,15 @@ extern "C" { #define DMA_PRIORITY_SHIFT 30 #define ALL_DMA_ATTN_BITS 0x000003fc +#define RDMA_RSRVCTRL_FIFO_OFLW_FIX 0x00000004 +#define RDMA_RSRVCTRL_FIFO_LWM_1_5K 0x00000c00 +#define RDMA_RSRVCTRL_FIFO_LWM_MASK 0x00000ff0 +#define RDMA_RSRVCTRL_FIFO_HWM_1_5K 0x000c0000 +#define RDMA_RSRVCTRL_FIFO_HWM_MASK 0x000ff000 +#define RDMA_RSRVCTRL_TXMRGN_320B 0x28000000 +#define RDMA_RSRVCTRL_TXMRGN_MASK 0xffe00000 + + /* * BCM5755, 5755M, 5906, 5906M only * 1 - Enable Fix. Device will send out the status block before @@ -644,6 +651,10 @@ extern "C" { */ #define DMA_STATUS_TAG_FIX_CQ12384 0x20000000 +/* 5720 only */ +#define DMA_H2BNC_VLAN_DET 0x20000000 + + /* * End of state machine control register definitions */ @@ -781,6 +792,8 @@ extern "C" { #define MAC_RX_MTU_DEFAULT 0x000005f2 /* 1522 */ #define MAC_TX_LENGTHS_REG 0x0464 #define MAC_TX_LENGTHS_DEFAULT 0x00002620 +#define MAC_TX_LENGTHS_JMB_FRM_LEN_MSK 0x00ff0000 +#define MAC_TX_LENGTHS_CNT_DWN_VAL_MSK 0xff000000 /* * MII access registers @@ -1069,10 +1082,16 @@ extern "C" { #define JUMBO_RCV_BD_REPLENISH_DEFAULT 0x00000020 /* 32 */ /* - * CPMU registers (5717/5718 only) + * CPMU registers (5717/5718/5719/5720 only) */ -#define CPMU_STATUS_REG 0x362c -#define CPMU_STATUS_FUN_NUM 0x20000000 +#define CPMU_CLCK_ORIDE_REG 0x3624 +#define CPMU_CLCK_ORIDE_MAC_ORIDE_EN 0x80000000 + +#define CPMU_STATUS_REG 0x362c +#define CPMU_STATUS_FUN_NUM_5717 0x20000000 +#define CPMU_STATUS_FUN_NUM_5719 0xc0000000 +#define CPMU_STATUS_FUN_NUM_5719_SHIFT 30 + /* * Host Coalescing Engine Control Registers @@ -1191,6 +1210,8 @@ extern "C" { #define VCPU_EXT_CTL 0x6890 #define VCPU_EXT_CTL_HALF 0x00400000 +#define GRC_FASTBOOT_PC 0x6894 + #define FTQ_RESET_REG 0x5c00 #define MSI_MODE_REG 0x6000 @@ -1210,14 +1231,18 @@ extern "C" { #define MODE_INT_ON_TXRISC_ATTN 0x01000000 #define MODE_RECV_NO_PSEUDO_HDR_CSUM 0x00800000 #define MODE_SEND_NO_PSEUDO_HDR_CSUM 0x00100000 +#define MODE_HTX2B_ENABLE 0x00040000 #define MODE_HOST_SEND_BDS 0x00020000 #define MODE_HOST_STACK_UP 0x00010000 #define MODE_FORCE_32_BIT_PCI 0x00008000 +#define MODE_B2HRX_ENABLE 0x00008000 #define MODE_NO_INT_ON_RECV 0x00004000 #define MODE_NO_INT_ON_SEND 0x00002000 #define MODE_ALLOW_BAD_FRAMES 0x00000800 #define MODE_NO_CRC 0x00000400 #define MODE_NO_FRAME_CRACKING 0x00000200 +#define MODE_WORD_SWAP_B2HRX_DATA 0x00000080 +#define MODE_BYTE_SWAP_B2HRX_DATA 0x00000040 #define MODE_WORD_SWAP_FRAME 0x00000020 #define MODE_BYTE_SWAP_FRAME 0x00000010 #define MODE_WORD_SWAP_NONFRAME 0x00000004 @@ -1246,7 +1271,7 @@ extern "C" { */ #define CORE_CLOCK_MHZ 66 #define MISC_CONFIG_REG 0x6804 -#define MISC_CONFIG_GRC_RESET_DISABLE 0x20000000 +#define MISC_CONFIG_GRC_RESET_DISABLE 0x20000000 #define MISC_CONFIG_GPHY_POWERDOWN_OVERRIDE 0x04000000 #define MISC_CONFIG_POWERDOWN 0x00100000 #define MISC_CONFIG_POWER_STATE 0x00060000 @@ -1567,6 +1592,7 @@ extern "C" { #define BGE_MINI_SLOTS_MAX 1024 #define BGE_RECV_SLOTS_MAX 2048 #define BGE_RECV_SLOTS_5705 512 +#define BGE_RECV_SLOTS_5717 1024 #define BGE_RECV_SLOTS_5782 512 #define BGE_RECV_SLOTS_5721 512 diff --git a/usr/src/uts/common/io/bge/bge_impl.h b/usr/src/uts/common/io/bge/bge_impl.h index 772c989092..0c51c2bc8e 100644 --- a/usr/src/uts/common/io/bge/bge_impl.h +++ b/usr/src/uts/common/io/bge/bge_impl.h @@ -23,6 +23,10 @@ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ + #ifndef _BGE_IMPL_H #define _BGE_IMPL_H @@ -605,6 +609,7 @@ typedef struct { uint8_t latency; /* latency-timer */ uint8_t flags; + uint32_t chip_type; /* see CHIP_TYPE_ in bge_hw.h */ uint16_t chip_label; /* numeric part only */ /* (e.g. 5703/5794/etc) */ uint32_t mbuf_base; /* Mbuf pool parameters */ @@ -640,10 +645,11 @@ typedef struct { uint32_t mask_pci_int; } chip_id_t; -#define CHIP_FLAG_SUPPORTED 0x80 -#define CHIP_FLAG_SERDES 0x40 -#define CHIP_FLAG_PARTIAL_CSUM 0x20 -#define CHIP_FLAG_NO_JUMBO 0x1 +#define CHIP_FLAG_SUPPORTED 0x80 +#define CHIP_FLAG_SERDES 0x40 +#define CHIP_FLAG_PARTIAL_CSUM 0x20 +#define CHIP_FLAG_NO_CHECK_RESET 0x2 +#define CHIP_FLAG_NO_JUMBO 0x1 /* * Collection of physical-layer functions to: diff --git a/usr/src/uts/common/io/bge/bge_main2.c b/usr/src/uts/common/io/bge/bge_main2.c index f191f313c0..d0f309730d 100644 --- a/usr/src/uts/common/io/bge/bge_main2.c +++ b/usr/src/uts/common/io/bge/bge_main2.c @@ -23,6 +23,10 @@ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ + #include "bge_impl.h" #include <sys/sdt.h> #include <sys/mac_provider.h> @@ -3211,13 +3215,17 @@ bge_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) */ if (DEVICE_5717_SERIES_CHIPSETS(bgep)) pci_config_put32(bgep->cfg_handle, PCI_CONF_BGE_MHCR, 0); +#else + mhcrValue = MHCR_ENABLE_INDIRECT_ACCESS | + MHCR_ENABLE_TAGGED_STATUS_MODE | + MHCR_MASK_INTERRUPT_MODE | + MHCR_MASK_PCI_INT_OUTPUT | + MHCR_CLEAR_INTERRUPT_INTA; +#endif pci_config_put32(bgep->cfg_handle, PCI_CONF_BGE_MHCR, mhcrValue); bge_ind_put32(bgep, MEMORY_ARBITER_MODE_REG, bge_ind_get32(bgep, MEMORY_ARBITER_MODE_REG) | MEMORY_ARBITER_ENABLE); -#else - mhcrValue = pci_config_get32(bgep->cfg_handle, PCI_CONF_BGE_MHCR); -#endif if (mhcrValue & MHCR_ENABLE_ENDIAN_WORD_SWAP) { bgep->asf_wordswapped = B_TRUE; } else { diff --git a/usr/src/uts/common/io/bge/bge_mii.c b/usr/src/uts/common/io/bge/bge_mii.c index f24b6a3f16..b47c043d8c 100644 --- a/usr/src/uts/common/io/bge/bge_mii.c +++ b/usr/src/uts/common/io/bge/bge_mii.c @@ -23,6 +23,10 @@ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ + #include "bge_impl.h" /* @@ -207,6 +211,7 @@ bge_phy_reset(bge_t *bgep) { uint16_t control; uint_t count; + boolean_t ret = B_FALSE; BGE_TRACE(("bge_phy_reset($%p)", (void *)bgep)); @@ -221,22 +226,26 @@ bge_phy_reset(bge_t *bgep) } /* - * Set the PHY RESET bit, then wait up to 5 ms for it to self-clear + * Set the PHY RESET bit, then wait up to 50 ms for it to self-clear */ bge_mii_put16(bgep, MII_CONTROL, MII_CONTROL_RESET); - for (count = 0; ++count < 1000; ) { - drv_usecwait(5); + for (count = 0; ++count < 5000; ) { control = bge_mii_get16(bgep, MII_CONTROL); - if (BIC(control, MII_CONTROL_RESET)) - return (B_TRUE); + if (BIC(control, MII_CONTROL_RESET)) { + drv_usecwait(40); + ret = B_TRUE; + break; + } + drv_usecwait(10); } - if (DEVICE_5906_SERIES_CHIPSETS(bgep)) + if (ret == B_TRUE && DEVICE_5906_SERIES_CHIPSETS(bgep)) (void) bge_adj_volt_5906(bgep); - BGE_DEBUG(("bge_phy_reset: FAILED, control now 0x%x", control)); + if (ret == B_FALSE) + BGE_DEBUG(("bge_phy_reset: FAILED, control now 0x%x", control)); - return (B_FALSE); + return (ret); } /* @@ -541,34 +550,14 @@ bge_restart_copper(bge_t *bgep, boolean_t powerdown) ASSERT(mutex_owned(bgep->genlock)); - switch (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev)) { - default: - /* - * Shouldn't happen; it means we don't recognise this chip. - * It's probably a new one, so we'll try our best anyway ... - */ - case MHCR_CHIP_ASIC_REV_5703: - case MHCR_CHIP_ASIC_REV_5704: - case MHCR_CHIP_ASIC_REV_5705: - case MHCR_CHIP_ASIC_REV_5752: - case MHCR_CHIP_ASIC_REV_5714: - case MHCR_CHIP_ASIC_REV_5715: - reset_ok = bge_phy_reset_and_check(bgep); - break; - - case MHCR_CHIP_ASIC_REV_5906: - case MHCR_CHIP_ASIC_REV_5700: - case MHCR_CHIP_ASIC_REV_5701: - case MHCR_CHIP_ASIC_REV_5723: - case MHCR_CHIP_ASIC_REV_5721_5751: - /* - * Just a plain reset; the "check" code breaks these chips - */ + if (bgep->chipid.flags & CHIP_FLAG_NO_CHECK_RESET) { reset_ok = bge_phy_reset(bgep); if (!reset_ok) bge_fm_ereport(bgep, DDI_FM_DEVICE_NO_RESPONSE); - break; + } else { + reset_ok = bge_phy_reset_and_check(bgep); } + if (!reset_ok) { BGE_REPORT((bgep, "PHY failed to reset correctly")); return (DDI_FAILURE); @@ -590,7 +579,7 @@ bge_restart_copper(bge_t *bgep, boolean_t powerdown) switch (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev)) { case MHCR_CHIP_ASIC_REV_5705: - case MHCR_CHIP_ASIC_REV_5721_5751: + case MHCR_CHIP_ASIC_REV_5750: bge_phy_bit_err_fix(bgep); break; } @@ -1507,14 +1496,22 @@ bge_phys_init(bge_t *bgep) */ bgep->phy_mii_addr = 1; if (DEVICE_5717_SERIES_CHIPSETS(bgep)) { - int regval = bge_reg_get32(bgep, CPMU_STATUS_REG); - if (regval & CPMU_STATUS_FUN_NUM) - bgep->phy_mii_addr += 1; + uint32_t regval = bge_reg_get32(bgep, CPMU_STATUS_REG); + if (MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5719 || + MHCR_CHIP_ASIC_REV(bgep->chipid.asic_rev) == + MHCR_CHIP_ASIC_REV_5720) { + bgep->phy_mii_addr += + (regval & CPMU_STATUS_FUN_NUM_5719) >> + CPMU_STATUS_FUN_NUM_5719_SHIFT; + } else { + bgep->phy_mii_addr += + (regval & CPMU_STATUS_FUN_NUM_5717) ? 1 : 0; + } regval = bge_reg_get32(bgep, SGMII_STATUS_REG); if (regval & MEDIA_SELECTION_MODE) bgep->phy_mii_addr += 7; } - if (bge_phy_probe(bgep)) { bgep->chipid.flags &= ~CHIP_FLAG_SERDES; bgep->physops = &copper_ops; diff --git a/usr/src/uts/common/io/blkdev/blkdev.c b/usr/src/uts/common/io/blkdev/blkdev.c index 8af4d1d6a5..20e3a5737e 100644 --- a/usr/src/uts/common/io/blkdev/blkdev.c +++ b/usr/src/uts/common/io/blkdev/blkdev.c @@ -85,6 +85,7 @@ struct bd { kstat_io_t *d_kiop; boolean_t d_rdonly; + boolean_t d_ssd; boolean_t d_removable; boolean_t d_hotpluggable; boolean_t d_use_dma; @@ -1103,6 +1104,14 @@ bd_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, int *rvalp) } return (0); } + case DKIOCSOLIDSTATE: { + int i; + i = bd->d_ssd ? 1 : 0; + if (ddi_copyout(&i, ptr, sizeof (i), flag)) { + return (EFAULT); + } + return (0); + } case DKIOCSTATE: { enum dkio_state state; if (ddi_copyin(ptr, &state, sizeof (state), flag)) { @@ -1246,6 +1255,7 @@ bd_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie) bd_update_state(bd); ((tg_attribute_t *)arg)->media_is_writable = bd->d_rdonly ? B_FALSE : B_TRUE; + ((tg_attribute_t *)arg)->media_is_solid_state = bd->d_ssd; return (0); default: @@ -1361,6 +1371,7 @@ bd_update_state(bd_t *bd) bd->d_blkshift = ddi_ffs(media.m_blksize) - 1; bd->d_numblks = media.m_nblks; bd->d_rdonly = media.m_readonly; + bd->d_ssd = media.m_solidstate; state = DKIO_INSERTED; } diff --git a/usr/src/uts/common/io/cmlb.c b/usr/src/uts/common/io/cmlb.c index 0d174501f5..d7d6cb5ab5 100644 --- a/usr/src/uts/common/io/cmlb.c +++ b/usr/src/uts/common/io/cmlb.c @@ -20,6 +20,7 @@ */ /* + * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -243,6 +244,7 @@ static i_ddi_prop_dyn_t cmlb_prop_dyn[] = { {"Size", DDI_PROP_TYPE_INT64, S_IFCHR}, {"device-nblocks", DDI_PROP_TYPE_INT64}, {"device-blksize", DDI_PROP_TYPE_INT}, + {"device-solid-state", DDI_PROP_TYPE_INT}, {NULL} }; @@ -5657,11 +5659,12 @@ cmlb_prop_op(cmlb_handle_t cmlbhandle, struct cmlb_lun *cl; diskaddr_t capacity; uint32_t lbasize; - enum dp { DP_NBLOCKS, DP_BLKSIZE } dp; + enum dp { DP_NBLOCKS, DP_BLKSIZE, DP_SSD } dp; int callers_length; caddr_t buffer; uint64_t nblocks64; uint_t dblk; + tg_attribute_t tgattr; /* Always fallback to ddi_prop_op... */ cl = (struct cmlb_lun *)cmlbhandle; @@ -5685,6 +5688,8 @@ fallback: return (ddi_prop_op(dev, dip, prop_op, mod_flags, dp = DP_NBLOCKS; else if (strcmp(name, "device-blksize") == 0) dp = DP_BLKSIZE; + else if (strcmp(name, "device-solid-state") == 0) + dp = DP_SSD; else goto fallback; @@ -5692,7 +5697,7 @@ fallback: return (ddi_prop_op(dev, dip, prop_op, mod_flags, callers_length = *lengthp; if (dp == DP_NBLOCKS) *lengthp = sizeof (uint64_t); - else if (dp == DP_BLKSIZE) + else if ((dp == DP_BLKSIZE) || (dp == DP_SSD)) *lengthp = sizeof (uint32_t); /* service request for the length of the property */ @@ -5720,11 +5725,19 @@ fallback: return (ddi_prop_op(dev, dip, prop_op, mod_flags, } /* transfer the value into the buffer */ - if (dp == DP_NBLOCKS) + switch (dp) { + case DP_NBLOCKS: *((uint64_t *)buffer) = capacity; - else if (dp == DP_BLKSIZE) + break; + case DP_BLKSIZE: *((uint32_t *)buffer) = lbasize; - + break; + case DP_SSD: + if (DK_TG_GETATTRIBUTE(cl, &tgattr, tg_cookie) != 0) + tgattr.media_is_solid_state = B_FALSE; + *((uint32_t *)buffer) = + tgattr.media_is_solid_state ? 1 : 0; + } return (DDI_PROP_SUCCESS); } diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c index 40cbe86170..2152ce0baa 100644 --- a/usr/src/uts/common/io/dld/dld_drv.c +++ b/usr/src/uts/common/io/dld/dld_drv.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent Inc. All rights reserved. */ /* @@ -701,7 +702,8 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set, err = EACCES; goto done; } - err = dls_devnet_setzid(dlh, dzp->diz_zid); + err = dls_devnet_setzid(dlh, dzp->diz_zid, + dzp->diz_transient); } else { kprop->pr_perm_flags = MAC_PROP_PERM_RW; (*(zoneid_t *)kprop->pr_val) = dls_devnet_getzid(dlh); @@ -865,7 +867,7 @@ drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) return (err); if ((err = dls_devnet_rename(dir->dir_linkid1, dir->dir_linkid2, - dir->dir_link)) != 0) + dir->dir_link, dir->dir_zoneinit)) != 0) return (err); if (dir->dir_linkid2 == DATALINK_INVALID_LINKID) diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c index f90adbf27a..d35c1e4bbf 100644 --- a/usr/src/uts/common/io/dls/dls.c +++ b/usr/src/uts/common/io/dls/dls.c @@ -25,6 +25,10 @@ */ /* + * Copyright 2011 Joyent, Inc. All rights reserved. + */ + +/* * Data-Link Services Module */ @@ -610,6 +614,22 @@ boolean_t dls_accept_promisc(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx, void **ds_rx_arg, boolean_t loopback) { + if (dsp->ds_promisc == 0) { + /* + * If there are active walkers of the mi_promisc_list when + * promiscuousness is disabled, ds_promisc will be cleared, + * but the DLS will remain on the mi_promisc_list until the + * walk is completed. If we do not recognize this case here, + * we won't properly execute the ds_promisc case in the common + * accept routine -- and we will potentially accept a packet + * that has originated with this DLS (which in turn can + * induce recursion and death by stack overflow). If + * ds_promisc is zero, we know that we are in this window -- + * and we refuse to accept the packet. + */ + return (B_FALSE); + } + return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_TRUE, loopback)); } diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c index 049c4bd757..5fa37e0a8a 100644 --- a/usr/src/uts/common/io/dls/dls_mgmt.c +++ b/usr/src/uts/common/io/dls/dls_mgmt.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* @@ -105,12 +106,13 @@ typedef struct dls_devnet_s { zoneid_t dd_zid; /* current zone */ boolean_t dd_prop_loaded; taskqid_t dd_prop_taskid; + boolean_t dd_transient; /* link goes away when zone does */ } dls_devnet_t; static int i_dls_devnet_create_iptun(const char *, const char *, datalink_id_t *); static int i_dls_devnet_destroy_iptun(datalink_id_t); -static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t); +static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t, boolean_t); static int dls_devnet_unset(const char *, datalink_id_t *, boolean_t); /*ARGSUSED*/ @@ -145,7 +147,12 @@ dls_zone_remove(datalink_id_t linkid, void *arg) dls_devnet_t *ddp; if (dls_devnet_hold_tmp(linkid, &ddp) == 0) { - (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID); + /* + * Don't bother moving transient links back to the global zone + * since we will simply delete them in dls_devnet_unset. + */ + if (!ddp->dd_transient) + (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE); dls_devnet_rele_tmp(ddp); } return (0); @@ -526,6 +533,7 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid) getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID; (void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN); + getlinkid.ld_zoneid = getzoneid(); if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval, sizeof (retval))) == 0) { @@ -740,12 +748,23 @@ dls_devnet_stat_update(kstat_t *ksp, int rw) * Create the "link" kstats. */ static void -dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid) +dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid, zoneid_t newzoneid) { kstat_t *ksp; + char *nm; + char kname[MAXLINKNAMELEN]; + + if (zoneid != newzoneid) { + ASSERT(zoneid == GLOBAL_ZONEID); + (void) snprintf(kname, sizeof (kname), "z%d_%s", newzoneid, + ddp->dd_linkname); + nm = kname; + } else { + nm = ddp->dd_linkname; + } - if (dls_stat_create("link", 0, ddp->dd_linkname, zoneid, - dls_devnet_stat_update, ddp, &ksp) == 0) { + if (dls_stat_create("link", 0, nm, zoneid, + dls_devnet_stat_update, ddp, &ksp, newzoneid) == 0) { ASSERT(ksp != NULL); if (zoneid == ddp->dd_owner_zid) { ASSERT(ddp->dd_ksp == NULL); @@ -765,12 +784,12 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid) { if (zoneid == ddp->dd_owner_zid) { if (ddp->dd_ksp != NULL) { - kstat_delete(ddp->dd_ksp); + dls_stat_delete(ddp->dd_ksp); ddp->dd_ksp = NULL; } } else { if (ddp->dd_zone_ksp != NULL) { - kstat_delete(ddp->dd_zone_ksp); + dls_stat_delete(ddp->dd_zone_ksp); ddp->dd_zone_ksp = NULL; } } @@ -781,15 +800,25 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid) * and create the new set using the new name. */ static void -dls_devnet_stat_rename(dls_devnet_t *ddp) +dls_devnet_stat_rename(dls_devnet_t *ddp, boolean_t zoneinit) { if (ddp->dd_ksp != NULL) { - kstat_delete(ddp->dd_ksp); + dls_stat_delete(ddp->dd_ksp); ddp->dd_ksp = NULL; } - /* We can't rename a link while it's assigned to a non-global zone. */ + if (zoneinit && ddp->dd_zone_ksp != NULL) { + dls_stat_delete(ddp->dd_zone_ksp); + ddp->dd_zone_ksp = NULL; + } + /* + * We can't rename a link while it's assigned to a non-global zone + * unless we're first initializing the zone while readying it. + */ ASSERT(ddp->dd_zone_ksp == NULL); - dls_devnet_stat_create(ddp, ddp->dd_owner_zid); + dls_devnet_stat_create(ddp, ddp->dd_owner_zid, + (zoneinit ? ddp->dd_zid : ddp->dd_owner_zid)); + if (zoneinit) + dls_devnet_stat_create(ddp, ddp->dd_zid, ddp->dd_zid); } /* @@ -878,7 +907,8 @@ done: rw_exit(&i_dls_devnet_lock); if (err == 0) { if (zoneid != GLOBAL_ZONEID && - (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE)) != 0) + (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE, + B_FALSE)) != 0) (void) dls_devnet_unset(macname, &linkid, B_TRUE); /* * The kstat subsystem holds its own locks (rather perimeter) @@ -887,7 +917,7 @@ done: * lock hierarchy is kstat locks -> i_dls_devnet_lock. */ if (stat_create) - dls_devnet_stat_create(ddp, zoneid); + dls_devnet_stat_create(ddp, zoneid, zoneid); if (ddpp != NULL) *ddpp = ddp; } @@ -924,17 +954,64 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) ASSERT(ddp->dd_ref != 0); if ((ddp->dd_ref != 1) || (!wait && (ddp->dd_tref != 0 || ddp->dd_prop_taskid != NULL))) { - mutex_exit(&ddp->dd_mutex); - rw_exit(&i_dls_devnet_lock); - return (EBUSY); + int zstatus = 0; + + /* + * There are a couple of alternatives that might be going on + * here; a) the zone is shutting down and it has a transient + * link assigned, in which case we want to clean it up instead + * of moving it back to the global zone, or b) its possible + * that we're trying to clean up an orphaned vnic that was + * delegated to a zone and which wasn't cleaned up properly + * when the zone went away. Check for either of these cases + * before we simply return EBUSY. + * + * zstatus indicates which situation we are dealing with: + * 0 - means return EBUSY + * 1 - means case (a), cleanup transient link + * -1 - means case (b), orphained VNIC + */ + if (ddp->dd_ref > 1 && ddp->dd_zid != GLOBAL_ZONEID) { + zone_t *zp; + + if ((zp = zone_find_by_id(ddp->dd_zid)) == NULL) { + zstatus = -1; + } else { + if (ddp->dd_transient) { + zone_status_t s = zone_status_get(zp); + + if (s >= ZONE_IS_SHUTTING_DOWN) + zstatus = 1; + } + zone_rele(zp); + } + } + + if (zstatus == 0) { + mutex_exit(&ddp->dd_mutex); + rw_exit(&i_dls_devnet_lock); + return (EBUSY); + } + + /* + * We want to delete the link, reset ref to 1; + */ + if (zstatus == -1) + /* Log a warning, but continue in this case */ + cmn_err(CE_WARN, "clear orphaned datalink: %s\n", + ddp->dd_linkname); + ddp->dd_ref = 1; } ddp->dd_flags |= DD_CONDEMNED; ddp->dd_ref--; *id = ddp->dd_linkid; - if (ddp->dd_zid != GLOBAL_ZONEID) - (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE); + if (ddp->dd_zid != GLOBAL_ZONEID) { + dls_devnet_stat_destroy(ddp, ddp->dd_zid); + (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE, + B_FALSE); + } /* * Remove this dls_devnet_t from the hash table. @@ -1261,9 +1338,15 @@ dls_devnet_phydev(datalink_id_t vlanid, dev_t *devp) * * This case does not change the <link name, linkid> mapping, so the link's * kstats need to be updated with using name associated the given id2. + * + * The zonename parameter is used to allow us to create a VNIC in the global + * zone which is assigned to a non-global zone. Since there is a race condition + * in the create process if two VNICs have the same name, we need to rename it + * after it has been assigned to the zone. */ int -dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) +dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link, + boolean_t zoneinit) { dls_dev_handle_t ddh = NULL; int err = 0; @@ -1313,13 +1396,16 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) * is currently accessing the link kstats, or if the link is on-loan * to a non-global zone. Then set the DD_KSTAT_CHANGING flag to * prevent any access to the kstats while we delete and recreate - * kstats below. + * kstats below. However, we skip this check if we're renaming the + * vnic as part of bringing it up for a zone. */ mutex_enter(&ddp->dd_mutex); - if (ddp->dd_ref > 1) { - mutex_exit(&ddp->dd_mutex); - err = EBUSY; - goto done; + if (!zoneinit) { + if (ddp->dd_ref > 1) { + mutex_exit(&ddp->dd_mutex); + err = EBUSY; + goto done; + } } ddp->dd_flags |= DD_KSTAT_CHANGING; @@ -1333,7 +1419,15 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) /* rename mac client name and its flow if exists */ if ((err = mac_open(ddp->dd_mac, &mh)) != 0) goto done; - (void) mac_rename_primary(mh, link); + if (zoneinit) { + char tname[MAXLINKNAMELEN]; + + (void) snprintf(tname, sizeof (tname), "z%d_%s", + ddp->dd_zid, link); + (void) mac_rename_primary(mh, tname); + } else { + (void) mac_rename_primary(mh, link); + } mac_close(mh); goto done; } @@ -1406,7 +1500,7 @@ done: */ rw_exit(&i_dls_devnet_lock); if (err == 0) - dls_devnet_stat_rename(ddp); + dls_devnet_stat_rename(ddp, zoneinit); if (clear_dd_flag) { mutex_enter(&ddp->dd_mutex); @@ -1421,7 +1515,8 @@ done: } static int -i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop) +i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop, + boolean_t transient) { int err; mac_perim_handle_t mph; @@ -1454,6 +1549,7 @@ i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop) } if ((err = dls_link_setzid(ddp->dd_mac, new_zoneid)) == 0) { ddp->dd_zid = new_zoneid; + ddp->dd_transient = transient; devnet_need_rebuild = B_TRUE; } @@ -1468,7 +1564,7 @@ done: } int -dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) +dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid, boolean_t transient) { dls_devnet_t *ddp; int err; @@ -1490,7 +1586,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) refheld = B_TRUE; } - if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE)) != 0) { + if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE, transient)) != 0) { if (refheld) dls_devnet_rele(ddp); return (err); @@ -1507,7 +1603,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) if (old_zid != GLOBAL_ZONEID) dls_devnet_stat_destroy(ddh, old_zid); if (new_zid != GLOBAL_ZONEID) - dls_devnet_stat_create(ddh, new_zid); + dls_devnet_stat_create(ddh, new_zid, new_zid); return (0); } diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c index 51e4be7260..82dceff278 100644 --- a/usr/src/uts/common/io/dls/dls_stat.c +++ b/usr/src/uts/common/io/dls/dls_stat.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* @@ -30,30 +31,33 @@ #include <sys/dld_impl.h> #include <sys/mac_ether.h> -static mac_stat_info_t i_dls_si[] = { - { MAC_STAT_IFSPEED, "ifspeed", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_MULTIRCV, "multircv", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_BRDCSTRCV, "brdcstrcv", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_MULTIXMT, "multixmt", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_BRDCSTXMT, "brdcstxmt", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_NORCVBUF, "norcvbuf", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_IERRORS, "ierrors", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_NOXMTBUF, "noxmtbuf", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OERRORS, "oerrors", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_COLLISIONS, "collisions", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_RBYTES, "rbytes", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_IPACKETS, "ipackets", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OBYTES, "obytes", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OPACKETS, "opackets", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_RBYTES, "rbytes64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_IPACKETS, "ipackets64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_OBYTES, "obytes64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_OPACKETS, "opackets64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_LINK_STATE, "link_state", KSTAT_DATA_UINT32, - (uint64_t)LINK_STATE_UNKNOWN} -}; - -#define STAT_INFO_COUNT (sizeof (i_dls_si) / sizeof (i_dls_si[0])) +/* + * structure for link kstats + */ +typedef struct { + kstat_named_t dk_ifspeed; + kstat_named_t dk_multircv; + kstat_named_t dk_brdcstrcv; + kstat_named_t dk_multixmt; + kstat_named_t dk_brdcstxmt; + kstat_named_t dk_norcvbuf; + kstat_named_t dk_ierrors; + kstat_named_t dk_noxmtbuf; + kstat_named_t dk_oerrors; + kstat_named_t dk_collisions; + kstat_named_t dk_rbytes; + kstat_named_t dk_ipackets; + kstat_named_t dk_obytes; + kstat_named_t dk_opackets; + kstat_named_t dk_rbytes64; + kstat_named_t dk_ipackets64; + kstat_named_t dk_obytes64; + kstat_named_t dk_opackets64; + kstat_named_t dk_link_state; + kstat_named_t dk_link_duplex; + kstat_named_t dk_unknowns; + kstat_named_t dk_zonename; +} dls_kstat_t; /* * Exported functions. @@ -61,42 +65,54 @@ static mac_stat_info_t i_dls_si[] = { int dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw) { - kstat_named_t *knp; - uint_t i; - uint64_t val; + dls_kstat_t *dkp = ksp->ks_data; if (rw != KSTAT_READ) return (EACCES); - knp = (kstat_named_t *)ksp->ks_data; - for (i = 0; i < STAT_INFO_COUNT; i++) { - val = mac_stat_get(dlp->dl_mh, i_dls_si[i].msi_stat); - - switch (i_dls_si[i].msi_type) { - case KSTAT_DATA_UINT64: - knp->value.ui64 = val; - break; - case KSTAT_DATA_UINT32: - knp->value.ui32 = (uint32_t)val; - break; - default: - ASSERT(B_FALSE); - } - - knp++; - } + dkp->dk_ifspeed.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_IFSPEED); + dkp->dk_multircv.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_MULTIRCV); + dkp->dk_brdcstrcv.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_BRDCSTRCV); + dkp->dk_multixmt.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_MULTIXMT); + dkp->dk_brdcstxmt.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_BRDCSTXMT); + dkp->dk_norcvbuf.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_NORCVBUF); + dkp->dk_ierrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_IERRORS); + dkp->dk_noxmtbuf.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_NOXMTBUF); + dkp->dk_oerrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OERRORS); + dkp->dk_collisions.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_COLLISIONS); + dkp->dk_rbytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES); + dkp->dk_ipackets.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_IPACKETS); + dkp->dk_obytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES); + dkp->dk_opackets.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_OPACKETS); + dkp->dk_rbytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES); + dkp->dk_ipackets64.value.ui64 = mac_stat_get(dlp->dl_mh, + MAC_STAT_IPACKETS); + dkp->dk_obytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES); + dkp->dk_opackets64.value.ui64 = mac_stat_get(dlp->dl_mh, + MAC_STAT_OPACKETS); + dkp->dk_link_state.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_LINK_STATE); /* * Ethernet specific kstat "link_duplex" */ if (dlp->dl_mip->mi_nativemedia != DL_ETHER) { - knp->value.ui32 = LINK_DUPLEX_UNKNOWN; + dkp->dk_link_duplex.value.ui32 = LINK_DUPLEX_UNKNOWN; } else { - val = mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX); - knp->value.ui32 = (uint32_t)val; + dkp->dk_link_duplex.value.ui32 = + (uint32_t)mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX); } - knp++; - knp->value.ui32 = dlp->dl_unknowns; + + dkp->dk_unknowns.value.ui32 = dlp->dl_unknowns; return (0); } @@ -104,30 +120,66 @@ dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw) int dls_stat_create(const char *module, int instance, const char *name, zoneid_t zoneid, int (*update)(struct kstat *, int), void *private, - kstat_t **kspp) + kstat_t **kspp, zoneid_t newzoneid) { kstat_t *ksp; - kstat_named_t *knp; - uint_t i; + zone_t *zone; + dls_kstat_t *dkp; if ((ksp = kstat_create_zone(module, instance, name, "net", - KSTAT_TYPE_NAMED, STAT_INFO_COUNT + 2, 0, zoneid)) == NULL) { + KSTAT_TYPE_NAMED, sizeof (dls_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zoneid)) == NULL) { return (EINVAL); } ksp->ks_update = update; ksp->ks_private = private; + dkp = ksp->ks_data = kmem_zalloc(sizeof (dls_kstat_t), KM_SLEEP); + if ((zone = zone_find_by_id(newzoneid)) != NULL) { + ksp->ks_data_size += strlen(zone->zone_name) + 1; + } - knp = (kstat_named_t *)ksp->ks_data; - for (i = 0; i < STAT_INFO_COUNT; i++) { - kstat_named_init(knp, i_dls_si[i].msi_name, - i_dls_si[i].msi_type); - knp++; + kstat_named_init(&dkp->dk_ifspeed, "ifspeed", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_multircv, "multircv", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_brdcstrcv, "brdcstrcv", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_multixmt, "multixmt", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_brdcstxmt, "brdcstxmt", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_norcvbuf, "norcvbuf", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_ierrors, "ierrors", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_noxmtbuf, "noxmtbuf", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_oerrors, "oerrors", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_collisions, "collisions", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_rbytes, "rbytes", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_ipackets, "ipackets", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_obytes, "obytes", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_opackets, "opackets", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_rbytes64, "rbytes64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_ipackets64, "ipackets64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_obytes64, "obytes64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_opackets64, "opackets64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_link_state, "link_state", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_link_duplex, "link_duplex", + KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_unknowns, "unknowns", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_zonename, "zonename", KSTAT_DATA_STRING); + + if (zone != NULL) { + kstat_named_setstr(&dkp->dk_zonename, zone->zone_name); + zone_rele(zone); } - kstat_named_init(knp++, "link_duplex", KSTAT_DATA_UINT32); - kstat_named_init(knp, "unknowns", KSTAT_DATA_UINT32); kstat_install(ksp); *kspp = ksp; return (0); } + +void +dls_stat_delete(kstat_t *ksp) +{ + void *data; + if (ksp != NULL) { + data = ksp->ks_data; + kstat_delete(ksp); + kmem_free(data, sizeof (dls_kstat_t)); + } +} diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE new file mode 100644 index 0000000000..00aefb6f51 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE @@ -0,0 +1,32 @@ +/* + * MegaRAID device driver for SAS2.0 controllers + * Copyright (c) 2009, LSI Logic Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip new file mode 100644 index 0000000000..ac6d2d1b15 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip @@ -0,0 +1 @@ +DR_SAS DRIVER diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.c b/usr/src/uts/common/io/dr_sas/dr_sas.c new file mode 100644 index 0000000000..5b1dc82938 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas.c @@ -0,0 +1,5506 @@ +/* + * dr_sas.c: source for dr_sas driver + * + * MegaRAID device driver for SAS2.0 controllers + * Copyright (c) 2008-2009, LSI Logic Corporation. + * All rights reserved. + * + * Version: + * Author: + * Arun Chandrashekhar + * Manju R + * Rajesh Prabhakaran + * Seokmann Ju + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/file.h> +#include <sys/errno.h> +#include <sys/open.h> +#include <sys/cred.h> +#include <sys/modctl.h> +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/stat.h> +#include <sys/mkdev.h> +#include <sys/pci.h> +#include <sys/scsi/scsi.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/atomic.h> +#include <sys/signal.h> +#include <sys/fs/dv_node.h> /* devfs_clean */ + +#include "dr_sas.h" + +/* + * FMA header files + */ +#include <sys/ddifm.h> +#include <sys/fm/protocol.h> +#include <sys/fm/util.h> +#include <sys/fm/io/ddi.h> + +/* + * Local static data + */ +static void *drsas_state = NULL; +static int debug_level_g = CL_NONE; + +#pragma weak scsi_hba_open +#pragma weak scsi_hba_close +#pragma weak scsi_hba_ioctl + +static ddi_dma_attr_t drsas_generic_dma_attr = { + DMA_ATTR_V0, /* dma_attr_version */ + 0, /* low DMA address range */ + 0xFFFFFFFFU, /* high DMA address range */ + 0xFFFFFFFFU, /* DMA counter register */ + 8, /* DMA address alignment */ + 0x07, /* DMA burstsizes */ + 1, /* min DMA size */ + 0xFFFFFFFFU, /* max DMA size */ + 0xFFFFFFFFU, /* segment boundary */ + DRSAS_MAX_SGE_CNT, /* dma_attr_sglen */ + 512, /* granularity of device */ + 0 /* bus specific DMA flags */ +}; + +int32_t drsas_max_cap_maxxfer = 0x1000000; + +/* + * cb_ops contains base level routines + */ +static struct cb_ops drsas_cb_ops = { + drsas_open, /* open */ + drsas_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + drsas_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + nodev, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_HOTPLUG, /* cb_flag */ + CB_REV, /* cb_rev */ + nodev, /* cb_aread */ + nodev /* cb_awrite */ +}; + +/* + * dev_ops contains configuration routines + */ +static struct dev_ops drsas_ops = { + DEVO_REV, /* rev, */ + 0, /* refcnt */ + drsas_getinfo, /* getinfo */ + nulldev, /* identify */ + nulldev, /* probe */ + drsas_attach, /* attach */ + drsas_detach, /* detach */ + drsas_reset, /* reset */ + &drsas_cb_ops, /* char/block ops */ + NULL, /* bus ops */ + NULL, /* power */ + ddi_quiesce_not_supported, /* quiesce */ +}; + +char _depends_on[] = "misc/scsi"; + +static struct modldrv modldrv = { + &mod_driverops, /* module type - driver */ + DRSAS_VERSION, + &drsas_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, /* ml_rev - must be MODREV_1 */ + &modldrv, /* ml_linkage */ + NULL /* end of driver linkage */ +}; + +static struct ddi_device_acc_attr endian_attr = { + DDI_DEVICE_ATTR_V0, + DDI_STRUCTURE_LE_ACC, + DDI_STRICTORDER_ACC +}; + + +/* + * ************************************************************************** * + * * + * common entry points - for loadable kernel modules * + * * + * ************************************************************************** * + */ + +int +_init(void) +{ + int ret; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + ret = ddi_soft_state_init(&drsas_state, + sizeof (struct drsas_instance), 0); + + if (ret != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: could not init state")); + return (ret); + } + + if ((ret = scsi_hba_init(&modlinkage)) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: could not init scsi hba")); + ddi_soft_state_fini(&drsas_state); + return (ret); + } + + ret = mod_install(&modlinkage); + + if (ret != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: mod_install failed")); + scsi_hba_fini(&modlinkage); + ddi_soft_state_fini(&drsas_state); + } + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int ret; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if ((ret = mod_remove(&modlinkage)) != DDI_SUCCESS) + return (ret); + + scsi_hba_fini(&modlinkage); + + ddi_soft_state_fini(&drsas_state); + + return (ret); +} + + +/* + * ************************************************************************** * + * * + * common entry points - for autoconfiguration * + * * + * ************************************************************************** * + */ + +static int +drsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int instance_no; + int nregs; + uint8_t added_isr_f = 0; + uint8_t added_soft_isr_f = 0; + uint8_t create_devctl_node_f = 0; + uint8_t create_scsi_node_f = 0; + uint8_t create_ioc_node_f = 0; + uint8_t tran_alloc_f = 0; + uint8_t irq; + uint16_t vendor_id; + uint16_t device_id; + uint16_t subsysvid; + uint16_t subsysid; + uint16_t command; + off_t reglength = 0; + int intr_types = 0; + char *data; + int msi_enable = 0; + + scsi_hba_tran_t *tran; + ddi_dma_attr_t tran_dma_attr; + struct drsas_instance *instance; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* CONSTCOND */ + ASSERT(NO_COMPETING_THREADS); + + instance_no = ddi_get_instance(dip); + + /* + * check to see whether this device is in a DMA-capable slot. + */ + if (ddi_slaveonly(dip) == DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Device in slave-only slot, unused", + instance_no)); + return (DDI_FAILURE); + } + + switch (cmd) { + case DDI_ATTACH: + con_log(CL_DLEVEL1, (CE_NOTE, "dr_sas: DDI_ATTACH")); + /* allocate the soft state for the instance */ + if (ddi_soft_state_zalloc(drsas_state, instance_no) + != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Failed to allocate soft state", + instance_no)); + + return (DDI_FAILURE); + } + + instance = (struct drsas_instance *)ddi_get_soft_state + (drsas_state, instance_no); + + if (instance == NULL) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Bad soft state", instance_no)); + + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + bzero((caddr_t)instance, + sizeof (struct drsas_instance)); + + instance->func_ptr = kmem_zalloc( + sizeof (struct drsas_func_ptr), KM_SLEEP); + ASSERT(instance->func_ptr); + + /* Setup the PCI configuration space handles */ + if (pci_config_setup(dip, &instance->pci_handle) != + DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: pci config setup failed ", + instance_no)); + + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to get registers.")); + + pci_config_teardown(&instance->pci_handle); + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + vendor_id = pci_config_get16(instance->pci_handle, + PCI_CONF_VENID); + device_id = pci_config_get16(instance->pci_handle, + PCI_CONF_DEVID); + + subsysvid = pci_config_get16(instance->pci_handle, + PCI_CONF_SUBVENID); + subsysid = pci_config_get16(instance->pci_handle, + PCI_CONF_SUBSYSID); + + pci_config_put16(instance->pci_handle, PCI_CONF_COMM, + (pci_config_get16(instance->pci_handle, + PCI_CONF_COMM) | PCI_COMM_ME)); + irq = pci_config_get8(instance->pci_handle, + PCI_CONF_ILINE); + + con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: " + "0x%x:0x%x 0x%x:0x%x, irq:%d drv-ver:%s", + instance_no, vendor_id, device_id, subsysvid, + subsysid, irq, DRSAS_VERSION)); + + /* enable bus-mastering */ + command = pci_config_get16(instance->pci_handle, + PCI_CONF_COMM); + + if (!(command & PCI_COMM_ME)) { + command |= PCI_COMM_ME; + + pci_config_put16(instance->pci_handle, + PCI_CONF_COMM, command); + + con_log(CL_ANN, (CE_CONT, "dr_sas%d: " + "enable bus-mastering", instance_no)); + } else { + con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: " + "bus-mastering already set", instance_no)); + } + + /* initialize function pointers */ + if ((device_id == PCI_DEVICE_ID_LSI_2108VDE) || + (device_id == PCI_DEVICE_ID_LSI_2108V)) { + con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: " + "2108V/DE detected", instance_no)); + instance->func_ptr->read_fw_status_reg = + read_fw_status_reg_ppc; + instance->func_ptr->issue_cmd = issue_cmd_ppc; + instance->func_ptr->issue_cmd_in_sync_mode = + issue_cmd_in_sync_mode_ppc; + instance->func_ptr->issue_cmd_in_poll_mode = + issue_cmd_in_poll_mode_ppc; + instance->func_ptr->enable_intr = + enable_intr_ppc; + instance->func_ptr->disable_intr = + disable_intr_ppc; + instance->func_ptr->intr_ack = intr_ack_ppc; + } else { + con_log(CL_ANN, (CE_WARN, + "dr_sas: Invalid device detected")); + + pci_config_teardown(&instance->pci_handle); + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + instance->baseaddress = pci_config_get32( + instance->pci_handle, PCI_CONF_BASE0); + instance->baseaddress &= 0x0fffc; + + instance->dip = dip; + instance->vendor_id = vendor_id; + instance->device_id = device_id; + instance->subsysvid = subsysvid; + instance->subsysid = subsysid; + instance->instance = instance_no; + + /* Initialize FMA */ + instance->fm_capabilities = ddi_prop_get_int( + DDI_DEV_T_ANY, instance->dip, DDI_PROP_DONTPASS, + "fm-capable", DDI_FM_EREPORT_CAPABLE | + DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE + | DDI_FM_ERRCB_CAPABLE); + + drsas_fm_init(instance); + + /* Initialize Interrupts */ + if ((ddi_dev_regsize(instance->dip, + REGISTER_SET_IO_2108, ®length) != DDI_SUCCESS) || + reglength < MINIMUM_MFI_MEM_SZ) { + return (DDI_FAILURE); + } + if (reglength > DEFAULT_MFI_MEM_SZ) { + reglength = DEFAULT_MFI_MEM_SZ; + con_log(CL_DLEVEL1, (CE_NOTE, + "dr_sas: register length to map is " + "0x%lx bytes", reglength)); + } + if (ddi_regs_map_setup(instance->dip, + REGISTER_SET_IO_2108, &instance->regmap, 0, + reglength, &endian_attr, &instance->regmap_handle) + != DDI_SUCCESS) { + con_log(CL_ANN, (CE_NOTE, + "dr_sas: couldn't map control registers")); + goto fail_attach; + } + + /* + * Disable Interrupt Now. + * Setup Software interrupt + */ + instance->func_ptr->disable_intr(instance); + + msi_enable = 0; + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0, + "drsas-enable-msi", &data) == DDI_SUCCESS) { + if (strncmp(data, "yes", 3) == 0) { + msi_enable = 1; + con_log(CL_ANN, (CE_WARN, + "msi_enable = %d ENABLED", + msi_enable)); + } + ddi_prop_free(data); + } + + con_log(CL_DLEVEL1, (CE_WARN, "msi_enable = %d", + msi_enable)); + + /* Check for all supported interrupt types */ + if (ddi_intr_get_supported_types( + dip, &intr_types) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "ddi_intr_get_supported_types() failed")); + goto fail_attach; + } + + con_log(CL_DLEVEL1, (CE_NOTE, + "ddi_intr_get_supported_types() ret: 0x%x", + intr_types)); + + /* Initialize and Setup Interrupt handler */ + if (msi_enable && (intr_types & DDI_INTR_TYPE_MSIX)) { + if (drsas_add_intrs(instance, + DDI_INTR_TYPE_MSIX) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "MSIX interrupt query failed")); + goto fail_attach; + } + instance->intr_type = DDI_INTR_TYPE_MSIX; + } else if (msi_enable && (intr_types & + DDI_INTR_TYPE_MSI)) { + if (drsas_add_intrs(instance, + DDI_INTR_TYPE_MSI) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "MSI interrupt query failed")); + goto fail_attach; + } + instance->intr_type = DDI_INTR_TYPE_MSI; + } else if (intr_types & DDI_INTR_TYPE_FIXED) { + msi_enable = 0; + if (drsas_add_intrs(instance, + DDI_INTR_TYPE_FIXED) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "FIXED interrupt query failed")); + goto fail_attach; + } + instance->intr_type = DDI_INTR_TYPE_FIXED; + } else { + con_log(CL_ANN, (CE_WARN, "Device cannot " + "suppport either FIXED or MSI/X " + "interrupts")); + goto fail_attach; + } + + added_isr_f = 1; + + /* setup the mfi based low level driver */ + if (init_mfi(instance) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: " + "could not initialize the low level driver")); + + goto fail_attach; + } + + /* Initialize all Mutex */ + INIT_LIST_HEAD(&instance->completed_pool_list); + mutex_init(&instance->completed_pool_mtx, + "completed_pool_mtx", MUTEX_DRIVER, + DDI_INTR_PRI(instance->intr_pri)); + + mutex_init(&instance->int_cmd_mtx, "int_cmd_mtx", + MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); + cv_init(&instance->int_cmd_cv, NULL, CV_DRIVER, NULL); + + mutex_init(&instance->cmd_pool_mtx, "cmd_pool_mtx", + MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); + + /* Register our soft-isr for highlevel interrupts. */ + instance->isr_level = instance->intr_pri; + if (instance->isr_level == HIGH_LEVEL_INTR) { + if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH, + &instance->soft_intr_id, NULL, NULL, + drsas_softintr, (caddr_t)instance) != + DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + " Software ISR did not register")); + + goto fail_attach; + } + + added_soft_isr_f = 1; + } + + /* Allocate a transport structure */ + tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP); + + if (tran == NULL) { + con_log(CL_ANN, (CE_WARN, + "scsi_hba_tran_alloc failed")); + goto fail_attach; + } + + tran_alloc_f = 1; + + instance->tran = tran; + + tran->tran_hba_private = instance; + tran->tran_tgt_init = drsas_tran_tgt_init; + tran->tran_tgt_probe = scsi_hba_probe; + tran->tran_tgt_free = drsas_tran_tgt_free; + tran->tran_init_pkt = drsas_tran_init_pkt; + tran->tran_start = drsas_tran_start; + tran->tran_abort = drsas_tran_abort; + tran->tran_reset = drsas_tran_reset; + tran->tran_getcap = drsas_tran_getcap; + tran->tran_setcap = drsas_tran_setcap; + tran->tran_destroy_pkt = drsas_tran_destroy_pkt; + tran->tran_dmafree = drsas_tran_dmafree; + tran->tran_sync_pkt = drsas_tran_sync_pkt; + tran->tran_bus_config = drsas_tran_bus_config; + + tran_dma_attr = drsas_generic_dma_attr; + tran_dma_attr.dma_attr_sgllen = instance->max_num_sge; + + /* Attach this instance of the hba */ + if (scsi_hba_attach_setup(dip, &tran_dma_attr, tran, 0) + != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "scsi_hba_attach failed")); + + goto fail_attach; + } + + /* create devctl node for cfgadm command */ + if (ddi_create_minor_node(dip, "devctl", + S_IFCHR, INST2DEVCTL(instance_no), + DDI_NT_SCSI_NEXUS, 0) == DDI_FAILURE) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create devctl node.")); + + goto fail_attach; + } + + create_devctl_node_f = 1; + + /* create scsi node for cfgadm command */ + if (ddi_create_minor_node(dip, "scsi", S_IFCHR, + INST2SCSI(instance_no), + DDI_NT_SCSI_ATTACHMENT_POINT, 0) == + DDI_FAILURE) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create scsi node.")); + + goto fail_attach; + } + + create_scsi_node_f = 1; + + (void) sprintf(instance->iocnode, "%d:lsirdctl", + instance_no); + + /* + * Create a node for applications + * for issuing ioctl to the driver. + */ + if (ddi_create_minor_node(dip, instance->iocnode, + S_IFCHR, INST2LSIRDCTL(instance_no), + DDI_PSEUDO, 0) == DDI_FAILURE) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create ioctl node.")); + + goto fail_attach; + } + + create_ioc_node_f = 1; + + /* Create a taskq to handle dr events */ + if ((instance->taskq = ddi_taskq_create(dip, + "drsas_dr_taskq", 1, + TASKQ_DEFAULTPRI, 0)) == NULL) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create taskq ")); + instance->taskq = NULL; + goto fail_attach; + } + + /* enable interrupt */ + instance->func_ptr->enable_intr(instance); + + /* initiate AEN */ + if (start_mfi_aen(instance)) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to initiate AEN.")); + goto fail_initiate_aen; + } + + con_log(CL_DLEVEL1, (CE_NOTE, + "AEN started for instance %d.", instance_no)); + + /* Finally! We are on the air. */ + ddi_report_dev(dip); + + if (drsas_check_acc_handle(instance->regmap_handle) != + DDI_SUCCESS) { + goto fail_attach; + } + if (drsas_check_acc_handle(instance->pci_handle) != + DDI_SUCCESS) { + goto fail_attach; + } + instance->dr_ld_list = + kmem_zalloc(MRDRV_MAX_LD * sizeof (struct drsas_ld), + KM_SLEEP); + break; + case DDI_PM_RESUME: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: DDI_PM_RESUME")); + break; + case DDI_RESUME: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: DDI_RESUME")); + break; + default: + con_log(CL_ANN, (CE_WARN, + "dr_sas: invalid attach cmd=%x", cmd)); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); + +fail_initiate_aen: +fail_attach: + if (create_devctl_node_f) { + ddi_remove_minor_node(dip, "devctl"); + } + + if (create_scsi_node_f) { + ddi_remove_minor_node(dip, "scsi"); + } + + if (create_ioc_node_f) { + ddi_remove_minor_node(dip, instance->iocnode); + } + + if (tran_alloc_f) { + scsi_hba_tran_free(tran); + } + + + if (added_soft_isr_f) { + ddi_remove_softintr(instance->soft_intr_id); + } + + if (added_isr_f) { + drsas_rem_intrs(instance); + } + + if (instance && instance->taskq) { + ddi_taskq_destroy(instance->taskq); + } + + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + + drsas_fm_fini(instance); + + pci_config_teardown(&instance->pci_handle); + + ddi_soft_state_free(drsas_state, instance_no); + + con_log(CL_ANN, (CE_NOTE, + "dr_sas: return failure from drsas_attach")); + + return (DDI_FAILURE); +} + +/*ARGSUSED*/ +static int +drsas_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) +{ + int rval; + int drsas_minor = getminor((dev_t)arg); + + struct drsas_instance *instance; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + instance = (struct drsas_instance *) + ddi_get_soft_state(drsas_state, + MINOR2INST(drsas_minor)); + + if (instance == NULL) { + *resultp = NULL; + rval = DDI_FAILURE; + } else { + *resultp = instance->dip; + rval = DDI_SUCCESS; + } + break; + case DDI_INFO_DEVT2INSTANCE: + *resultp = (void *)instance; + rval = DDI_SUCCESS; + break; + default: + *resultp = NULL; + rval = DDI_FAILURE; + } + + return (rval); +} + +static int +drsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int instance_no; + + struct drsas_instance *instance; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* CONSTCOND */ + ASSERT(NO_COMPETING_THREADS); + + instance_no = ddi_get_instance(dip); + + instance = (struct drsas_instance *)ddi_get_soft_state(drsas_state, + instance_no); + + if (!instance) { + con_log(CL_ANN, (CE_WARN, + "dr_sas:%d could not get instance in detach", + instance_no)); + + return (DDI_FAILURE); + } + + con_log(CL_ANN, (CE_NOTE, + "dr_sas%d: detaching device 0x%4x:0x%4x:0x%4x:0x%4x", + instance_no, instance->vendor_id, instance->device_id, + instance->subsysvid, instance->subsysid)); + + switch (cmd) { + case DDI_DETACH: + con_log(CL_ANN, (CE_NOTE, + "drsas_detach: DDI_DETACH")); + + if (scsi_hba_detach(dip) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas:%d failed to detach", + instance_no)); + + return (DDI_FAILURE); + } + + scsi_hba_tran_free(instance->tran); + + flush_cache(instance); + + if (abort_aen_cmd(instance, instance->aen_cmd)) { + con_log(CL_ANN, (CE_WARN, "drsas_detach: " + "failed to abort prevous AEN command")); + + return (DDI_FAILURE); + } + + instance->func_ptr->disable_intr(instance); + + if (instance->isr_level == HIGH_LEVEL_INTR) { + ddi_remove_softintr(instance->soft_intr_id); + } + + drsas_rem_intrs(instance); + + if (instance->taskq) { + ddi_taskq_destroy(instance->taskq); + } + kmem_free(instance->dr_ld_list, MRDRV_MAX_LD + * sizeof (struct drsas_ld)); + free_space_for_mfi(instance); + + drsas_fm_fini(instance); + + pci_config_teardown(&instance->pci_handle); + + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + + ddi_soft_state_free(drsas_state, instance_no); + break; + case DDI_PM_SUSPEND: + con_log(CL_ANN, (CE_NOTE, + "drsas_detach: DDI_PM_SUSPEND")); + + break; + case DDI_SUSPEND: + con_log(CL_ANN, (CE_NOTE, + "drsas_detach: DDI_SUSPEND")); + + break; + default: + con_log(CL_ANN, (CE_WARN, + "invalid detach command:0x%x", cmd)); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * ************************************************************************** * + * * + * common entry points - for character driver types * + * * + * ************************************************************************** * + */ +static int +drsas_open(dev_t *dev, int openflags, int otyp, cred_t *credp) +{ + int rval = 0; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* Check root permissions */ + if (drv_priv(credp) != 0) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: Non-root ioctl access denied!")); + return (EPERM); + } + + /* Verify we are being opened as a character device */ + if (otyp != OTYP_CHR) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: ioctl node must be a char node")); + return (EINVAL); + } + + if (ddi_get_soft_state(drsas_state, MINOR2INST(getminor(*dev))) + == NULL) { + return (ENXIO); + } + + if (scsi_hba_open) { + rval = scsi_hba_open(dev, openflags, otyp, credp); + } + + return (rval); +} + +static int +drsas_close(dev_t dev, int openflags, int otyp, cred_t *credp) +{ + int rval = 0; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* no need for locks! */ + + if (scsi_hba_close) { + rval = scsi_hba_close(dev, openflags, otyp, credp); + } + + return (rval); +} + +static int +drsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + int rval = 0; + + struct drsas_instance *instance; + struct drsas_ioctl *ioctl; + struct drsas_aen aen; + int i; + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + instance = ddi_get_soft_state(drsas_state, MINOR2INST(getminor(dev))); + + if (instance == NULL) { + /* invalid minor number */ + con_log(CL_ANN, (CE_WARN, "dr_sas: adapter not found.")); + return (ENXIO); + } + + ioctl = (struct drsas_ioctl *)kmem_zalloc(sizeof (struct drsas_ioctl), + KM_SLEEP); + ASSERT(ioctl); + + switch ((uint_t)cmd) { + case DRSAS_IOCTL_FIRMWARE: + for (i = 0; i < sizeof (struct drsas_ioctl); i++) { + if (ddi_copyin((uint8_t *)arg+i, + (uint8_t *)ioctl+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, "drsas_ioctl " + "ERROR IOCTL copyin")); + kmem_free(ioctl, + sizeof (struct drsas_ioctl)); + return (EFAULT); + } + } + if (ioctl->control_code == DRSAS_DRIVER_IOCTL_COMMON) { + rval = handle_drv_ioctl(instance, ioctl, mode); + } else { + rval = handle_mfi_ioctl(instance, ioctl, mode); + } + for (i = 0; i < sizeof (struct drsas_ioctl) - 1; i++) { + if (ddi_copyout((uint8_t *)ioctl+i, + (uint8_t *)arg+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "drsas_ioctl: ddi_copyout " + "failed")); + rval = 1; + break; + } + } + + break; + case DRSAS_IOCTL_AEN: + for (i = 0; i < sizeof (struct drsas_aen); i++) { + if (ddi_copyin((uint8_t *)arg+i, + (uint8_t *)&aen+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "drsas_ioctl: " + "ERROR AEN copyin")); + kmem_free(ioctl, + sizeof (struct drsas_ioctl)); + return (EFAULT); + } + } + + rval = handle_mfi_aen(instance, &aen); + for (i = 0; i < sizeof (struct drsas_aen); i++) { + if (ddi_copyout((uint8_t *)&aen + i, + (uint8_t *)arg + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "drsas_ioctl: " + "ddi_copyout failed")); + rval = 1; + break; + } + } + + break; + default: + rval = scsi_hba_ioctl(dev, cmd, arg, + mode, credp, rvalp); + + con_log(CL_DLEVEL1, (CE_NOTE, "drsas_ioctl: " + "scsi_hba_ioctl called, ret = %x.", rval)); + } + + kmem_free(ioctl, sizeof (struct drsas_ioctl)); + return (rval); +} + +/* + * ************************************************************************** * + * * + * common entry points - for block driver types * + * * + * ************************************************************************** * + */ +/*ARGSUSED*/ +static int +drsas_reset(dev_info_t *dip, ddi_reset_cmd_t cmd) +{ + int instance_no; + + struct drsas_instance *instance; + + instance_no = ddi_get_instance(dip); + instance = (struct drsas_instance *)ddi_get_soft_state + (drsas_state, instance_no); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (!instance) { + con_log(CL_ANN, (CE_WARN, "dr_sas:%d could not get adapter " + "in reset", instance_no)); + return (DDI_FAILURE); + } + + instance->func_ptr->disable_intr(instance); + + con_log(CL_ANN1, (CE_NOTE, "flushing cache for instance %d", + instance_no)); + + flush_cache(instance); + + return (DDI_SUCCESS); +} + + +/* + * ************************************************************************** * + * * + * entry points (SCSI HBA) * + * * + * ************************************************************************** * + */ +/*ARGSUSED*/ +static int +drsas_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *tran, struct scsi_device *sd) +{ + struct drsas_instance *instance; + uint16_t tgt = sd->sd_address.a_target; + uint8_t lun = sd->sd_address.a_lun; + + con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init target %d lun %d", + tgt, lun)); + + instance = ADDR2MR(&sd->sd_address); + + if (ndi_dev_is_persistent_node(tgt_dip) == 0) { + (void) ndi_merge_node(tgt_dip, drsas_name_node); + ddi_set_name_addr(tgt_dip, NULL); + + con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init in " + "ndi_dev_is_persistent_node DDI_FAILURE t = %d l = %d", + tgt, lun)); + return (DDI_FAILURE); + } + + con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init dev_dip %p tgt_dip %p", + (void *)instance->dr_ld_list[tgt].dip, (void *)tgt_dip)); + + if (tgt < MRDRV_MAX_LD && lun == 0) { + if (instance->dr_ld_list[tgt].dip == NULL && + strcmp(ddi_driver_name(sd->sd_dev), "sd") == 0) { + instance->dr_ld_list[tgt].dip = tgt_dip; + instance->dr_ld_list[tgt].lun_type = DRSAS_LD_LUN; + } + } + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static void +drsas_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *hba_tran, struct scsi_device *sd) +{ + struct drsas_instance *instance; + int tgt = sd->sd_address.a_target; + int lun = sd->sd_address.a_lun; + + instance = ADDR2MR(&sd->sd_address); + + con_log(CL_ANN1, (CE_NOTE, "tgt_free t = %d l = %d", tgt, lun)); + + if (tgt < MRDRV_MAX_LD && lun == 0) { + if (instance->dr_ld_list[tgt].dip == tgt_dip) { + instance->dr_ld_list[tgt].dip = NULL; + } + } +} + +static dev_info_t * +drsas_find_child(struct drsas_instance *instance, uint16_t tgt, uint8_t lun) +{ + dev_info_t *child = NULL; + char addr[SCSI_MAXNAMELEN]; + char tmp[MAXNAMELEN]; + + (void) sprintf(addr, "%x,%x", tgt, lun); + for (child = ddi_get_child(instance->dip); child; + child = ddi_get_next_sibling(child)) { + + if (drsas_name_node(child, tmp, MAXNAMELEN) != + DDI_SUCCESS) { + continue; + } + + if (strcmp(addr, tmp) == 0) { + break; + } + } + con_log(CL_ANN1, (CE_NOTE, "drsas_find_child: return child = %p", + (void *)child)); + return (child); +} + +static int +drsas_name_node(dev_info_t *dip, char *name, int len) +{ + int tgt, lun; + + tgt = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "target", -1); + con_log(CL_ANN1, (CE_NOTE, + "drsas_name_node: dip %p tgt %d", (void *)dip, tgt)); + if (tgt == -1) { + return (DDI_FAILURE); + } + lun = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "lun", -1); + con_log(CL_ANN1, + (CE_NOTE, "drsas_name_node: tgt %d lun %d", tgt, lun)); + if (lun == -1) { + return (DDI_FAILURE); + } + (void) snprintf(name, len, "%x,%x", tgt, lun); + return (DDI_SUCCESS); +} + +static struct scsi_pkt * +drsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt, + struct buf *bp, int cmdlen, int statuslen, int tgtlen, + int flags, int (*callback)(), caddr_t arg) +{ + struct scsa_cmd *acmd; + struct drsas_instance *instance; + struct scsi_pkt *new_pkt; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + instance = ADDR2MR(ap); + + /* step #1 : pkt allocation */ + if (pkt == NULL) { + pkt = scsi_hba_pkt_alloc(instance->dip, ap, cmdlen, statuslen, + tgtlen, sizeof (struct scsa_cmd), callback, arg); + if (pkt == NULL) { + return (NULL); + } + + acmd = PKT2CMD(pkt); + + /* + * Initialize the new pkt - we redundantly initialize + * all the fields for illustrative purposes. + */ + acmd->cmd_pkt = pkt; + acmd->cmd_flags = 0; + acmd->cmd_scblen = statuslen; + acmd->cmd_cdblen = cmdlen; + acmd->cmd_dmahandle = NULL; + acmd->cmd_ncookies = 0; + acmd->cmd_cookie = 0; + acmd->cmd_cookiecnt = 0; + acmd->cmd_nwin = 0; + + pkt->pkt_address = *ap; + pkt->pkt_comp = (void (*)())NULL; + pkt->pkt_flags = 0; + pkt->pkt_time = 0; + pkt->pkt_resid = 0; + pkt->pkt_state = 0; + pkt->pkt_statistics = 0; + pkt->pkt_reason = 0; + new_pkt = pkt; + } else { + acmd = PKT2CMD(pkt); + new_pkt = NULL; + } + + /* step #2 : dma allocation/move */ + if (bp && bp->b_bcount != 0) { + if (acmd->cmd_dmahandle == NULL) { + if (drsas_dma_alloc(instance, pkt, bp, flags, + callback) == DDI_FAILURE) { + if (new_pkt) { + scsi_hba_pkt_free(ap, new_pkt); + } + return ((struct scsi_pkt *)NULL); + } + } else { + if (drsas_dma_move(instance, pkt, bp) == DDI_FAILURE) { + return ((struct scsi_pkt *)NULL); + } + } + } + + return (pkt); +} + +static int +drsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt) +{ + uchar_t cmd_done = 0; + + struct drsas_instance *instance = ADDR2MR(ap); + struct drsas_cmd *cmd; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d:SCSI CDB[0]=0x%x", + __func__, __LINE__, pkt->pkt_cdbp[0])); + + pkt->pkt_reason = CMD_CMPLT; + *pkt->pkt_scbp = STATUS_GOOD; /* clear arq scsi_status */ + + cmd = build_cmd(instance, ap, pkt, &cmd_done); + + /* + * Check if the command is already completed by the drsas_build_cmd() + * routine. In which case the busy_flag would be clear and scb will be + * NULL and appropriate reason provided in pkt_reason field + */ + if (cmd_done) { + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_scbp[0] = STATUS_GOOD; + pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET + | STATE_SENT_CMD; + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + + return (TRAN_ACCEPT); + } + + if (cmd == NULL) { + return (TRAN_BUSY); + } + + if ((pkt->pkt_flags & FLAG_NOINTR) == 0) { + if (instance->fw_outstanding > instance->max_fw_cmds) { + con_log(CL_ANN, (CE_CONT, "dr_sas:Firmware busy")); + return_mfi_pkt(instance, cmd); + return (TRAN_BUSY); + } + + /* Synchronize the Cmd frame for the controller */ + (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, 0, 0, + DDI_DMA_SYNC_FORDEV); + + instance->func_ptr->issue_cmd(cmd, instance); + + } else { + struct drsas_header *hdr = &cmd->frame->hdr; + + cmd->sync_cmd = DRSAS_TRUE; + + instance->func_ptr-> issue_cmd_in_poll_mode(instance, cmd); + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + + switch (ddi_get8(cmd->frame_dma_obj.acc_handle, + &hdr->cmd_status)) { + case MFI_STAT_OK: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + + case MFI_STAT_SCSI_DONE_WITH_ERROR: + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + + ((struct scsi_status *)pkt->pkt_scbp)->sts_chk = 1; + break; + + case MFI_STAT_DEVICE_NOT_FOUND: + pkt->pkt_reason = CMD_DEV_GONE; + pkt->pkt_statistics = STAT_DISCON; + break; + + default: + ((struct scsi_status *)pkt->pkt_scbp)->sts_busy = 1; + } + + return_mfi_pkt(instance, cmd); + (void) drsas_common_check(instance, cmd); + + if (pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + + } + + return (TRAN_ACCEPT); +} + +/*ARGSUSED*/ +static int +drsas_tran_abort(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* abort command not supported by H/W */ + + return (DDI_FAILURE); +} + +/*ARGSUSED*/ +static int +drsas_tran_reset(struct scsi_address *ap, int level) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* reset command not supported by H/W */ + + return (DDI_FAILURE); + +} + +/*ARGSUSED*/ +static int +drsas_tran_getcap(struct scsi_address *ap, char *cap, int whom) +{ + int rval = 0; + + struct drsas_instance *instance = ADDR2MR(ap); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* we do allow inquiring about capabilities for other targets */ + if (cap == NULL) { + return (-1); + } + + switch (scsi_hba_lookup_capstr(cap)) { + case SCSI_CAP_DMA_MAX: + /* Limit to 16MB max transfer */ + rval = drsas_max_cap_maxxfer; + break; + case SCSI_CAP_MSG_OUT: + rval = 1; + break; + case SCSI_CAP_DISCONNECT: + rval = 0; + break; + case SCSI_CAP_SYNCHRONOUS: + rval = 0; + break; + case SCSI_CAP_WIDE_XFER: + rval = 1; + break; + case SCSI_CAP_TAGGED_QING: + rval = 1; + break; + case SCSI_CAP_UNTAGGED_QING: + rval = 1; + break; + case SCSI_CAP_PARITY: + rval = 1; + break; + case SCSI_CAP_INITIATOR_ID: + rval = instance->init_id; + break; + case SCSI_CAP_ARQ: + rval = 1; + break; + case SCSI_CAP_LINKED_CMDS: + rval = 0; + break; + case SCSI_CAP_RESET_NOTIFICATION: + rval = 1; + break; + case SCSI_CAP_GEOMETRY: + rval = -1; + + break; + default: + con_log(CL_DLEVEL2, (CE_NOTE, "Default cap coming 0x%x", + scsi_hba_lookup_capstr(cap))); + rval = -1; + break; + } + + return (rval); +} + +/*ARGSUSED*/ +static int +drsas_tran_setcap(struct scsi_address *ap, char *cap, int value, int whom) +{ + int rval = 1; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* We don't allow setting capabilities for other targets */ + if (cap == NULL || whom == 0) { + return (-1); + } + + switch (scsi_hba_lookup_capstr(cap)) { + case SCSI_CAP_DMA_MAX: + case SCSI_CAP_MSG_OUT: + case SCSI_CAP_PARITY: + case SCSI_CAP_LINKED_CMDS: + case SCSI_CAP_RESET_NOTIFICATION: + case SCSI_CAP_DISCONNECT: + case SCSI_CAP_SYNCHRONOUS: + case SCSI_CAP_UNTAGGED_QING: + case SCSI_CAP_WIDE_XFER: + case SCSI_CAP_INITIATOR_ID: + case SCSI_CAP_ARQ: + /* + * None of these are settable via + * the capability interface. + */ + break; + case SCSI_CAP_TAGGED_QING: + rval = 1; + break; + case SCSI_CAP_SECTOR_SIZE: + rval = 1; + break; + + case SCSI_CAP_TOTAL_SECTORS: + rval = 1; + break; + default: + rval = -1; + break; + } + + return (rval); +} + +static void +drsas_tran_destroy_pkt(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + struct scsa_cmd *acmd = PKT2CMD(pkt); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + acmd->cmd_flags &= ~CFLAG_DMAVALID; + + (void) ddi_dma_unbind_handle(acmd->cmd_dmahandle); + + ddi_dma_free_handle(&acmd->cmd_dmahandle); + + acmd->cmd_dmahandle = NULL; + } + + /* free the pkt */ + scsi_hba_pkt_free(ap, pkt); +} + +/*ARGSUSED*/ +static void +drsas_tran_dmafree(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + register struct scsa_cmd *acmd = PKT2CMD(pkt); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + acmd->cmd_flags &= ~CFLAG_DMAVALID; + + (void) ddi_dma_unbind_handle(acmd->cmd_dmahandle); + + ddi_dma_free_handle(&acmd->cmd_dmahandle); + + acmd->cmd_dmahandle = NULL; + } +} + +/*ARGSUSED*/ +static void +drsas_tran_sync_pkt(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + register struct scsa_cmd *acmd = PKT2CMD(pkt); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, acmd->cmd_dma_offset, + acmd->cmd_dma_len, (acmd->cmd_flags & CFLAG_DMASEND) ? + DDI_DMA_SYNC_FORDEV : DDI_DMA_SYNC_FORCPU); + } +} + +/* + * drsas_isr(caddr_t) + * + * The Interrupt Service Routine + * + * Collect status for all completed commands and do callback + * + */ +static uint_t +drsas_isr(struct drsas_instance *instance) +{ + int need_softintr; + uint32_t producer; + uint32_t consumer; + uint32_t context; + + struct drsas_cmd *cmd; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + ASSERT(instance); + if ((instance->intr_type == DDI_INTR_TYPE_FIXED) && + !instance->func_ptr->intr_ack(instance)) { + return (DDI_INTR_UNCLAIMED); + } + + (void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORCPU); + + if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle) + != DDI_SUCCESS) { + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (DDI_INTR_UNCLAIMED); + } + + producer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle, + instance->producer); + consumer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle, + instance->consumer); + + con_log(CL_ANN1, (CE_CONT, " producer %x consumer %x ", + producer, consumer)); + if (producer == consumer) { + con_log(CL_ANN1, (CE_WARN, "producer = consumer case")); + return (DDI_INTR_UNCLAIMED); + } + mutex_enter(&instance->completed_pool_mtx); + + while (consumer != producer) { + context = ddi_get32(instance->mfi_internal_dma_obj.acc_handle, + &instance->reply_queue[consumer]); + cmd = instance->cmd_list[context]; + mlist_add_tail(&cmd->list, &instance->completed_pool_list); + + consumer++; + if (consumer == (instance->max_fw_cmds + 1)) { + consumer = 0; + } + } + + mutex_exit(&instance->completed_pool_mtx); + + ddi_put32(instance->mfi_internal_dma_obj.acc_handle, + instance->consumer, consumer); + (void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORDEV); + + if (instance->softint_running) { + need_softintr = 0; + } else { + need_softintr = 1; + } + + if (instance->isr_level == HIGH_LEVEL_INTR) { + if (need_softintr) { + ddi_trigger_softintr(instance->soft_intr_id); + } + } else { + /* + * Not a high-level interrupt, therefore call the soft level + * interrupt explicitly + */ + (void) drsas_softintr(instance); + } + + return (DDI_INTR_CLAIMED); +} + + +/* + * ************************************************************************** * + * * + * libraries * + * * + * ************************************************************************** * + */ +/* + * get_mfi_pkt : Get a command from the free pool + * After successful allocation, the caller of this routine + * must clear the frame buffer (memset to zero) before + * using the packet further. + * + * ***** Note ***** + * After clearing the frame buffer the context id of the + * frame buffer SHOULD be restored back. + */ +static struct drsas_cmd * +get_mfi_pkt(struct drsas_instance *instance) +{ + mlist_t *head = &instance->cmd_pool_list; + struct drsas_cmd *cmd = NULL; + + mutex_enter(&instance->cmd_pool_mtx); + ASSERT(mutex_owned(&instance->cmd_pool_mtx)); + + if (!mlist_empty(head)) { + cmd = mlist_entry(head->next, struct drsas_cmd, list); + mlist_del_init(head->next); + } + if (cmd != NULL) + cmd->pkt = NULL; + mutex_exit(&instance->cmd_pool_mtx); + + return (cmd); +} + +/* + * return_mfi_pkt : Return a cmd to free command pool + */ +static void +return_mfi_pkt(struct drsas_instance *instance, struct drsas_cmd *cmd) +{ + mutex_enter(&instance->cmd_pool_mtx); + ASSERT(mutex_owned(&instance->cmd_pool_mtx)); + + mlist_add(&cmd->list, &instance->cmd_pool_list); + + mutex_exit(&instance->cmd_pool_mtx); +} + +/* + * destroy_mfi_frame_pool + */ +static void +destroy_mfi_frame_pool(struct drsas_instance *instance) +{ + int i; + uint32_t max_cmd = instance->max_fw_cmds; + + struct drsas_cmd *cmd; + + /* return all frames to pool */ + for (i = 0; i < max_cmd+1; i++) { + + cmd = instance->cmd_list[i]; + + if (cmd->frame_dma_obj_status == DMA_OBJ_ALLOCATED) + (void) drsas_free_dma_obj(instance, cmd->frame_dma_obj); + + cmd->frame_dma_obj_status = DMA_OBJ_FREED; + } + +} + +/* + * create_mfi_frame_pool + */ +static int +create_mfi_frame_pool(struct drsas_instance *instance) +{ + int i = 0; + int cookie_cnt; + uint16_t max_cmd; + uint16_t sge_sz; + uint32_t sgl_sz; + uint32_t tot_frame_size; + + struct drsas_cmd *cmd; + + max_cmd = instance->max_fw_cmds; + + sge_sz = sizeof (struct drsas_sge64); + + /* calculated the number of 64byte frames required for SGL */ + sgl_sz = sge_sz * instance->max_num_sge; + tot_frame_size = sgl_sz + MRMFI_FRAME_SIZE + SENSE_LENGTH; + + con_log(CL_DLEVEL3, (CE_NOTE, "create_mfi_frame_pool: " + "sgl_sz %x tot_frame_size %x", sgl_sz, tot_frame_size)); + + while (i < max_cmd+1) { + cmd = instance->cmd_list[i]; + + cmd->frame_dma_obj.size = tot_frame_size; + cmd->frame_dma_obj.dma_attr = drsas_generic_dma_attr; + cmd->frame_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + cmd->frame_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + cmd->frame_dma_obj.dma_attr.dma_attr_sgllen = 1; + cmd->frame_dma_obj.dma_attr.dma_attr_align = 64; + + + cookie_cnt = drsas_alloc_dma_obj(instance, &cmd->frame_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC); + + if (cookie_cnt == -1 || cookie_cnt > 1) { + con_log(CL_ANN, (CE_WARN, + "create_mfi_frame_pool: could not alloc.")); + return (DDI_FAILURE); + } + + bzero(cmd->frame_dma_obj.buffer, tot_frame_size); + + cmd->frame_dma_obj_status = DMA_OBJ_ALLOCATED; + cmd->frame = (union drsas_frame *)cmd->frame_dma_obj.buffer; + cmd->frame_phys_addr = + cmd->frame_dma_obj.dma_cookie[0].dmac_address; + + cmd->sense = (uint8_t *)(((unsigned long) + cmd->frame_dma_obj.buffer) + + tot_frame_size - SENSE_LENGTH); + cmd->sense_phys_addr = + cmd->frame_dma_obj.dma_cookie[0].dmac_address + + tot_frame_size - SENSE_LENGTH; + + if (!cmd->frame || !cmd->sense) { + con_log(CL_ANN, (CE_NOTE, + "dr_sas: pci_pool_alloc failed")); + + return (ENOMEM); + } + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &cmd->frame->io.context, cmd->index); + i++; + + con_log(CL_DLEVEL3, (CE_NOTE, "[%x]-%x", + cmd->index, cmd->frame_phys_addr)); + } + + return (DDI_SUCCESS); +} + +/* + * free_additional_dma_buffer + */ +static void +free_additional_dma_buffer(struct drsas_instance *instance) +{ + if (instance->mfi_internal_dma_obj.status == DMA_OBJ_ALLOCATED) { + (void) drsas_free_dma_obj(instance, + instance->mfi_internal_dma_obj); + instance->mfi_internal_dma_obj.status = DMA_OBJ_FREED; + } + + if (instance->mfi_evt_detail_obj.status == DMA_OBJ_ALLOCATED) { + (void) drsas_free_dma_obj(instance, + instance->mfi_evt_detail_obj); + instance->mfi_evt_detail_obj.status = DMA_OBJ_FREED; + } +} + +/* + * alloc_additional_dma_buffer + */ +static int +alloc_additional_dma_buffer(struct drsas_instance *instance) +{ + uint32_t reply_q_sz; + uint32_t internal_buf_size = PAGESIZE*2; + + /* max cmds plus 1 + producer & consumer */ + reply_q_sz = sizeof (uint32_t) * (instance->max_fw_cmds + 1 + 2); + + instance->mfi_internal_dma_obj.size = internal_buf_size; + instance->mfi_internal_dma_obj.dma_attr = drsas_generic_dma_attr; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_count_max = + 0xFFFFFFFFU; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_sgllen = 1; + + if (drsas_alloc_dma_obj(instance, &instance->mfi_internal_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: could not alloc reply queue")); + return (DDI_FAILURE); + } + + bzero(instance->mfi_internal_dma_obj.buffer, internal_buf_size); + + instance->mfi_internal_dma_obj.status |= DMA_OBJ_ALLOCATED; + + instance->producer = (uint32_t *)((unsigned long) + instance->mfi_internal_dma_obj.buffer); + instance->consumer = (uint32_t *)((unsigned long) + instance->mfi_internal_dma_obj.buffer + 4); + instance->reply_queue = (uint32_t *)((unsigned long) + instance->mfi_internal_dma_obj.buffer + 8); + instance->internal_buf = (caddr_t)(((unsigned long) + instance->mfi_internal_dma_obj.buffer) + reply_q_sz + 8); + instance->internal_buf_dmac_add = + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + + (reply_q_sz + 8); + instance->internal_buf_size = internal_buf_size - + (reply_q_sz + 8); + + /* allocate evt_detail */ + instance->mfi_evt_detail_obj.size = sizeof (struct drsas_evt_detail); + instance->mfi_evt_detail_obj.dma_attr = drsas_generic_dma_attr; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_sgllen = 1; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_align = 1; + + if (drsas_alloc_dma_obj(instance, &instance->mfi_evt_detail_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "alloc_additional_dma_buffer: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + bzero(instance->mfi_evt_detail_obj.buffer, + sizeof (struct drsas_evt_detail)); + + instance->mfi_evt_detail_obj.status |= DMA_OBJ_ALLOCATED; + + return (DDI_SUCCESS); +} + +/* + * free_space_for_mfi + */ +static void +free_space_for_mfi(struct drsas_instance *instance) +{ + int i; + uint32_t max_cmd = instance->max_fw_cmds; + + /* already freed */ + if (instance->cmd_list == NULL) { + return; + } + + free_additional_dma_buffer(instance); + + /* first free the MFI frame pool */ + destroy_mfi_frame_pool(instance); + + /* free all the commands in the cmd_list */ + for (i = 0; i < instance->max_fw_cmds+1; i++) { + kmem_free(instance->cmd_list[i], + sizeof (struct drsas_cmd)); + + instance->cmd_list[i] = NULL; + } + + /* free the cmd_list buffer itself */ + kmem_free(instance->cmd_list, + sizeof (struct drsas_cmd *) * (max_cmd+1)); + + instance->cmd_list = NULL; + + INIT_LIST_HEAD(&instance->cmd_pool_list); +} + +/* + * alloc_space_for_mfi + */ +static int +alloc_space_for_mfi(struct drsas_instance *instance) +{ + int i; + uint32_t max_cmd; + size_t sz; + + struct drsas_cmd *cmd; + + max_cmd = instance->max_fw_cmds; + + /* reserve 1 more slot for flush_cache */ + sz = sizeof (struct drsas_cmd *) * (max_cmd+1); + + /* + * instance->cmd_list is an array of struct drsas_cmd pointers. + * Allocate the dynamic array first and then allocate individual + * commands. + */ + instance->cmd_list = kmem_zalloc(sz, KM_SLEEP); + ASSERT(instance->cmd_list); + + for (i = 0; i < max_cmd+1; i++) { + instance->cmd_list[i] = kmem_zalloc(sizeof (struct drsas_cmd), + KM_SLEEP); + ASSERT(instance->cmd_list[i]); + } + + INIT_LIST_HEAD(&instance->cmd_pool_list); + + /* add all the commands to command pool (instance->cmd_pool) */ + for (i = 0; i < max_cmd; i++) { + cmd = instance->cmd_list[i]; + cmd->index = i; + + mlist_add_tail(&cmd->list, &instance->cmd_pool_list); + } + + /* single slot for flush_cache won't be added in command pool */ + cmd = instance->cmd_list[max_cmd]; + cmd->index = i; + + /* create a frame pool and assign one frame to each cmd */ + if (create_mfi_frame_pool(instance)) { + con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool")); + return (DDI_FAILURE); + } + + /* create a frame pool and assign one frame to each cmd */ + if (alloc_additional_dma_buffer(instance)) { + con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool")); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * get_ctrl_info + */ +static int +get_ctrl_info(struct drsas_instance *instance, + struct drsas_ctrl_info *ctrl_info) +{ + int ret = 0; + + struct drsas_cmd *cmd; + struct drsas_dcmd_frame *dcmd; + struct drsas_ctrl_info *ci; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + con_log(CL_ANN, (CE_WARN, + "Failed to get a cmd for ctrl info")); + return (DDI_FAILURE); + } + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + dcmd = &cmd->frame->dcmd; + + ci = (struct drsas_ctrl_info *)instance->internal_buf; + + if (!ci) { + con_log(CL_ANN, (CE_WARN, + "Failed to alloc mem for ctrl info")); + return_mfi_pkt(instance, cmd); + return (DDI_FAILURE); + } + + (void) memset(ci, 0, sizeof (struct drsas_ctrl_info)); + + /* for( i = 0; i < DCMD_MBOX_SZ; i++ ) dcmd->mbox.b[i] = 0; */ + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_READ); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, + sizeof (struct drsas_ctrl_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_GET_INFO); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr, + instance->internal_buf_dmac_add); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length, + sizeof (struct drsas_ctrl_info)); + + cmd->frame_count = 1; + + if (!instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + ret = 0; + ddi_rep_get8(cmd->frame_dma_obj.acc_handle, + (uint8_t *)ctrl_info, (uint8_t *)ci, + sizeof (struct drsas_ctrl_info), DDI_DEV_AUTOINCR); + } else { + con_log(CL_ANN, (CE_WARN, "get_ctrl_info: Ctrl info failed")); + ret = -1; + } + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) { + ret = -1; + } + + return (ret); +} + +/* + * abort_aen_cmd + */ +static int +abort_aen_cmd(struct drsas_instance *instance, + struct drsas_cmd *cmd_to_abort) +{ + int ret = 0; + + struct drsas_cmd *cmd; + struct drsas_abort_frame *abort_fr; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + con_log(CL_ANN, (CE_WARN, + "Failed to get a cmd for ctrl info")); + return (DDI_FAILURE); + } + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + abort_fr = &cmd->frame->abort; + + /* prepare and issue the abort frame */ + ddi_put8(cmd->frame_dma_obj.acc_handle, + &abort_fr->cmd, MFI_CMD_OP_ABORT); + ddi_put8(cmd->frame_dma_obj.acc_handle, &abort_fr->cmd_status, + MFI_CMD_STATUS_SYNC_MODE); + ddi_put16(cmd->frame_dma_obj.acc_handle, &abort_fr->flags, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &abort_fr->abort_context, + cmd_to_abort->index); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &abort_fr->abort_mfi_phys_addr_lo, cmd_to_abort->frame_phys_addr); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &abort_fr->abort_mfi_phys_addr_hi, 0); + + instance->aen_cmd->abort_aen = 1; + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, + "abort_aen_cmd: issue_cmd_in_sync_mode failed")); + ret = -1; + } else { + ret = 0; + } + + instance->aen_cmd->abort_aen = 1; + instance->aen_cmd = 0; + + return_mfi_pkt(instance, cmd); + (void) drsas_common_check(instance, cmd); + + return (ret); +} + +/* + * init_mfi + */ +static int +init_mfi(struct drsas_instance *instance) +{ + struct drsas_cmd *cmd; + struct drsas_ctrl_info ctrl_info; + struct drsas_init_frame *init_frame; + struct drsas_init_queue_info *initq_info; + + /* we expect the FW state to be READY */ + if (mfi_state_transition_to_ready(instance)) { + con_log(CL_ANN, (CE_WARN, "dr_sas: F/W is not ready")); + goto fail_ready_state; + } + + /* get various operational parameters from status register */ + instance->max_num_sge = + (instance->func_ptr->read_fw_status_reg(instance) & + 0xFF0000) >> 0x10; + /* + * Reduce the max supported cmds by 1. This is to ensure that the + * reply_q_sz (1 more than the max cmd that driver may send) + * does not exceed max cmds that the FW can support + */ + instance->max_fw_cmds = + instance->func_ptr->read_fw_status_reg(instance) & 0xFFFF; + instance->max_fw_cmds = instance->max_fw_cmds - 1; + + instance->max_num_sge = + (instance->max_num_sge > DRSAS_MAX_SGE_CNT) ? + DRSAS_MAX_SGE_CNT : instance->max_num_sge; + + /* create a pool of commands */ + if (alloc_space_for_mfi(instance) != DDI_SUCCESS) + goto fail_alloc_fw_space; + + /* + * Prepare a init frame. Note the init frame points to queue info + * structure. Each frame has SGL allocated after first 64 bytes. For + * this frame - since we don't need any SGL - we use SGL's space as + * queue info structure + */ + cmd = get_mfi_pkt(instance); + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + init_frame = (struct drsas_init_frame *)cmd->frame; + initq_info = (struct drsas_init_queue_info *) + ((unsigned long)init_frame + 64); + + (void) memset(init_frame, 0, MRMFI_FRAME_SIZE); + (void) memset(initq_info, 0, sizeof (struct drsas_init_queue_info)); + + ddi_put32(cmd->frame_dma_obj.acc_handle, &initq_info->init_flags, 0); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->reply_queue_entries, instance->max_fw_cmds + 1); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->producer_index_phys_addr_hi, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->producer_index_phys_addr_lo, + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->consumer_index_phys_addr_hi, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->consumer_index_phys_addr_lo, + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 4); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->reply_queue_start_phys_addr_hi, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->reply_queue_start_phys_addr_lo, + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 8); + + ddi_put8(cmd->frame_dma_obj.acc_handle, + &init_frame->cmd, MFI_CMD_OP_INIT); + ddi_put8(cmd->frame_dma_obj.acc_handle, &init_frame->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + ddi_put16(cmd->frame_dma_obj.acc_handle, &init_frame->flags, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &init_frame->queue_info_new_phys_addr_lo, + cmd->frame_phys_addr + 64); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &init_frame->queue_info_new_phys_addr_hi, 0); + + ddi_put32(cmd->frame_dma_obj.acc_handle, &init_frame->data_xfer_len, + sizeof (struct drsas_init_queue_info)); + + cmd->frame_count = 1; + + /* issue the init frame in polled mode */ + if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, "failed to init firmware")); + goto fail_fw_init; + } + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) { + goto fail_fw_init; + } + + /* gather misc FW related information */ + if (!get_ctrl_info(instance, &ctrl_info)) { + instance->max_sectors_per_req = ctrl_info.max_request_size; + con_log(CL_ANN1, (CE_NOTE, "product name %s ld present %d", + ctrl_info.product_name, ctrl_info.ld_present_count)); + } else { + instance->max_sectors_per_req = instance->max_num_sge * + PAGESIZE / 512; + } + + if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { + goto fail_fw_init; + } + + return (DDI_SUCCESS); + +fail_fw_init: +fail_alloc_fw_space: + + free_space_for_mfi(instance); + +fail_ready_state: + ddi_regs_map_free(&instance->regmap_handle); + +fail_mfi_reg_setup: + return (DDI_FAILURE); +} + +/* + * mfi_state_transition_to_ready : Move the FW to READY state + * + * @reg_set : MFI register set + */ +static int +mfi_state_transition_to_ready(struct drsas_instance *instance) +{ + int i; + uint8_t max_wait; + uint32_t fw_ctrl; + uint32_t fw_state; + uint32_t cur_state; + + fw_state = + instance->func_ptr->read_fw_status_reg(instance) & MFI_STATE_MASK; + con_log(CL_ANN1, (CE_NOTE, + "mfi_state_transition_to_ready:FW state = 0x%x", fw_state)); + + while (fw_state != MFI_STATE_READY) { + con_log(CL_ANN, (CE_NOTE, + "mfi_state_transition_to_ready:FW state%x", fw_state)); + + switch (fw_state) { + case MFI_STATE_FAULT: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: FW in FAULT state!!")); + + return (ENODEV); + case MFI_STATE_WAIT_HANDSHAKE: + /* set the CLR bit in IMR0 */ + con_log(CL_ANN, (CE_NOTE, + "dr_sas: FW waiting for HANDSHAKE")); + /* + * PCI_Hot Plug: MFI F/W requires + * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG) + * to be set + */ + /* WR_IB_MSG_0(MFI_INIT_CLEAR_HANDSHAKE, instance); */ + WR_IB_DOORBELL(MFI_INIT_CLEAR_HANDSHAKE | + MFI_INIT_HOTPLUG, instance); + + max_wait = 2; + cur_state = MFI_STATE_WAIT_HANDSHAKE; + break; + case MFI_STATE_BOOT_MESSAGE_PENDING: + /* set the CLR bit in IMR0 */ + con_log(CL_ANN, (CE_NOTE, + "dr_sas: FW state boot message pending")); + /* + * PCI_Hot Plug: MFI F/W requires + * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG) + * to be set + */ + WR_IB_DOORBELL(MFI_INIT_HOTPLUG, instance); + + max_wait = 10; + cur_state = MFI_STATE_BOOT_MESSAGE_PENDING; + break; + case MFI_STATE_OPERATIONAL: + /* bring it to READY state; assuming max wait 2 secs */ + instance->func_ptr->disable_intr(instance); + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: FW in OPERATIONAL state")); + /* + * PCI_Hot Plug: MFI F/W requires + * (MFI_INIT_READY | MFI_INIT_MFIMODE | MFI_INIT_ABORT) + * to be set + */ + /* WR_IB_DOORBELL(MFI_INIT_READY, instance); */ + WR_IB_DOORBELL(MFI_RESET_FLAGS, instance); + + max_wait = 10; + cur_state = MFI_STATE_OPERATIONAL; + break; + case MFI_STATE_UNDEFINED: + /* this state should not last for more than 2 seconds */ + con_log(CL_ANN, (CE_NOTE, "FW state undefined")); + + max_wait = 2; + cur_state = MFI_STATE_UNDEFINED; + break; + case MFI_STATE_BB_INIT: + max_wait = 2; + cur_state = MFI_STATE_BB_INIT; + break; + case MFI_STATE_FW_INIT: + max_wait = 2; + cur_state = MFI_STATE_FW_INIT; + break; + case MFI_STATE_DEVICE_SCAN: + max_wait = 10; + cur_state = MFI_STATE_DEVICE_SCAN; + break; + default: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: Unknown state 0x%x", fw_state)); + return (ENODEV); + } + + /* the cur_state should not last for more than max_wait secs */ + for (i = 0; i < (max_wait * MILLISEC); i++) { + /* fw_state = RD_OB_MSG_0(instance) & MFI_STATE_MASK; */ + fw_state = + instance->func_ptr->read_fw_status_reg(instance) & + MFI_STATE_MASK; + + if (fw_state == cur_state) { + delay(1 * drv_usectohz(MILLISEC)); + } else { + break; + } + } + + /* return error if fw_state hasn't changed after max_wait */ + if (fw_state == cur_state) { + con_log(CL_ANN, (CE_NOTE, + "FW state hasn't changed in %d secs", max_wait)); + return (ENODEV); + } + }; + + fw_ctrl = RD_IB_DOORBELL(instance); + + con_log(CL_ANN1, (CE_NOTE, + "mfi_state_transition_to_ready:FW ctrl = 0x%x", fw_ctrl)); + + /* + * Write 0xF to the doorbell register to do the following. + * - Abort all outstanding commands (bit 0). + * - Transition from OPERATIONAL to READY state (bit 1). + * - Discard (possible) low MFA posted in 64-bit mode (bit-2). + * - Set to release FW to continue running (i.e. BIOS handshake + * (bit 3). + */ + WR_IB_DOORBELL(0xF, instance); + + if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { + return (ENODEV); + } + return (DDI_SUCCESS); +} + +/* + * get_seq_num + */ +static int +get_seq_num(struct drsas_instance *instance, + struct drsas_evt_log_info *eli) +{ + int ret = DDI_SUCCESS; + + dma_obj_t dcmd_dma_obj; + struct drsas_cmd *cmd; + struct drsas_dcmd_frame *dcmd; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + cmn_err(CE_WARN, "dr_sas: failed to get a cmd"); + return (ENOMEM); + } + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + dcmd = &cmd->frame->dcmd; + + /* allocate the data transfer buffer */ + dcmd_dma_obj.size = sizeof (struct drsas_evt_log_info); + dcmd_dma_obj.dma_attr = drsas_generic_dma_attr; + dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1; + dcmd_dma_obj.dma_attr.dma_attr_align = 1; + + if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, + "get_seq_num: could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + (void) memset(dcmd_dma_obj.buffer, 0, + sizeof (struct drsas_evt_log_info)); + + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_READ); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, + sizeof (struct drsas_evt_log_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_EVENT_GET_INFO); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length, + sizeof (struct drsas_evt_log_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr, + dcmd_dma_obj.dma_cookie[0].dmac_address); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + cmn_err(CE_WARN, "get_seq_num: " + "failed to issue DRSAS_DCMD_CTRL_EVENT_GET_INFO"); + ret = DDI_FAILURE; + } else { + /* copy the data back into callers buffer */ + ddi_rep_get8(cmd->frame_dma_obj.acc_handle, (uint8_t *)eli, + (uint8_t *)dcmd_dma_obj.buffer, + sizeof (struct drsas_evt_log_info), DDI_DEV_AUTOINCR); + ret = DDI_SUCCESS; + } + + if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS) + ret = DDI_FAILURE; + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) { + ret = DDI_FAILURE; + } + return (ret); +} + +/* + * start_mfi_aen + */ +static int +start_mfi_aen(struct drsas_instance *instance) +{ + int ret = 0; + + struct drsas_evt_log_info eli; + union drsas_evt_class_locale class_locale; + + /* get the latest sequence number from FW */ + (void) memset(&eli, 0, sizeof (struct drsas_evt_log_info)); + + if (get_seq_num(instance, &eli)) { + cmn_err(CE_WARN, "start_mfi_aen: failed to get seq num"); + return (-1); + } + + /* register AEN with FW for latest sequence number plus 1 */ + class_locale.members.reserved = 0; + class_locale.members.locale = DR_EVT_LOCALE_ALL; + class_locale.members.class = DR_EVT_CLASS_INFO; + ret = register_mfi_aen(instance, eli.newest_seq_num + 1, + class_locale.word); + + if (ret) { + cmn_err(CE_WARN, "start_mfi_aen: aen registration failed"); + return (-1); + } + + return (ret); +} + +/* + * flush_cache + */ +static void +flush_cache(struct drsas_instance *instance) +{ + struct drsas_cmd *cmd = NULL; + struct drsas_dcmd_frame *dcmd; + uint32_t max_cmd = instance->max_fw_cmds; + + cmd = instance->cmd_list[max_cmd]; + + if (cmd == NULL) + return; + + dcmd = &cmd->frame->dcmd; + + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 0); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_NONE); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_CACHE_FLUSH); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.b[0], + DR_FLUSH_CTRL_CACHE | DR_FLUSH_DISK_CACHE); + + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + con_log(CL_ANN1, (CE_WARN, + "flush_cache: failed to issue MFI_DCMD_CTRL_CACHE_FLUSH")); + } + con_log(CL_DLEVEL1, (CE_NOTE, "done")); +} + +/* + * service_mfi_aen- Completes an AEN command + * @instance: Adapter soft state + * @cmd: Command to be completed + * + */ +static void +service_mfi_aen(struct drsas_instance *instance, struct drsas_cmd *cmd) +{ + uint32_t seq_num; + struct drsas_evt_detail *evt_detail = + (struct drsas_evt_detail *)instance->mfi_evt_detail_obj.buffer; + int rval = 0; + int tgt = 0; + ddi_acc_handle_t acc_handle; + + acc_handle = cmd->frame_dma_obj.acc_handle; + + cmd->cmd_status = ddi_get8(acc_handle, &cmd->frame->io.cmd_status); + + if (cmd->cmd_status == ENODATA) { + cmd->cmd_status = 0; + } + + /* + * log the MFI AEN event to the sysevent queue so that + * application will get noticed + */ + if (ddi_log_sysevent(instance->dip, DDI_VENDOR_LSI, "LSIMEGA", "SAS", + NULL, NULL, DDI_NOSLEEP) != DDI_SUCCESS) { + int instance_no = ddi_get_instance(instance->dip); + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Failed to log AEN event", instance_no)); + } + /* + * Check for any ld devices that has changed state. i.e. online + * or offline. + */ + con_log(CL_ANN1, (CE_NOTE, + "AEN: code = %x class = %x locale = %x args = %x", + ddi_get32(acc_handle, &evt_detail->code), + evt_detail->cl.members.class, + ddi_get16(acc_handle, &evt_detail->cl.members.locale), + ddi_get8(acc_handle, &evt_detail->arg_type))); + + switch (ddi_get32(acc_handle, &evt_detail->code)) { + case DR_EVT_CFG_CLEARED: { + for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) { + if (instance->dr_ld_list[tgt].dip != NULL) { + rval = drsas_service_evt(instance, tgt, 0, + DRSAS_EVT_UNCONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, + "dr_sas: CFG CLEARED AEN rval = %d " + "tgt id = %d", rval, tgt)); + } + } + break; + } + + case DR_EVT_LD_DELETED: { + rval = drsas_service_evt(instance, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0, + DRSAS_EVT_UNCONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, "dr_sas: LD DELETED AEN rval = %d " + "tgt id = %d index = %d", rval, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), + ddi_get8(acc_handle, &evt_detail->args.ld.ld_index))); + break; + } /* End of DR_EVT_LD_DELETED */ + + case DR_EVT_LD_CREATED: { + rval = drsas_service_evt(instance, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0, + DRSAS_EVT_CONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, "dr_sas: LD CREATED AEN rval = %d " + "tgt id = %d index = %d", rval, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), + ddi_get8(acc_handle, &evt_detail->args.ld.ld_index))); + break; + } /* End of DR_EVT_LD_CREATED */ + } /* End of Main Switch */ + + /* get copy of seq_num and class/locale for re-registration */ + seq_num = ddi_get32(acc_handle, &evt_detail->seq_num); + seq_num++; + (void) memset(instance->mfi_evt_detail_obj.buffer, 0, + sizeof (struct drsas_evt_detail)); + + ddi_put8(acc_handle, &cmd->frame->dcmd.cmd_status, 0x0); + ddi_put32(acc_handle, &cmd->frame->dcmd.mbox.w[0], seq_num); + + instance->aen_seq_num = seq_num; + + cmd->frame_count = 1; + + /* Issue the aen registration frame */ + instance->func_ptr->issue_cmd(cmd, instance); +} + +/* + * complete_cmd_in_sync_mode - Completes an internal command + * @instance: Adapter soft state + * @cmd: Command to be completed + * + * The issue_cmd_in_sync_mode() function waits for a command to complete + * after it issues a command. This function wakes up that waiting routine by + * calling wake_up() on the wait queue. + */ +static void +complete_cmd_in_sync_mode(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + cmd->cmd_status = ddi_get8(cmd->frame_dma_obj.acc_handle, + &cmd->frame->io.cmd_status); + + cmd->sync_cmd = DRSAS_FALSE; + + if (cmd->cmd_status == ENODATA) { + cmd->cmd_status = 0; + } + + cv_broadcast(&instance->int_cmd_cv); +} + +/* + * drsas_softintr - The Software ISR + * @param arg : HBA soft state + * + * called from high-level interrupt if hi-level interrupt are not there, + * otherwise triggered as a soft interrupt + */ +static uint_t +drsas_softintr(struct drsas_instance *instance) +{ + struct scsi_pkt *pkt; + struct scsa_cmd *acmd; + struct drsas_cmd *cmd; + struct mlist_head *pos, *next; + mlist_t process_list; + struct drsas_header *hdr; + struct scsi_arq_status *arqstat; + + con_log(CL_ANN1, (CE_CONT, "drsas_softintr called")); + + ASSERT(instance); + mutex_enter(&instance->completed_pool_mtx); + + if (mlist_empty(&instance->completed_pool_list)) { + mutex_exit(&instance->completed_pool_mtx); + return (DDI_INTR_UNCLAIMED); + } + + instance->softint_running = 1; + + INIT_LIST_HEAD(&process_list); + mlist_splice(&instance->completed_pool_list, &process_list); + INIT_LIST_HEAD(&instance->completed_pool_list); + + mutex_exit(&instance->completed_pool_mtx); + + /* perform all callbacks first, before releasing the SCBs */ + mlist_for_each_safe(pos, next, &process_list) { + cmd = mlist_entry(pos, struct drsas_cmd, list); + + /* syncronize the Cmd frame for the controller */ + (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORCPU); + + if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) != + DDI_SUCCESS) { + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (DDI_INTR_UNCLAIMED); + } + + hdr = &cmd->frame->hdr; + + /* remove the internal command from the process list */ + mlist_del_init(&cmd->list); + + switch (ddi_get8(cmd->frame_dma_obj.acc_handle, &hdr->cmd)) { + case MFI_CMD_OP_PD_SCSI: + case MFI_CMD_OP_LD_SCSI: + case MFI_CMD_OP_LD_READ: + case MFI_CMD_OP_LD_WRITE: + /* + * MFI_CMD_OP_PD_SCSI and MFI_CMD_OP_LD_SCSI + * could have been issued either through an + * IO path or an IOCTL path. If it was via IOCTL, + * we will send it to internal completion. + */ + if (cmd->sync_cmd == DRSAS_TRUE) { + complete_cmd_in_sync_mode(instance, cmd); + break; + } + + /* regular commands */ + acmd = cmd->cmd; + pkt = CMD2PKT(acmd); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, + acmd->cmd_dma_len, + DDI_DMA_SYNC_FORCPU); + } + } + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + pkt->pkt_state = STATE_GOT_BUS + | STATE_GOT_TARGET | STATE_SENT_CMD + | STATE_XFERRED_DATA | STATE_GOT_STATUS; + + con_log(CL_ANN1, (CE_CONT, + "CDB[0] = %x completed for %s: size %lx context %x", + pkt->pkt_cdbp[0], ((acmd->islogical) ? "LD" : "PD"), + acmd->cmd_dmacount, hdr->context)); + + if (pkt->pkt_cdbp[0] == SCMD_INQUIRY) { + struct scsi_inquiry *inq; + + if (acmd->cmd_dmacount != 0) { + bp_mapin(acmd->cmd_buf); + inq = (struct scsi_inquiry *) + acmd->cmd_buf->b_un.b_addr; + + /* don't expose physical drives to OS */ + if (acmd->islogical && + (hdr->cmd_status == MFI_STAT_OK)) { + display_scsi_inquiry( + (caddr_t)inq); + } else if ((hdr->cmd_status == + MFI_STAT_OK) && inq->inq_dtype == + DTYPE_DIRECT) { + + display_scsi_inquiry( + (caddr_t)inq); + + /* for physical disk */ + hdr->cmd_status = + MFI_STAT_DEVICE_NOT_FOUND; + } + } + } + + switch (hdr->cmd_status) { + case MFI_STAT_OK: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + case MFI_STAT_LD_CC_IN_PROGRESS: + case MFI_STAT_LD_RECON_IN_PROGRESS: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + case MFI_STAT_LD_INIT_IN_PROGRESS: + con_log(CL_ANN, + (CE_WARN, "Initialization in Progress")); + pkt->pkt_reason = CMD_TRAN_ERR; + + break; + case MFI_STAT_SCSI_DONE_WITH_ERROR: + con_log(CL_ANN1, (CE_CONT, "scsi_done error")); + + pkt->pkt_reason = CMD_CMPLT; + ((struct scsi_status *) + pkt->pkt_scbp)->sts_chk = 1; + + if (pkt->pkt_cdbp[0] == SCMD_TEST_UNIT_READY) { + + con_log(CL_ANN, + (CE_WARN, "TEST_UNIT_READY fail")); + + } else { + pkt->pkt_state |= STATE_ARQ_DONE; + arqstat = (void *)(pkt->pkt_scbp); + arqstat->sts_rqpkt_reason = CMD_CMPLT; + arqstat->sts_rqpkt_resid = 0; + arqstat->sts_rqpkt_state |= + STATE_GOT_BUS | STATE_GOT_TARGET + | STATE_SENT_CMD + | STATE_XFERRED_DATA; + *(uint8_t *)&arqstat->sts_rqpkt_status = + STATUS_GOOD; + ddi_rep_get8( + cmd->frame_dma_obj.acc_handle, + (uint8_t *) + &(arqstat->sts_sensedata), + cmd->sense, + acmd->cmd_scblen - + offsetof(struct scsi_arq_status, + sts_sensedata), DDI_DEV_AUTOINCR); + } + break; + case MFI_STAT_LD_OFFLINE: + case MFI_STAT_DEVICE_NOT_FOUND: + con_log(CL_ANN1, (CE_CONT, + "device not found error")); + pkt->pkt_reason = CMD_DEV_GONE; + pkt->pkt_statistics = STAT_DISCON; + break; + case MFI_STAT_LD_LBA_OUT_OF_RANGE: + pkt->pkt_state |= STATE_ARQ_DONE; + pkt->pkt_reason = CMD_CMPLT; + ((struct scsi_status *) + pkt->pkt_scbp)->sts_chk = 1; + + arqstat = (void *)(pkt->pkt_scbp); + arqstat->sts_rqpkt_reason = CMD_CMPLT; + arqstat->sts_rqpkt_resid = 0; + arqstat->sts_rqpkt_state |= STATE_GOT_BUS + | STATE_GOT_TARGET | STATE_SENT_CMD + | STATE_XFERRED_DATA; + *(uint8_t *)&arqstat->sts_rqpkt_status = + STATUS_GOOD; + + arqstat->sts_sensedata.es_valid = 1; + arqstat->sts_sensedata.es_key = + KEY_ILLEGAL_REQUEST; + arqstat->sts_sensedata.es_class = + CLASS_EXTENDED_SENSE; + + /* + * LOGICAL BLOCK ADDRESS OUT OF RANGE: + * ASC: 0x21h; ASCQ: 0x00h; + */ + arqstat->sts_sensedata.es_add_code = 0x21; + arqstat->sts_sensedata.es_qual_code = 0x00; + + break; + + default: + con_log(CL_ANN, (CE_CONT, "Unknown status!")); + pkt->pkt_reason = CMD_TRAN_ERR; + + break; + } + + atomic_add_16(&instance->fw_outstanding, (-1)); + + return_mfi_pkt(instance, cmd); + + (void) drsas_common_check(instance, cmd); + + if (acmd->cmd_dmahandle) { + if (drsas_check_dma_handle( + acmd->cmd_dmahandle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, + DDI_SERVICE_UNAFFECTED); + pkt->pkt_reason = CMD_TRAN_ERR; + pkt->pkt_statistics = 0; + } + } + + /* Call the callback routine */ + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && + pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + + break; + case MFI_CMD_OP_SMP: + case MFI_CMD_OP_STP: + complete_cmd_in_sync_mode(instance, cmd); + break; + case MFI_CMD_OP_DCMD: + /* see if got an event notification */ + if (ddi_get32(cmd->frame_dma_obj.acc_handle, + &cmd->frame->dcmd.opcode) == + DR_DCMD_CTRL_EVENT_WAIT) { + if ((instance->aen_cmd == cmd) && + (instance->aen_cmd->abort_aen)) { + con_log(CL_ANN, (CE_WARN, + "drsas_softintr: " + "aborted_aen returned")); + } else { + atomic_add_16(&instance->fw_outstanding, + (-1)); + service_mfi_aen(instance, cmd); + } + } else { + complete_cmd_in_sync_mode(instance, cmd); + } + + break; + case MFI_CMD_OP_ABORT: + con_log(CL_ANN, (CE_WARN, "MFI_CMD_OP_ABORT complete")); + /* + * MFI_CMD_OP_ABORT successfully completed + * in the synchronous mode + */ + complete_cmd_in_sync_mode(instance, cmd); + break; + default: + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + + if (cmd->pkt != NULL) { + pkt = cmd->pkt; + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && + pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + } + con_log(CL_ANN, (CE_WARN, "Cmd type unknown !")); + break; + } + } + + instance->softint_running = 0; + + return (DDI_INTR_CLAIMED); +} + +/* + * drsas_alloc_dma_obj + * + * Allocate the memory and other resources for an dma object. + */ +static int +drsas_alloc_dma_obj(struct drsas_instance *instance, dma_obj_t *obj, + uchar_t endian_flags) +{ + int i; + size_t alen = 0; + uint_t cookie_cnt; + struct ddi_device_acc_attr tmp_endian_attr; + + tmp_endian_attr = endian_attr; + tmp_endian_attr.devacc_attr_endian_flags = endian_flags; + + i = ddi_dma_alloc_handle(instance->dip, &obj->dma_attr, + DDI_DMA_SLEEP, NULL, &obj->dma_handle); + if (i != DDI_SUCCESS) { + + switch (i) { + case DDI_DMA_BADATTR : + con_log(CL_ANN, (CE_WARN, + "Failed ddi_dma_alloc_handle- Bad attribute")); + break; + case DDI_DMA_NORESOURCES : + con_log(CL_ANN, (CE_WARN, + "Failed ddi_dma_alloc_handle- No Resources")); + break; + default : + con_log(CL_ANN, (CE_WARN, + "Failed ddi_dma_alloc_handle: " + "unknown status %d", i)); + break; + } + + return (-1); + } + + if ((ddi_dma_mem_alloc(obj->dma_handle, obj->size, &tmp_endian_attr, + DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL, + &obj->buffer, &alen, &obj->acc_handle) != DDI_SUCCESS) || + alen < obj->size) { + + ddi_dma_free_handle(&obj->dma_handle); + + con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_mem_alloc")); + + return (-1); + } + + if (ddi_dma_addr_bind_handle(obj->dma_handle, NULL, obj->buffer, + obj->size, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP, + NULL, &obj->dma_cookie[0], &cookie_cnt) != DDI_SUCCESS) { + + ddi_dma_mem_free(&obj->acc_handle); + ddi_dma_free_handle(&obj->dma_handle); + + con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_addr_bind_handle")); + + return (-1); + } + + if (drsas_check_dma_handle(obj->dma_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (-1); + } + + if (drsas_check_acc_handle(obj->acc_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (-1); + } + + return (cookie_cnt); +} + +/* + * drsas_free_dma_obj(struct drsas_instance *, dma_obj_t) + * + * De-allocate the memory and other resources for an dma object, which must + * have been alloated by a previous call to drsas_alloc_dma_obj() + */ +static int +drsas_free_dma_obj(struct drsas_instance *instance, dma_obj_t obj) +{ + + if (drsas_check_dma_handle(obj.dma_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + return (DDI_FAILURE); + } + + if (drsas_check_acc_handle(obj.acc_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + return (DDI_FAILURE); + } + + (void) ddi_dma_unbind_handle(obj.dma_handle); + ddi_dma_mem_free(&obj.acc_handle); + ddi_dma_free_handle(&obj.dma_handle); + + return (DDI_SUCCESS); +} + +/* + * drsas_dma_alloc(instance_t *, struct scsi_pkt *, struct buf *, + * int, int (*)()) + * + * Allocate dma resources for a new scsi command + */ +static int +drsas_dma_alloc(struct drsas_instance *instance, struct scsi_pkt *pkt, + struct buf *bp, int flags, int (*callback)()) +{ + int dma_flags; + int (*cb)(caddr_t); + int i; + + ddi_dma_attr_t tmp_dma_attr = drsas_generic_dma_attr; + struct scsa_cmd *acmd = PKT2CMD(pkt); + + acmd->cmd_buf = bp; + + if (bp->b_flags & B_READ) { + acmd->cmd_flags &= ~CFLAG_DMASEND; + dma_flags = DDI_DMA_READ; + } else { + acmd->cmd_flags |= CFLAG_DMASEND; + dma_flags = DDI_DMA_WRITE; + } + + if (flags & PKT_CONSISTENT) { + acmd->cmd_flags |= CFLAG_CONSISTENT; + dma_flags |= DDI_DMA_CONSISTENT; + } + + if (flags & PKT_DMA_PARTIAL) { + dma_flags |= DDI_DMA_PARTIAL; + } + + dma_flags |= DDI_DMA_REDZONE; + + cb = (callback == NULL_FUNC) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP; + + tmp_dma_attr.dma_attr_sgllen = instance->max_num_sge; + tmp_dma_attr.dma_attr_addr_hi = 0xffffffffffffffffull; + + if ((i = ddi_dma_alloc_handle(instance->dip, &tmp_dma_attr, + cb, 0, &acmd->cmd_dmahandle)) != DDI_SUCCESS) { + switch (i) { + case DDI_DMA_BADATTR: + bioerror(bp, EFAULT); + return (DDI_FAILURE); + + case DDI_DMA_NORESOURCES: + bioerror(bp, 0); + return (DDI_FAILURE); + + default: + con_log(CL_ANN, (CE_PANIC, "ddi_dma_alloc_handle: " + "impossible result (0x%x)", i)); + bioerror(bp, EFAULT); + return (DDI_FAILURE); + } + } + + i = ddi_dma_buf_bind_handle(acmd->cmd_dmahandle, bp, dma_flags, + cb, 0, &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies); + + switch (i) { + case DDI_DMA_PARTIAL_MAP: + if ((dma_flags & DDI_DMA_PARTIAL) == 0) { + con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: " + "DDI_DMA_PARTIAL_MAP impossible")); + goto no_dma_cookies; + } + + if (ddi_dma_numwin(acmd->cmd_dmahandle, &acmd->cmd_nwin) == + DDI_FAILURE) { + con_log(CL_ANN, (CE_PANIC, "ddi_dma_numwin failed")); + goto no_dma_cookies; + } + + if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin, + &acmd->cmd_dma_offset, &acmd->cmd_dma_len, + &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) == + DDI_FAILURE) { + + con_log(CL_ANN, (CE_PANIC, "ddi_dma_getwin failed")); + goto no_dma_cookies; + } + + goto get_dma_cookies; + case DDI_DMA_MAPPED: + acmd->cmd_nwin = 1; + acmd->cmd_dma_len = 0; + acmd->cmd_dma_offset = 0; + +get_dma_cookies: + i = 0; + acmd->cmd_dmacount = 0; + for (;;) { + acmd->cmd_dmacount += + acmd->cmd_dmacookies[i++].dmac_size; + + if (i == instance->max_num_sge || + i == acmd->cmd_ncookies) + break; + + ddi_dma_nextcookie(acmd->cmd_dmahandle, + &acmd->cmd_dmacookies[i]); + } + + acmd->cmd_cookie = i; + acmd->cmd_cookiecnt = i; + + acmd->cmd_flags |= CFLAG_DMAVALID; + + if (bp->b_bcount >= acmd->cmd_dmacount) { + pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount; + } else { + pkt->pkt_resid = 0; + } + + return (DDI_SUCCESS); + case DDI_DMA_NORESOURCES: + bioerror(bp, 0); + break; + case DDI_DMA_NOMAPPING: + bioerror(bp, EFAULT); + break; + case DDI_DMA_TOOBIG: + bioerror(bp, EINVAL); + break; + case DDI_DMA_INUSE: + con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle:" + " DDI_DMA_INUSE impossible")); + break; + default: + con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: " + "impossible result (0x%x)", i)); + break; + } + +no_dma_cookies: + ddi_dma_free_handle(&acmd->cmd_dmahandle); + acmd->cmd_dmahandle = NULL; + acmd->cmd_flags &= ~CFLAG_DMAVALID; + return (DDI_FAILURE); +} + +/* + * drsas_dma_move(struct drsas_instance *, struct scsi_pkt *, struct buf *) + * + * move dma resources to next dma window + * + */ +static int +drsas_dma_move(struct drsas_instance *instance, struct scsi_pkt *pkt, + struct buf *bp) +{ + int i = 0; + + struct scsa_cmd *acmd = PKT2CMD(pkt); + + /* + * If there are no more cookies remaining in this window, + * must move to the next window first. + */ + if (acmd->cmd_cookie == acmd->cmd_ncookies) { + if (acmd->cmd_curwin == acmd->cmd_nwin && acmd->cmd_nwin == 1) { + return (DDI_SUCCESS); + } + + /* at last window, cannot move */ + if (++acmd->cmd_curwin >= acmd->cmd_nwin) { + return (DDI_FAILURE); + } + + if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin, + &acmd->cmd_dma_offset, &acmd->cmd_dma_len, + &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) == + DDI_FAILURE) { + return (DDI_FAILURE); + } + + acmd->cmd_cookie = 0; + } else { + /* still more cookies in this window - get the next one */ + ddi_dma_nextcookie(acmd->cmd_dmahandle, + &acmd->cmd_dmacookies[0]); + } + + /* get remaining cookies in this window, up to our maximum */ + for (;;) { + acmd->cmd_dmacount += acmd->cmd_dmacookies[i++].dmac_size; + acmd->cmd_cookie++; + + if (i == instance->max_num_sge || + acmd->cmd_cookie == acmd->cmd_ncookies) { + break; + } + + ddi_dma_nextcookie(acmd->cmd_dmahandle, + &acmd->cmd_dmacookies[i]); + } + + acmd->cmd_cookiecnt = i; + + if (bp->b_bcount >= acmd->cmd_dmacount) { + pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount; + } else { + pkt->pkt_resid = 0; + } + + return (DDI_SUCCESS); +} + +/* + * build_cmd + */ +static struct drsas_cmd * +build_cmd(struct drsas_instance *instance, struct scsi_address *ap, + struct scsi_pkt *pkt, uchar_t *cmd_done) +{ + uint16_t flags = 0; + uint32_t i; + uint32_t context; + uint32_t sge_bytes; + ddi_acc_handle_t acc_handle; + struct drsas_cmd *cmd; + struct drsas_sge64 *mfi_sgl; + struct scsa_cmd *acmd = PKT2CMD(pkt); + struct drsas_pthru_frame *pthru; + struct drsas_io_frame *ldio; + + /* find out if this is logical or physical drive command. */ + acmd->islogical = MRDRV_IS_LOGICAL(ap); + acmd->device_id = MAP_DEVICE_ID(instance, ap); + *cmd_done = 0; + + /* get the command packet */ + if (!(cmd = get_mfi_pkt(instance))) { + return (NULL); + } + + acc_handle = cmd->frame_dma_obj.acc_handle; + + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(acc_handle, &cmd->frame->hdr.context, cmd->index); + + cmd->pkt = pkt; + cmd->cmd = acmd; + + /* lets get the command directions */ + if (acmd->cmd_flags & CFLAG_DMASEND) { + flags = MFI_FRAME_DIR_WRITE; + + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, acmd->cmd_dma_len, + DDI_DMA_SYNC_FORDEV); + } + } else if (acmd->cmd_flags & ~CFLAG_DMASEND) { + flags = MFI_FRAME_DIR_READ; + + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, acmd->cmd_dma_len, + DDI_DMA_SYNC_FORCPU); + } + } else { + flags = MFI_FRAME_DIR_NONE; + } + + flags |= MFI_FRAME_SGL64; + + switch (pkt->pkt_cdbp[0]) { + + /* + * case SCMD_SYNCHRONIZE_CACHE: + * flush_cache(instance); + * return_mfi_pkt(instance, cmd); + * *cmd_done = 1; + * + * return (NULL); + */ + + case SCMD_READ: + case SCMD_WRITE: + case SCMD_READ_G1: + case SCMD_WRITE_G1: + if (acmd->islogical) { + ldio = (struct drsas_io_frame *)cmd->frame; + + /* + * preare the Logical IO frame: + * 2nd bit is zero for all read cmds + */ + ddi_put8(acc_handle, &ldio->cmd, + (pkt->pkt_cdbp[0] & 0x02) ? MFI_CMD_OP_LD_WRITE + : MFI_CMD_OP_LD_READ); + ddi_put8(acc_handle, &ldio->cmd_status, 0x0); + ddi_put8(acc_handle, &ldio->scsi_status, 0x0); + ddi_put8(acc_handle, &ldio->target_id, acmd->device_id); + ddi_put16(acc_handle, &ldio->timeout, 0); + ddi_put8(acc_handle, &ldio->reserved_0, 0); + ddi_put16(acc_handle, &ldio->pad_0, 0); + ddi_put16(acc_handle, &ldio->flags, flags); + + /* Initialize sense Information */ + bzero(cmd->sense, SENSE_LENGTH); + ddi_put8(acc_handle, &ldio->sense_len, SENSE_LENGTH); + ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_hi, 0); + ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_lo, + cmd->sense_phys_addr); + ddi_put32(acc_handle, &ldio->start_lba_hi, 0); + ddi_put8(acc_handle, &ldio->access_byte, + (acmd->cmd_cdblen != 6) ? pkt->pkt_cdbp[1] : 0); + ddi_put8(acc_handle, &ldio->sge_count, + acmd->cmd_cookiecnt); + mfi_sgl = (struct drsas_sge64 *)&ldio->sgl; + + context = ddi_get32(acc_handle, &ldio->context); + + if (acmd->cmd_cdblen == CDB_GROUP0) { + ddi_put32(acc_handle, &ldio->lba_count, ( + (uint16_t)(pkt->pkt_cdbp[4]))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[3])) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 8) | + ((uint32_t)((pkt->pkt_cdbp[1]) & 0x1F) + << 16))); + } else if (acmd->cmd_cdblen == CDB_GROUP1) { + ddi_put32(acc_handle, &ldio->lba_count, ( + ((uint16_t)(pkt->pkt_cdbp[8])) | + ((uint16_t)(pkt->pkt_cdbp[7]) << 8))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); + } else if (acmd->cmd_cdblen == CDB_GROUP2) { + ddi_put32(acc_handle, &ldio->lba_count, ( + ((uint16_t)(pkt->pkt_cdbp[9])) | + ((uint16_t)(pkt->pkt_cdbp[8]) << 8) | + ((uint16_t)(pkt->pkt_cdbp[7]) << 16) | + ((uint16_t)(pkt->pkt_cdbp[6]) << 24))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); + } else if (acmd->cmd_cdblen == CDB_GROUP3) { + ddi_put32(acc_handle, &ldio->lba_count, ( + ((uint16_t)(pkt->pkt_cdbp[13])) | + ((uint16_t)(pkt->pkt_cdbp[12]) << 8) | + ((uint16_t)(pkt->pkt_cdbp[11]) << 16) | + ((uint16_t)(pkt->pkt_cdbp[10]) << 24))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[9])) | + ((uint32_t)(pkt->pkt_cdbp[8]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[7]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[6]) << 24))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); + } + + break; + } + /* fall through For all non-rd/wr cmds */ + default: + + switch (pkt->pkt_cdbp[0]) { + case SCMD_MODE_SENSE: + case SCMD_MODE_SENSE_G1: { + union scsi_cdb *cdbp; + uint16_t page_code; + + cdbp = (void *)pkt->pkt_cdbp; + page_code = (uint16_t)cdbp->cdb_un.sg.scsi[0]; + switch (page_code) { + case 0x3: + case 0x4: + (void) drsas_mode_sense_build(pkt); + return_mfi_pkt(instance, cmd); + *cmd_done = 1; + return (NULL); + } + break; + } + default: + break; + } + + pthru = (struct drsas_pthru_frame *)cmd->frame; + + /* prepare the DCDB frame */ + ddi_put8(acc_handle, &pthru->cmd, (acmd->islogical) ? + MFI_CMD_OP_LD_SCSI : MFI_CMD_OP_PD_SCSI); + ddi_put8(acc_handle, &pthru->cmd_status, 0x0); + ddi_put8(acc_handle, &pthru->scsi_status, 0x0); + ddi_put8(acc_handle, &pthru->target_id, acmd->device_id); + ddi_put8(acc_handle, &pthru->lun, 0); + ddi_put8(acc_handle, &pthru->cdb_len, acmd->cmd_cdblen); + ddi_put16(acc_handle, &pthru->timeout, 0); + ddi_put16(acc_handle, &pthru->flags, flags); + ddi_put32(acc_handle, &pthru->data_xfer_len, + acmd->cmd_dmacount); + ddi_put8(acc_handle, &pthru->sge_count, acmd->cmd_cookiecnt); + mfi_sgl = (struct drsas_sge64 *)&pthru->sgl; + + bzero(cmd->sense, SENSE_LENGTH); + ddi_put8(acc_handle, &pthru->sense_len, SENSE_LENGTH); + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0); + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, + cmd->sense_phys_addr); + + context = ddi_get32(acc_handle, &pthru->context); + ddi_rep_put8(acc_handle, (uint8_t *)pkt->pkt_cdbp, + (uint8_t *)pthru->cdb, acmd->cmd_cdblen, DDI_DEV_AUTOINCR); + + break; + } +#ifdef lint + context = context; +#endif + /* prepare the scatter-gather list for the firmware */ + for (i = 0; i < acmd->cmd_cookiecnt; i++, mfi_sgl++) { + ddi_put64(acc_handle, &mfi_sgl->phys_addr, + acmd->cmd_dmacookies[i].dmac_laddress); + ddi_put32(acc_handle, &mfi_sgl->length, + acmd->cmd_dmacookies[i].dmac_size); + } + + sge_bytes = sizeof (struct drsas_sge64)*acmd->cmd_cookiecnt; + + cmd->frame_count = (sge_bytes / MRMFI_FRAME_SIZE) + + ((sge_bytes % MRMFI_FRAME_SIZE) ? 1 : 0) + 1; + + if (cmd->frame_count >= 8) { + cmd->frame_count = 8; + } + + return (cmd); +} + +/* + * issue_mfi_pthru + */ +static int +issue_mfi_pthru(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *ubuf; + uint32_t kphys_addr = 0; + uint32_t xferlen = 0; + uint_t model; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + dma_obj_t pthru_dma_obj; + struct drsas_pthru_frame *kpthru; + struct drsas_pthru_frame *pthru; + int i; + pthru = &cmd->frame->pthru; + kpthru = (struct drsas_pthru_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32")); + + xferlen = kpthru->sgl.sge32[0].length; + + ubuf = (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr; + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32")); + xferlen = kpthru->sgl.sge32[0].length; + ubuf = (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP64")); + xferlen = kpthru->sgl.sge64[0].length; + ubuf = (void *)(ulong_t)kpthru->sgl.sge64[0].phys_addr; +#endif + } + + if (xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + pthru_dma_obj.size = xferlen; + pthru_dma_obj.dma_attr = drsas_generic_dma_attr; + pthru_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + pthru_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + pthru_dma_obj.dma_attr.dma_attr_sgllen = 1; + pthru_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &pthru_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_pthru: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + if (kpthru->flags & MFI_FRAME_DIR_WRITE) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyin((uint8_t *)ubuf+i, + (uint8_t *)pthru_dma_obj.buffer+i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_pthru : " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + kphys_addr = pthru_dma_obj.dma_cookie[0].dmac_address; + } + + ddi_put8(acc_handle, &pthru->cmd, kpthru->cmd); + ddi_put8(acc_handle, &pthru->sense_len, kpthru->sense_len); + ddi_put8(acc_handle, &pthru->cmd_status, 0); + ddi_put8(acc_handle, &pthru->scsi_status, 0); + ddi_put8(acc_handle, &pthru->target_id, kpthru->target_id); + ddi_put8(acc_handle, &pthru->lun, kpthru->lun); + ddi_put8(acc_handle, &pthru->cdb_len, kpthru->cdb_len); + ddi_put8(acc_handle, &pthru->sge_count, kpthru->sge_count); + ddi_put16(acc_handle, &pthru->timeout, kpthru->timeout); + ddi_put32(acc_handle, &pthru->data_xfer_len, kpthru->data_xfer_len); + + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0); + /* pthru->sense_buf_phys_addr_lo = cmd->sense_phys_addr; */ + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, 0); + + ddi_rep_put8(acc_handle, (uint8_t *)kpthru->cdb, (uint8_t *)pthru->cdb, + pthru->cdb_len, DDI_DEV_AUTOINCR); + + ddi_put16(acc_handle, &pthru->flags, kpthru->flags & ~MFI_FRAME_SGL64); + ddi_put32(acc_handle, &pthru->sgl.sge32[0].length, xferlen); + ddi_put32(acc_handle, &pthru->sgl.sge32[0].phys_addr, kphys_addr); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_pthru: fw_ioctl failed")); + } else { + if (xferlen && kpthru->flags & MFI_FRAME_DIR_READ) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyout( + (uint8_t *)pthru_dma_obj.buffer+i, + (uint8_t *)ubuf+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_pthru : " + "copy to user space failed")); + return (DDI_FAILURE); + } + } + } + } + + kpthru->cmd_status = ddi_get8(acc_handle, &pthru->cmd_status); + kpthru->scsi_status = ddi_get8(acc_handle, &pthru->scsi_status); + + con_log(CL_ANN, (CE_NOTE, "issue_mfi_pthru: cmd_status %x, " + "scsi_status %x", kpthru->cmd_status, kpthru->scsi_status)); + + if (xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, pthru_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * issue_mfi_dcmd + */ +static int +issue_mfi_dcmd(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *ubuf; + uint32_t kphys_addr = 0; + uint32_t xferlen = 0; + uint32_t model; + dma_obj_t dcmd_dma_obj; + struct drsas_dcmd_frame *kdcmd; + struct drsas_dcmd_frame *dcmd; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + int i; + dcmd = &cmd->frame->dcmd; + kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32")); + + xferlen = kdcmd->sgl.sge32[0].length; + + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32")); + xferlen = kdcmd->sgl.sge32[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_LP64")); + xferlen = kdcmd->sgl.sge64[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr; +#endif + } + if (xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + dcmd_dma_obj.size = xferlen; + dcmd_dma_obj.dma_attr = drsas_generic_dma_attr; + dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1; + dcmd_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + if (kdcmd->flags & MFI_FRAME_DIR_WRITE) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyin((uint8_t *)ubuf + i, + (uint8_t *)dcmd_dma_obj.buffer + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_dcmd : " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + kphys_addr = dcmd_dma_obj.dma_cookie[0].dmac_address; + } + + ddi_put8(acc_handle, &dcmd->cmd, kdcmd->cmd); + ddi_put8(acc_handle, &dcmd->cmd_status, 0); + ddi_put8(acc_handle, &dcmd->sge_count, kdcmd->sge_count); + ddi_put16(acc_handle, &dcmd->timeout, kdcmd->timeout); + ddi_put32(acc_handle, &dcmd->data_xfer_len, kdcmd->data_xfer_len); + ddi_put32(acc_handle, &dcmd->opcode, kdcmd->opcode); + + ddi_rep_put8(acc_handle, (uint8_t *)kdcmd->mbox.b, + (uint8_t *)dcmd->mbox.b, DCMD_MBOX_SZ, DDI_DEV_AUTOINCR); + + ddi_put16(acc_handle, &dcmd->flags, kdcmd->flags & ~MFI_FRAME_SGL64); + ddi_put32(acc_handle, &dcmd->sgl.sge32[0].length, xferlen); + ddi_put32(acc_handle, &dcmd->sgl.sge32[0].phys_addr, kphys_addr); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: fw_ioctl failed")); + } else { + if (xferlen && (kdcmd->flags & MFI_FRAME_DIR_READ)) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyout( + (uint8_t *)dcmd_dma_obj.buffer + i, + (uint8_t *)ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_dcmd : " + "copy to user space failed")); + return (DDI_FAILURE); + } + } + } + } + + kdcmd->cmd_status = ddi_get8(acc_handle, &dcmd->cmd_status); + + if (xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * issue_mfi_smp + */ +static int +issue_mfi_smp(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *request_ubuf; + void *response_ubuf; + uint32_t request_xferlen = 0; + uint32_t response_xferlen = 0; + uint_t model; + dma_obj_t request_dma_obj; + dma_obj_t response_dma_obj; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + struct drsas_smp_frame *ksmp; + struct drsas_smp_frame *smp; + struct drsas_sge32 *sge32; +#ifndef _ILP32 + struct drsas_sge64 *sge64; +#endif + int i; + uint64_t tmp_sas_addr; + + smp = &cmd->frame->smp; + ksmp = (struct drsas_smp_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32")); + + sge32 = &ksmp->sgl[0].sge32[0]; + response_xferlen = sge32[0].length; + request_xferlen = sge32[1].length; + con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: " + "response_xferlen = %x, request_xferlen = %x", + response_xferlen, request_xferlen)); + + response_ubuf = (void *)(ulong_t)sge32[0].phys_addr; + request_ubuf = (void *)(ulong_t)sge32[1].phys_addr; + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: " + "response_ubuf = %p, request_ubuf = %p", + response_ubuf, request_ubuf)); + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32")); + + sge32 = &ksmp->sgl[0].sge32[0]; + response_xferlen = sge32[0].length; + request_xferlen = sge32[1].length; + con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: " + "response_xferlen = %x, request_xferlen = %x", + response_xferlen, request_xferlen)); + + response_ubuf = (void *)(ulong_t)sge32[0].phys_addr; + request_ubuf = (void *)(ulong_t)sge32[1].phys_addr; + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: " + "response_ubuf = %p, request_ubuf = %p", + response_ubuf, request_ubuf)); +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_LP64")); + + sge64 = &ksmp->sgl[0].sge64[0]; + response_xferlen = sge64[0].length; + request_xferlen = sge64[1].length; + + response_ubuf = (void *)(ulong_t)sge64[0].phys_addr; + request_ubuf = (void *)(ulong_t)sge64[1].phys_addr; +#endif + } + if (request_xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + request_dma_obj.size = request_xferlen; + request_dma_obj.dma_attr = drsas_generic_dma_attr; + request_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + request_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + request_dma_obj.dma_attr.dma_attr_sgllen = 1; + request_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &request_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < request_xferlen; i++) { + if (ddi_copyin((uint8_t *)request_ubuf + i, + (uint8_t *)request_dma_obj.buffer + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + if (response_xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + response_dma_obj.size = response_xferlen; + response_dma_obj.dma_attr = drsas_generic_dma_attr; + response_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + response_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + response_dma_obj.dma_attr.dma_attr_sgllen = 1; + response_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &response_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < response_xferlen; i++) { + if (ddi_copyin((uint8_t *)response_ubuf + i, + (uint8_t *)response_dma_obj.buffer + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + ddi_put8(acc_handle, &smp->cmd, ksmp->cmd); + ddi_put8(acc_handle, &smp->cmd_status, 0); + ddi_put8(acc_handle, &smp->connection_status, 0); + ddi_put8(acc_handle, &smp->sge_count, ksmp->sge_count); + /* smp->context = ksmp->context; */ + ddi_put16(acc_handle, &smp->timeout, ksmp->timeout); + ddi_put32(acc_handle, &smp->data_xfer_len, ksmp->data_xfer_len); + + bcopy((void *)&ksmp->sas_addr, (void *)&tmp_sas_addr, + sizeof (uint64_t)); + ddi_put64(acc_handle, &smp->sas_addr, tmp_sas_addr); + + ddi_put16(acc_handle, &smp->flags, ksmp->flags & ~MFI_FRAME_SGL64); + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + + sge32 = &smp->sgl[0].sge32[0]; + ddi_put32(acc_handle, &sge32[0].length, response_xferlen); + ddi_put32(acc_handle, &sge32[0].phys_addr, + response_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &sge32[1].length, request_xferlen); + ddi_put32(acc_handle, &sge32[1].phys_addr, + request_dma_obj.dma_cookie[0].dmac_address); + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + sge32 = &smp->sgl[0].sge32[0]; + ddi_put32(acc_handle, &sge32[0].length, response_xferlen); + ddi_put32(acc_handle, &sge32[0].phys_addr, + response_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &sge32[1].length, request_xferlen); + ddi_put32(acc_handle, &sge32[1].phys_addr, + request_dma_obj.dma_cookie[0].dmac_address); +#else + con_log(CL_ANN1, (CE_NOTE, + "issue_mfi_smp: DDI_MODEL_LP64")); + sge64 = &smp->sgl[0].sge64[0]; + ddi_put32(acc_handle, &sge64[0].length, response_xferlen); + ddi_put64(acc_handle, &sge64[0].phys_addr, + response_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &sge64[1].length, request_xferlen); + ddi_put64(acc_handle, &sge64[1].phys_addr, + request_dma_obj.dma_cookie[0].dmac_address); +#endif + } + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp : " + "smp->response_xferlen = %d, smp->request_xferlen = %d " + "smp->data_xfer_len = %d", ddi_get32(acc_handle, &sge32[0].length), + ddi_get32(acc_handle, &sge32[1].length), + ddi_get32(acc_handle, &smp->data_xfer_len))); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_smp: fw_ioctl failed")); + } else { + con_log(CL_ANN1, (CE_NOTE, + "issue_mfi_smp: copy to user space")); + + if (request_xferlen) { + for (i = 0; i < request_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)request_dma_obj.buffer + + i, (uint8_t *)request_ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_smp : copy to user space" + " failed")); + return (DDI_FAILURE); + } + } + } + + if (response_xferlen) { + for (i = 0; i < response_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)response_dma_obj.buffer + + i, (uint8_t *)response_ubuf + + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_smp : copy to " + "user space failed")); + return (DDI_FAILURE); + } + } + } + } + + ksmp->cmd_status = ddi_get8(acc_handle, &smp->cmd_status); + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: smp->cmd_status = %d", + ddi_get8(acc_handle, &smp->cmd_status))); + + + if (request_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, request_dma_obj) != + DDI_SUCCESS) + return (DDI_FAILURE); + } + + if (response_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, response_dma_obj) != + DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * issue_mfi_stp + */ +static int +issue_mfi_stp(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *fis_ubuf; + void *data_ubuf; + uint32_t fis_xferlen = 0; + uint32_t data_xferlen = 0; + uint_t model; + dma_obj_t fis_dma_obj; + dma_obj_t data_dma_obj; + struct drsas_stp_frame *kstp; + struct drsas_stp_frame *stp; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + int i; + + stp = &cmd->frame->stp; + kstp = (struct drsas_stp_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32")); + + fis_xferlen = kstp->sgl.sge32[0].length; + data_xferlen = kstp->sgl.sge32[1].length; + + fis_ubuf = (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr; + data_ubuf = (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr; + } + else + { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32")); + + fis_xferlen = kstp->sgl.sge32[0].length; + data_xferlen = kstp->sgl.sge32[1].length; + + fis_ubuf = (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr; + data_ubuf = (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_LP64")); + + fis_xferlen = kstp->sgl.sge64[0].length; + data_xferlen = kstp->sgl.sge64[1].length; + + fis_ubuf = (void *)(ulong_t)kstp->sgl.sge64[0].phys_addr; + data_ubuf = (void *)(ulong_t)kstp->sgl.sge64[1].phys_addr; +#endif + } + + + if (fis_xferlen) { + con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: " + "fis_ubuf = %p fis_xferlen = %x", fis_ubuf, fis_xferlen)); + + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + fis_dma_obj.size = fis_xferlen; + fis_dma_obj.dma_attr = drsas_generic_dma_attr; + fis_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + fis_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + fis_dma_obj.dma_attr.dma_attr_sgllen = 1; + fis_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &fis_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp : " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < fis_xferlen; i++) { + if (ddi_copyin((uint8_t *)fis_ubuf + i, + (uint8_t *)fis_dma_obj.buffer + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + if (data_xferlen) { + con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: data_ubuf = %p " + "data_xferlen = %x", data_ubuf, data_xferlen)); + + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + data_dma_obj.size = data_xferlen; + data_dma_obj.dma_attr = drsas_generic_dma_attr; + data_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + data_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + data_dma_obj.dma_attr.dma_attr_sgllen = 1; + data_dma_obj.dma_attr.dma_attr_align = 1; + +/* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &data_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < data_xferlen; i++) { + if (ddi_copyin((uint8_t *)data_ubuf + i, + (uint8_t *)data_dma_obj.buffer + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + ddi_put8(acc_handle, &stp->cmd, kstp->cmd); + ddi_put8(acc_handle, &stp->cmd_status, 0); + ddi_put8(acc_handle, &stp->connection_status, 0); + ddi_put8(acc_handle, &stp->target_id, kstp->target_id); + ddi_put8(acc_handle, &stp->sge_count, kstp->sge_count); + + ddi_put16(acc_handle, &stp->timeout, kstp->timeout); + ddi_put32(acc_handle, &stp->data_xfer_len, kstp->data_xfer_len); + + ddi_rep_put8(acc_handle, (uint8_t *)kstp->fis, (uint8_t *)stp->fis, 10, + DDI_DEV_AUTOINCR); + + ddi_put16(acc_handle, &stp->flags, kstp->flags & ~MFI_FRAME_SGL64); + ddi_put32(acc_handle, &stp->stp_flags, kstp->stp_flags); + ddi_put32(acc_handle, &stp->sgl.sge32[0].length, fis_xferlen); + ddi_put32(acc_handle, &stp->sgl.sge32[0].phys_addr, + fis_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &stp->sgl.sge32[1].length, data_xferlen); + ddi_put32(acc_handle, &stp->sgl.sge32[1].phys_addr, + data_dma_obj.dma_cookie[0].dmac_address); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: fw_ioctl failed")); + } else { + + if (fis_xferlen) { + for (i = 0; i < fis_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)fis_dma_obj.buffer + i, + (uint8_t *)fis_ubuf + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_stp : copy to " + "user space failed")); + return (DDI_FAILURE); + } + } + } + } + if (data_xferlen) { + for (i = 0; i < data_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)data_dma_obj.buffer + i, + (uint8_t *)data_ubuf + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_stp : copy to" + " user space failed")); + return (DDI_FAILURE); + } + } + } + + kstp->cmd_status = ddi_get8(acc_handle, &stp->cmd_status); + + if (fis_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, fis_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + if (data_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, data_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * fill_up_drv_ver + */ +static void +fill_up_drv_ver(struct drsas_drv_ver *dv) +{ + (void) memset(dv, 0, sizeof (struct drsas_drv_ver)); + + (void) memcpy(dv->signature, "$LSI LOGIC$", strlen("$LSI LOGIC$")); + (void) memcpy(dv->os_name, "Solaris", strlen("Solaris")); + (void) memcpy(dv->drv_name, "dr_sas", strlen("dr_sas")); + (void) memcpy(dv->drv_ver, DRSAS_VERSION, strlen(DRSAS_VERSION)); + (void) memcpy(dv->drv_rel_date, DRSAS_RELDATE, + strlen(DRSAS_RELDATE)); +} + +/* + * handle_drv_ioctl + */ +static int +handle_drv_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + int mode) +{ + int i; + int rval = DDI_SUCCESS; + int *props = NULL; + void *ubuf; + + uint8_t *pci_conf_buf; + uint32_t xferlen; + uint32_t num_props; + uint_t model; + struct drsas_dcmd_frame *kdcmd; + struct drsas_drv_ver dv; + struct drsas_pci_information pi; + + kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + + xferlen = kdcmd->sgl.sge32[0].length; + + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + xferlen = kdcmd->sgl.sge32[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_LP64")); + xferlen = kdcmd->sgl.sge64[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr; +#endif + } + con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + "dataBuf=%p size=%d bytes", ubuf, xferlen)); + + switch (kdcmd->opcode) { + case DRSAS_DRIVER_IOCTL_DRIVER_VERSION: + con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_DRIVER_VERSION")); + + fill_up_drv_ver(&dv); + for (i = 0; i < xferlen; i++) { + if (ddi_copyout((uint8_t *)&dv + i, (uint8_t *)ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_DRIVER_VERSION" + " : copy to user space failed")); + kdcmd->cmd_status = 1; + rval = DDI_FAILURE; + break; + } + } + if (i == xferlen) + kdcmd->cmd_status = 0; + break; + case DRSAS_DRIVER_IOCTL_PCI_INFORMATION: + con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_PCI_INFORMAITON")); + + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, instance->dip, + 0, "reg", &props, &num_props)) { + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_PCI_INFORMATION : " + "ddi_prop_look_int_array failed")); + rval = DDI_FAILURE; + } else { + + pi.busNumber = (props[0] >> 16) & 0xFF; + pi.deviceNumber = (props[0] >> 11) & 0x1f; + pi.functionNumber = (props[0] >> 8) & 0x7; + ddi_prop_free((void *)props); + } + + pci_conf_buf = (uint8_t *)&pi.pciHeaderInfo; + + for (i = 0; i < (sizeof (struct drsas_pci_information) - + offsetof(struct drsas_pci_information, pciHeaderInfo)); + i++) { + pci_conf_buf[i] = + pci_config_get8(instance->pci_handle, i); + } + for (i = 0; i < xferlen; i++) { + if (ddi_copyout((uint8_t *)&pi + i, (uint8_t *)ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_PCI_INFORMATION" + " : copy to user space failed")); + kdcmd->cmd_status = 1; + rval = DDI_FAILURE; + break; + } + } + + if (i == xferlen) + kdcmd->cmd_status = 0; + + break; + default: + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "invalid driver specific IOCTL opcode = 0x%x", + kdcmd->opcode)); + kdcmd->cmd_status = 1; + rval = DDI_FAILURE; + break; + } + + return (rval); +} + +/* + * handle_mfi_ioctl + */ +static int +handle_mfi_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + int mode) +{ + int rval = DDI_SUCCESS; + + struct drsas_header *hdr; + struct drsas_cmd *cmd; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + con_log(CL_ANN, (CE_WARN, "dr_sas: " + "failed to get a cmd packet")); + return (DDI_FAILURE); + } + + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + hdr = (struct drsas_header *)&ioctl->frame[0]; + + switch (hdr->cmd) { + case MFI_CMD_OP_DCMD: + rval = issue_mfi_dcmd(instance, ioctl, cmd, mode); + break; + case MFI_CMD_OP_SMP: + rval = issue_mfi_smp(instance, ioctl, cmd, mode); + break; + case MFI_CMD_OP_STP: + rval = issue_mfi_stp(instance, ioctl, cmd, mode); + break; + case MFI_CMD_OP_LD_SCSI: + case MFI_CMD_OP_PD_SCSI: + rval = issue_mfi_pthru(instance, ioctl, cmd, mode); + break; + default: + con_log(CL_ANN, (CE_WARN, "handle_mfi_ioctl: " + "invalid mfi ioctl hdr->cmd = %d", hdr->cmd)); + rval = DDI_FAILURE; + break; + } + + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) + rval = DDI_FAILURE; + return (rval); +} + +/* + * AEN + */ +static int +handle_mfi_aen(struct drsas_instance *instance, struct drsas_aen *aen) +{ + int rval = 0; + + rval = register_mfi_aen(instance, instance->aen_seq_num, + aen->class_locale_word); + + aen->cmd_status = (uint8_t)rval; + + return (rval); +} + +static int +register_mfi_aen(struct drsas_instance *instance, uint32_t seq_num, + uint32_t class_locale_word) +{ + int ret_val; + + struct drsas_cmd *cmd, *aen_cmd; + struct drsas_dcmd_frame *dcmd; + union drsas_evt_class_locale curr_aen; + union drsas_evt_class_locale prev_aen; + + /* + * If there an AEN pending already (aen_cmd), check if the + * class_locale of that pending AEN is inclusive of the new + * AEN request we currently have. If it is, then we don't have + * to do anything. In other words, whichever events the current + * AEN request is subscribing to, have already been subscribed + * to. + * + * If the old_cmd is _not_ inclusive, then we have to abort + * that command, form a class_locale that is superset of both + * old and current and re-issue to the FW + */ + + curr_aen.word = class_locale_word; + aen_cmd = instance->aen_cmd; + if (aen_cmd) { + prev_aen.word = ddi_get32(aen_cmd->frame_dma_obj.acc_handle, + &aen_cmd->frame->dcmd.mbox.w[1]); + + /* + * A class whose enum value is smaller is inclusive of all + * higher values. If a PROGRESS (= -1) was previously + * registered, then a new registration requests for higher + * classes need not be sent to FW. They are automatically + * included. + * + * Locale numbers don't have such hierarchy. They are bitmap + * values + */ + if ((prev_aen.members.class <= curr_aen.members.class) && + !((prev_aen.members.locale & curr_aen.members.locale) ^ + curr_aen.members.locale)) { + /* + * Previously issued event registration includes + * current request. Nothing to do. + */ + + return (0); + } else { + curr_aen.members.locale |= prev_aen.members.locale; + + if (prev_aen.members.class < curr_aen.members.class) + curr_aen.members.class = prev_aen.members.class; + + ret_val = abort_aen_cmd(instance, aen_cmd); + + if (ret_val) { + con_log(CL_ANN, (CE_WARN, "register_mfi_aen: " + "failed to abort prevous AEN command")); + + return (ret_val); + } + } + } else { + curr_aen.word = class_locale_word; + } + + cmd = get_mfi_pkt(instance); + + if (!cmd) + return (ENOMEM); + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + dcmd = &cmd->frame->dcmd; + + /* for(i = 0; i < DCMD_MBOX_SZ; i++) dcmd->mbox.b[i] = 0; */ + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + (void) memset(instance->mfi_evt_detail_obj.buffer, 0, + sizeof (struct drsas_evt_detail)); + + /* Prepare DCMD for aen registration */ + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_READ); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, + sizeof (struct drsas_evt_detail)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_EVENT_WAIT); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[0], seq_num); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[1], + curr_aen.word); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr, + instance->mfi_evt_detail_obj.dma_cookie[0].dmac_address); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length, + sizeof (struct drsas_evt_detail)); + + instance->aen_seq_num = seq_num; + + + /* + * Store reference to the cmd used to register for AEN. When an + * application wants us to register for AEN, we have to abort this + * cmd and re-register with a new EVENT LOCALE supplied by that app + */ + instance->aen_cmd = cmd; + + cmd->frame_count = 1; + + /* Issue the aen registration frame */ + /* atomic_add_16 (&instance->fw_outstanding, 1); */ + instance->func_ptr->issue_cmd(cmd, instance); + + return (0); +} + +static void +display_scsi_inquiry(caddr_t scsi_inq) +{ +#define MAX_SCSI_DEVICE_CODE 14 + int i; + char inquiry_buf[256] = {0}; + int len; + const char *const scsi_device_types[] = { + "Direct-Access ", + "Sequential-Access", + "Printer ", + "Processor ", + "WORM ", + "CD-ROM ", + "Scanner ", + "Optical Device ", + "Medium Changer ", + "Communications ", + "Unknown ", + "Unknown ", + "Unknown ", + "Enclosure ", + }; + + len = 0; + + len += snprintf(inquiry_buf + len, 265 - len, " Vendor: "); + for (i = 8; i < 16; i++) { + len += snprintf(inquiry_buf + len, 265 - len, "%c", + scsi_inq[i]); + } + + len += snprintf(inquiry_buf + len, 265 - len, " Model: "); + + for (i = 16; i < 32; i++) { + len += snprintf(inquiry_buf + len, 265 - len, "%c", + scsi_inq[i]); + } + + len += snprintf(inquiry_buf + len, 265 - len, " Rev: "); + + for (i = 32; i < 36; i++) { + len += snprintf(inquiry_buf + len, 265 - len, "%c", + scsi_inq[i]); + } + + len += snprintf(inquiry_buf + len, 265 - len, "\n"); + + + i = scsi_inq[0] & 0x1f; + + + len += snprintf(inquiry_buf + len, 265 - len, " Type: %s ", + i < MAX_SCSI_DEVICE_CODE ? scsi_device_types[i] : + "Unknown "); + + + len += snprintf(inquiry_buf + len, 265 - len, + " ANSI SCSI revision: %02x", scsi_inq[2] & 0x07); + + if ((scsi_inq[2] & 0x07) == 1 && (scsi_inq[3] & 0x0f) == 1) { + len += snprintf(inquiry_buf + len, 265 - len, " CCS\n"); + } else { + len += snprintf(inquiry_buf + len, 265 - len, "\n"); + } + + con_log(CL_ANN1, (CE_CONT, inquiry_buf)); +} + +static int +read_fw_status_reg_ppc(struct drsas_instance *instance) +{ + return ((int)RD_OB_SCRATCH_PAD_0(instance)); +} + +static void +issue_cmd_ppc(struct drsas_cmd *cmd, struct drsas_instance *instance) +{ + atomic_add_16(&instance->fw_outstanding, 1); + + /* Issue the command to the FW */ + WR_IB_QPORT((cmd->frame_phys_addr) | + (((cmd->frame_count - 1) << 1) | 1), instance); +} + +/* + * issue_cmd_in_sync_mode + */ +static int +issue_cmd_in_sync_mode_ppc(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + int i; + uint32_t msecs = MFI_POLL_TIMEOUT_SECS * (10 * MILLISEC); + + con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: called")); + + cmd->cmd_status = ENODATA; + + WR_IB_QPORT((cmd->frame_phys_addr) | + (((cmd->frame_count - 1) << 1) | 1), instance); + + mutex_enter(&instance->int_cmd_mtx); + + for (i = 0; i < msecs && (cmd->cmd_status == ENODATA); i++) { + cv_wait(&instance->int_cmd_cv, &instance->int_cmd_mtx); + } + + mutex_exit(&instance->int_cmd_mtx); + + con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: done")); + + if (i < (msecs -1)) { + return (DDI_SUCCESS); + } else { + return (DDI_FAILURE); + } +} + +/* + * issue_cmd_in_poll_mode + */ +static int +issue_cmd_in_poll_mode_ppc(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + int i; + uint16_t flags; + uint32_t msecs = MFI_POLL_TIMEOUT_SECS * MILLISEC; + struct drsas_header *frame_hdr; + + con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_poll_mode_ppc: called")); + + frame_hdr = (struct drsas_header *)cmd->frame; + ddi_put8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + flags = ddi_get16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags); + flags |= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE; + + ddi_put16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags, flags); + + /* issue the frame using inbound queue port */ + WR_IB_QPORT((cmd->frame_phys_addr) | + (((cmd->frame_count - 1) << 1) | 1), instance); + + /* wait for cmd_status to change from 0xFF */ + for (i = 0; i < msecs && ( + ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status) + == MFI_CMD_STATUS_POLL_MODE); i++) { + drv_usecwait(MILLISEC); /* wait for 1000 usecs */ + } + + if (ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status) + == MFI_CMD_STATUS_POLL_MODE) { + con_log(CL_ANN, (CE_NOTE, "issue_cmd_in_poll_mode: " + "cmd polling timed out")); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +static void +enable_intr_ppc(struct drsas_instance *instance) +{ + uint32_t mask; + + con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: called")); + + /* WR_OB_DOORBELL_CLEAR(0xFFFFFFFF, instance); */ + WR_OB_DOORBELL_CLEAR(OB_DOORBELL_CLEAR_MASK, instance); + + /* WR_OB_INTR_MASK(~0x80000000, instance); */ + WR_OB_INTR_MASK(~(MFI_REPLY_2108_MESSAGE_INTR_MASK), instance); + + /* dummy read to force PCI flush */ + mask = RD_OB_INTR_MASK(instance); + + con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: " + "outbound_intr_mask = 0x%x", mask)); +} + +static void +disable_intr_ppc(struct drsas_instance *instance) +{ + uint32_t mask; + + con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: called")); + + con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: before : " + "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance))); + + /* WR_OB_INTR_MASK(0xFFFFFFFF, instance); */ + WR_OB_INTR_MASK(OB_INTR_MASK, instance); + + con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: after : " + "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance))); + + /* dummy read to force PCI flush */ + mask = RD_OB_INTR_MASK(instance); +#ifdef lint + mask = mask; +#endif +} + +static int +intr_ack_ppc(struct drsas_instance *instance) +{ + uint32_t status; + + con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: called")); + + /* check if it is our interrupt */ + status = RD_OB_INTR_STATUS(instance); + + con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: status = 0x%x", status)); + + if (!(status & MFI_REPLY_2108_MESSAGE_INTR)) { + return (DDI_INTR_UNCLAIMED); + } + + /* clear the interrupt by writing back the same value */ + WR_OB_DOORBELL_CLEAR(status, instance); + + /* dummy READ */ + status = RD_OB_INTR_STATUS(instance); + + con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: interrupt cleared")); + + return (DDI_INTR_CLAIMED); +} + +static int +drsas_common_check(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + int ret = DDI_SUCCESS; + + if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) != + DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle) + != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + if (drsas_check_dma_handle(instance->mfi_evt_detail_obj.dma_handle) != + DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + + ddi_fm_acc_err_clear(instance->regmap_handle, DDI_FME_VER0); + + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + + return (ret); +} + +/*ARGSUSED*/ +static int +drsas_fm_error_cb(dev_info_t *dip, ddi_fm_error_t *err, const void *impl_data) +{ + /* + * as the driver can always deal with an error in any dma or + * access handle, we can just return the fme_status value. + */ + pci_ereport_post(dip, err, NULL); + return (err->fme_status); +} + +static void +drsas_fm_init(struct drsas_instance *instance) +{ + /* Need to change iblock to priority for new MSI intr */ + ddi_iblock_cookie_t fm_ibc; + + /* Only register with IO Fault Services if we have some capability */ + if (instance->fm_capabilities) { + /* Adjust access and dma attributes for FMA */ + endian_attr.devacc_attr_access = DDI_FLAGERR_ACC; + drsas_generic_dma_attr.dma_attr_flags = DDI_DMA_FLAGERR; + + /* + * Register capabilities with IO Fault Services. + * fm_capabilities will be updated to indicate + * capabilities actually supported (not requested.) + */ + + ddi_fm_init(instance->dip, &instance->fm_capabilities, &fm_ibc); + + /* + * Initialize pci ereport capabilities if ereport + * capable (should always be.) + */ + + if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) || + DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + pci_ereport_setup(instance->dip); + } + + /* + * Register error callback if error callback capable. + */ + if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + ddi_fm_handler_register(instance->dip, + drsas_fm_error_cb, (void*) instance); + } + } else { + endian_attr.devacc_attr_access = DDI_DEFAULT_ACC; + drsas_generic_dma_attr.dma_attr_flags = 0; + } +} + +static void +drsas_fm_fini(struct drsas_instance *instance) +{ + /* Only unregister FMA capabilities if registered */ + if (instance->fm_capabilities) { + /* + * Un-register error callback if error callback capable. + */ + if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + ddi_fm_handler_unregister(instance->dip); + } + + /* + * Release any resources allocated by pci_ereport_setup() + */ + if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) || + DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + pci_ereport_teardown(instance->dip); + } + + /* Unregister from IO Fault Services */ + ddi_fm_fini(instance->dip); + + /* Adjust access and dma attributes for FMA */ + endian_attr.devacc_attr_access = DDI_DEFAULT_ACC; + drsas_generic_dma_attr.dma_attr_flags = 0; + } +} + +int +drsas_check_acc_handle(ddi_acc_handle_t handle) +{ + ddi_fm_error_t de; + + if (handle == NULL) { + return (DDI_FAILURE); + } + + ddi_fm_acc_err_get(handle, &de, DDI_FME_VERSION); + + return (de.fme_status); +} + +int +drsas_check_dma_handle(ddi_dma_handle_t handle) +{ + ddi_fm_error_t de; + + if (handle == NULL) { + return (DDI_FAILURE); + } + + ddi_fm_dma_err_get(handle, &de, DDI_FME_VERSION); + + return (de.fme_status); +} + +void +drsas_fm_ereport(struct drsas_instance *instance, char *detail) +{ + uint64_t ena; + char buf[FM_MAX_CLASS]; + + (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail); + ena = fm_ena_generate(0, FM_ENA_FMT1); + if (DDI_FM_EREPORT_CAP(instance->fm_capabilities)) { + ddi_fm_ereport_post(instance->dip, buf, ena, DDI_NOSLEEP, + FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERSION, NULL); + } +} + +static int +drsas_add_intrs(struct drsas_instance *instance, int intr_type) +{ + + dev_info_t *dip = instance->dip; + int avail, actual, count; + int i, flag, ret; + + con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: intr_type = %x", + intr_type)); + + /* Get number of interrupts */ + ret = ddi_intr_get_nintrs(dip, intr_type, &count); + if ((ret != DDI_SUCCESS) || (count == 0)) { + con_log(CL_ANN, (CE_WARN, "ddi_intr_get_nintrs() failed:" + "ret %d count %d", ret, count)); + + return (DDI_FAILURE); + } + + con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: count = %d ", count)); + + /* Get number of available interrupts */ + ret = ddi_intr_get_navail(dip, intr_type, &avail); + if ((ret != DDI_SUCCESS) || (avail == 0)) { + con_log(CL_ANN, (CE_WARN, "ddi_intr_get_navail() failed:" + "ret %d avail %d", ret, avail)); + + return (DDI_FAILURE); + } + con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: avail = %d ", avail)); + + /* Only one interrupt routine. So limit the count to 1 */ + if (count > 1) { + count = 1; + } + + /* + * Allocate an array of interrupt handlers. Currently we support + * only one interrupt. The framework can be extended later. + */ + instance->intr_size = count * sizeof (ddi_intr_handle_t); + instance->intr_htable = kmem_zalloc(instance->intr_size, KM_SLEEP); + ASSERT(instance->intr_htable); + + flag = ((intr_type == DDI_INTR_TYPE_MSI) || (intr_type == + DDI_INTR_TYPE_MSIX)) ? DDI_INTR_ALLOC_STRICT:DDI_INTR_ALLOC_NORMAL; + + /* Allocate interrupt */ + ret = ddi_intr_alloc(dip, instance->intr_htable, intr_type, 0, + count, &actual, flag); + + if ((ret != DDI_SUCCESS) || (actual == 0)) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "avail = %d", avail)); + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + if (actual < count) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "Requested = %d Received = %d", count, actual)); + } + instance->intr_cnt = actual; + + /* + * Get the priority of the interrupt allocated. + */ + if ((ret = ddi_intr_get_pri(instance->intr_htable[0], + &instance->intr_pri)) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "get priority call failed")); + + for (i = 0; i < actual; i++) { + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + /* + * Test for high level mutex. we don't support them. + */ + if (instance->intr_pri >= ddi_intr_get_hilevel_pri()) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "High level interrupts not supported.")); + + for (i = 0; i < actual; i++) { + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + con_log(CL_DLEVEL1, (CE_NOTE, "drsas_add_intrs: intr_pri = 0x%x ", + instance->intr_pri)); + + /* Call ddi_intr_add_handler() */ + for (i = 0; i < actual; i++) { + ret = ddi_intr_add_handler(instance->intr_htable[i], + (ddi_intr_handler_t *)drsas_isr, (caddr_t)instance, + (caddr_t)(uintptr_t)i); + + if (ret != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs:" + "failed %d", ret)); + + for (i = 0; i < actual; i++) { + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + } + + con_log(CL_DLEVEL1, (CE_WARN, " ddi_intr_add_handler done")); + + if ((ret = ddi_intr_get_cap(instance->intr_htable[0], + &instance->intr_cap)) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "ddi_intr_get_cap() failed %d", + ret)); + + /* Free already allocated intr */ + for (i = 0; i < actual; i++) { + (void) ddi_intr_remove_handler( + instance->intr_htable[i]); + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) { + con_log(CL_ANN, (CE_WARN, "Calling ddi_intr_block _enable")); + + (void) ddi_intr_block_enable(instance->intr_htable, + instance->intr_cnt); + } else { + con_log(CL_ANN, (CE_NOTE, " calling ddi_intr_enable")); + + for (i = 0; i < instance->intr_cnt; i++) { + (void) ddi_intr_enable(instance->intr_htable[i]); + con_log(CL_ANN, (CE_NOTE, "ddi intr enable returns " + "%d", i)); + } + } + + return (DDI_SUCCESS); + +} + + +static void +drsas_rem_intrs(struct drsas_instance *instance) +{ + int i; + + con_log(CL_ANN, (CE_NOTE, "drsas_rem_intrs called")); + + /* Disable all interrupts first */ + if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) { + (void) ddi_intr_block_disable(instance->intr_htable, + instance->intr_cnt); + } else { + for (i = 0; i < instance->intr_cnt; i++) { + (void) ddi_intr_disable(instance->intr_htable[i]); + } + } + + /* Remove all the handlers */ + + for (i = 0; i < instance->intr_cnt; i++) { + (void) ddi_intr_remove_handler(instance->intr_htable[i]); + (void) ddi_intr_free(instance->intr_htable[i]); + } + + kmem_free(instance->intr_htable, instance->intr_size); +} + +static int +drsas_tran_bus_config(dev_info_t *parent, uint_t flags, + ddi_bus_config_op_t op, void *arg, dev_info_t **childp) +{ + struct drsas_instance *instance; + int config; + int rval; + + char *ptr = NULL; + int tgt, lun; + + con_log(CL_ANN1, (CE_NOTE, "Bus config called for op = %x", op)); + + if ((instance = ddi_get_soft_state(drsas_state, + ddi_get_instance(parent))) == NULL) { + return (NDI_FAILURE); + } + + /* Hold nexus during bus_config */ + ndi_devi_enter(parent, &config); + switch (op) { + case BUS_CONFIG_ONE: { + + /* parse wwid/target name out of name given */ + if ((ptr = strchr((char *)arg, '@')) == NULL) { + rval = NDI_FAILURE; + break; + } + ptr++; + + if (drsas_parse_devname(arg, &tgt, &lun) != 0) { + rval = NDI_FAILURE; + break; + } + + if (lun == 0) { + rval = drsas_config_ld(instance, tgt, lun, childp); + } else { + rval = NDI_FAILURE; + } + + break; + } + case BUS_CONFIG_DRIVER: + case BUS_CONFIG_ALL: { + + rval = drsas_config_all_devices(instance); + + rval = NDI_SUCCESS; + break; + } + } + + if (rval == NDI_SUCCESS) { + rval = ndi_busop_bus_config(parent, flags, op, arg, childp, 0); + + } + ndi_devi_exit(parent, config); + + con_log(CL_ANN1, (CE_NOTE, "drsas_tran_bus_config: rval = %x", + rval)); + return (rval); +} + +static int +drsas_config_all_devices(struct drsas_instance *instance) +{ + int rval, tgt; + + for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) { + (void) drsas_config_ld(instance, tgt, 0, NULL); + + } + + rval = NDI_SUCCESS; + return (rval); +} + +static int +drsas_parse_devname(char *devnm, int *tgt, int *lun) +{ + char devbuf[SCSI_MAXNAMELEN]; + char *addr; + char *p, *tp, *lp; + long num; + + /* Parse dev name and address */ + (void) strcpy(devbuf, devnm); + addr = ""; + for (p = devbuf; *p != '\0'; p++) { + if (*p == '@') { + addr = p + 1; + *p = '\0'; + } else if (*p == ':') { + *p = '\0'; + break; + } + } + + /* Parse target and lun */ + for (p = tp = addr, lp = NULL; *p != '\0'; p++) { + if (*p == ',') { + lp = p + 1; + *p = '\0'; + break; + } + } + if (tgt && tp) { + if (ddi_strtol(tp, NULL, 0x10, &num)) { + return (DDI_FAILURE); /* Can declare this as constant */ + } + *tgt = (int)num; + } + if (lun && lp) { + if (ddi_strtol(lp, NULL, 0x10, &num)) { + return (DDI_FAILURE); + } + *lun = (int)num; + } + return (DDI_SUCCESS); /* Success case */ +} + +static int +drsas_config_ld(struct drsas_instance *instance, uint16_t tgt, + uint8_t lun, dev_info_t **ldip) +{ + struct scsi_device *sd; + dev_info_t *child; + int rval; + + con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: t = %d l = %d", + tgt, lun)); + + if ((child = drsas_find_child(instance, tgt, lun)) != NULL) { + if (ldip) { + *ldip = child; + } + con_log(CL_ANN1, (CE_NOTE, + "drsas_config_ld: Child = %p found t = %d l = %d", + (void *)child, tgt, lun)); + return (NDI_SUCCESS); + } + + sd = kmem_zalloc(sizeof (struct scsi_device), KM_SLEEP); + sd->sd_address.a_hba_tran = instance->tran; + sd->sd_address.a_target = (uint16_t)tgt; + sd->sd_address.a_lun = (uint8_t)lun; + + if (scsi_hba_probe(sd, NULL) == SCSIPROBE_EXISTS) + rval = drsas_config_scsi_device(instance, sd, ldip); + else + rval = NDI_FAILURE; + + /* sd_unprobe is blank now. Free buffer manually */ + if (sd->sd_inq) { + kmem_free(sd->sd_inq, SUN_INQSIZE); + sd->sd_inq = (struct scsi_inquiry *)NULL; + } + + kmem_free(sd, sizeof (struct scsi_device)); + con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: return rval = %d", + rval)); + return (rval); +} + +static int +drsas_config_scsi_device(struct drsas_instance *instance, + struct scsi_device *sd, dev_info_t **dipp) +{ + char *nodename = NULL; + char **compatible = NULL; + int ncompatible = 0; + char *childname; + dev_info_t *ldip = NULL; + int tgt = sd->sd_address.a_target; + int lun = sd->sd_address.a_lun; + int dtype = sd->sd_inq->inq_dtype & DTYPE_MASK; + int rval; + + con_log(CL_ANN1, (CE_WARN, "dr_sas: scsi_device t%dL%d", tgt, lun)); + scsi_hba_nodename_compatible_get(sd->sd_inq, NULL, dtype, + NULL, &nodename, &compatible, &ncompatible); + + if (nodename == NULL) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: Found no compatible driver " + "for t%dL%d", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + + childname = (dtype == DTYPE_DIRECT) ? "sd" : nodename; + con_log(CL_ANN1, (CE_WARN, + "dr_sas: Childname = %2s nodename = %s", childname, nodename)); + + /* Create a dev node */ + rval = ndi_devi_alloc(instance->dip, childname, DEVI_SID_NODEID, &ldip); + con_log(CL_ANN1, (CE_WARN, + "dr_sas_config_scsi_device: ndi_devi_alloc rval = %x", rval)); + if (rval == NDI_SUCCESS) { + if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "target", tgt) != + DDI_PROP_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create " + "property for t%dl%d target", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "lun", lun) != + DDI_PROP_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create " + "property for t%dl%d lun", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + + if (ndi_prop_update_string_array(DDI_DEV_T_NONE, ldip, + "compatible", compatible, ncompatible) != + DDI_PROP_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create " + "property for t%dl%d compatible", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + + rval = ndi_devi_online(ldip, NDI_ONLINE_ATTACH); + if (rval != NDI_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to online " + "t%dl%d", tgt, lun)); + ndi_prop_remove_all(ldip); + (void) ndi_devi_free(ldip); + } else { + con_log(CL_ANN1, (CE_WARN, "dr_sas: online Done :" + "0 t%dl%d", tgt, lun)); + } + + } +finish: + if (dipp) { + *dipp = ldip; + } + + con_log(CL_DLEVEL1, (CE_WARN, + "dr_sas: config_scsi_device rval = %d t%dL%d", + rval, tgt, lun)); + scsi_hba_nodename_compatible_free(nodename, compatible); + return (rval); +} + +/*ARGSUSED*/ +static int +drsas_service_evt(struct drsas_instance *instance, int tgt, int lun, int event, + uint64_t wwn) +{ + struct drsas_eventinfo *mrevt = NULL; + + con_log(CL_ANN1, (CE_NOTE, + "drsas_service_evt called for t%dl%d event = %d", + tgt, lun, event)); + + if ((instance->taskq == NULL) || (mrevt = + kmem_zalloc(sizeof (struct drsas_eventinfo), KM_NOSLEEP)) == NULL) { + return (ENOMEM); + } + + mrevt->instance = instance; + mrevt->tgt = tgt; + mrevt->lun = lun; + mrevt->event = event; + + if ((ddi_taskq_dispatch(instance->taskq, + (void (*)(void *))drsas_issue_evt_taskq, mrevt, DDI_NOSLEEP)) != + DDI_SUCCESS) { + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: Event task failed for t%dl%d event = %d", + tgt, lun, event)); + kmem_free(mrevt, sizeof (struct drsas_eventinfo)); + return (DDI_FAILURE); + } + return (DDI_SUCCESS); +} + +static void +drsas_issue_evt_taskq(struct drsas_eventinfo *mrevt) +{ + struct drsas_instance *instance = mrevt->instance; + dev_info_t *dip, *pdip; + int circ1 = 0; + char *devname; + + con_log(CL_ANN1, (CE_NOTE, "drsas_issue_evt_taskq: called for" + " tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + + if (mrevt->tgt < MRDRV_MAX_LD && mrevt->lun == 0) { + dip = instance->dr_ld_list[mrevt->tgt].dip; + } else { + return; + } + + ndi_devi_enter(instance->dip, &circ1); + switch (mrevt->event) { + case DRSAS_EVT_CONFIG_TGT: + if (dip == NULL) { + + if (mrevt->lun == 0) { + (void) drsas_config_ld(instance, mrevt->tgt, + 0, NULL); + } + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_CONFIG_TGT called:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + + } else { + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_CONFIG_TGT dip != NULL:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + } + break; + case DRSAS_EVT_UNCONFIG_TGT: + if (dip) { + if (i_ddi_devi_attached(dip)) { + + pdip = ddi_get_parent(dip); + + devname = kmem_zalloc(MAXNAMELEN + 1, KM_SLEEP); + (void) ddi_deviname(dip, devname); + + (void) devfs_clean(pdip, devname + 1, + DV_CLEAN_FORCE); + kmem_free(devname, MAXNAMELEN + 1); + } + (void) ndi_devi_offline(dip, NDI_DEVI_REMOVE); + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_UNCONFIG_TGT called:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + } else { + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_UNCONFIG_TGT dip == NULL:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + } + break; + } + kmem_free(mrevt, sizeof (struct drsas_eventinfo)); + ndi_devi_exit(instance->dip, circ1); +} + +static int +drsas_mode_sense_build(struct scsi_pkt *pkt) +{ + union scsi_cdb *cdbp; + uint16_t page_code; + struct scsa_cmd *acmd; + struct buf *bp; + struct mode_header *modehdrp; + + cdbp = (void *)pkt->pkt_cdbp; + page_code = cdbp->cdb_un.sg.scsi[0]; + acmd = PKT2CMD(pkt); + bp = acmd->cmd_buf; + if ((!bp) && bp->b_un.b_addr && bp->b_bcount && acmd->cmd_dmacount) { + con_log(CL_ANN1, (CE_WARN, "Failing MODESENSE Command")); + /* ADD pkt statistics as Command failed. */ + return (NULL); + } + + bp_mapin(bp); + bzero(bp->b_un.b_addr, bp->b_bcount); + + switch (page_code) { + case 0x3: { + struct mode_format *page3p = NULL; + modehdrp = (struct mode_header *)(bp->b_un.b_addr); + modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH; + + page3p = (void *)((caddr_t)modehdrp + + MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH); + page3p->mode_page.code = 0x3; + page3p->mode_page.length = + (uchar_t)(sizeof (struct mode_format)); + page3p->data_bytes_sect = 512; + page3p->sect_track = 63; + break; + } + case 0x4: { + struct mode_geometry *page4p = NULL; + modehdrp = (struct mode_header *)(bp->b_un.b_addr); + modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH; + + page4p = (void *)((caddr_t)modehdrp + + MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH); + page4p->mode_page.code = 0x4; + page4p->mode_page.length = + (uchar_t)(sizeof (struct mode_geometry)); + page4p->heads = 255; + page4p->rpm = 10000; + break; + } + default: + break; + } + return (NULL); +} diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.conf b/usr/src/uts/common/io/dr_sas/dr_sas.conf new file mode 100644 index 0000000000..3792f43ca4 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas.conf @@ -0,0 +1,15 @@ +# +# Copyright (c) 2008-2009, LSI Logic Corporation. +# All rights reserved. +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# dr_sas.conf for sol 10 (and later) for all supported architectures +# +# global definitions + +# MSI specific flag. user can uncomment this line and set flag "yes" to enable MSI +#drsas-enable-msi="yes"; diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.h b/usr/src/uts/common/io/dr_sas/dr_sas.h new file mode 100644 index 0000000000..8f78658edf --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas.h @@ -0,0 +1,1766 @@ +/* + * dr_sas.h: header for dr_sas + * + * Solaris MegaRAID driver for SAS2.0 controllers + * Copyright (c) 2008-2009, LSI Logic Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DR_SAS_H_ +#define _DR_SAS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/scsi/scsi.h> +#include "dr_sas_list.h" + +/* + * MegaRAID SAS2.0 Driver meta data + */ +#define DRSAS_VERSION "LSIv2.0" +#define DRSAS_RELDATE "Jan 9, 2009" + +#define DRSAS_TRUE 1 +#define DRSAS_FALSE 0 + +/* + * MegaRAID SAS2.0 device id conversion definitions. + */ +#define INST2LSIRDCTL(x) ((x) << INST_MINOR_SHIFT) + +/* + * MegaRAID SAS2.0 supported controllers + */ +#define PCI_DEVICE_ID_LSI_2108VDE 0x0078 +#define PCI_DEVICE_ID_LSI_2108V 0x0079 + +/* + * Register Index for 2108 Controllers. + */ +#define REGISTER_SET_IO_2108 (2) + +#define DRSAS_MAX_SGE_CNT 0x50 + +#define DRSAS_IOCTL_DRIVER 0x12341234 +#define DRSAS_IOCTL_FIRMWARE 0x12345678 +#define DRSAS_IOCTL_AEN 0x87654321 + +#define DRSAS_1_SECOND 1000000 + +/* Dynamic Enumeration Flags */ +#define DRSAS_PD_LUN 1 +#define DRSAS_LD_LUN 0 +#define DRSAS_PD_TGT_MAX 255 +#define DRSAS_GET_PD_MAX(s) ((s)->dr_pd_max) +#define WWN_STRLEN 17 + +/* + * ===================================== + * MegaRAID SAS2.0 MFI firmware definitions + * ===================================== + */ +/* + * MFI stands for MegaRAID SAS2.0 FW Interface. This is just a moniker for + * protocol between the software and firmware. Commands are issued using + * "message frames" + */ + +/* + * FW posts its state in upper 4 bits of outbound_msg_0 register + */ +#define MFI_STATE_SHIFT 28 +#define MFI_STATE_MASK ((uint32_t)0xF<<MFI_STATE_SHIFT) +#define MFI_STATE_UNDEFINED ((uint32_t)0x0<<MFI_STATE_SHIFT) +#define MFI_STATE_BB_INIT ((uint32_t)0x1<<MFI_STATE_SHIFT) +#define MFI_STATE_FW_INIT ((uint32_t)0x4<<MFI_STATE_SHIFT) +#define MFI_STATE_WAIT_HANDSHAKE ((uint32_t)0x6<<MFI_STATE_SHIFT) +#define MFI_STATE_FW_INIT_2 ((uint32_t)0x7<<MFI_STATE_SHIFT) +#define MFI_STATE_DEVICE_SCAN ((uint32_t)0x8<<MFI_STATE_SHIFT) +#define MFI_STATE_BOOT_MESSAGE_PENDING ((uint32_t)0x9<<MFI_STATE_SHIFT) +#define MFI_STATE_FLUSH_CACHE ((uint32_t)0xA<<MFI_STATE_SHIFT) +#define MFI_STATE_READY ((uint32_t)0xB<<MFI_STATE_SHIFT) +#define MFI_STATE_OPERATIONAL ((uint32_t)0xC<<MFI_STATE_SHIFT) +#define MFI_STATE_FAULT ((uint32_t)0xF<<MFI_STATE_SHIFT) + +#define MRMFI_FRAME_SIZE 64 + +/* + * During FW init, clear pending cmds & reset state using inbound_msg_0 + * + * ABORT : Abort all pending cmds + * READY : Move from OPERATIONAL to READY state; discard queue info + * MFIMODE : Discard (possible) low MFA posted in 64-bit mode (??) + * CLR_HANDSHAKE: FW is waiting for HANDSHAKE from BIOS or Driver + */ +#define MFI_INIT_ABORT 0x00000001 +#define MFI_INIT_READY 0x00000002 +#define MFI_INIT_MFIMODE 0x00000004 +#define MFI_INIT_CLEAR_HANDSHAKE 0x00000008 +#define MFI_INIT_HOTPLUG 0x00000010 +#define MFI_STOP_ADP 0x00000020 +#define MFI_RESET_FLAGS MFI_INIT_READY|MFI_INIT_MFIMODE|MFI_INIT_ABORT + +/* + * MFI frame flags + */ +#define MFI_FRAME_POST_IN_REPLY_QUEUE 0x0000 +#define MFI_FRAME_DONT_POST_IN_REPLY_QUEUE 0x0001 +#define MFI_FRAME_SGL32 0x0000 +#define MFI_FRAME_SGL64 0x0002 +#define MFI_FRAME_SENSE32 0x0000 +#define MFI_FRAME_SENSE64 0x0004 +#define MFI_FRAME_DIR_NONE 0x0000 +#define MFI_FRAME_DIR_WRITE 0x0008 +#define MFI_FRAME_DIR_READ 0x0010 +#define MFI_FRAME_DIR_BOTH 0x0018 + +/* + * Definition for cmd_status + */ +#define MFI_CMD_STATUS_POLL_MODE 0xFF +#define MFI_CMD_STATUS_SYNC_MODE 0xFF + +/* + * MFI command opcodes + */ +#define MFI_CMD_OP_INIT 0x00 +#define MFI_CMD_OP_LD_READ 0x01 +#define MFI_CMD_OP_LD_WRITE 0x02 +#define MFI_CMD_OP_LD_SCSI 0x03 +#define MFI_CMD_OP_PD_SCSI 0x04 +#define MFI_CMD_OP_DCMD 0x05 +#define MFI_CMD_OP_ABORT 0x06 +#define MFI_CMD_OP_SMP 0x07 +#define MFI_CMD_OP_STP 0x08 + +#define DR_DCMD_CTRL_GET_INFO 0x01010000 + +#define DR_DCMD_CTRL_CACHE_FLUSH 0x01101000 +#define DR_FLUSH_CTRL_CACHE 0x01 +#define DR_FLUSH_DISK_CACHE 0x02 + +#define DR_DCMD_CTRL_SHUTDOWN 0x01050000 +#define DRSAS_ENABLE_DRIVE_SPINDOWN 0x01 + +#define DR_DCMD_CTRL_EVENT_GET_INFO 0x01040100 +#define DR_DCMD_CTRL_EVENT_GET 0x01040300 +#define DR_DCMD_CTRL_EVENT_WAIT 0x01040500 +#define DR_DCMD_LD_GET_PROPERTIES 0x03030000 +#define DR_DCMD_PD_GET_INFO 0x02020000 + +/* + * Solaris Specific MAX values + */ +#define MAX_SGL 24 +/* + * MFI command completion codes + */ +enum MFI_STAT { + MFI_STAT_OK = 0x00, + MFI_STAT_INVALID_CMD = 0x01, + MFI_STAT_INVALID_DCMD = 0x02, + MFI_STAT_INVALID_PARAMETER = 0x03, + MFI_STAT_INVALID_SEQUENCE_NUMBER = 0x04, + MFI_STAT_ABORT_NOT_POSSIBLE = 0x05, + MFI_STAT_APP_HOST_CODE_NOT_FOUND = 0x06, + MFI_STAT_APP_IN_USE = 0x07, + MFI_STAT_APP_NOT_INITIALIZED = 0x08, + MFI_STAT_ARRAY_INDEX_INVALID = 0x09, + MFI_STAT_ARRAY_ROW_NOT_EMPTY = 0x0a, + MFI_STAT_CONFIG_RESOURCE_CONFLICT = 0x0b, + MFI_STAT_DEVICE_NOT_FOUND = 0x0c, + MFI_STAT_DRIVE_TOO_SMALL = 0x0d, + MFI_STAT_FLASH_ALLOC_FAIL = 0x0e, + MFI_STAT_FLASH_BUSY = 0x0f, + MFI_STAT_FLASH_ERROR = 0x10, + MFI_STAT_FLASH_IMAGE_BAD = 0x11, + MFI_STAT_FLASH_IMAGE_INCOMPLETE = 0x12, + MFI_STAT_FLASH_NOT_OPEN = 0x13, + MFI_STAT_FLASH_NOT_STARTED = 0x14, + MFI_STAT_FLUSH_FAILED = 0x15, + MFI_STAT_HOST_CODE_NOT_FOUNT = 0x16, + MFI_STAT_LD_CC_IN_PROGRESS = 0x17, + MFI_STAT_LD_INIT_IN_PROGRESS = 0x18, + MFI_STAT_LD_LBA_OUT_OF_RANGE = 0x19, + MFI_STAT_LD_MAX_CONFIGURED = 0x1a, + MFI_STAT_LD_NOT_OPTIMAL = 0x1b, + MFI_STAT_LD_RBLD_IN_PROGRESS = 0x1c, + MFI_STAT_LD_RECON_IN_PROGRESS = 0x1d, + MFI_STAT_LD_WRONG_RAID_LEVEL = 0x1e, + MFI_STAT_MAX_SPARES_EXCEEDED = 0x1f, + MFI_STAT_MEMORY_NOT_AVAILABLE = 0x20, + MFI_STAT_MFC_HW_ERROR = 0x21, + MFI_STAT_NO_HW_PRESENT = 0x22, + MFI_STAT_NOT_FOUND = 0x23, + MFI_STAT_NOT_IN_ENCL = 0x24, + MFI_STAT_PD_CLEAR_IN_PROGRESS = 0x25, + MFI_STAT_PD_TYPE_WRONG = 0x26, + MFI_STAT_PR_DISABLED = 0x27, + MFI_STAT_ROW_INDEX_INVALID = 0x28, + MFI_STAT_SAS_CONFIG_INVALID_ACTION = 0x29, + MFI_STAT_SAS_CONFIG_INVALID_DATA = 0x2a, + MFI_STAT_SAS_CONFIG_INVALID_PAGE = 0x2b, + MFI_STAT_SAS_CONFIG_INVALID_TYPE = 0x2c, + MFI_STAT_SCSI_DONE_WITH_ERROR = 0x2d, + MFI_STAT_SCSI_IO_FAILED = 0x2e, + MFI_STAT_SCSI_RESERVATION_CONFLICT = 0x2f, + MFI_STAT_SHUTDOWN_FAILED = 0x30, + MFI_STAT_TIME_NOT_SET = 0x31, + MFI_STAT_WRONG_STATE = 0x32, + MFI_STAT_LD_OFFLINE = 0x33, + /* UNUSED: 0x34 to 0xfe */ + MFI_STAT_INVALID_STATUS = 0xFF +}; + +enum DR_EVT_CLASS { + DR_EVT_CLASS_DEBUG = -2, + DR_EVT_CLASS_PROGRESS = -1, + DR_EVT_CLASS_INFO = 0, + DR_EVT_CLASS_WARNING = 1, + DR_EVT_CLASS_CRITICAL = 2, + DR_EVT_CLASS_FATAL = 3, + DR_EVT_CLASS_DEAD = 4 +}; + +enum DR_EVT_LOCALE { + DR_EVT_LOCALE_LD = 0x0001, + DR_EVT_LOCALE_PD = 0x0002, + DR_EVT_LOCALE_ENCL = 0x0004, + DR_EVT_LOCALE_BBU = 0x0008, + DR_EVT_LOCALE_SAS = 0x0010, + DR_EVT_LOCALE_CTRL = 0x0020, + DR_EVT_LOCALE_CONFIG = 0x0040, + DR_EVT_LOCALE_CLUSTER = 0x0080, + DR_EVT_LOCALE_ALL = 0xffff +}; + +#define DR_EVT_CFG_CLEARED 0x0004 +#define DR_EVT_LD_CREATED 0x008a +#define DR_EVT_LD_DELETED 0x008b +#define DR_EVT_PD_REMOVED_EXT 0x00f8 +#define DR_EVT_PD_INSERTED_EXT 0x00f7 + +enum LD_STATE { + LD_OFFLINE = 0, + LD_PARTIALLY_DEGRADED = 1, + LD_DEGRADED = 2, + LD_OPTIMAL = 3, + LD_INVALID = 0xFF +}; + +enum DRSAS_EVT { + DRSAS_EVT_CONFIG_TGT = 0, + DRSAS_EVT_UNCONFIG_TGT = 1, + DRSAS_EVT_UNCONFIG_SMP = 2 +}; + +#define DMA_OBJ_ALLOCATED 1 +#define DMA_OBJ_REALLOCATED 2 +#define DMA_OBJ_FREED 3 + +/* + * dma_obj_t - Our DMA object + * @param buffer : kernel virtual address + * @param size : size of the data to be allocated + * @param acc_handle : access handle + * @param dma_handle : dma handle + * @param dma_cookie : scatter-gather list + * @param dma_attr : dma attributes for this buffer + * Our DMA object. The caller must initialize the size and dma attributes + * (dma_attr) fields before allocating the resources. + */ +typedef struct { + caddr_t buffer; + uint32_t size; + ddi_acc_handle_t acc_handle; + ddi_dma_handle_t dma_handle; + ddi_dma_cookie_t dma_cookie[DRSAS_MAX_SGE_CNT]; + ddi_dma_attr_t dma_attr; + uint8_t status; + uint8_t reserved[3]; +} dma_obj_t; + +struct drsas_eventinfo { + struct drsas_instance *instance; + int tgt; + int lun; + int event; +}; + +struct drsas_ld { + dev_info_t *dip; + uint8_t lun_type; + uint8_t reserved[3]; +}; + +struct drsas_pd { + dev_info_t *dip; + uint8_t lun_type; + uint8_t dev_id; + uint8_t flags; + uint8_t reserved; +}; + +struct drsas_pd_info { + uint16_t deviceId; + uint16_t seqNum; + uint8_t inquiryData[96]; + uint8_t vpdPage83[64]; + uint8_t notSupported; + uint8_t scsiDevType; + uint8_t a; + uint8_t device_speed; + uint32_t mediaerrcnt; + uint32_t other; + uint32_t pred; + uint32_t lastpred; + uint16_t fwState; + uint8_t disabled; + uint8_t linkspwwd; + uint32_t ddfType; + struct { + uint8_t count; + uint8_t isPathBroken; + uint8_t connectorIndex[2]; + uint8_t reserved[4]; + uint64_t sasAddr[2]; + uint8_t reserved2[16]; + } pathInfo; +}; + +typedef struct drsas_instance { + uint32_t *producer; + uint32_t *consumer; + + uint32_t *reply_queue; + dma_obj_t mfi_internal_dma_obj; + + uint8_t init_id; + uint8_t reserved[3]; + + uint16_t max_num_sge; + uint16_t max_fw_cmds; + uint32_t max_sectors_per_req; + + struct drsas_cmd **cmd_list; + + mlist_t cmd_pool_list; + kmutex_t cmd_pool_mtx; + + mlist_t cmd_pend_list; + kmutex_t cmd_pend_mtx; + + dma_obj_t mfi_evt_detail_obj; + struct drsas_cmd *aen_cmd; + + uint32_t aen_seq_num; + uint32_t aen_class_locale_word; + + scsi_hba_tran_t *tran; + + kcondvar_t int_cmd_cv; + kmutex_t int_cmd_mtx; + + kcondvar_t aen_cmd_cv; + kmutex_t aen_cmd_mtx; + + kcondvar_t abort_cmd_cv; + kmutex_t abort_cmd_mtx; + + dev_info_t *dip; + ddi_acc_handle_t pci_handle; + + timeout_id_t timeout_id; + uint32_t unique_id; + uint16_t fw_outstanding; + caddr_t regmap; + ddi_acc_handle_t regmap_handle; + uint8_t isr_level; + ddi_iblock_cookie_t iblock_cookie; + ddi_iblock_cookie_t soft_iblock_cookie; + ddi_softintr_t soft_intr_id; + uint8_t softint_running; + kmutex_t completed_pool_mtx; + mlist_t completed_pool_list; + + caddr_t internal_buf; + uint32_t internal_buf_dmac_add; + uint32_t internal_buf_size; + + uint16_t vendor_id; + uint16_t device_id; + uint16_t subsysvid; + uint16_t subsysid; + int instance; + int baseaddress; + char iocnode[16]; + + int fm_capabilities; + + struct drsas_func_ptr *func_ptr; + /* MSI interrupts specific */ + ddi_intr_handle_t *intr_htable; + int intr_type; + int intr_cnt; + size_t intr_size; + uint_t intr_pri; + int intr_cap; + + ddi_taskq_t *taskq; + struct drsas_ld *dr_ld_list; +} drsas_t; + +struct drsas_func_ptr { + int (*read_fw_status_reg)(struct drsas_instance *); + void (*issue_cmd)(struct drsas_cmd *, struct drsas_instance *); + int (*issue_cmd_in_sync_mode)(struct drsas_instance *, + struct drsas_cmd *); + int (*issue_cmd_in_poll_mode)(struct drsas_instance *, + struct drsas_cmd *); + void (*enable_intr)(struct drsas_instance *); + void (*disable_intr)(struct drsas_instance *); + int (*intr_ack)(struct drsas_instance *); +}; + +/* + * ### Helper routines ### + */ + +/* + * con_log() - console log routine + * @param level : indicates the severity of the message. + * @fparam mt : format string + * + * con_log displays the error messages on the console based on the current + * debug level. Also it attaches the appropriate kernel severity level with + * the message. + * + * + * console messages debug levels + */ +#define CL_NONE 0 /* No debug information */ +#define CL_ANN 1 /* print unconditionally, announcements */ +#define CL_ANN1 2 /* No o/p */ +#define CL_DLEVEL1 3 /* debug level 1, informative */ +#define CL_DLEVEL2 4 /* debug level 2, verbose */ +#define CL_DLEVEL3 5 /* debug level 3, very verbose */ + +#ifdef __SUNPRO_C +#define __func__ "" +#endif + +#define con_log(level, fmt) { if (debug_level_g >= level) cmn_err fmt; } + +/* + * ### SCSA definitions ### + */ +#define PKT2TGT(pkt) ((pkt)->pkt_address.a_target) +#define PKT2LUN(pkt) ((pkt)->pkt_address.a_lun) +#define PKT2TRAN(pkt) ((pkt)->pkt_adress.a_hba_tran) +#define ADDR2TRAN(ap) ((ap)->a_hba_tran) + +#define TRAN2MR(tran) (struct drsas_instance *)(tran)->tran_hba_private) +#define ADDR2MR(ap) (TRAN2MR(ADDR2TRAN(ap)) + +#define PKT2CMD(pkt) ((struct scsa_cmd *)(pkt)->pkt_ha_private) +#define CMD2PKT(sp) ((sp)->cmd_pkt) +#define PKT2REQ(pkt) (&(PKT2CMD(pkt)->request)) + +#define CMD2ADDR(cmd) (&CMD2PKT(cmd)->pkt_address) +#define CMD2TRAN(cmd) (CMD2PKT(cmd)->pkt_address.a_hba_tran) +#define CMD2MR(cmd) (TRAN2MR(CMD2TRAN(cmd))) + +#define CFLAG_DMAVALID 0x0001 /* requires a dma operation */ +#define CFLAG_DMASEND 0x0002 /* Transfer from the device */ +#define CFLAG_CONSISTENT 0x0040 /* consistent data transfer */ + +/* + * ### Data structures for ioctl inteface and internal commands ### + */ + +/* + * Data direction flags + */ +#define UIOC_RD 0x00001 +#define UIOC_WR 0x00002 + +#define SCP2HOST(scp) (scp)->device->host /* to host */ +#define SCP2HOSTDATA(scp) SCP2HOST(scp)->hostdata /* to soft state */ +#define SCP2CHANNEL(scp) (scp)->device->channel /* to channel */ +#define SCP2TARGET(scp) (scp)->device->id /* to target */ +#define SCP2LUN(scp) (scp)->device->lun /* to LUN */ + +#define SCSIHOST2ADAP(host) (((caddr_t *)(host->hostdata))[0]) +#define SCP2ADAPTER(scp) \ + (struct drsas_instance *)SCSIHOST2ADAP(SCP2HOST(scp)) + +#define MRDRV_IS_LOGICAL_SCSA(instance, acmd) \ + (acmd->device_id < MRDRV_MAX_LD) ? 1 : 0 +#define MRDRV_IS_LOGICAL(ap) \ + ((ap->a_target < MRDRV_MAX_LD) && (ap->a_lun == 0)) ? 1 : 0 +#define MAP_DEVICE_ID(instance, ap) \ + (ap->a_target) + +#define HIGH_LEVEL_INTR 1 +#define NORMAL_LEVEL_INTR 0 + +/* + * scsa_cmd - Per-command mr private data + * @param cmd_dmahandle : dma handle + * @param cmd_dmacookies : current dma cookies + * @param cmd_pkt : scsi_pkt reference + * @param cmd_dmacount : dma count + * @param cmd_cookie : next cookie + * @param cmd_ncookies : cookies per window + * @param cmd_cookiecnt : cookies per sub-win + * @param cmd_nwin : number of dma windows + * @param cmd_curwin : current dma window + * @param cmd_dma_offset : current window offset + * @param cmd_dma_len : current window length + * @param cmd_flags : private flags + * @param cmd_cdblen : length of cdb + * @param cmd_scblen : length of scb + * @param cmd_buf : command buffer + * @param channel : channel for scsi sub-system + * @param target : target for scsi sub-system + * @param lun : LUN for scsi sub-system + * + * - Allocated at same time as scsi_pkt by scsi_hba_pkt_alloc(9E) + * - Pointed to by pkt_ha_private field in scsi_pkt + */ +struct scsa_cmd { + ddi_dma_handle_t cmd_dmahandle; + ddi_dma_cookie_t cmd_dmacookies[DRSAS_MAX_SGE_CNT]; + struct scsi_pkt *cmd_pkt; + ulong_t cmd_dmacount; + uint_t cmd_cookie; + uint_t cmd_ncookies; + uint_t cmd_cookiecnt; + uint_t cmd_nwin; + uint_t cmd_curwin; + off_t cmd_dma_offset; + ulong_t cmd_dma_len; + ulong_t cmd_flags; + uint_t cmd_cdblen; + uint_t cmd_scblen; + struct buf *cmd_buf; + ushort_t device_id; + uchar_t islogical; + uchar_t lun; + struct drsas_device *drsas_dev; +}; + + +struct drsas_cmd { + union drsas_frame *frame; + uint32_t frame_phys_addr; + uint8_t *sense; + uint32_t sense_phys_addr; + dma_obj_t frame_dma_obj; + uint8_t frame_dma_obj_status; + + uint32_t index; + uint8_t sync_cmd; + uint8_t cmd_status; + uint16_t abort_aen; + mlist_t list; + uint32_t frame_count; + struct scsa_cmd *cmd; + struct scsi_pkt *pkt; +}; + +#define MAX_MGMT_ADAPTERS 1024 +#define IOC_SIGNATURE "MR-SAS" + +#define IOC_CMD_FIRMWARE 0x0 +#define DRSAS_DRIVER_IOCTL_COMMON 0xF0010000 +#define DRSAS_DRIVER_IOCTL_DRIVER_VERSION 0xF0010100 +#define DRSAS_DRIVER_IOCTL_PCI_INFORMATION 0xF0010200 +#define DRSAS_DRIVER_IOCTL_MRRAID_STATISTICS 0xF0010300 + + +#define DRSAS_MAX_SENSE_LENGTH 32 + +struct drsas_mgmt_info { + + uint16_t count; + struct drsas_instance *instance[MAX_MGMT_ADAPTERS]; + uint16_t map[MAX_MGMT_ADAPTERS]; + int max_index; +}; + +#pragma pack(1) + +/* + * SAS controller properties + */ +struct drsas_ctrl_prop { + uint16_t seq_num; + uint16_t pred_fail_poll_interval; + uint16_t intr_throttle_count; + uint16_t intr_throttle_timeouts; + + uint8_t rebuild_rate; + uint8_t patrol_read_rate; + uint8_t bgi_rate; + uint8_t cc_rate; + uint8_t recon_rate; + + uint8_t cache_flush_interval; + + uint8_t spinup_drv_count; + uint8_t spinup_delay; + + uint8_t cluster_enable; + uint8_t coercion_mode; + uint8_t disk_write_cache_disable; + uint8_t alarm_enable; + + uint8_t reserved[44]; +}; + +/* + * SAS controller information + */ +struct drsas_ctrl_info { + /* PCI device information */ + struct { + uint16_t vendor_id; + uint16_t device_id; + uint16_t sub_vendor_id; + uint16_t sub_device_id; + uint8_t reserved[24]; + } pci; + + /* Host interface information */ + struct { + uint8_t PCIX : 1; + uint8_t PCIE : 1; + uint8_t iSCSI : 1; + uint8_t SAS_3G : 1; + uint8_t reserved_0 : 4; + uint8_t reserved_1[6]; + uint8_t port_count; + uint64_t port_addr[8]; + } host_interface; + + /* Device (backend) interface information */ + struct { + uint8_t SPI : 1; + uint8_t SAS_3G : 1; + uint8_t SATA_1_5G : 1; + uint8_t SATA_3G : 1; + uint8_t reserved_0 : 4; + uint8_t reserved_1[6]; + uint8_t port_count; + uint64_t port_addr[8]; + } device_interface; + + /* List of components residing in flash. All str are null terminated */ + uint32_t image_check_word; + uint32_t image_component_count; + + struct { + char name[8]; + char version[32]; + char build_date[16]; + char built_time[16]; + } image_component[8]; + + /* + * List of flash components that have been flashed on the card, but + * are not in use, pending reset of the adapter. This list will be + * empty if a flash operation has not occurred. All stings are null + * terminated + */ + uint32_t pending_image_component_count; + + struct { + char name[8]; + char version[32]; + char build_date[16]; + char build_time[16]; + } pending_image_component[8]; + + uint8_t max_arms; + uint8_t max_spans; + uint8_t max_arrays; + uint8_t max_lds; + + char product_name[80]; + char serial_no[32]; + + /* + * Other physical/controller/operation information. Indicates the + * presence of the hardware + */ + struct { + uint32_t bbu : 1; + uint32_t alarm : 1; + uint32_t nvram : 1; + uint32_t uart : 1; + uint32_t reserved : 28; + } hw_present; + + uint32_t current_fw_time; + + /* Maximum data transfer sizes */ + uint16_t max_concurrent_cmds; + uint16_t max_sge_count; + uint32_t max_request_size; + + /* Logical and physical device counts */ + uint16_t ld_present_count; + uint16_t ld_degraded_count; + uint16_t ld_offline_count; + + uint16_t pd_present_count; + uint16_t pd_disk_present_count; + uint16_t pd_disk_pred_failure_count; + uint16_t pd_disk_failed_count; + + /* Memory size information */ + uint16_t nvram_size; + uint16_t memory_size; + uint16_t flash_size; + + /* Error counters */ + uint16_t mem_correctable_error_count; + uint16_t mem_uncorrectable_error_count; + + /* Cluster information */ + uint8_t cluster_permitted; + uint8_t cluster_active; + uint8_t reserved_1[2]; + + /* Controller capabilities structures */ + struct { + uint32_t raid_level_0 : 1; + uint32_t raid_level_1 : 1; + uint32_t raid_level_5 : 1; + uint32_t raid_level_1E : 1; + uint32_t reserved : 28; + } raid_levels; + + struct { + uint32_t rbld_rate : 1; + uint32_t cc_rate : 1; + uint32_t bgi_rate : 1; + uint32_t recon_rate : 1; + uint32_t patrol_rate : 1; + uint32_t alarm_control : 1; + uint32_t cluster_supported : 1; + uint32_t bbu : 1; + uint32_t spanning_allowed : 1; + uint32_t dedicated_hotspares : 1; + uint32_t revertible_hotspares : 1; + uint32_t foreign_config_import : 1; + uint32_t self_diagnostic : 1; + uint32_t reserved : 19; + } adapter_operations; + + struct { + uint32_t read_policy : 1; + uint32_t write_policy : 1; + uint32_t io_policy : 1; + uint32_t access_policy : 1; + uint32_t reserved : 28; + } ld_operations; + + struct { + uint8_t min; + uint8_t max; + uint8_t reserved[2]; + } stripe_size_operations; + + struct { + uint32_t force_online : 1; + uint32_t force_offline : 1; + uint32_t force_rebuild : 1; + uint32_t reserved : 29; + } pd_operations; + + struct { + uint32_t ctrl_supports_sas : 1; + uint32_t ctrl_supports_sata : 1; + uint32_t allow_mix_in_encl : 1; + uint32_t allow_mix_in_ld : 1; + uint32_t allow_sata_in_cluster : 1; + uint32_t reserved : 27; + } pd_mix_support; + + /* Include the controller properties (changeable items) */ + uint8_t reserved_2[12]; + struct drsas_ctrl_prop properties; + + uint8_t pad[0x800 - 0x640]; +}; + +/* + * ================================== + * MegaRAID SAS2.0 driver definitions + * ================================== + */ +#define MRDRV_MAX_NUM_CMD 1024 + +#define MRDRV_MAX_PD_CHANNELS 2 +#define MRDRV_MAX_LD_CHANNELS 2 +#define MRDRV_MAX_CHANNELS (MRDRV_MAX_PD_CHANNELS + \ + MRDRV_MAX_LD_CHANNELS) +#define MRDRV_MAX_DEV_PER_CHANNEL 128 +#define MRDRV_DEFAULT_INIT_ID -1 +#define MRDRV_MAX_CMD_PER_LUN 1000 +#define MRDRV_MAX_LUN 1 +#define MRDRV_MAX_LD 64 + +#define MRDRV_RESET_WAIT_TIME 300 +#define MRDRV_RESET_NOTICE_INTERVAL 5 + +#define DRSAS_IOCTL_CMD 0 + +/* + * FW can accept both 32 and 64 bit SGLs. We want to allocate 32/64 bit + * SGLs based on the size of dma_addr_t + */ +#define IS_DMA64 (sizeof (dma_addr_t) == 8) + +#define IB_MSG_0_OFF 0x10 /* XScale */ +#define OB_MSG_0_OFF 0x18 /* XScale */ +#define IB_DOORBELL_OFF 0x20 /* XScale & ROC */ +#define OB_INTR_STATUS_OFF 0x30 /* XScale & ROC */ +#define OB_INTR_MASK_OFF 0x34 /* XScale & ROC */ +#define IB_QPORT_OFF 0x40 /* XScale & ROC */ +#define OB_DOORBELL_CLEAR_OFF 0xA0 /* ROC */ +#define OB_SCRATCH_PAD_0_OFF 0xB0 /* ROC */ +#define OB_INTR_MASK 0xFFFFFFFF +#define OB_DOORBELL_CLEAR_MASK 0xFFFFFFFF + +/* + * All MFI register set macros accept drsas_register_set* + */ +#define WR_IB_MSG_0(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_MSG_0_OFF), (v)) + +#define RD_OB_MSG_0(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_MSG_0_OFF)) + +#define WR_IB_DOORBELL(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF), (v)) + +#define RD_IB_DOORBELL(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF)) + +#define WR_OB_INTR_STATUS(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF), (v)) + +#define RD_OB_INTR_STATUS(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF)) + +#define WR_OB_INTR_MASK(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), (v)) + +#define RD_OB_INTR_MASK(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF)) + +#define WR_IB_QPORT(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_QPORT_OFF), (v)) + +#define WR_OB_DOORBELL_CLEAR(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_DOORBELL_CLEAR_OFF), \ + (v)) + +#define RD_OB_SCRATCH_PAD_0(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_SCRATCH_PAD_0_OFF)) + +/* + * When FW is in MFI_STATE_READY or MFI_STATE_OPERATIONAL, the state data + * of Outbound Msg Reg 0 indicates max concurrent cmds supported, max SGEs + * supported per cmd and if 64-bit MFAs (M64) is enabled or disabled. + */ +#define MFI_OB_INTR_STATUS_MASK 0x00000002 + +/* + * This MFI_REPLY_2108_MESSAGE_INTR flag is used also + * in enable_intr_ppc also. Hence bit 2, i.e. 0x4 has + * been set in this flag along with bit 1. + */ +#define MFI_REPLY_2108_MESSAGE_INTR 0x00000001 +#define MFI_REPLY_2108_MESSAGE_INTR_MASK 0x00000005 + +#define MFI_POLL_TIMEOUT_SECS 60 + +#define MFI_ENABLE_INTR(instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), 1) +#define MFI_DISABLE_INTR(instance) \ +{ \ + uint32_t disable = 1; \ + uint32_t mask = ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF));\ + mask &= ~disable; \ + ddi_put32((instance)->regmap_handle, (uint32_t *) \ + (uintptr_t)((instance)->regmap + OB_INTR_MASK_OFF), mask); \ +} + +/* By default, the firmware programs for 8 Kbytes of memory */ +#define DEFAULT_MFI_MEM_SZ 8192 +#define MINIMUM_MFI_MEM_SZ 4096 + +/* DCMD Message Frame MAILBOX0-11 */ +#define DCMD_MBOX_SZ 12 + + +struct drsas_register_set { + uint32_t reserved_0[4]; + + uint32_t inbound_msg_0; + uint32_t inbound_msg_1; + uint32_t outbound_msg_0; + uint32_t outbound_msg_1; + + uint32_t inbound_doorbell; + uint32_t inbound_intr_status; + uint32_t inbound_intr_mask; + + uint32_t outbound_doorbell; + uint32_t outbound_intr_status; + uint32_t outbound_intr_mask; + + uint32_t reserved_1[2]; + + uint32_t inbound_queue_port; + uint32_t outbound_queue_port; + + uint32_t reserved_2[22]; + + uint32_t outbound_doorbell_clear; + + uint32_t reserved_3[3]; + + uint32_t outbound_scratch_pad; + + uint32_t reserved_4[3]; + + uint32_t inbound_low_queue_port; + + uint32_t inbound_high_queue_port; + + uint32_t reserved_5; + uint32_t index_registers[820]; +}; + +struct drsas_sge32 { + uint32_t phys_addr; + uint32_t length; +}; + +struct drsas_sge64 { + uint64_t phys_addr; + uint32_t length; +}; + +union drsas_sgl { + struct drsas_sge32 sge32[1]; + struct drsas_sge64 sge64[1]; +}; + +struct drsas_header { + uint8_t cmd; + uint8_t sense_len; + uint8_t cmd_status; + uint8_t scsi_status; + + uint8_t target_id; + uint8_t lun; + uint8_t cdb_len; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + uint32_t data_xferlen; +}; + +union drsas_sgl_frame { + struct drsas_sge32 sge32[8]; + struct drsas_sge64 sge64[5]; +}; + +struct drsas_init_frame { + uint8_t cmd; + uint8_t reserved_0; + uint8_t cmd_status; + + uint8_t reserved_1; + uint32_t reserved_2; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t reserved_3; + uint32_t data_xfer_len; + + uint32_t queue_info_new_phys_addr_lo; + uint32_t queue_info_new_phys_addr_hi; + uint32_t queue_info_old_phys_addr_lo; + uint32_t queue_info_old_phys_addr_hi; + + uint32_t reserved_4[6]; +}; + +struct drsas_init_queue_info { + uint32_t init_flags; + uint32_t reply_queue_entries; + + uint32_t reply_queue_start_phys_addr_lo; + uint32_t reply_queue_start_phys_addr_hi; + uint32_t producer_index_phys_addr_lo; + uint32_t producer_index_phys_addr_hi; + uint32_t consumer_index_phys_addr_lo; + uint32_t consumer_index_phys_addr_hi; +}; + +struct drsas_io_frame { + uint8_t cmd; + uint8_t sense_len; + uint8_t cmd_status; + uint8_t scsi_status; + + uint8_t target_id; + uint8_t access_byte; + uint8_t reserved_0; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + uint32_t lba_count; + + uint32_t sense_buf_phys_addr_lo; + uint32_t sense_buf_phys_addr_hi; + + uint32_t start_lba_lo; + uint32_t start_lba_hi; + + union drsas_sgl sgl; +}; + +struct drsas_pthru_frame { + uint8_t cmd; + uint8_t sense_len; + uint8_t cmd_status; + uint8_t scsi_status; + + uint8_t target_id; + uint8_t lun; + uint8_t cdb_len; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + uint32_t data_xfer_len; + + uint32_t sense_buf_phys_addr_lo; + uint32_t sense_buf_phys_addr_hi; + + uint8_t cdb[16]; + union drsas_sgl sgl; +}; + +struct drsas_dcmd_frame { + uint8_t cmd; + uint8_t reserved_0; + uint8_t cmd_status; + uint8_t reserved_1[4]; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + + uint32_t data_xfer_len; + uint32_t opcode; + + union { + uint8_t b[DCMD_MBOX_SZ]; + uint16_t s[6]; + uint32_t w[3]; + } mbox; + + union drsas_sgl sgl; +}; + +struct drsas_abort_frame { + uint8_t cmd; + uint8_t reserved_0; + uint8_t cmd_status; + + uint8_t reserved_1; + uint32_t reserved_2; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t reserved_3; + uint32_t reserved_4; + + uint32_t abort_context; + uint32_t pad_1; + + uint32_t abort_mfi_phys_addr_lo; + uint32_t abort_mfi_phys_addr_hi; + + uint32_t reserved_5[6]; +}; + +struct drsas_smp_frame { + uint8_t cmd; + uint8_t reserved_1; + uint8_t cmd_status; + uint8_t connection_status; + + uint8_t reserved_2[3]; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + + uint32_t data_xfer_len; + + uint64_t sas_addr; + + union drsas_sgl sgl[2]; +}; + +struct drsas_stp_frame { + uint8_t cmd; + uint8_t reserved_1; + uint8_t cmd_status; + uint8_t connection_status; + + uint8_t target_id; + uint8_t reserved_2[2]; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + + uint32_t data_xfer_len; + + uint16_t fis[10]; + uint32_t stp_flags; + union drsas_sgl sgl; +}; + +union drsas_frame { + struct drsas_header hdr; + struct drsas_init_frame init; + struct drsas_io_frame io; + struct drsas_pthru_frame pthru; + struct drsas_dcmd_frame dcmd; + struct drsas_abort_frame abort; + struct drsas_smp_frame smp; + struct drsas_stp_frame stp; + + uint8_t raw_bytes[64]; +}; + +typedef struct drsas_pd_address { + uint16_t device_id; + uint16_t encl_id; + + union { + struct { + uint8_t encl_index; + uint8_t slot_number; + } pd_address; + struct { + uint8_t encl_position; + uint8_t encl_connector_index; + } encl_address; + }address; + + uint8_t scsi_dev_type; + + union { + uint8_t port_bitmap; + uint8_t port_numbers; + } connected; + + uint64_t sas_addr[2]; +} drsas_pd_address_t; + +union drsas_evt_class_locale { + struct { + uint16_t locale; + uint8_t reserved; + int8_t class; + } members; + + uint32_t word; +}; + +struct drsas_evt_log_info { + uint32_t newest_seq_num; + uint32_t oldest_seq_num; + uint32_t clear_seq_num; + uint32_t shutdown_seq_num; + uint32_t boot_seq_num; +}; + +struct drsas_progress { + uint16_t progress; + uint16_t elapsed_seconds; +}; + +struct drsas_evtarg_ld { + uint16_t target_id; + uint8_t ld_index; + uint8_t reserved; +}; + +struct drsas_evtarg_pd { + uint16_t device_id; + uint8_t encl_index; + uint8_t slot_number; +}; + +struct drsas_evt_detail { + uint32_t seq_num; + uint32_t time_stamp; + uint32_t code; + union drsas_evt_class_locale cl; + uint8_t arg_type; + uint8_t reserved1[15]; + + union { + struct { + struct drsas_evtarg_pd pd; + uint8_t cdb_length; + uint8_t sense_length; + uint8_t reserved[2]; + uint8_t cdb[16]; + uint8_t sense[64]; + } cdbSense; + + struct drsas_evtarg_ld ld; + + struct { + struct drsas_evtarg_ld ld; + uint64_t count; + } ld_count; + + struct { + uint64_t lba; + struct drsas_evtarg_ld ld; + } ld_lba; + + struct { + struct drsas_evtarg_ld ld; + uint32_t prevOwner; + uint32_t newOwner; + } ld_owner; + + struct { + uint64_t ld_lba; + uint64_t pd_lba; + struct drsas_evtarg_ld ld; + struct drsas_evtarg_pd pd; + } ld_lba_pd_lba; + + struct { + struct drsas_evtarg_ld ld; + struct drsas_progress prog; + } ld_prog; + + struct { + struct drsas_evtarg_ld ld; + uint32_t prev_state; + uint32_t new_state; + } ld_state; + + struct { + uint64_t strip; + struct drsas_evtarg_ld ld; + } ld_strip; + + struct drsas_evtarg_pd pd; + + struct { + struct drsas_evtarg_pd pd; + uint32_t err; + } pd_err; + + struct { + uint64_t lba; + struct drsas_evtarg_pd pd; + } pd_lba; + + struct { + uint64_t lba; + struct drsas_evtarg_pd pd; + struct drsas_evtarg_ld ld; + } pd_lba_ld; + + struct { + struct drsas_evtarg_pd pd; + struct drsas_progress prog; + } pd_prog; + + struct { + struct drsas_evtarg_pd pd; + uint32_t prevState; + uint32_t newState; + } pd_state; + + struct { + uint16_t vendorId; + uint16_t deviceId; + uint16_t subVendorId; + uint16_t subDeviceId; + } pci; + + uint32_t rate; + char str[96]; + + struct { + uint32_t rtc; + uint32_t elapsedSeconds; + } time; + + struct { + uint32_t ecar; + uint32_t elog; + char str[64]; + } ecc; + + drsas_pd_address_t pd_addr; + + uint8_t b[96]; + uint16_t s[48]; + uint32_t w[24]; + uint64_t d[12]; + } args; + + char description[128]; + +}; + +/* only 63 are usable by the application */ +#define MAX_LOGICAL_DRIVES 64 +/* only 255 physical devices may be used */ +#define MAX_PHYSICAL_DEVICES 256 +#define MAX_PD_PER_ENCLOSURE 64 +/* maximum disks per array */ +#define MAX_ROW_SIZE 32 +/* maximum spans per logical drive */ +#define MAX_SPAN_DEPTH 8 +/* maximum number of arrays a hot spare may be dedicated to */ +#define MAX_ARRAYS_DEDICATED 16 +/* maximum number of arrays which may exist */ +#define MAX_ARRAYS 128 +/* maximum number of foreign configs that may ha managed at once */ +#define MAX_FOREIGN_CONFIGS 8 +/* maximum spares (global and dedicated combined) */ +#define MAX_SPARES_FOR_THE_CONTROLLER MAX_PHYSICAL_DEVICES +/* maximum possible Target IDs (i.e. 0 to 63) */ +#define MAX_TARGET_ID 63 +/* maximum number of supported enclosures */ +#define MAX_ENCLOSURES 32 +/* maximum number of PHYs per controller */ +#define MAX_PHYS_PER_CONTROLLER 16 +/* maximum number of LDs per array (due to DDF limitations) */ +#define MAX_LDS_PER_ARRAY 16 + +/* + * ----------------------------------------------------------------------------- + * ----------------------------------------------------------------------------- + * + * Logical Drive commands + * + * ----------------------------------------------------------------------------- + * ----------------------------------------------------------------------------- + */ +#define DR_DCMD_LD 0x03000000, /* Logical Device (LD) opcodes */ + +/* + * Input: dcmd.opcode - DR_DCMD_LD_GET_LIST + * dcmd.mbox - reserved + * dcmd.sge IN - ptr to returned DR_LD_LIST structure + * Desc: Return the logical drive list structure + * Status: No error + */ + +/* + * defines the logical drive reference structure + */ +typedef union _DR_LD_REF { /* LD reference structure */ + struct { + uint8_t targetId; /* LD target id (0 to MAX_TARGET_ID) */ + uint8_t reserved; /* reserved for in line with DR_PD_REF */ + uint16_t seqNum; /* Sequence Number */ + } ld_ref; + uint32_t ref; /* shorthand reference to full 32-bits */ +} DR_LD_REF; /* 4 bytes */ + +/* + * defines the logical drive list structure + */ +typedef struct _DR_LD_LIST { + uint32_t ldCount; /* number of LDs */ + uint32_t reserved; /* pad to 8-byte boundary */ + struct { + DR_LD_REF ref; /* LD reference */ + uint8_t state; /* current LD state (DR_LD_STATE) */ + uint8_t reserved[3]; /* pad to 8-byte boundary */ + uint64_t size; /* LD size */ + } ldList[MAX_LOGICAL_DRIVES]; +} DR_LD_LIST; + +struct drsas_drv_ver { + uint8_t signature[12]; + uint8_t os_name[16]; + uint8_t os_ver[12]; + uint8_t drv_name[20]; + uint8_t drv_ver[32]; + uint8_t drv_rel_date[20]; +}; + +#define PCI_TYPE0_ADDRESSES 6 +#define PCI_TYPE1_ADDRESSES 2 +#define PCI_TYPE2_ADDRESSES 5 + +struct drsas_pci_common_header { + uint16_t vendorID; /* (ro) */ + uint16_t deviceID; /* (ro) */ + uint16_t command; /* Device control */ + uint16_t status; + uint8_t revisionID; /* (ro) */ + uint8_t progIf; /* (ro) */ + uint8_t subClass; /* (ro) */ + uint8_t baseClass; /* (ro) */ + uint8_t cacheLineSize; /* (ro+) */ + uint8_t latencyTimer; /* (ro+) */ + uint8_t headerType; /* (ro) */ + uint8_t bist; /* Built in self test */ + + union { + struct { + uint32_t baseAddresses[PCI_TYPE0_ADDRESSES]; + uint32_t cis; + uint16_t subVendorID; + uint16_t subSystemID; + uint32_t romBaseAddress; + uint8_t capabilitiesPtr; + uint8_t reserved1[3]; + uint32_t reserved2; + uint8_t interruptLine; + uint8_t interruptPin; /* (ro) */ + uint8_t minimumGrant; /* (ro) */ + uint8_t maximumLatency; /* (ro) */ + } type_0; + + struct { + uint32_t baseAddresses[PCI_TYPE1_ADDRESSES]; + uint8_t primaryBus; + uint8_t secondaryBus; + uint8_t subordinateBus; + uint8_t secondaryLatency; + uint8_t ioBase; + uint8_t ioLimit; + uint16_t secondaryStatus; + uint16_t memoryBase; + uint16_t memoryLimit; + uint16_t prefetchBase; + uint16_t prefetchLimit; + uint32_t prefetchBaseUpper32; + uint32_t prefetchLimitUpper32; + uint16_t ioBaseUpper16; + uint16_t ioLimitUpper16; + uint8_t capabilitiesPtr; + uint8_t reserved1[3]; + uint32_t romBaseAddress; + uint8_t interruptLine; + uint8_t interruptPin; + uint16_t bridgeControl; + } type_1; + + struct { + uint32_t socketRegistersBaseAddress; + uint8_t capabilitiesPtr; + uint8_t reserved; + uint16_t secondaryStatus; + uint8_t primaryBus; + uint8_t secondaryBus; + uint8_t subordinateBus; + uint8_t secondaryLatency; + struct { + uint32_t base; + uint32_t limit; + } range[PCI_TYPE2_ADDRESSES-1]; + uint8_t interruptLine; + uint8_t interruptPin; + uint16_t bridgeControl; + } type_2; + } header; +}; + +struct drsas_pci_link_capability { + union { + struct { + uint32_t linkSpeed :4; + uint32_t linkWidth :6; + uint32_t aspmSupport :2; + uint32_t losExitLatency :3; + uint32_t l1ExitLatency :3; + uint32_t rsvdp :6; + uint32_t portNumber :8; + } bits; + + uint32_t asUlong; + } cap; + +}; + +struct drsas_pci_link_status_capability { + union { + struct { + uint16_t linkSpeed :4; + uint16_t negotiatedLinkWidth :6; + uint16_t linkTrainingError :1; + uint16_t linkTraning :1; + uint16_t slotClockConfig :1; + uint16_t rsvdZ :3; + } bits; + + uint16_t asUshort; + } stat_cap; + + uint16_t reserved; + +}; + +struct drsas_pci_capabilities { + struct drsas_pci_link_capability linkCapability; + struct drsas_pci_link_status_capability linkStatusCapability; +}; + +struct drsas_pci_information +{ + uint32_t busNumber; + uint8_t deviceNumber; + uint8_t functionNumber; + uint8_t interruptVector; + uint8_t reserved; + struct drsas_pci_common_header pciHeaderInfo; + struct drsas_pci_capabilities capability; + uint8_t reserved2[32]; +}; + +struct drsas_ioctl { + uint16_t version; + uint16_t controller_id; + uint8_t signature[8]; + uint32_t reserved_1; + uint32_t control_code; + uint32_t reserved_2[2]; + uint8_t frame[64]; + union drsas_sgl_frame sgl_frame; + uint8_t sense_buff[DRSAS_MAX_SENSE_LENGTH]; + uint8_t data[1]; +}; + +struct drsas_aen { + uint16_t host_no; + uint16_t cmd_status; + uint32_t seq_num; + uint32_t class_locale_word; +}; +#pragma pack() + +#ifndef DDI_VENDOR_LSI +#define DDI_VENDOR_LSI "LSI" +#endif /* DDI_VENDOR_LSI */ + +static int drsas_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); +static int drsas_attach(dev_info_t *, ddi_attach_cmd_t); +static int drsas_reset(dev_info_t *, ddi_reset_cmd_t); +static int drsas_detach(dev_info_t *, ddi_detach_cmd_t); +static int drsas_open(dev_t *, int, int, cred_t *); +static int drsas_close(dev_t, int, int, cred_t *); +static int drsas_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); + +static int drsas_tran_tgt_init(dev_info_t *, dev_info_t *, + scsi_hba_tran_t *, struct scsi_device *); +static struct scsi_pkt *drsas_tran_init_pkt(struct scsi_address *, register + struct scsi_pkt *, struct buf *, int, int, int, int, + int (*)(), caddr_t); +static int drsas_tran_start(struct scsi_address *, + register struct scsi_pkt *); +static int drsas_tran_abort(struct scsi_address *, struct scsi_pkt *); +static int drsas_tran_reset(struct scsi_address *, int); +static int drsas_tran_getcap(struct scsi_address *, char *, int); +static int drsas_tran_setcap(struct scsi_address *, char *, int, int); +static void drsas_tran_destroy_pkt(struct scsi_address *, + struct scsi_pkt *); +static void drsas_tran_dmafree(struct scsi_address *, struct scsi_pkt *); +static void drsas_tran_sync_pkt(struct scsi_address *, struct scsi_pkt *); +static uint_t drsas_isr(); +static uint_t drsas_softintr(); + +static int init_mfi(struct drsas_instance *); +static int drsas_free_dma_obj(struct drsas_instance *, dma_obj_t); +static int drsas_alloc_dma_obj(struct drsas_instance *, dma_obj_t *, + uchar_t); +static struct drsas_cmd *get_mfi_pkt(struct drsas_instance *); +static void return_mfi_pkt(struct drsas_instance *, + struct drsas_cmd *); + +static void free_space_for_mfi(struct drsas_instance *); +static void free_additional_dma_buffer(struct drsas_instance *); +static int alloc_additional_dma_buffer(struct drsas_instance *); +static int read_fw_status_reg_ppc(struct drsas_instance *); +static void issue_cmd_ppc(struct drsas_cmd *, struct drsas_instance *); +static int issue_cmd_in_poll_mode_ppc(struct drsas_instance *, + struct drsas_cmd *); +static int issue_cmd_in_sync_mode_ppc(struct drsas_instance *, + struct drsas_cmd *); +static void enable_intr_ppc(struct drsas_instance *); +static void disable_intr_ppc(struct drsas_instance *); +static int intr_ack_ppc(struct drsas_instance *); +static int mfi_state_transition_to_ready(struct drsas_instance *); +static void destroy_mfi_frame_pool(struct drsas_instance *); +static int create_mfi_frame_pool(struct drsas_instance *); +static int drsas_dma_alloc(struct drsas_instance *, struct scsi_pkt *, + struct buf *, int, int (*)()); +static int drsas_dma_move(struct drsas_instance *, + struct scsi_pkt *, struct buf *); +static void flush_cache(struct drsas_instance *instance); +static void display_scsi_inquiry(caddr_t); +static int start_mfi_aen(struct drsas_instance *instance); +static int handle_drv_ioctl(struct drsas_instance *instance, + struct drsas_ioctl *ioctl, int mode); +static int handle_mfi_ioctl(struct drsas_instance *instance, + struct drsas_ioctl *ioctl, int mode); +static int handle_mfi_aen(struct drsas_instance *instance, + struct drsas_aen *aen); +static void fill_up_drv_ver(struct drsas_drv_ver *dv); +static struct drsas_cmd *build_cmd(struct drsas_instance *instance, + struct scsi_address *ap, struct scsi_pkt *pkt, + uchar_t *cmd_done); +static int register_mfi_aen(struct drsas_instance *instance, + uint32_t seq_num, uint32_t class_locale_word); +static int issue_mfi_pthru(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int issue_mfi_dcmd(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int issue_mfi_smp(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int issue_mfi_stp(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int abort_aen_cmd(struct drsas_instance *instance, + struct drsas_cmd *cmd_to_abort); + +static int drsas_common_check(struct drsas_instance *instance, + struct drsas_cmd *cmd); +static void drsas_fm_init(struct drsas_instance *instance); +static void drsas_fm_fini(struct drsas_instance *instance); +static int drsas_fm_error_cb(dev_info_t *, ddi_fm_error_t *, + const void *); +static void drsas_fm_ereport(struct drsas_instance *instance, + char *detail); +static int drsas_check_dma_handle(ddi_dma_handle_t handle); +static int drsas_check_acc_handle(ddi_acc_handle_t handle); + +static void drsas_rem_intrs(struct drsas_instance *instance); +static int drsas_add_intrs(struct drsas_instance *instance, int intr_type); + +static void drsas_tran_tgt_free(dev_info_t *, dev_info_t *, + scsi_hba_tran_t *, struct scsi_device *); +static int drsas_tran_bus_config(dev_info_t *, uint_t, + ddi_bus_config_op_t, void *, dev_info_t **); +static int drsas_parse_devname(char *, int *, int *); +static int drsas_config_all_devices(struct drsas_instance *); +static int drsas_config_scsi_device(struct drsas_instance *, + struct scsi_device *, dev_info_t **); +static int drsas_config_ld(struct drsas_instance *, uint16_t, + uint8_t, dev_info_t **); +static dev_info_t *drsas_find_child(struct drsas_instance *, uint16_t, + uint8_t); +static int drsas_name_node(dev_info_t *, char *, int); +static void drsas_issue_evt_taskq(struct drsas_eventinfo *); +static int drsas_service_evt(struct drsas_instance *, int, int, int, + uint64_t); +static int drsas_mode_sense_build(struct scsi_pkt *); + +#ifdef __cplusplus +} +#endif + +#endif /* _DR_SAS_H_ */ diff --git a/usr/src/uts/common/io/dr_sas/dr_sas_list.h b/usr/src/uts/common/io/dr_sas/dr_sas_list.h new file mode 100644 index 0000000000..4154a77796 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas_list.h @@ -0,0 +1,212 @@ +/* + * dr_sas_list.h: header for dr_sas + * + * Solaris MegaRAID driver for SAS2.0 controllers + * Copyright (c) 2008-2009, LSI Logic Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DR_SAS_LIST_H_ +#define _DR_SAS_LIST_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +struct mlist_head { + struct mlist_head *next, *prev; +}; + +typedef struct mlist_head mlist_t; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct mlist_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} + + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static void __list_add(struct mlist_head *new, + struct mlist_head *prev, + struct mlist_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + + +/* + * mlist_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static void mlist_add(struct mlist_head *new, struct mlist_head *head) +{ + __list_add(new, head, head->next); +} + + +/* + * mlist_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static void mlist_add_tail(struct mlist_head *new, struct mlist_head *head) +{ + __list_add(new, head->prev, head); +} + + + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static void __list_del(struct mlist_head *prev, + struct mlist_head *next) +{ + next->prev = prev; + prev->next = next; +} + + +/* + * mlist_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static void mlist_del_init(struct mlist_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + + +/* + * mlist_empty - tests whether a list is empty + * @head: the list to test. + */ +static int mlist_empty(struct mlist_head *head) +{ + return (head->next == head); +} + + +/* + * mlist_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static void mlist_splice(struct mlist_head *list, struct mlist_head *head) +{ + struct mlist_head *first = list->next; + + if (first != list) { + struct mlist_head *last = list->prev; + struct mlist_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; + } +} + + +/* + * mlist_entry - get the struct for this entry + * @ptr: the &struct mlist_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define mlist_entry(ptr, type, member) \ + ((type *)((size_t)(ptr) - offsetof(type, member))) + + +/* + * mlist_for_each - iterate over a list + * @pos: the &struct mlist_head to use as a loop counter. + * @head: the head for your list. + */ +#define mlist_for_each(pos, head) \ + for (pos = (head)->next, prefetch(pos->next); pos != (head); \ + pos = pos->next, prefetch(pos->next)) + + +/* + * mlist_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct mlist_head to use as a loop counter. + * @n: another &struct mlist_head to use as temporary storage + * @head: the head for your list. + */ +#define mlist_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +#ifdef __cplusplus +} +#endif + +#endif /* _DR_SAS_LIST_H_ */ diff --git a/usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c b/usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c index da00160b68..2efb178ff1 100644 --- a/usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c +++ b/usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c @@ -532,8 +532,7 @@ oce_drain_rq_cq(void *arg) if (dev->function_mode & FLEX10_MODE) { if (cqe->u0.s.vlan_tag_present && cqe->u0.s.qnq) { - oce_rx_insert_tag(mp, - cqe->u0.s.vlan_tag); + oce_rx_insert_tag(mp, cqe->u0.s.vlan_tag); } } else if (cqe->u0.s.vlan_tag_present) { oce_rx_insert_tag(mp, cqe->u0.s.vlan_tag); diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c index 4e1979cf54..61a5353365 100644 --- a/usr/src/uts/common/io/mac/mac.c +++ b/usr/src/uts/common/io/mac/mac.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* @@ -2989,6 +2990,9 @@ mac_prop_check_size(mac_prop_id_t id, uint_t valsize, boolean_t is_range) case MAC_PROP_WL_MLME: minsize = sizeof (wl_mlme_t); break; + case MAC_PROP_VN_PROMISC_FILTERED: + minsize = sizeof (boolean_t); + break; } return (valsize >= minsize); diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c index dc1132941b..dc1e40b424 100644 --- a/usr/src/uts/common/io/mac/mac_client.c +++ b/usr/src/uts/common/io/mac/mac_client.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* @@ -3170,7 +3171,8 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type, } if ((mcip->mci_state_flags & MCIS_IS_VNIC) && - type == MAC_CLIENT_PROMISC_ALL) { + type == MAC_CLIENT_PROMISC_ALL && + (mcip->mci_protect_flags & MPT_FLAG_PROMISC_FILTERED)) { /* * The function is being invoked by the upper MAC client * of a VNIC. The VNIC should only see the traffic @@ -4032,16 +4034,15 @@ mac_info_get(const char *name, mac_info_t *minfop) /* * To get the capabilities that MAC layer cares about, such as rings, factory * mac address, vnic or not, it should directly invoke this function. If the - * link is part of a bridge, then the only "capability" it has is the inability - * to do zero copy. + * link is part of a bridge, then the link is unable to do zero copy. */ boolean_t i_mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) { mac_impl_t *mip = (mac_impl_t *)mh; - if (mip->mi_bridge_link != NULL) - return (cap == MAC_CAPAB_NO_ZCOPY); + if (mip->mi_bridge_link != NULL && cap == MAC_CAPAB_NO_ZCOPY) + return (B_TRUE); else if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB) return (mip->mi_getcapab(mip->mi_driver, cap, cap_data)); else @@ -5411,3 +5412,23 @@ mac_client_set_rings(mac_client_handle_t mch, int rxrings, int txrings) mrp->mrp_ntxrings = txrings; } } + +boolean_t +mac_get_promisc_filtered(mac_client_handle_t mch) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + + return (mcip->mci_protect_flags & MPT_FLAG_PROMISC_FILTERED); +} + +void +mac_set_promisc_filtered(mac_client_handle_t mch, boolean_t enable) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + if (enable) + mcip->mci_protect_flags |= MPT_FLAG_PROMISC_FILTERED; + else + mcip->mci_protect_flags &= ~MPT_FLAG_PROMISC_FILTERED; +} diff --git a/usr/src/uts/common/io/mac/mac_protect.c b/usr/src/uts/common/io/mac/mac_protect.c index 0dc825492e..4d5201a994 100644 --- a/usr/src/uts/common/io/mac/mac_protect.c +++ b/usr/src/uts/common/io/mac/mac_protect.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include <sys/strsun.h> @@ -2267,6 +2268,9 @@ mac_protect_init(mac_client_impl_t *mcip) sizeof (dhcpv6_cid_t), offsetof(dhcpv6_cid_t, dc_node)); avl_create(&mcip->mci_v6_dyn_ip, compare_dhcpv6_ip, sizeof (dhcpv6_addr_t), offsetof(dhcpv6_addr_t, da_node)); + + if (mcip->mci_state_flags & MCIS_IS_VNIC) + mcip->mci_protect_flags |= MPT_FLAG_PROMISC_FILTERED; } void diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c index 38967e5d15..06a5ac8cbf 100644 --- a/usr/src/uts/common/io/mac/mac_sched.c +++ b/usr/src/uts/common/io/mac/mac_sched.c @@ -530,12 +530,13 @@ enum pkt_type { /* * In general we do port based hashing to spread traffic over different - * softrings. The below tunable allows to override that behavior. Setting it - * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior - * is also the applicable to ipv6 packets carrying multiple optional headers + * softrings. The below tunables allows to override that behavior. Setting it + * to B_TRUE allows to do a fanout based on src ipv6/ipv4 address. This behavior + * is also applicable to ipv6 packets carrying multiple optional headers * and other uncommon packet types. */ boolean_t mac_src_ipv6_fanout = B_FALSE; +boolean_t mac_src_ipv4_fanout = B_FALSE; /* * Pair of local and remote ports in the transport header @@ -765,13 +766,14 @@ int fanout_unalligned = 0; /* * mac_rx_srs_long_fanout * - * The fanout routine for IPv6 + * The fanout routine for IPv6 (and IPv4 when VLANs are in use). */ static int mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx) { ip6_t *ip6h; + struct ip *ip4h; uint8_t *whereptr; uint_t hash; uint16_t remlen; @@ -839,7 +841,7 @@ mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, */ if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h, mp->b_wptr, &hdr_len, &nexthdr, NULL)) { - goto src_based_fanout; + goto ipv6_src_based_fanout; } whereptr = (uint8_t *)ip6h + hdr_len; @@ -856,7 +858,7 @@ mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, */ if (mp->b_cont != NULL && whereptr + PORTS_SIZE > mp->b_wptr) { - goto src_based_fanout; + goto ipv6_src_based_fanout; } break; default: @@ -890,7 +892,85 @@ mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, /* For all other protocol, do source based fanout */ default: - goto src_based_fanout; + goto ipv6_src_based_fanout; + } + } else if (sap == ETHERTYPE_IP) { + boolean_t modifiable = B_TRUE; + + ASSERT(MBLKL(mp) >= hdrsize); + + ip4h = (struct ip *)(mp->b_rptr + hdrsize); + + if ((unsigned char *)ip4h == mp->b_wptr) { + /* + * The first mblk_t only includes the mac header. + * Note that it is safe to change the mp pointer here, + * as the subsequent operation does not assume mp + * points to the start of the mac header. + */ + mp = mp->b_cont; + + /* + * Make sure ip4h holds the full base ip structure + * up through the destination address. It might not + * hold any of the options though. + */ + if (mp == NULL) + return (-1); + + if (MBLKL(mp) < IP_SIMPLE_HDR_LENGTH) { + modifiable = (DB_REF(mp) == 1); + + if (modifiable && + !pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) + return (-1); + } + + ip4h = (struct ip *)mp->b_rptr; + } + + if (!modifiable || !(OK_32PTR((char *)ip4h))) { + /* + * If ip4h is not aligned fanout to the default ring. + * Note that this may cause packets reordering. + */ + *indx = 0; + *type = OTH; + fanout_unalligned++; + return (0); + } + + /* Do src based fanout if below tunable is set to B_TRUE. */ + if (mac_src_ipv4_fanout) + goto ipv4_src_based_fanout; + + /* If the transport is TCP, we try to do port based fanout */ + if (ip4h->ip_p == IPPROTO_TCP) { + int hdr_len; + + hdr_len = ip4h->ip_hl << 2; + /* set whereptr to point to tcphdr */ + whereptr = (uint8_t *)ip4h + hdr_len; + + /* + * If ip4h does not hold the complete ip header + * including options, or if both ports in the TCP + * header are not part of the mblk, do src_based_fanout + * (the second case covers the first one so we only + * need one test). + */ + if (mp->b_cont != NULL && + whereptr + PORTS_SIZE > mp->b_wptr) + goto ipv4_src_based_fanout; + + hash = HASH_ADDR(ip4h->ip_src.s_addr, + *(uint32_t *)whereptr); + *indx = COMPUTE_INDEX(hash, + mac_srs->srs_tcp_ring_count); + *type = OTH; + } else { + /* For all other protocols, do source based fanout */ + goto ipv4_src_based_fanout; } } else { *indx = 0; @@ -898,11 +978,17 @@ mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, } return (0); -src_based_fanout: +ipv6_src_based_fanout: hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0); *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); *type = OTH; return (0); + +ipv4_src_based_fanout: + hash = HASH_ADDR(ip4h->ip_src.s_addr, (uint32_t)0); + *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); + *type = OTH; + return (0); } /* diff --git a/usr/src/uts/common/io/mr_sas/fusion.h b/usr/src/uts/common/io/mr_sas/fusion.h new file mode 100644 index 0000000000..36fb3cb11a --- /dev/null +++ b/usr/src/uts/common/io/mr_sas/fusion.h @@ -0,0 +1,561 @@ +/* + * fusion.h + * + * Solaris MegaRAID device driver for SAS2.0 controllers + * Copyright (c) 2008-2012, LSI Logic Corporation. + * All rights reserved. + * + * Version: + * Author: + * Swaminathan K S + * Arun Chandrashekhar + * Manju R + * Rasheed + * Shakeel Bukhari + */ + + +#ifndef _FUSION_H_ +#define _FUSION_H_ + +#define U64 uint64_t +#define U32 uint32_t +#define U16 uint16_t +#define U8 uint8_t +#define S8 char +#define S16 short +#define S32 int + +/* MPI2 defines */ +#define MPI2_REPLY_POST_HOST_INDEX_OFFSET (0x6C) +#define MPI2_FUNCTION_IOC_INIT (0x02) /* IOC Init */ +#define MPI2_WHOINIT_HOST_DRIVER (0x04) +#define MPI2_VERSION_MAJOR (0x02) +#define MPI2_VERSION_MINOR (0x00) +#define MPI2_VERSION_MAJOR_MASK (0xFF00) +#define MPI2_VERSION_MAJOR_SHIFT (8) +#define MPI2_VERSION_MINOR_MASK (0x00FF) +#define MPI2_VERSION_MINOR_SHIFT (0) +#define MPI2_VERSION ((MPI2_VERSION_MAJOR << MPI2_VERSION_MAJOR_SHIFT) | \ + MPI2_VERSION_MINOR) +#define MPI2_HEADER_VERSION_UNIT (0x10) +#define MPI2_HEADER_VERSION_DEV (0x00) +#define MPI2_HEADER_VERSION_UNIT_MASK (0xFF00) +#define MPI2_HEADER_VERSION_UNIT_SHIFT (8) +#define MPI2_HEADER_VERSION_DEV_MASK (0x00FF) +#define MPI2_HEADER_VERSION_DEV_SHIFT (0) +#define MPI2_HEADER_VERSION ((MPI2_HEADER_VERSION_UNIT \ + << 8) | \ + MPI2_HEADER_VERSION_DEV) +#define MPI2_IEEE_SGE_FLAGS_IOCPLBNTA_ADDR (0x03) +#define MPI2_SCSIIO_EEDPFLAGS_INC_PRI_REFTAG (0x8000) +#define MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG (0x0400) +#define MPI2_SCSIIO_EEDPFLAGS_CHECK_REMOVE_OP (0x0003) +#define MPI2_SCSIIO_EEDPFLAGS_CHECK_APPTAG (0x0200) +#define MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD (0x0100) +#define MPI2_SCSIIO_EEDPFLAGS_INSERT_OP (0x0004) +#define MPI2_FUNCTION_SCSI_IO_REQUEST (0x00) /* SCSI IO */ +#define MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY (0x06) +#define MPI2_REQ_DESCRIPT_FLAGS_SCSI_IO (0x00) +#define MPI2_SGE_FLAGS_64_BIT_ADDRESSING (0x02) +#define MPI2_SCSIIO_CONTROL_WRITE (0x01000000) +#define MPI2_SCSIIO_CONTROL_READ (0x02000000) +#define MPI2_REQ_DESCRIPT_FLAGS_TYPE_MASK (0x0E) +#define MPI2_RPY_DESCRIPT_FLAGS_UNUSED (0x0F) +#define MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS (0x00) +#define MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK (0x0F) +#define MPI2_WRSEQ_FLUSH_KEY_VALUE (0x0) +#define MPI2_WRITE_SEQUENCE_OFFSET (0x00000004) +#define MPI2_WRSEQ_1ST_KEY_VALUE (0xF) +#define MPI2_WRSEQ_2ND_KEY_VALUE (0x4) +#define MPI2_WRSEQ_3RD_KEY_VALUE (0xB) +#define MPI2_WRSEQ_4TH_KEY_VALUE (0x2) +#define MPI2_WRSEQ_5TH_KEY_VALUE (0x7) +#define MPI2_WRSEQ_6TH_KEY_VALUE (0xD) + +/* Invader defines */ +#define MPI2_TYPE_CUDA 0x2 +#define MPI25_SAS_DEVICE0_FLAGS_ENABLED_FAST_PATH 0x4000 +#define MR_RL_FLAGS_GRANT_DESTINATION_CPU0 0x00 +#define MR_RL_FLAGS_GRANT_DESTINATION_CPU1 0x10 +#define MR_RL_FLAGS_GRANT_DESTINATION_CUDA 0x80 +#define MR_RL_FLAGS_SEQ_NUM_ENABLE 0x8 +#define MPI2_NSEG_FLAGS_SHIFT 4 + + +#define MR_PD_INVALID 0xFFFF +#define MAX_SPAN_DEPTH 8 +#define MAX_RAIDMAP_SPAN_DEPTH (MAX_SPAN_DEPTH) +#define MAX_ROW_SIZE 32 +#define MAX_RAIDMAP_ROW_SIZE (MAX_ROW_SIZE) +#define MAX_LOGICAL_DRIVES 64 +#define MAX_RAIDMAP_LOGICAL_DRIVES (MAX_LOGICAL_DRIVES) +#define MAX_RAIDMAP_VIEWS (MAX_LOGICAL_DRIVES) +#define MAX_ARRAYS 128 +#define MAX_RAIDMAP_ARRAYS (MAX_ARRAYS) +#define MAX_PHYSICAL_DEVICES 256 +#define MAX_RAIDMAP_PHYSICAL_DEVICES (MAX_PHYSICAL_DEVICES) + +/* get the mapping information of LD */ +#define MR_DCMD_LD_MAP_GET_INFO 0x0300e101 + +#ifndef MPI2_POINTER +#define MPI2_POINTER * +#endif + +#pragma pack(1) + +typedef struct _MPI25_IEEE_SGE_CHAIN64 +{ + U64 Address; + U32 Length; + U16 Reserved1; + U8 NextChainOffset; + U8 Flags; +} MPI25_IEEE_SGE_CHAIN64, MPI2_POINTER PTR_MPI25_IEEE_SGE_CHAIN64, + Mpi25IeeeSgeChain64_t, MPI2_POINTER pMpi25IeeeSgeChain64_t; + +typedef struct _MPI2_SGE_SIMPLE_UNION +{ + U32 FlagsLength; + union + { + U32 Address32; + U64 Address64; + } u1; +} MPI2_SGE_SIMPLE_UNION, MPI2_POINTER PTR_MPI2_SGE_SIMPLE_UNION, + Mpi2SGESimpleUnion_t, MPI2_POINTER pMpi2SGESimpleUnion_t; + +typedef struct +{ + U8 CDB[20]; /* 0x00 */ + U32 PrimaryReferenceTag; /* 0x14 */ + U16 PrimaryApplicationTag; /* 0x18 */ + U16 PrimaryApplicationTagMask; /* 0x1A */ + U32 TransferLength; /* 0x1C */ +} MPI2_SCSI_IO_CDB_EEDP32, MPI2_POINTER PTR_MPI2_SCSI_IO_CDB_EEDP32, + Mpi2ScsiIoCdbEedp32_t, MPI2_POINTER pMpi2ScsiIoCdbEedp32_t; + +typedef struct _MPI2_SGE_CHAIN_UNION +{ + U16 Length; + U8 NextChainOffset; + U8 Flags; + union + { + U32 Address32; + U64 Address64; + } u1; +} MPI2_SGE_CHAIN_UNION, MPI2_POINTER PTR_MPI2_SGE_CHAIN_UNION, + Mpi2SGEChainUnion_t, MPI2_POINTER pMpi2SGEChainUnion_t; + +typedef struct _MPI2_IEEE_SGE_SIMPLE32 +{ + U32 Address; + U32 FlagsLength; +} MPI2_IEEE_SGE_SIMPLE32, MPI2_POINTER PTR_MPI2_IEEE_SGE_SIMPLE32, + Mpi2IeeeSgeSimple32_t, MPI2_POINTER pMpi2IeeeSgeSimple32_t; + +typedef struct _MPI2_IEEE_SGE_SIMPLE64 +{ + U64 Address; + U32 Length; + U16 Reserved1; + U8 Reserved2; + U8 Flags; +} MPI2_IEEE_SGE_SIMPLE64, MPI2_POINTER PTR_MPI2_IEEE_SGE_SIMPLE64, + Mpi2IeeeSgeSimple64_t, MPI2_POINTER pMpi2IeeeSgeSimple64_t; + +typedef union _MPI2_IEEE_SGE_SIMPLE_UNION +{ + MPI2_IEEE_SGE_SIMPLE32 Simple32; + MPI2_IEEE_SGE_SIMPLE64 Simple64; +} MPI2_IEEE_SGE_SIMPLE_UNION, MPI2_POINTER PTR_MPI2_IEEE_SGE_SIMPLE_UNION, + Mpi2IeeeSgeSimpleUnion_t, MPI2_POINTER pMpi2IeeeSgeSimpleUnion_t; + +typedef MPI2_IEEE_SGE_SIMPLE32 MPI2_IEEE_SGE_CHAIN32; +typedef MPI2_IEEE_SGE_SIMPLE64 MPI2_IEEE_SGE_CHAIN64; + +typedef union _MPI2_IEEE_SGE_CHAIN_UNION +{ + MPI2_IEEE_SGE_CHAIN32 Chain32; + MPI2_IEEE_SGE_CHAIN64 Chain64; +} MPI2_IEEE_SGE_CHAIN_UNION, MPI2_POINTER PTR_MPI2_IEEE_SGE_CHAIN_UNION, + Mpi2IeeeSgeChainUnion_t, MPI2_POINTER pMpi2IeeeSgeChainUnion_t; + +typedef union _MPI2_SGE_IO_UNION +{ + MPI2_SGE_SIMPLE_UNION MpiSimple; + MPI2_SGE_CHAIN_UNION MpiChain; + MPI2_IEEE_SGE_SIMPLE_UNION IeeeSimple; + MPI2_IEEE_SGE_CHAIN_UNION IeeeChain; +} MPI2_SGE_IO_UNION, MPI2_POINTER PTR_MPI2_SGE_IO_UNION, + Mpi2SGEIOUnion_t, MPI2_POINTER pMpi2SGEIOUnion_t; + +typedef union +{ + U8 CDB32[32]; + MPI2_SCSI_IO_CDB_EEDP32 EEDP32; + MPI2_SGE_SIMPLE_UNION SGE; +} MPI2_SCSI_IO_CDB_UNION, MPI2_POINTER PTR_MPI2_SCSI_IO_CDB_UNION, + Mpi2ScsiIoCdb_t, MPI2_POINTER pMpi2ScsiIoCdb_t; + +/* Default Request Descriptor */ +typedef struct _MPI2_DEFAULT_REQUEST_DESCRIPTOR +{ + U8 RequestFlags; /* 0x00 */ + U8 MSIxIndex; /* 0x01 */ + U16 SMID; /* 0x02 */ + U16 LMID; /* 0x04 */ + U16 DescriptorTypeDependent; /* 0x06 */ +} MPI2_DEFAULT_REQUEST_DESCRIPTOR, + MPI2_POINTER PTR_MPI2_DEFAULT_REQUEST_DESCRIPTOR, + Mpi2DefaultRequestDescriptor_t, + MPI2_POINTER pMpi2DefaultRequestDescriptor_t; + +/* High Priority Request Descriptor */ +typedef struct _MPI2_HIGH_PRIORITY_REQUEST_DESCRIPTOR +{ + U8 RequestFlags; /* 0x00 */ + U8 MSIxIndex; /* 0x01 */ + U16 SMID; /* 0x02 */ + U16 LMID; /* 0x04 */ + U16 Reserved1; /* 0x06 */ +} MPI2_HIGH_PRIORITY_REQUEST_DESCRIPTOR, + MPI2_POINTER PTR_MPI2_HIGH_PRIORITY_REQUEST_DESCRIPTOR, + Mpi2HighPriorityRequestDescriptor_t, + MPI2_POINTER pMpi2HighPriorityRequestDescriptor_t; + +/* SCSI IO Request Descriptor */ +typedef struct _MPI2_SCSI_IO_REQUEST_DESCRIPTOR +{ + U8 RequestFlags; /* 0x00 */ + U8 MSIxIndex; /* 0x01 */ + U16 SMID; /* 0x02 */ + U16 LMID; /* 0x04 */ + U16 DevHandle; /* 0x06 */ +} MPI2_SCSI_IO_REQUEST_DESCRIPTOR, + MPI2_POINTER PTR_MPI2_SCSI_IO_REQUEST_DESCRIPTOR, + Mpi2SCSIIORequestDescriptor_t, + MPI2_POINTER pMpi2SCSIIORequestDescriptor_t; + +/* SCSI Target Request Descriptor */ +typedef struct _MPI2_SCSI_TARGET_REQUEST_DESCRIPTOR +{ + U8 RequestFlags; /* 0x00 */ + U8 MSIxIndex; /* 0x01 */ + U16 SMID; /* 0x02 */ + U16 LMID; /* 0x04 */ + U16 IoIndex; /* 0x06 */ +} MPI2_SCSI_TARGET_REQUEST_DESCRIPTOR, + MPI2_POINTER PTR_MPI2_SCSI_TARGET_REQUEST_DESCRIPTOR, + Mpi2SCSITargetRequestDescriptor_t, + MPI2_POINTER pMpi2SCSITargetRequestDescriptor_t; + +/* RAID Accelerator Request Descriptor */ +typedef struct _MPI2_RAID_ACCEL_REQUEST_DESCRIPTOR +{ + U8 RequestFlags; /* 0x00 */ + U8 MSIxIndex; /* 0x01 */ + U16 SMID; /* 0x02 */ + U16 LMID; /* 0x04 */ + U16 Reserved; /* 0x06 */ +} MPI2_RAID_ACCEL_REQUEST_DESCRIPTOR, + MPI2_POINTER PTR_MPI2_RAID_ACCEL_REQUEST_DESCRIPTOR, + Mpi2RAIDAcceleratorRequestDescriptor_t, + MPI2_POINTER pMpi2RAIDAcceleratorRequestDescriptor_t; + +/* Default Reply Descriptor */ +typedef struct _MPI2_DEFAULT_REPLY_DESCRIPTOR +{ + U8 ReplyFlags; /* 0x00 */ + U8 MSIxIndex; /* 0x01 */ + U16 DescriptorTypeDependent1; /* 0x02 */ + U32 DescriptorTypeDependent2; /* 0x04 */ +} MPI2_DEFAULT_REPLY_DESCRIPTOR, MPI2_POINTER PTR_MPI2_DEFAULT_REPLY_DESCRIPTOR, + Mpi2DefaultReplyDescriptor_t, MPI2_POINTER pMpi2DefaultReplyDescriptor_t; + +/* Address Reply Descriptor */ +typedef struct _MPI2_ADDRESS_REPLY_DESCRIPTOR +{ + U8 ReplyFlags; /* 0x00 */ + U8 MSIxIndex; /* 0x01 */ + U16 SMID; /* 0x02 */ + U32 ReplyFrameAddress; /* 0x04 */ +} MPI2_ADDRESS_REPLY_DESCRIPTOR, MPI2_POINTER PTR_MPI2_ADDRESS_REPLY_DESCRIPTOR, + Mpi2AddressReplyDescriptor_t, MPI2_POINTER pMpi2AddressReplyDescriptor_t; + +/* SCSI IO Success Reply Descriptor */ +typedef struct _MPI2_SCSI_IO_SUCCESS_REPLY_DESCRIPTOR +{ + U8 ReplyFlags; /* 0x00 */ + U8 MSIxIndex; /* 0x01 */ + U16 SMID; /* 0x02 */ + U16 TaskTag; /* 0x04 */ + U16 Reserved1; /* 0x06 */ +} MPI2_SCSI_IO_SUCCESS_REPLY_DESCRIPTOR, + MPI2_POINTER PTR_MPI2_SCSI_IO_SUCCESS_REPLY_DESCRIPTOR, + Mpi2SCSIIOSuccessReplyDescriptor_t, + MPI2_POINTER pMpi2SCSIIOSuccessReplyDescriptor_t; + +/* TargetAssist Success Reply Descriptor */ +typedef struct _MPI2_TARGETASSIST_SUCCESS_REPLY_DESCRIPTOR +{ + U8 ReplyFlags; /* 0x00 */ + U8 MSIxIndex; /* 0x01 */ + U16 SMID; /* 0x02 */ + U8 SequenceNumber; /* 0x04 */ + U8 Reserved1; /* 0x05 */ + U16 IoIndex; /* 0x06 */ +} MPI2_TARGETASSIST_SUCCESS_REPLY_DESCRIPTOR, + MPI2_POINTER PTR_MPI2_TARGETASSIST_SUCCESS_REPLY_DESCRIPTOR, + Mpi2TargetAssistSuccessReplyDescriptor_t, + MPI2_POINTER pMpi2TargetAssistSuccessReplyDescriptor_t; + +/* Target Command Buffer Reply Descriptor */ +typedef struct _MPI2_TARGET_COMMAND_BUFFER_REPLY_DESCRIPTOR +{ + U8 ReplyFlags; /* 0x00 */ + U8 MSIxIndex; /* 0x01 */ + U8 VP_ID; /* 0x02 */ + U8 Flags; /* 0x03 */ + U16 InitiatorDevHandle; /* 0x04 */ + U16 IoIndex; /* 0x06 */ +} MPI2_TARGET_COMMAND_BUFFER_REPLY_DESCRIPTOR, + MPI2_POINTER PTR_MPI2_TARGET_COMMAND_BUFFER_REPLY_DESCRIPTOR, + Mpi2TargetCommandBufferReplyDescriptor_t, + MPI2_POINTER pMpi2TargetCommandBufferReplyDescriptor_t; + +/* RAID Accelerator Success Reply Descriptor */ +typedef struct _MPI2_RAID_ACCELERATOR_SUCCESS_REPLY_DESCRIPTOR +{ + U8 ReplyFlags; /* 0x00 */ + U8 MSIxIndex; /* 0x01 */ + U16 SMID; /* 0x02 */ + U32 Reserved; /* 0x04 */ +} MPI2_RAID_ACCELERATOR_SUCCESS_REPLY_DESCRIPTOR, + MPI2_POINTER PTR_MPI2_RAID_ACCELERATOR_SUCCESS_REPLY_DESCRIPTOR, + Mpi2RAIDAcceleratorSuccessReplyDescriptor_t, + MPI2_POINTER pMpi2RAIDAcceleratorSuccessReplyDescriptor_t; + +/* union of Reply Descriptors */ +typedef union _MPI2_REPLY_DESCRIPTORS_UNION +{ + MPI2_DEFAULT_REPLY_DESCRIPTOR Default; + MPI2_ADDRESS_REPLY_DESCRIPTOR AddressReply; + MPI2_SCSI_IO_SUCCESS_REPLY_DESCRIPTOR SCSIIOSuccess; + MPI2_TARGETASSIST_SUCCESS_REPLY_DESCRIPTOR TargetAssistSuccess; + MPI2_TARGET_COMMAND_BUFFER_REPLY_DESCRIPTOR TargetCommandBuffer; + MPI2_RAID_ACCELERATOR_SUCCESS_REPLY_DESCRIPTOR RAIDAcceleratorSuccess; + U64 Words; +} MPI2_REPLY_DESCRIPTORS_UNION, MPI2_POINTER PTR_MPI2_REPLY_DESCRIPTORS_UNION, + Mpi2ReplyDescriptorsUnion_t, MPI2_POINTER pMpi2ReplyDescriptorsUnion_t; + +/* IOCInit Request message */ +typedef struct _MPI2_IOC_INIT_REQUEST +{ + U8 WhoInit; /* 0x00 */ + U8 Reserved1; /* 0x01 */ + U8 ChainOffset; /* 0x02 */ + U8 Function; /* 0x03 */ + U16 Reserved2; /* 0x04 */ + U8 Reserved3; /* 0x06 */ + U8 MsgFlags; /* 0x07 */ + U8 VP_ID; /* 0x08 */ + U8 VF_ID; /* 0x09 */ + U16 Reserved4; /* 0x0A */ + U16 MsgVersion; /* 0x0C */ + U16 HeaderVersion; /* 0x0E */ + U32 Reserved5; /* 0x10 */ + U16 Reserved6; /* 0x14 */ + U8 Reserved7; /* 0x16 */ + U8 HostMSIxVectors; /* 0x17 */ + U16 Reserved8; /* 0x18 */ + U16 SystemRequestFrameSize; /* 0x1A */ + U16 ReplyDescriptorPostQueueDepth; /* 0x1C */ + U16 ReplyFreeQueueDepth; /* 0x1E */ + U32 SenseBufferAddressHigh; /* 0x20 */ + U32 SystemReplyAddressHigh; /* 0x24 */ + U64 SystemRequestFrameBaseAddress; /* 0x28 */ + U64 ReplyDescriptorPostQueueAddress; /* 0x30 */ + U64 ReplyFreeQueueAddress; /* 0x38 */ + U64 TimeStamp; /* 0x40 */ +} MPI2_IOC_INIT_REQUEST, MPI2_POINTER PTR_MPI2_IOC_INIT_REQUEST, + Mpi2IOCInitRequest_t, MPI2_POINTER pMpi2IOCInitRequest_t; + + +typedef struct _MR_DEV_HANDLE_INFO { + + /* Send bitmap of LDs that are idle with respect to FP */ + U16 curDevHdl; + + /* bitmap of valid device handles. */ + U8 validHandles; + U8 reserved; + /* 0x04 dev handles for all the paths. */ + U16 devHandle[2]; +} MR_DEV_HANDLE_INFO; /* 0x08, Total Size */ + +typedef struct _MR_ARRAY_INFO { + U16 pd[MAX_RAIDMAP_ROW_SIZE]; +} MR_ARRAY_INFO; /* 0x40, Total Size */ + +typedef struct _MR_QUAD_ELEMENT { + U64 logStart; /* 0x00 */ + U64 logEnd; /* 0x08 */ + U64 offsetInSpan; /* 0x10 */ + U32 diff; /* 0x18 */ + U32 reserved1; /* 0x1C */ +} MR_QUAD_ELEMENT; /* 0x20, Total size */ + +typedef struct _MR_SPAN_INFO { + U32 noElements; /* 0x00 */ + U32 reserved1; /* 0x04 */ + MR_QUAD_ELEMENT quads[MAX_RAIDMAP_SPAN_DEPTH]; /* 0x08 */ +} MR_SPAN_INFO; /* 0x108, Total size */ + +typedef struct _MR_LD_SPAN_ { /* SPAN structure */ + /* 0x00, starting block number in array */ + U64 startBlk; + + /* 0x08, number of blocks */ + U64 numBlks; + + /* 0x10, array reference */ + U16 arrayRef; + + U8 reserved[6]; /* 0x12 */ +} MR_LD_SPAN; /* 0x18, Total Size */ + +typedef struct _MR_SPAN_BLOCK_INFO { + /* number of rows/span */ + U64 num_rows; + + MR_LD_SPAN span; /* 0x08 */ + MR_SPAN_INFO block_span_info; /* 0x20 */ +} MR_SPAN_BLOCK_INFO; /* 0x128, Total Size */ + +typedef struct _MR_LD_RAID { + struct { + U32 fpCapable :1; + U32 reserved5 :3; + U32 ldPiMode :4; + U32 pdPiMode :4; + + /* FDE or controller encryption (MR_LD_ENCRYPTION_TYPE) */ + U32 encryptionType :8; + + U32 fpWriteCapable :1; + U32 fpReadCapable :1; + U32 fpWriteAcrossStripe:1; + U32 fpReadAcrossStripe:1; + U32 reserved4 :8; + } capability; /* 0x00 */ + U32 reserved6; + U64 size; /* 0x08, LD size in blocks */ + U8 spanDepth; /* 0x10, Total Number of Spans */ + U8 level; /* 0x11, RAID level */ + /* 0x12, shift-count to get stripe size (0=512, 1=1K, 7=64K, etc.) */ + U8 stripeShift; + U8 rowSize; /* 0x13, number of disks in a row */ + /* 0x14, number of data disks in a row */ + U8 rowDataSize; + U8 writeMode; /* 0x15, WRITE_THROUGH or WRITE_BACK */ + + /* 0x16, To differentiate between RAID1 and RAID1E */ + U8 PRL; + + U8 SRL; /* 0x17 */ + U16 targetId; /* 0x18, ld Target Id. */ + + /* 0x1a, state of ld, state corresponds to MR_LD_STATE */ + U8 ldState; + + /* 0x1b, Pre calculate region type requests based on MFC etc.. */ + U8 regTypeReqOnWrite; + + U8 modFactor; /* 0x1c, same as rowSize */ + /* + * 0x1d, region lock type used for read, valid only if + * regTypeOnReadIsValid=1 + */ + U8 regTypeReqOnRead; + U16 seqNum; /* 0x1e, LD sequence number */ + + struct { + /* This LD requires sync command before completing */ + U32 ldSyncRequired:1; + U32 reserved:31; + } flags; /* 0x20 */ + + U8 reserved3[0x5C]; /* 0x24 */ +} MR_LD_RAID; /* 0x80, Total Size */ + +typedef struct _MR_LD_SPAN_MAP { + MR_LD_RAID ldRaid; /* 0x00 */ + + /* 0x80, needed for GET_ARM() - R0/1/5 only. */ + U8 dataArmMap[MAX_RAIDMAP_ROW_SIZE]; + + MR_SPAN_BLOCK_INFO spanBlock[MAX_RAIDMAP_SPAN_DEPTH]; /* 0xA0 */ +} MR_LD_SPAN_MAP; /* 0x9E0 */ + +typedef struct _MR_FW_RAID_MAP { + /* total size of this structure, including this field */ + U32 totalSize; + union { + /* Simple method of version checking variables */ + struct { + U32 maxLd; + U32 maxSpanDepth; + U32 maxRowSize; + U32 maxPdCount; + U32 maxArrays; + } validationInfo; + U32 version[5]; + U32 reserved1[5]; + } u1; + + U32 ldCount; /* count of lds */ + U32 Reserved1; + + /* + * 0x20 This doesn't correspond to + * FW Ld Tgt Id to LD, but will purge. For example: if tgt Id is 4 + * and FW LD is 2, and there is only one LD, FW will populate the + * array like this. [0xFF, 0xFF, 0xFF, 0xFF, 0x0.....]. This is to + * help reduce the entire strcture size if there are few LDs or + * driver is looking info for 1 LD only. + */ + U8 ldTgtIdToLd[MAX_RAIDMAP_LOGICAL_DRIVES+ \ + MAX_RAIDMAP_VIEWS]; /* 0x20 */ + /* timeout value used by driver in FP IOs */ + U8 fpPdIoTimeoutSec; + U8 reserved2[7]; + MR_ARRAY_INFO arMapInfo[MAX_RAIDMAP_ARRAYS]; /* 0x00a8 */ + MR_DEV_HANDLE_INFO devHndlInfo[MAX_RAIDMAP_PHYSICAL_DEVICES]; + + /* 0x28a8-[0 -MAX_RAIDMAP_LOGICAL_DRIVES+MAX_RAIDMAP_VIEWS+1]; */ + MR_LD_SPAN_MAP ldSpanMap[1]; +}MR_FW_RAID_MAP; /* 0x3288, Total Size */ + +typedef struct _LD_TARGET_SYNC { + U8 ldTargetId; + U8 reserved; + U16 seqNum; +} LD_TARGET_SYNC; + +#pragma pack() + +struct IO_REQUEST_INFO { + U64 ldStartBlock; + U32 numBlocks; + U16 ldTgtId; + U8 isRead; + U16 devHandle; + U64 pdBlock; + U8 fpOkForIo; + U8 ldPI; +}; + +#endif /* _FUSION_H_ */ diff --git a/usr/src/uts/common/io/mr_sas/ld_pd_map.c b/usr/src/uts/common/io/mr_sas/ld_pd_map.c new file mode 100644 index 0000000000..8fac4e7b5a --- /dev/null +++ b/usr/src/uts/common/io/mr_sas/ld_pd_map.c @@ -0,0 +1,536 @@ +/* + * ********************************************************************** + * + * ld_pd_map.c + * + * Solaris MegaRAID device driver for SAS2.0 controllers + * Copyright (c) 2008-2012, LSI Logic Corporation. + * All rights reserved. + * + * Version: + * Author: + * Swaminathan K S + * Arun Chandrashekhar + * Manju R + * Rasheed + * Shakeel Bukhari + * + * + * This module contains functions for device drivers + * to get pd-ld mapping information. + * + * ********************************************************************** + */ + +#include <sys/scsi/scsi.h> +#include "mr_sas.h" +#include "ld_pd_map.h" + +/* + * This function will check if FAST IO is possible on this logical drive + * by checking the EVENT information availabe in the driver + */ +#define MR_LD_STATE_OPTIMAL 3 +#define ABS_DIFF(a, b) (((a) > (b)) ? ((a) - (b)) : ((b) - (a))) + +static void mr_update_load_balance_params(MR_FW_RAID_MAP_ALL *, + PLD_LOAD_BALANCE_INFO); + +#define FALSE 0 +#define TRUE 1 + +typedef U64 REGION_KEY; +typedef U32 REGION_LEN; +extern int debug_level_g; + + +MR_LD_RAID +*MR_LdRaidGet(U32 ld, MR_FW_RAID_MAP_ALL *map) +{ + return (&map->raidMap.ldSpanMap[ld].ldRaid); +} + +U16 +MR_GetLDTgtId(U32 ld, MR_FW_RAID_MAP_ALL *map) +{ + return (map->raidMap.ldSpanMap[ld].ldRaid.targetId); +} + + +static MR_SPAN_BLOCK_INFO * +MR_LdSpanInfoGet(U32 ld, MR_FW_RAID_MAP_ALL *map) +{ + return (&map->raidMap.ldSpanMap[ld].spanBlock[0]); +} + +static U8 +MR_LdDataArmGet(U32 ld, U32 armIdx, MR_FW_RAID_MAP_ALL *map) +{ + return (map->raidMap.ldSpanMap[ld].dataArmMap[armIdx]); +} + +static U16 +MR_ArPdGet(U32 ar, U32 arm, MR_FW_RAID_MAP_ALL *map) +{ + return (map->raidMap.arMapInfo[ar].pd[arm]); +} + +static U16 +MR_LdSpanArrayGet(U32 ld, U32 span, MR_FW_RAID_MAP_ALL *map) +{ + return (map->raidMap.ldSpanMap[ld].spanBlock[span].span.arrayRef); +} + +static U16 +MR_PdDevHandleGet(U32 pd, MR_FW_RAID_MAP_ALL *map) +{ + return (map->raidMap.devHndlInfo[pd].curDevHdl); +} + +U16 +MR_TargetIdToLdGet(U32 ldTgtId, MR_FW_RAID_MAP_ALL *map) +{ + return (map->raidMap.ldTgtIdToLd[ldTgtId]); +} + +U16 +MR_CheckDIF(U32 ldTgtId, MR_FW_RAID_MAP_ALL *map) +{ + MR_LD_RAID *raid; + U32 ld; + + ld = MR_TargetIdToLdGet(ldTgtId, map); + + if (ld >= MAX_LOGICAL_DRIVES) { + return (FALSE); + } + + raid = MR_LdRaidGet(ld, map); + + return (raid->capability.ldPiMode == 0x8); +} + +static MR_LD_SPAN * +MR_LdSpanPtrGet(U32 ld, U32 span, MR_FW_RAID_MAP_ALL *map) +{ + return (&map->raidMap.ldSpanMap[ld].spanBlock[span].span); +} + +/* + * This function will validate Map info data provided by FW + */ +U8 +MR_ValidateMapInfo(MR_FW_RAID_MAP_ALL *map, PLD_LOAD_BALANCE_INFO lbInfo) +{ + MR_FW_RAID_MAP *pFwRaidMap = &map->raidMap; + U32 fwsize = sizeof (MR_FW_RAID_MAP) - sizeof (MR_LD_SPAN_MAP) + + (sizeof (MR_LD_SPAN_MAP) * pFwRaidMap->ldCount); + + if (pFwRaidMap->totalSize != fwsize) { + + con_log(CL_ANN1, (CE_NOTE, + "map info structure size 0x%x is " + "not matching with ld count\n", fwsize)); + /* sizeof (foo) returns size_t, which is *LONG*. */ + con_log(CL_ANN1, (CE_NOTE, "span map 0x%x total size 0x%x\n",\ + (int)sizeof (MR_LD_SPAN_MAP), pFwRaidMap->totalSize)); + + return (0); + } + + mr_update_load_balance_params(map, lbInfo); + + return (1); +} + +U32 +MR_GetSpanBlock(U32 ld, U64 row, U64 *span_blk, MR_FW_RAID_MAP_ALL *map, + int *div_error) +{ + MR_SPAN_BLOCK_INFO *pSpanBlock = MR_LdSpanInfoGet(ld, map); + MR_QUAD_ELEMENT *qe; + MR_LD_RAID *raid = MR_LdRaidGet(ld, map); + U32 span, j; + + for (span = 0; span < raid->spanDepth; span++, pSpanBlock++) { + for (j = 0; j < pSpanBlock->block_span_info.noElements; j++) { + qe = &pSpanBlock->block_span_info.quads[j]; + if (qe->diff == 0) { + *div_error = 1; + return (span); + } + if (qe->logStart <= row && row <= qe->logEnd && + (((row - qe->logStart) % qe->diff)) == 0) { + if (span_blk != NULL) { + U64 blk; + blk = ((row - qe->logStart) / + (qe->diff)); + + blk = (blk + qe->offsetInSpan) << + raid->stripeShift; + *span_blk = blk; + } + return (span); + } + } + } + return (span); +} + + +/* + * ************************************************************* + * + * This routine calculates the arm, span and block for + * the specified stripe and reference in stripe. + * + * Inputs : + * + * ld - Logical drive number + * stripRow - Stripe number + * stripRef - Reference in stripe + * + * Outputs : + * + * span - Span number + * block - Absolute Block number in the physical disk + */ +U8 +MR_GetPhyParams(struct mrsas_instance *instance, U32 ld, U64 stripRow, + U16 stripRef, U64 *pdBlock, U16 *pDevHandle, + MPI2_SCSI_IO_VENDOR_UNIQUE *pRAID_Context, MR_FW_RAID_MAP_ALL *map) +{ + MR_LD_RAID *raid = MR_LdRaidGet(ld, map); + U32 pd, arRef; + U8 physArm, span; + U64 row; + int error_code = 0; + U8 retval = TRUE; + U32 rowMod; + U32 armQ; + U32 arm; + + row = (stripRow / raid->rowDataSize); + + if (raid->level == 6) { + U32 logArm = (stripRow % (raid->rowDataSize)); + + if (raid->rowSize == 0) { + return (FALSE); + } + rowMod = (row % (raid->rowSize)); + armQ = raid->rowSize-1-rowMod; + arm = armQ+1+logArm; + if (arm >= raid->rowSize) + arm -= raid->rowSize; + physArm = (U8)arm; + } else { + if (raid->modFactor == 0) + return (FALSE); + physArm = MR_LdDataArmGet(ld, + (stripRow % (raid->modFactor)), map); + } + if (raid->spanDepth == 1) { + span = 0; + *pdBlock = row << raid->stripeShift; + } else + span = (U8)MR_GetSpanBlock(ld, row, pdBlock, map, &error_code); + + if (error_code == 1) + return (FALSE); + + /* Get the array on which this span is present. */ + arRef = MR_LdSpanArrayGet(ld, span, map); + /* Get the Pd. */ + pd = MR_ArPdGet(arRef, physArm, map); + /* Get dev handle from Pd. */ + if (pd != MR_PD_INVALID) { + *pDevHandle = MR_PdDevHandleGet(pd, map); + } else { + *pDevHandle = MR_PD_INVALID; /* set dev handle as invalid. */ + if ((raid->level >= 5) && + ((instance->device_id != PCI_DEVICE_ID_LSI_INVADER) || + (instance->device_id == PCI_DEVICE_ID_LSI_INVADER && + raid->regTypeReqOnRead != REGION_TYPE_UNUSED))) { + pRAID_Context->regLockFlags = REGION_TYPE_EXCLUSIVE; + } else if (raid->level == 1) { + /* Get Alternate Pd. */ + pd = MR_ArPdGet(arRef, physArm + 1, map); + /* Get dev handle from Pd. */ + if (pd != MR_PD_INVALID) + *pDevHandle = MR_PdDevHandleGet(pd, map); + } + } + + *pdBlock += stripRef + MR_LdSpanPtrGet(ld, span, map)->startBlk; + + pRAID_Context->spanArm = (span << RAID_CTX_SPANARM_SPAN_SHIFT) | + physArm; + + return (retval); +} + + + +/* + * *********************************************************************** + * + * MR_BuildRaidContext function + * + * This function will initiate command processing. The start/end row and strip + * information is calculated then the lock is acquired. + * This function will return 0 if region lock + * was acquired OR return num strips ??? + */ + +U8 +MR_BuildRaidContext(struct mrsas_instance *instance, + struct IO_REQUEST_INFO *io_info, MPI2_SCSI_IO_VENDOR_UNIQUE *pRAID_Context, + MR_FW_RAID_MAP_ALL *map) +{ + MR_LD_RAID *raid; + U32 ld, stripSize, stripe_mask; + U64 endLba, endStrip, endRow; + U64 start_row, start_strip; + REGION_KEY regStart; + REGION_LEN regSize; + U8 num_strips, numRows; + U16 ref_in_start_stripe; + U16 ref_in_end_stripe; + + U64 ldStartBlock; + U32 numBlocks, ldTgtId; + U8 isRead; + U8 retval = 0; + + ldStartBlock = io_info->ldStartBlock; + numBlocks = io_info->numBlocks; + ldTgtId = io_info->ldTgtId; + isRead = io_info->isRead; + + if (map == NULL) { + io_info->fpOkForIo = FALSE; + return (FALSE); + } + + ld = MR_TargetIdToLdGet(ldTgtId, map); + + if (ld >= MAX_LOGICAL_DRIVES) { + io_info->fpOkForIo = FALSE; + return (FALSE); + } + + raid = MR_LdRaidGet(ld, map); + + stripSize = 1 << raid->stripeShift; + stripe_mask = stripSize-1; + /* + * calculate starting row and stripe, and number of strips and rows + */ + start_strip = ldStartBlock >> raid->stripeShift; + ref_in_start_stripe = (U16)(ldStartBlock & stripe_mask); + endLba = ldStartBlock + numBlocks - 1; + ref_in_end_stripe = (U16)(endLba & stripe_mask); + endStrip = endLba >> raid->stripeShift; + num_strips = (U8)(endStrip - start_strip + 1); + /* Check to make sure is not deviding by zero */ + if (raid->rowDataSize == 0) + return (FALSE); + start_row = (start_strip / raid->rowDataSize); + endRow = (endStrip / raid->rowDataSize); + /* get the row count */ + numRows = (U8)(endRow - start_row + 1); + + /* + * calculate region info. + */ + regStart = start_row << raid->stripeShift; + regSize = stripSize; + + /* Check if we can send this I/O via FastPath */ + if (raid->capability.fpCapable) { + if (isRead) + io_info->fpOkForIo = (raid->capability.fpReadCapable && + ((num_strips == 1) || + raid->capability.fpReadAcrossStripe)); + else + io_info->fpOkForIo = + (raid->capability.fpWriteCapable && + ((num_strips == 1) || + raid->capability.fpWriteAcrossStripe)); + } else + io_info->fpOkForIo = FALSE; + + + /* + * Check for DIF support + */ + if (!raid->capability.ldPiMode) { + io_info->ldPI = FALSE; + } else { + io_info->ldPI = TRUE; + } + + if (numRows == 1) { + if (num_strips == 1) { + regStart += ref_in_start_stripe; + regSize = numBlocks; + } + } else { + if (start_strip == (start_row + 1) * raid->rowDataSize - 1) { + regStart += ref_in_start_stripe; + regSize = stripSize - ref_in_start_stripe; + } + + if (numRows > 2) { + regSize += (numRows-2) << raid->stripeShift; + } + + if (endStrip == endRow*raid->rowDataSize) { + regSize += ref_in_end_stripe+1; + } else { + regSize += stripSize; + } + } + + pRAID_Context->timeoutValue = map->raidMap.fpPdIoTimeoutSec; + + if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) { + pRAID_Context->regLockFlags = (isRead) ? + raid->regTypeReqOnRead : raid->regTypeReqOnWrite; + } else { + pRAID_Context->regLockFlags = (isRead) ? + REGION_TYPE_SHARED_READ : raid->regTypeReqOnWrite; + } + + pRAID_Context->ldTargetId = raid->targetId; + pRAID_Context->regLockRowLBA = regStart; + pRAID_Context->regLockLength = regSize; + pRAID_Context->configSeqNum = raid->seqNum; + + /* + * Get Phy Params only if FP capable, + * or else leave it to MR firmware to do the calculation. + */ + if (io_info->fpOkForIo) { + /* if fast path possible then get the physical parameters */ + retval = MR_GetPhyParams(instance, ld, start_strip, + ref_in_start_stripe, &io_info->pdBlock, + &io_info->devHandle, pRAID_Context, map); + + /* If IO on an invalid Pd, then FP is not possible. */ + if (io_info->devHandle == MR_PD_INVALID) + io_info->fpOkForIo = FALSE; + + return (retval); + + } else if (isRead) { + uint_t stripIdx; + + for (stripIdx = 0; stripIdx < num_strips; stripIdx++) { + if (!MR_GetPhyParams(instance, ld, + start_strip + stripIdx, ref_in_start_stripe, + &io_info->pdBlock, &io_info->devHandle, + pRAID_Context, map)) { + return (TRUE); + } + } + } + return (TRUE); +} + + +void +mr_update_load_balance_params(MR_FW_RAID_MAP_ALL *map, + PLD_LOAD_BALANCE_INFO lbInfo) +{ + int ldCount; + U16 ld; + MR_LD_RAID *raid; + + for (ldCount = 0; ldCount < MAX_LOGICAL_DRIVES; ldCount++) { + ld = MR_TargetIdToLdGet(ldCount, map); + + if (ld >= MAX_LOGICAL_DRIVES) { + con_log(CL_ANN1, + (CE_NOTE, "mrsas: ld=%d Invalid ld \n", ld)); + continue; + } + + raid = MR_LdRaidGet(ld, map); + + /* Two drive Optimal RAID 1 */ + if ((raid->level == 1) && (raid->rowSize == 2) && + (raid->spanDepth == 1) && + raid->ldState == MR_LD_STATE_OPTIMAL) { + U32 pd, arRef; + + lbInfo[ldCount].loadBalanceFlag = 1; + + /* Get the array on which this span is present. */ + arRef = MR_LdSpanArrayGet(ld, 0, map); + + pd = MR_ArPdGet(arRef, 0, map); /* Get the Pd. */ + /* Get dev handle from Pd. */ + lbInfo[ldCount].raid1DevHandle[0] = + MR_PdDevHandleGet(pd, map); + + pd = MR_ArPdGet(arRef, 1, map); /* Get the Pd. */ + /* Get dev handle from Pd. */ + lbInfo[ldCount].raid1DevHandle[1] = + MR_PdDevHandleGet(pd, map); + con_log(CL_ANN1, (CE_NOTE, + "mrsas: ld=%d load balancing enabled \n", ldCount)); + } else { + lbInfo[ldCount].loadBalanceFlag = 0; + } + } +} + + +U8 +megasas_get_best_arm(PLD_LOAD_BALANCE_INFO lbInfo, U8 arm, U64 block, + U32 count) +{ + U16 pend0, pend1; + U64 diff0, diff1; + U8 bestArm; + + /* get the pending cmds for the data and mirror arms */ + pend0 = lbInfo->scsi_pending_cmds[0]; + pend1 = lbInfo->scsi_pending_cmds[1]; + + /* Determine the disk whose head is nearer to the req. block */ + diff0 = ABS_DIFF(block, lbInfo->last_accessed_block[0]); + diff1 = ABS_DIFF(block, lbInfo->last_accessed_block[1]); + bestArm = (diff0 <= diff1 ? 0 : 1); + + if ((bestArm == arm && pend0 > pend1 + 16) || + (bestArm != arm && pend1 > pend0 + 16)) { + bestArm ^= 1; + } + + /* Update the last accessed block on the correct pd */ + lbInfo->last_accessed_block[bestArm] = block + count - 1; + return (bestArm); +} + +U16 +get_updated_dev_handle(PLD_LOAD_BALANCE_INFO lbInfo, + struct IO_REQUEST_INFO *io_info) +{ + U8 arm, old_arm; + U16 devHandle; + + old_arm = lbInfo->raid1DevHandle[0] == io_info->devHandle ? 0 : 1; + + /* get best new arm */ + arm = megasas_get_best_arm(lbInfo, old_arm, io_info->ldStartBlock, + io_info->numBlocks); + + devHandle = lbInfo->raid1DevHandle[arm]; + + lbInfo->scsi_pending_cmds[arm]++; + + return (devHandle); +} diff --git a/usr/src/uts/common/io/mr_sas/ld_pd_map.h b/usr/src/uts/common/io/mr_sas/ld_pd_map.h new file mode 100644 index 0000000000..dc6f0ce957 --- /dev/null +++ b/usr/src/uts/common/io/mr_sas/ld_pd_map.h @@ -0,0 +1,249 @@ +/* + * ld_pd_map.h + * + * Solaris MegaRAID device driver for SAS2.0 controllers + * Copyright (c) 2008-2012, LSI Logic Corporation. + * All rights reserved. + * + * Version: + * Author: + * Swaminathan K S + * Arun Chandrashekhar + * Manju R + * Rasheed + * Shakeel Bukhari + */ + +#ifndef _LD_PD_MAP +#define _LD_PD_MAP +#include <sys/scsi/scsi.h> +#include "fusion.h" + +struct mrsas_instance; /* This will be defined in mr_sas.h */ + +/* raid->write_mode; raid->read_ahead; dcmd->state */ +/* Write through */ +#define WRITE_THROUGH 0 +/* Delayed Write */ +#define WRITE_BACK 1 + +/* SCSI CDB definitions */ +#define READ_6 0x08 +#define READ_16 0x88 +#define READ_10 0x28 +#define READ_12 0xA8 +#define WRITE_16 0x8A +#define WRITE_10 0x2A + +/* maximum disks per array */ +#define MAX_ROW_SIZE 32 +/* maximum spans per logical drive */ +#define MAX_SPAN_DEPTH 8 +#define MEGASAS_LOAD_BALANCE_FLAG 0x1 +#define MR_DEFAULT_IO_TIMEOUT 20 + + +union desc_value { + U64 word; + struct { + U32 low; + U32 high; + } u1; +}; + +typedef struct _LD_LOAD_BALANCE_INFO +{ + U8 loadBalanceFlag; + U8 reserved1; + U16 raid1DevHandle[2]; + U16 scsi_pending_cmds[2]; + U64 last_accessed_block[2]; +} LD_LOAD_BALANCE_INFO, *PLD_LOAD_BALANCE_INFO; + +#pragma pack(1) +typedef struct _MR_FW_RAID_MAP_ALL { + MR_FW_RAID_MAP raidMap; + MR_LD_SPAN_MAP ldSpanMap[MAX_LOGICAL_DRIVES - 1]; +} MR_FW_RAID_MAP_ALL; + +/* + * Raid Context structure which describes MegaRAID specific IO Paramenters + * This resides at offset 0x60 where the SGL normally starts in MPT IO Frames + */ +typedef struct _MPI2_SCSI_IO_VENDOR_UNIQUE { + U8 nsegType; /* 0x00 nseg[7:4], Type[3:0] */ + U8 resvd0; /* 0x01 */ + U16 timeoutValue; /* 0x02 -0x03 */ + U8 regLockFlags; /* 0x04 */ + U8 reservedForHw1; /* 0x05 */ + U16 ldTargetId; /* 0x06 - 0x07 */ + U64 regLockRowLBA; /* 0x08 - 0x0F */ + U32 regLockLength; /* 0x10 - 0x13 */ + U16 nextLMId; /* 0x14 - 0x15 */ + U8 extStatus; /* 0x16 */ + U8 status; /* 0x17 status */ + U8 RAIDFlags; /* 0x18 resvd[7:6], ioSubType[5:4], */ + /* resvd[3:1], preferredCpu[0] */ + U8 numSGE; /* 0x19 numSge; not including chain entries */ + U16 configSeqNum; /* 0x1A -0x1B */ + U8 spanArm; /* 0x1C span[7:5], arm[4:0] */ + U8 resvd2[3]; /* 0x1D-0x1f */ +} MPI2_SCSI_IO_VENDOR_UNIQUE, MPI25_SCSI_IO_VENDOR_UNIQUE; + +#define RAID_CTX_SPANARM_ARM_SHIFT (0) +#define RAID_CTX_SPANARM_ARM_MASK (0x1f) + +#define RAID_CTX_SPANARM_SPAN_SHIFT (5) +#define RAID_CTX_SPANARM_SPAN_MASK (0xE0) + + +/* + * RAID SCSI IO Request Message + * Total SGE count will be one less + * than _MPI2_SCSI_IO_REQUEST + */ +typedef struct _MPI2_RAID_SCSI_IO_REQUEST +{ + uint16_t DevHandle; /* 0x00 */ + uint8_t ChainOffset; /* 0x02 */ + uint8_t Function; /* 0x03 */ + uint16_t Reserved1; /* 0x04 */ + uint8_t Reserved2; /* 0x06 */ + uint8_t MsgFlags; /* 0x07 */ + uint8_t VP_ID; /* 0x08 */ + uint8_t VF_ID; /* 0x09 */ + uint16_t Reserved3; /* 0x0A */ + uint32_t SenseBufferLowAddress; /* 0x0C */ + uint16_t SGLFlags; /* 0x10 */ + uint8_t SenseBufferLength; /* 0x12 */ + uint8_t Reserved4; /* 0x13 */ + uint8_t SGLOffset0; /* 0x14 */ + uint8_t SGLOffset1; /* 0x15 */ + uint8_t SGLOffset2; /* 0x16 */ + uint8_t SGLOffset3; /* 0x17 */ + uint32_t SkipCount; /* 0x18 */ + uint32_t DataLength; /* 0x1C */ + uint32_t BidirectionalDataLength; /* 0x20 */ + uint16_t IoFlags; /* 0x24 */ + uint16_t EEDPFlags; /* 0x26 */ + uint32_t EEDPBlockSize; /* 0x28 */ + uint32_t SecondaryReferenceTag; /* 0x2C */ + uint16_t SecondaryApplicationTag; /* 0x30 */ + uint16_t ApplicationTagTranslationMask; /* 0x32 */ + uint8_t LUN[8]; /* 0x34 */ + uint32_t Control; /* 0x3C */ + Mpi2ScsiIoCdb_t CDB; /* 0x40 */ + MPI2_SCSI_IO_VENDOR_UNIQUE RaidContext; /* 0x60 */ + Mpi2SGEIOUnion_t SGL; /* 0x80 */ +} MPI2_RAID_SCSI_IO_REQUEST, MPI2_POINTER PTR_MPI2_RAID_SCSI_IO_REQUEST, +Mpi2RaidSCSIIORequest_t, MPI2_POINTER pMpi2RaidSCSIIORequest_t; + +/* + * define region lock types + */ +typedef enum _REGION_TYPE { + REGION_TYPE_UNUSED = 0, /* lock is currently not active */ + REGION_TYPE_SHARED_READ = 1, /* shared lock (for reads) */ + REGION_TYPE_SHARED_WRITE = 2, + REGION_TYPE_EXCLUSIVE = 3 /* exclusive lock (for writes) */ +} REGION_TYPE; + + +#define DM_PATH_MAXPATH 2 +#define DM_PATH_FIRSTPATH 0 +#define DM_PATH_SECONDPATH 1 + +/* declare valid Region locking values */ +typedef enum _REGION_LOCK { + REGION_LOCK_BYPASS = 0, + /* for RAID 6 single-drive failure */ + REGION_LOCK_UNCOND_SHARED_READ = 1, + REGION_LOCK_UNCOND_SHARED_WRITE = 2, + REGION_LOCK_UNCOND_SHARED_OTHER = 3, + REGION_LOCK_UNCOND_SHARED_EXCLUSIVE = 0xFF +} REGION_LOCK; + + +struct mrsas_init_frame2 { + uint8_t cmd; /* 00h */ + uint8_t reserved_0; /* 01h */ + uint8_t cmd_status; /* 02h */ + + uint8_t reserved_1; /* 03h */ + uint32_t reserved_2; /* 04h */ + + uint32_t context; /* 08h */ + uint32_t pad_0; /* 0Ch */ + + uint16_t flags; /* 10h */ + uint16_t reserved_3; /* 12h */ + uint32_t data_xfer_len; /* 14h */ + + uint32_t queue_info_new_phys_addr_lo; /* 18h */ + uint32_t queue_info_new_phys_addr_hi; /* 1Ch */ + uint32_t queue_info_old_phys_addr_lo; /* 20h */ + uint32_t queue_info_old_phys_addr_hi; /* 24h */ + uint64_t driverversion; /* 28h */ + uint32_t reserved_4[4]; /* 30h */ +}; + + +/* + * Request descriptor types + */ +#define MPI2_REQ_DESCRIPT_FLAGS_LD_IO 0x7 +#define MPI2_REQ_DESCRIPT_FLAGS_MFA 0x1 +#define MPI2_REQ_DESCRIPT_FLAGS_NO_LOCK 0x2 + +#define MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT 1 + + +/* + * MPT RAID MFA IO Descriptor. + */ +typedef struct _MR_RAID_MFA_IO_DESCRIPTOR { + uint32_t RequestFlags : 8; + uint32_t MessageAddress1 : 24; /* bits 31:8 */ + uint32_t MessageAddress2; /* bits 61:32 */ +} MR_RAID_MFA_IO_REQUEST_DESCRIPTOR, +*PMR_RAID_MFA_IO_REQUEST_DESCRIPTOR; + +/* union of Request Descriptors */ +typedef union _MRSAS_REQUEST_DESCRIPTOR_UNION +{ + MPI2_DEFAULT_REQUEST_DESCRIPTOR Default; + MPI2_HIGH_PRIORITY_REQUEST_DESCRIPTOR HighPriority; + MPI2_SCSI_IO_REQUEST_DESCRIPTOR SCSIIO; + MPI2_SCSI_TARGET_REQUEST_DESCRIPTOR SCSITarget; + MPI2_RAID_ACCEL_REQUEST_DESCRIPTOR RAIDAccelerator; + MR_RAID_MFA_IO_REQUEST_DESCRIPTOR MFAIo; + U64 Words; +} MRSAS_REQUEST_DESCRIPTOR_UNION; + +#pragma pack() + +enum { + MRSAS_SCSI_VARIABLE_LENGTH_CMD = 0x7F, + MRSAS_SCSI_SERVICE_ACTION_READ32 = 0x9, + MRSAS_SCSI_SERVICE_ACTION_WRITE32 = 0xB, + MRSAS_SCSI_ADDL_CDB_LEN = 0x18, + MRSAS_RD_WR_PROTECT = 0x20, + MRSAS_EEDPBLOCKSIZE = 512 +}; + + +#define IEEE_SGE_FLAGS_ADDR_MASK (0x03) +#define IEEE_SGE_FLAGS_SYSTEM_ADDR (0x00) +#define IEEE_SGE_FLAGS_IOCDDR_ADDR (0x01) +#define IEEE_SGE_FLAGS_IOCPLB_ADDR (0x02) +#define IEEE_SGE_FLAGS_IOCPLBNTA_ADDR (0x03) +#define IEEE_SGE_FLAGS_CHAIN_ELEMENT (0x80) +#define IEEE_SGE_FLAGS_END_OF_LIST (0x40) + + +U8 MR_ValidateMapInfo(MR_FW_RAID_MAP_ALL *map, PLD_LOAD_BALANCE_INFO lbInfo); +U16 MR_CheckDIF(U32, MR_FW_RAID_MAP_ALL *); +U8 MR_BuildRaidContext(struct mrsas_instance *, struct IO_REQUEST_INFO *, + MPI2_SCSI_IO_VENDOR_UNIQUE *, MR_FW_RAID_MAP_ALL *); + +#endif /* _LD_PD_MAP */ diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.c b/usr/src/uts/common/io/mr_sas/mr_sas.c index 922fc78f8d..05fecff694 100644 --- a/usr/src/uts/common/io/mr_sas/mr_sas.c +++ b/usr/src/uts/common/io/mr_sas/mr_sas.c @@ -1,16 +1,17 @@ /* * mr_sas.c: source for mr_sas driver * - * MegaRAID device driver for SAS2.0 controllers - * Copyright (c) 2008-2010, LSI Logic Corporation. + * Solaris MegaRAID device driver for SAS2.0 controllers + * Copyright (c) 2008-2012, LSI Logic Corporation. * All rights reserved. * * Version: * Author: + * Swaminathan K S * Arun Chandrashekhar * Manju R - * Rajesh Prabhakaran - * Seokmann Ju + * Rasheed + * Shakeel Bukhari * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -43,6 +44,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Bayard G. Bell. All rights reserved. + * Copyright 2012 Nexenta System, Inc. All rights reserved. */ #include <sys/types.h> @@ -83,29 +85,158 @@ */ static void *mrsas_state = NULL; static volatile boolean_t mrsas_relaxed_ordering = B_TRUE; -static volatile int debug_level_g = CL_NONE; -static volatile int msi_enable = 1; +volatile int debug_level_g = CL_NONE; +static volatile int msi_enable = 1; static volatile int ctio_enable = 1; /* Default Timeout value to issue online controller reset */ -static volatile int debug_timeout_g = 0xB4; +volatile int debug_timeout_g = 0xF0; /* 0xB4; */ /* Simulate consecutive firmware fault */ static volatile int debug_fw_faults_after_ocr_g = 0; - #ifdef OCRDEBUG /* Simulate three consecutive timeout for an IO */ static volatile int debug_consecutive_timeout_after_ocr_g = 0; #endif +#if 0 +/* Enable OCR on firmware fault */ +static volatile int debug_support_ocr_isr_g = 0; +#endif #pragma weak scsi_hba_open #pragma weak scsi_hba_close #pragma weak scsi_hba_ioctl -static ddi_dma_attr_t mrsas_generic_dma_attr = { +/* Local static prototypes. */ +static int mrsas_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); +static int mrsas_attach(dev_info_t *, ddi_attach_cmd_t); +#ifdef __sparc +static int mrsas_reset(dev_info_t *, ddi_reset_cmd_t); +#else +static int mrsas_quiesce(dev_info_t *); +#endif +static int mrsas_detach(dev_info_t *, ddi_detach_cmd_t); +static int mrsas_open(dev_t *, int, int, cred_t *); +static int mrsas_close(dev_t, int, int, cred_t *); +static int mrsas_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); + +static int mrsas_tran_tgt_init(dev_info_t *, dev_info_t *, + scsi_hba_tran_t *, struct scsi_device *); +static struct scsi_pkt *mrsas_tran_init_pkt(struct scsi_address *, register + struct scsi_pkt *, struct buf *, int, int, int, int, + int (*)(), caddr_t); +static int mrsas_tran_start(struct scsi_address *, + register struct scsi_pkt *); +static int mrsas_tran_abort(struct scsi_address *, struct scsi_pkt *); +static int mrsas_tran_reset(struct scsi_address *, int); +#if 0 +static int mrsas_tran_bus_reset(dev_info_t *, int); +#endif +static int mrsas_tran_getcap(struct scsi_address *, char *, int); +static int mrsas_tran_setcap(struct scsi_address *, char *, int, int); +static void mrsas_tran_destroy_pkt(struct scsi_address *, + struct scsi_pkt *); +static void mrsas_tran_dmafree(struct scsi_address *, struct scsi_pkt *); +static void mrsas_tran_sync_pkt(struct scsi_address *, struct scsi_pkt *); +static int mrsas_tran_quiesce(dev_info_t *dip); +static int mrsas_tran_unquiesce(dev_info_t *dip); +static uint_t mrsas_isr(); +static uint_t mrsas_softintr(); +static void mrsas_undo_resources(dev_info_t *, struct mrsas_instance *); +static struct mrsas_cmd *get_mfi_pkt(struct mrsas_instance *); +static void return_mfi_pkt(struct mrsas_instance *, + struct mrsas_cmd *); + +static void free_space_for_mfi(struct mrsas_instance *); +static uint32_t read_fw_status_reg_ppc(struct mrsas_instance *); +static void issue_cmd_ppc(struct mrsas_cmd *, struct mrsas_instance *); +static int issue_cmd_in_poll_mode_ppc(struct mrsas_instance *, + struct mrsas_cmd *); +static int issue_cmd_in_sync_mode_ppc(struct mrsas_instance *, + struct mrsas_cmd *); +static void enable_intr_ppc(struct mrsas_instance *); +static void disable_intr_ppc(struct mrsas_instance *); +static int intr_ack_ppc(struct mrsas_instance *); +static void flush_cache(struct mrsas_instance *instance); +void display_scsi_inquiry(caddr_t); +static int start_mfi_aen(struct mrsas_instance *instance); +static int handle_drv_ioctl(struct mrsas_instance *instance, + struct mrsas_ioctl *ioctl, int mode); +static int handle_mfi_ioctl(struct mrsas_instance *instance, + struct mrsas_ioctl *ioctl, int mode); +static int handle_mfi_aen(struct mrsas_instance *instance, + struct mrsas_aen *aen); +static struct mrsas_cmd *build_cmd(struct mrsas_instance *, + struct scsi_address *, struct scsi_pkt *, uchar_t *); +static int alloc_additional_dma_buffer(struct mrsas_instance *); +static void complete_cmd_in_sync_mode(struct mrsas_instance *, + struct mrsas_cmd *); +static int mrsas_kill_adapter(struct mrsas_instance *); +static int mrsas_issue_init_mfi(struct mrsas_instance *); +static int mrsas_reset_ppc(struct mrsas_instance *); +static uint32_t mrsas_initiate_ocr_if_fw_is_faulty(struct mrsas_instance *); +static int wait_for_outstanding(struct mrsas_instance *instance); +static int register_mfi_aen(struct mrsas_instance *instance, + uint32_t seq_num, uint32_t class_locale_word); +static int issue_mfi_pthru(struct mrsas_instance *instance, struct + mrsas_ioctl *ioctl, struct mrsas_cmd *cmd, int mode); +static int issue_mfi_dcmd(struct mrsas_instance *instance, struct + mrsas_ioctl *ioctl, struct mrsas_cmd *cmd, int mode); +static int issue_mfi_smp(struct mrsas_instance *instance, struct + mrsas_ioctl *ioctl, struct mrsas_cmd *cmd, int mode); +static int issue_mfi_stp(struct mrsas_instance *instance, struct + mrsas_ioctl *ioctl, struct mrsas_cmd *cmd, int mode); +static int abort_aen_cmd(struct mrsas_instance *instance, + struct mrsas_cmd *cmd_to_abort); + +static void mrsas_rem_intrs(struct mrsas_instance *instance); +static int mrsas_add_intrs(struct mrsas_instance *instance, int intr_type); + +static void mrsas_tran_tgt_free(dev_info_t *, dev_info_t *, + scsi_hba_tran_t *, struct scsi_device *); +static int mrsas_tran_bus_config(dev_info_t *, uint_t, + ddi_bus_config_op_t, void *, dev_info_t **); +static int mrsas_parse_devname(char *, int *, int *); +static int mrsas_config_all_devices(struct mrsas_instance *); +static int mrsas_config_ld(struct mrsas_instance *, uint16_t, + uint8_t, dev_info_t **); +static int mrsas_name_node(dev_info_t *, char *, int); +static void mrsas_issue_evt_taskq(struct mrsas_eventinfo *); +static void free_additional_dma_buffer(struct mrsas_instance *); +static void io_timeout_checker(void *); +static void mrsas_fm_init(struct mrsas_instance *); +static void mrsas_fm_fini(struct mrsas_instance *); + +static struct mrsas_function_template mrsas_function_template_ppc = { + .read_fw_status_reg = read_fw_status_reg_ppc, + .issue_cmd = issue_cmd_ppc, + .issue_cmd_in_sync_mode = issue_cmd_in_sync_mode_ppc, + .issue_cmd_in_poll_mode = issue_cmd_in_poll_mode_ppc, + .enable_intr = enable_intr_ppc, + .disable_intr = disable_intr_ppc, + .intr_ack = intr_ack_ppc, + .init_adapter = mrsas_init_adapter_ppc +/* .reset_adapter = mrsas_reset_adapter_ppc */ +}; + + +static struct mrsas_function_template mrsas_function_template_fusion = { + .read_fw_status_reg = tbolt_read_fw_status_reg, + .issue_cmd = tbolt_issue_cmd, + .issue_cmd_in_sync_mode = tbolt_issue_cmd_in_sync_mode, + .issue_cmd_in_poll_mode = tbolt_issue_cmd_in_poll_mode, + .enable_intr = tbolt_enable_intr, + .disable_intr = tbolt_disable_intr, + .intr_ack = tbolt_intr_ack, + .init_adapter = mrsas_init_adapter_tbolt +/* .reset_adapter = mrsas_reset_adapter_tbolt */ +}; + + +ddi_dma_attr_t mrsas_generic_dma_attr = { DMA_ATTR_V0, /* dma_attr_version */ 0, /* low DMA address range */ 0xFFFFFFFFU, /* high DMA address range */ - 0xFFFFFFFFU, /* DMA counter register */ + 0xFFFFFFFFU, /* DMA counter register */ 8, /* DMA address alignment */ 0x07, /* DMA burstsizes */ 1, /* min DMA size */ @@ -119,6 +250,12 @@ static ddi_dma_attr_t mrsas_generic_dma_attr = { int32_t mrsas_max_cap_maxxfer = 0x1000000; /* + * Fix for: Thunderbolt controller IO timeout when IO write size is 1MEG, + * Limit size to 256K + */ +uint32_t mrsas_tbolt_max_cap_maxxfer = (512 * 512); + +/* * cb_ops contains base level routines */ static struct cb_ops mrsas_cb_ops = { @@ -153,19 +290,20 @@ static struct dev_ops mrsas_ops = { nulldev, /* probe */ mrsas_attach, /* attach */ mrsas_detach, /* detach */ -#ifdef __sparc +#ifdef __sparc mrsas_reset, /* reset */ #else /* __sparc */ nodev, -#endif /* __sparc */ +#endif /* __sparc */ &mrsas_cb_ops, /* char/block ops */ NULL, /* bus ops */ NULL, /* power */ -#ifdef __sparc +#ifdef __sparc ddi_quiesce_not_needed #else /* __sparc */ - mrsas_quiesce /* quiesce */ + mrsas_quiesce /* quiesce */ #endif /* __sparc */ + }; static struct modldrv modldrv = { @@ -188,14 +326,28 @@ static struct ddi_device_acc_attr endian_attr = { }; +unsigned int enable_fp = 1; + + /* * ************************************************************************** * - * * - * common entry points - for loadable kernel modules * - * * + * * + * common entry points - for loadable kernel modules * + * * * ************************************************************************** * */ +/* + * _init - initialize a loadable module + * @void + * + * The driver should perform any one-time resource allocation or data + * initialization during driver loading in _init(). For example, the driver + * should initialize any mutexes global to the driver in this routine. + * The driver should not, however, use _init() to allocate or initialize + * anything that has to do with a particular instance of the device. + * Per-instance initialization must be done in attach(). + */ int _init(void) { @@ -207,12 +359,12 @@ _init(void) sizeof (struct mrsas_instance), 0); if (ret != DDI_SUCCESS) { - con_log(CL_ANN, (CE_WARN, "mr_sas: could not init state")); + cmn_err(CE_WARN, "mr_sas: could not init state"); return (ret); } if ((ret = scsi_hba_init(&modlinkage)) != DDI_SUCCESS) { - con_log(CL_ANN, (CE_WARN, "mr_sas: could not init scsi hba")); + cmn_err(CE_WARN, "mr_sas: could not init scsi hba"); ddi_soft_state_fini(&mrsas_state); return (ret); } @@ -220,7 +372,7 @@ _init(void) ret = mod_install(&modlinkage); if (ret != DDI_SUCCESS) { - con_log(CL_ANN, (CE_WARN, "mr_sas: mod_install failed")); + cmn_err(CE_WARN, "mr_sas: mod_install failed"); scsi_hba_fini(&modlinkage); ddi_soft_state_fini(&mrsas_state); } @@ -228,6 +380,13 @@ _init(void) return (ret); } +/* + * _info - returns information about a loadable module. + * @void + * + * _info() is called to return module information. This is a typical entry + * point that does predefined role. It simply calls mod_info(). + */ int _info(struct modinfo *modinfop) { @@ -236,6 +395,13 @@ _info(struct modinfo *modinfop) return (mod_info(&modlinkage, modinfop)); } +/* + * _fini - prepare a loadable module for unloading + * @void + * + * In _fini(), the driver should release any resources that were allocated in + * _init(). The driver must remove itself from the system module list. + */ int _fini(void) { @@ -243,12 +409,17 @@ _fini(void) con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); - if ((ret = mod_remove(&modlinkage)) != DDI_SUCCESS) + if ((ret = mod_remove(&modlinkage)) != DDI_SUCCESS) { + con_log(CL_ANN1, + (CE_WARN, "_fini: mod_remove() failed, error 0x%X", ret)); return (ret); + } scsi_hba_fini(&modlinkage); + con_log(CL_DLEVEL1, (CE_NOTE, "_fini: scsi_hba_fini() done.")); ddi_soft_state_fini(&mrsas_state); + con_log(CL_DLEVEL1, (CE_NOTE, "_fini: ddi_soft_state_fini() done.")); return (ret); } @@ -256,24 +427,41 @@ _fini(void) /* * ************************************************************************** * - * * - * common entry points - for autoconfiguration * - * * + * * + * common entry points - for autoconfiguration * + * * * ************************************************************************** * */ - +/* + * attach - adds a device to the system as part of initialization + * @dip: + * @cmd: + * + * The kernel calls a driver's attach() entry point to attach an instance of + * a device (for MegaRAID, it is instance of a controller) or to resume + * operation for an instance of a device that has been suspended or has been + * shut down by the power management framework + * The attach() entry point typically includes the following types of + * processing: + * - allocate a soft-state structure for the device instance (for MegaRAID, + * controller instance) + * - initialize per-instance mutexes + * - initialize condition variables + * - register the device's interrupts (for MegaRAID, controller's interrupts) + * - map the registers and memory of the device instance (for MegaRAID, + * controller instance) + * - create minor device nodes for the device instance (for MegaRAID, + * controller instance) + * - report that the device instance (for MegaRAID, controller instance) has + * attached + */ static int mrsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { int instance_no; int nregs; - uint8_t added_isr_f = 0; - uint8_t added_soft_isr_f = 0; - uint8_t create_devctl_node_f = 0; - uint8_t create_scsi_node_f = 0; - uint8_t create_ioc_node_f = 0; - uint8_t tran_alloc_f = 0; - uint8_t irq; + int i = 0; + uint8_t irq; uint16_t vendor_id; uint16_t device_id; uint16_t subsysvid; @@ -284,7 +472,7 @@ mrsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) char *data; scsi_hba_tran_t *tran; - ddi_dma_attr_t tran_dma_attr; + ddi_dma_attr_t tran_dma_attr; struct mrsas_instance *instance; con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); @@ -298,481 +486,533 @@ mrsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) * check to see whether this device is in a DMA-capable slot. */ if (ddi_slaveonly(dip) == DDI_SUCCESS) { - con_log(CL_ANN, (CE_WARN, + cmn_err(CE_WARN, "mr_sas%d: Device in slave-only slot, unused", - instance_no)); + instance_no); return (DDI_FAILURE); } switch (cmd) { - case DDI_ATTACH: - con_log(CL_DLEVEL1, (CE_NOTE, "mr_sas: DDI_ATTACH")); - /* allocate the soft state for the instance */ - if (ddi_soft_state_zalloc(mrsas_state, instance_no) - != DDI_SUCCESS) { - con_log(CL_ANN, (CE_WARN, - "mr_sas%d: Failed to allocate soft state", - instance_no)); + case DDI_ATTACH: + /* allocate the soft state for the instance */ + if (ddi_soft_state_zalloc(mrsas_state, instance_no) + != DDI_SUCCESS) { + cmn_err(CE_WARN, + "mr_sas%d: Failed to allocate soft state", + instance_no); + return (DDI_FAILURE); + } - return (DDI_FAILURE); - } + instance = (struct mrsas_instance *)ddi_get_soft_state + (mrsas_state, instance_no); - instance = (struct mrsas_instance *)ddi_get_soft_state - (mrsas_state, instance_no); + if (instance == NULL) { + cmn_err(CE_WARN, + "mr_sas%d: Bad soft state", instance_no); + ddi_soft_state_free(mrsas_state, instance_no); + return (DDI_FAILURE); + } - if (instance == NULL) { - con_log(CL_ANN, (CE_WARN, - "mr_sas%d: Bad soft state", instance_no)); + bzero(instance, sizeof (struct mrsas_instance)); - ddi_soft_state_free(mrsas_state, instance_no); + instance->unroll.softs = 1; - return (DDI_FAILURE); - } + /* Setup the PCI configuration space handles */ + if (pci_config_setup(dip, &instance->pci_handle) != + DDI_SUCCESS) { + cmn_err(CE_WARN, + "mr_sas%d: pci config setup failed ", + instance_no); + + ddi_soft_state_free(mrsas_state, instance_no); + return (DDI_FAILURE); + } + if (instance->pci_handle == NULL) { + cmn_err(CE_WARN, + "mr_sas%d: pci config setup failed ", + instance_no); + ddi_soft_state_free(mrsas_state, instance_no); + return (DDI_FAILURE); + } - bzero((caddr_t)instance, - sizeof (struct mrsas_instance)); - instance->func_ptr = kmem_zalloc( - sizeof (struct mrsas_func_ptr), KM_SLEEP); - ASSERT(instance->func_ptr); - /* Setup the PCI configuration space handles */ - if (pci_config_setup(dip, &instance->pci_handle) != - DDI_SUCCESS) { - con_log(CL_ANN, (CE_WARN, - "mr_sas%d: pci config setup failed ", - instance_no)); + if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS) { + cmn_err(CE_WARN, + "mr_sas: failed to get registers."); - kmem_free(instance->func_ptr, - sizeof (struct mrsas_func_ptr)); - ddi_soft_state_free(mrsas_state, instance_no); + pci_config_teardown(&instance->pci_handle); + ddi_soft_state_free(mrsas_state, instance_no); + return (DDI_FAILURE); + } - return (DDI_FAILURE); - } + vendor_id = pci_config_get16(instance->pci_handle, + PCI_CONF_VENID); + device_id = pci_config_get16(instance->pci_handle, + PCI_CONF_DEVID); - if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS) { - con_log(CL_ANN, (CE_WARN, - "mr_sas: failed to get registers.")); + subsysvid = pci_config_get16(instance->pci_handle, + PCI_CONF_SUBVENID); + subsysid = pci_config_get16(instance->pci_handle, + PCI_CONF_SUBSYSID); - pci_config_teardown(&instance->pci_handle); - kmem_free(instance->func_ptr, - sizeof (struct mrsas_func_ptr)); - ddi_soft_state_free(mrsas_state, instance_no); + pci_config_put16(instance->pci_handle, PCI_CONF_COMM, + (pci_config_get16(instance->pci_handle, + PCI_CONF_COMM) | PCI_COMM_ME)); + irq = pci_config_get8(instance->pci_handle, + PCI_CONF_ILINE); - return (DDI_FAILURE); - } + con_log(CL_DLEVEL1, (CE_CONT, "mr_sas%d: " + "0x%x:0x%x 0x%x:0x%x, irq:%d drv-ver:%s", + instance_no, vendor_id, device_id, subsysvid, + subsysid, irq, MRSAS_VERSION)); - vendor_id = pci_config_get16(instance->pci_handle, - PCI_CONF_VENID); - device_id = pci_config_get16(instance->pci_handle, - PCI_CONF_DEVID); + /* enable bus-mastering */ + command = pci_config_get16(instance->pci_handle, + PCI_CONF_COMM); - subsysvid = pci_config_get16(instance->pci_handle, - PCI_CONF_SUBVENID); - subsysid = pci_config_get16(instance->pci_handle, - PCI_CONF_SUBSYSID); + if (!(command & PCI_COMM_ME)) { + command |= PCI_COMM_ME; - pci_config_put16(instance->pci_handle, PCI_CONF_COMM, - (pci_config_get16(instance->pci_handle, - PCI_CONF_COMM) | PCI_COMM_ME)); - irq = pci_config_get8(instance->pci_handle, - PCI_CONF_ILINE); + pci_config_put16(instance->pci_handle, + PCI_CONF_COMM, command); + con_log(CL_ANN, (CE_CONT, "mr_sas%d: " + "enable bus-mastering", instance_no)); + } else { con_log(CL_DLEVEL1, (CE_CONT, "mr_sas%d: " - "0x%x:0x%x 0x%x:0x%x, irq:%d drv-ver:%s", - instance_no, vendor_id, device_id, subsysvid, - subsysid, irq, MRSAS_VERSION)); + "bus-mastering already set", instance_no)); + } - /* enable bus-mastering */ - command = pci_config_get16(instance->pci_handle, - PCI_CONF_COMM); + /* initialize function pointers */ + switch (device_id) { + case PCI_DEVICE_ID_LSI_TBOLT: + case PCI_DEVICE_ID_LSI_INVADER: + con_log(CL_ANN, (CE_NOTE, + "mr_sas: 2208 T.B. device detected")); - if (!(command & PCI_COMM_ME)) { - command |= PCI_COMM_ME; + instance->func_ptr = + &mrsas_function_template_fusion; + instance->tbolt = 1; + break; - pci_config_put16(instance->pci_handle, - PCI_CONF_COMM, command); + case PCI_DEVICE_ID_LSI_2108VDE: + case PCI_DEVICE_ID_LSI_2108V: + con_log(CL_ANN, (CE_NOTE, + "mr_sas: 2108 Liberator device detected")); - con_log(CL_ANN, (CE_CONT, "mr_sas%d: " - "enable bus-mastering", instance_no)); - } else { - con_log(CL_DLEVEL1, (CE_CONT, "mr_sas%d: " - "bus-mastering already set", instance_no)); - } + instance->func_ptr = + &mrsas_function_template_ppc; + break; - /* initialize function pointers */ - if ((device_id == PCI_DEVICE_ID_LSI_2108VDE) || - (device_id == PCI_DEVICE_ID_LSI_2108V)) { - con_log(CL_DLEVEL1, (CE_CONT, "mr_sas%d: " - "2108V/DE detected", instance_no)); - instance->func_ptr->read_fw_status_reg = - read_fw_status_reg_ppc; - instance->func_ptr->issue_cmd = issue_cmd_ppc; - instance->func_ptr->issue_cmd_in_sync_mode = - issue_cmd_in_sync_mode_ppc; - instance->func_ptr->issue_cmd_in_poll_mode = - issue_cmd_in_poll_mode_ppc; - instance->func_ptr->enable_intr = - enable_intr_ppc; - instance->func_ptr->disable_intr = - disable_intr_ppc; - instance->func_ptr->intr_ack = intr_ack_ppc; - } else { - con_log(CL_ANN, (CE_WARN, - "mr_sas: Invalid device detected")); + default: + cmn_err(CE_WARN, + "mr_sas: Invalid device detected"); - pci_config_teardown(&instance->pci_handle); - kmem_free(instance->func_ptr, - sizeof (struct mrsas_func_ptr)); - ddi_soft_state_free(mrsas_state, instance_no); + pci_config_teardown(&instance->pci_handle); + ddi_soft_state_free(mrsas_state, instance_no); + return (DDI_FAILURE); + } - return (DDI_FAILURE); + instance->baseaddress = pci_config_get32( + instance->pci_handle, PCI_CONF_BASE0); + instance->baseaddress &= 0x0fffc; + + instance->dip = dip; + instance->vendor_id = vendor_id; + instance->device_id = device_id; + instance->subsysvid = subsysvid; + instance->subsysid = subsysid; + instance->instance = instance_no; + + /* Initialize FMA */ + instance->fm_capabilities = ddi_prop_get_int( + DDI_DEV_T_ANY, instance->dip, DDI_PROP_DONTPASS, + "fm-capable", DDI_FM_EREPORT_CAPABLE | + DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE + | DDI_FM_ERRCB_CAPABLE); + + mrsas_fm_init(instance); + + /* Setup register map */ + if ((ddi_dev_regsize(instance->dip, + REGISTER_SET_IO_2108, ®length) != DDI_SUCCESS) || + reglength < MINIMUM_MFI_MEM_SZ) { + goto fail_attach; + } + if (reglength > DEFAULT_MFI_MEM_SZ) { + reglength = DEFAULT_MFI_MEM_SZ; + con_log(CL_DLEVEL1, (CE_NOTE, + "mr_sas: register length to map is 0x%lx bytes", + reglength)); + } + if (ddi_regs_map_setup(instance->dip, + REGISTER_SET_IO_2108, &instance->regmap, 0, + reglength, &endian_attr, &instance->regmap_handle) + != DDI_SUCCESS) { + cmn_err(CE_WARN, + "mr_sas: couldn't map control registers"); + goto fail_attach; + } + if (instance->regmap_handle == NULL) { + cmn_err(CE_WARN, + "mr_sas: couldn't map control registers"); + goto fail_attach; + } + + instance->unroll.regs = 1; + + /* + * Disable Interrupt Now. + * Setup Software interrupt + */ + instance->func_ptr->disable_intr(instance); + + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0, + "mrsas-enable-msi", &data) == DDI_SUCCESS) { + if (strncmp(data, "no", 3) == 0) { + msi_enable = 0; + con_log(CL_ANN1, (CE_WARN, + "msi_enable = %d disabled", msi_enable)); } + ddi_prop_free(data); + } - instance->baseaddress = pci_config_get32( - instance->pci_handle, PCI_CONF_BASE0); - instance->baseaddress &= 0x0fffc; - - instance->dip = dip; - instance->vendor_id = vendor_id; - instance->device_id = device_id; - instance->subsysvid = subsysvid; - instance->subsysid = subsysid; - instance->instance = instance_no; - - /* Initialize FMA */ - instance->fm_capabilities = ddi_prop_get_int( - DDI_DEV_T_ANY, instance->dip, DDI_PROP_DONTPASS, - "fm-capable", DDI_FM_EREPORT_CAPABLE | - DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE - | DDI_FM_ERRCB_CAPABLE); - - mrsas_fm_init(instance); - - /* Initialize Interrupts */ - if ((ddi_dev_regsize(instance->dip, - REGISTER_SET_IO_2108, ®length) != DDI_SUCCESS) || - reglength < MINIMUM_MFI_MEM_SZ) { - return (DDI_FAILURE); + con_log(CL_DLEVEL1, (CE_NOTE, "msi_enable = %d", msi_enable)); + + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0, + "mrsas-enable-fp", &data) == DDI_SUCCESS) { + if (strncmp(data, "no", 3) == 0) { + enable_fp = 0; + cmn_err(CE_NOTE, + "enable_fp = %d, Fast-Path disabled.\n", + enable_fp); + } + + ddi_prop_free(data); + } + + cmn_err(CE_NOTE, "enable_fp = %d\n", enable_fp); + + /* Check for all supported interrupt types */ + if (ddi_intr_get_supported_types( + dip, &intr_types) != DDI_SUCCESS) { + cmn_err(CE_WARN, + "ddi_intr_get_supported_types() failed"); + goto fail_attach; + } + + con_log(CL_DLEVEL1, (CE_NOTE, + "ddi_intr_get_supported_types() ret: 0x%x", intr_types)); + + /* Initialize and Setup Interrupt handler */ + if (msi_enable && (intr_types & DDI_INTR_TYPE_MSIX)) { + if (mrsas_add_intrs(instance, DDI_INTR_TYPE_MSIX) != + DDI_SUCCESS) { + cmn_err(CE_WARN, + "MSIX interrupt query failed"); + goto fail_attach; } - if (reglength > DEFAULT_MFI_MEM_SZ) { - reglength = DEFAULT_MFI_MEM_SZ; - con_log(CL_DLEVEL1, (CE_NOTE, - "mr_sas: register length to map is " - "0x%lx bytes", reglength)); + instance->intr_type = DDI_INTR_TYPE_MSIX; + } else if (msi_enable && (intr_types & DDI_INTR_TYPE_MSI)) { + if (mrsas_add_intrs(instance, DDI_INTR_TYPE_MSI) != + DDI_SUCCESS) { + cmn_err(CE_WARN, + "MSI interrupt query failed"); + goto fail_attach; } - if (ddi_regs_map_setup(instance->dip, - REGISTER_SET_IO_2108, &instance->regmap, 0, - reglength, &endian_attr, &instance->regmap_handle) - != DDI_SUCCESS) { - con_log(CL_ANN, (CE_NOTE, - "mr_sas: couldn't map control registers")); + instance->intr_type = DDI_INTR_TYPE_MSI; + } else if (intr_types & DDI_INTR_TYPE_FIXED) { + msi_enable = 0; + if (mrsas_add_intrs(instance, DDI_INTR_TYPE_FIXED) != + DDI_SUCCESS) { + cmn_err(CE_WARN, + "FIXED interrupt query failed"); goto fail_attach; } + instance->intr_type = DDI_INTR_TYPE_FIXED; + } else { + cmn_err(CE_WARN, "Device cannot " + "suppport either FIXED or MSI/X " + "interrupts"); + goto fail_attach; + } - /* - * Disable Interrupt Now. - * Setup Software interrupt - */ - instance->func_ptr->disable_intr(instance); + instance->unroll.intr = 1; - if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0, - "mrsas-enable-msi", &data) == DDI_SUCCESS) { - if (strncmp(data, "no", 3) == 0) { - msi_enable = 0; - con_log(CL_ANN1, (CE_WARN, - "msi_enable = %d disabled", - msi_enable)); - } - ddi_prop_free(data); + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0, + "mrsas-enable-ctio", &data) == DDI_SUCCESS) { + if (strncmp(data, "no", 3) == 0) { + ctio_enable = 0; + con_log(CL_ANN1, (CE_WARN, + "ctio_enable = %d disabled", ctio_enable)); } + ddi_prop_free(data); + } - con_log(CL_DLEVEL1, (CE_WARN, "msi_enable = %d", - msi_enable)); + con_log(CL_DLEVEL1, (CE_WARN, "ctio_enable = %d", ctio_enable)); - /* Check for all supported interrupt types */ - if (ddi_intr_get_supported_types( - dip, &intr_types) != DDI_SUCCESS) { - con_log(CL_ANN, (CE_WARN, - "ddi_intr_get_supported_types() failed")); - goto fail_attach; - } + /* setup the mfi based low level driver */ + if (mrsas_init_adapter(instance) != DDI_SUCCESS) { + cmn_err(CE_WARN, "mr_sas: " + "could not initialize the low level driver"); - con_log(CL_DLEVEL1, (CE_NOTE, - "ddi_intr_get_supported_types() ret: 0x%x", - intr_types)); + goto fail_attach; + } - /* Initialize and Setup Interrupt handler */ - if (msi_enable && (intr_types & DDI_INTR_TYPE_MSIX)) { - if (mrsas_add_intrs(instance, - DDI_INTR_TYPE_MSIX) != DDI_SUCCESS) { - con_log(CL_ANN, (CE_WARN, - "MSIX interrupt query failed")); - goto fail_attach; - } - instance->intr_type = DDI_INTR_TYPE_MSIX; - } else if (msi_enable && (intr_types & - DDI_INTR_TYPE_MSI)) { - if (mrsas_add_intrs(instance, - DDI_INTR_TYPE_MSI) != DDI_SUCCESS) { - con_log(CL_ANN, (CE_WARN, - "MSI interrupt query failed")); - goto fail_attach; - } - instance->intr_type = DDI_INTR_TYPE_MSI; - } else if (intr_types & DDI_INTR_TYPE_FIXED) { - msi_enable = 0; - if (mrsas_add_intrs(instance, - DDI_INTR_TYPE_FIXED) != DDI_SUCCESS) { - con_log(CL_ANN, (CE_WARN, - "FIXED interrupt query failed")); - goto fail_attach; - } - instance->intr_type = DDI_INTR_TYPE_FIXED; - } else { - con_log(CL_ANN, (CE_WARN, "Device cannot " - "suppport either FIXED or MSI/X " - "interrupts")); - goto fail_attach; - } + /* Initialize all Mutex */ + INIT_LIST_HEAD(&instance->completed_pool_list); + mutex_init(&instance->completed_pool_mtx, + "completed_pool_mtx", MUTEX_DRIVER, + DDI_INTR_PRI(instance->intr_pri)); - added_isr_f = 1; + mutex_init(&instance->sync_map_mtx, + "sync_map_mtx", MUTEX_DRIVER, + DDI_INTR_PRI(instance->intr_pri)); - if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0, - "mrsas-enable-ctio", &data) == DDI_SUCCESS) { - if (strncmp(data, "no", 3) == 0) { - ctio_enable = 0; - con_log(CL_ANN1, (CE_WARN, - "ctio_enable = %d disabled", - ctio_enable)); - } - ddi_prop_free(data); - } + mutex_init(&instance->app_cmd_pool_mtx, + "app_cmd_pool_mtx", MUTEX_DRIVER, + DDI_INTR_PRI(instance->intr_pri)); - con_log(CL_DLEVEL1, (CE_WARN, "ctio_enable = %d", - ctio_enable)); + mutex_init(&instance->config_dev_mtx, "config_dev_mtx", + MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); - /* setup the mfi based low level driver */ - if (init_mfi(instance) != DDI_SUCCESS) { - con_log(CL_ANN, (CE_WARN, "mr_sas: " - "could not initialize the low level driver")); + mutex_init(&instance->cmd_pend_mtx, "cmd_pend_mtx", + MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); - goto fail_attach; - } + mutex_init(&instance->ocr_flags_mtx, "ocr_flags_mtx", + MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); - /* Initialize all Mutex */ - INIT_LIST_HEAD(&instance->completed_pool_list); - mutex_init(&instance->completed_pool_mtx, - "completed_pool_mtx", MUTEX_DRIVER, - DDI_INTR_PRI(instance->intr_pri)); + mutex_init(&instance->int_cmd_mtx, "int_cmd_mtx", + MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); + cv_init(&instance->int_cmd_cv, NULL, CV_DRIVER, NULL); - mutex_init(&instance->app_cmd_pool_mtx, - "app_cmd_pool_mtx", MUTEX_DRIVER, - DDI_INTR_PRI(instance->intr_pri)); + mutex_init(&instance->cmd_pool_mtx, "cmd_pool_mtx", + MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); + + mutex_init(&instance->reg_write_mtx, "reg_write_mtx", + MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); - mutex_init(&instance->cmd_pend_mtx, "cmd_pend_mtx", - MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); + if (instance->tbolt) { + mutex_init(&instance->cmd_app_pool_mtx, + "cmd_app_pool_mtx", MUTEX_DRIVER, + DDI_INTR_PRI(instance->intr_pri)); - mutex_init(&instance->ocr_flags_mtx, "ocr_flags_mtx", - MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); + mutex_init(&instance->chip_mtx, + "chip_mtx", MUTEX_DRIVER, + DDI_INTR_PRI(instance->intr_pri)); - mutex_init(&instance->int_cmd_mtx, "int_cmd_mtx", - MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); - cv_init(&instance->int_cmd_cv, NULL, CV_DRIVER, NULL); + } - mutex_init(&instance->cmd_pool_mtx, "cmd_pool_mtx", - MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); + instance->unroll.mutexs = 1; - instance->timeout_id = (timeout_id_t)-1; + instance->timeout_id = (timeout_id_t)-1; - /* Register our soft-isr for highlevel interrupts. */ - instance->isr_level = instance->intr_pri; + /* Register our soft-isr for highlevel interrupts. */ + instance->isr_level = instance->intr_pri; + if (!(instance->tbolt)) { if (instance->isr_level == HIGH_LEVEL_INTR) { - if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH, + if (ddi_add_softintr(dip, + DDI_SOFTINT_HIGH, &instance->soft_intr_id, NULL, NULL, mrsas_softintr, (caddr_t)instance) != DDI_SUCCESS) { - con_log(CL_ANN, (CE_WARN, - " Software ISR did not register")); + cmn_err(CE_WARN, + "Software ISR did not register"); goto fail_attach; } - added_soft_isr_f = 1; - } - - /* Allocate a transport structure */ - tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP); + instance->unroll.soft_isr = 1; - if (tran == NULL) { - con_log(CL_ANN, (CE_WARN, - "scsi_hba_tran_alloc failed")); - goto fail_attach; } + } + + instance->softint_running = 0; - tran_alloc_f = 1; + /* Allocate a transport structure */ + tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP); - instance->tran = tran; + if (tran == NULL) { + cmn_err(CE_WARN, + "scsi_hba_tran_alloc failed"); + goto fail_attach; + } - tran->tran_hba_private = instance; - tran->tran_tgt_init = mrsas_tran_tgt_init; - tran->tran_tgt_probe = scsi_hba_probe; - tran->tran_tgt_free = mrsas_tran_tgt_free; + instance->tran = tran; + instance->unroll.tran = 1; + + tran->tran_hba_private = instance; + tran->tran_tgt_init = mrsas_tran_tgt_init; + tran->tran_tgt_probe = scsi_hba_probe; + tran->tran_tgt_free = mrsas_tran_tgt_free; + if (instance->tbolt) { + tran->tran_init_pkt = + mrsas_tbolt_tran_init_pkt; + tran->tran_start = + mrsas_tbolt_tran_start; + } else { tran->tran_init_pkt = mrsas_tran_init_pkt; tran->tran_start = mrsas_tran_start; - tran->tran_abort = mrsas_tran_abort; - tran->tran_reset = mrsas_tran_reset; - tran->tran_getcap = mrsas_tran_getcap; - tran->tran_setcap = mrsas_tran_setcap; - tran->tran_destroy_pkt = mrsas_tran_destroy_pkt; - tran->tran_dmafree = mrsas_tran_dmafree; - tran->tran_sync_pkt = mrsas_tran_sync_pkt; - tran->tran_bus_config = mrsas_tran_bus_config; - - if (mrsas_relaxed_ordering) - mrsas_generic_dma_attr.dma_attr_flags |= - DDI_DMA_RELAXED_ORDERING; - - - tran_dma_attr = mrsas_generic_dma_attr; - tran_dma_attr.dma_attr_sgllen = instance->max_num_sge; - - /* Attach this instance of the hba */ - if (scsi_hba_attach_setup(dip, &tran_dma_attr, tran, 0) - != DDI_SUCCESS) { - con_log(CL_ANN, (CE_WARN, - "scsi_hba_attach failed")); + } + tran->tran_abort = mrsas_tran_abort; + tran->tran_reset = mrsas_tran_reset; + tran->tran_getcap = mrsas_tran_getcap; + tran->tran_setcap = mrsas_tran_setcap; + tran->tran_destroy_pkt = mrsas_tran_destroy_pkt; + tran->tran_dmafree = mrsas_tran_dmafree; + tran->tran_sync_pkt = mrsas_tran_sync_pkt; + tran->tran_quiesce = mrsas_tran_quiesce; + tran->tran_unquiesce = mrsas_tran_unquiesce; + tran->tran_bus_config = mrsas_tran_bus_config; + + if (mrsas_relaxed_ordering) + mrsas_generic_dma_attr.dma_attr_flags |= + DDI_DMA_RELAXED_ORDERING; + + + tran_dma_attr = mrsas_generic_dma_attr; + tran_dma_attr.dma_attr_sgllen = instance->max_num_sge; + + /* Attach this instance of the hba */ + if (scsi_hba_attach_setup(dip, &tran_dma_attr, tran, 0) + != DDI_SUCCESS) { + cmn_err(CE_WARN, + "scsi_hba_attach failed"); + + goto fail_attach; + } + instance->unroll.tranSetup = 1; + con_log(CL_ANN1, + (CE_CONT, "scsi_hba_attach_setup() done.")); + + /* create devctl node for cfgadm command */ + if (ddi_create_minor_node(dip, "devctl", + S_IFCHR, INST2DEVCTL(instance_no), + DDI_NT_SCSI_NEXUS, 0) == DDI_FAILURE) { + cmn_err(CE_WARN, + "mr_sas: failed to create devctl node."); + + goto fail_attach; + } - goto fail_attach; - } + instance->unroll.devctl = 1; - /* create devctl node for cfgadm command */ - if (ddi_create_minor_node(dip, "devctl", - S_IFCHR, INST2DEVCTL(instance_no), - DDI_NT_SCSI_NEXUS, 0) == DDI_FAILURE) { - con_log(CL_ANN, (CE_WARN, - "mr_sas: failed to create devctl node.")); + /* create scsi node for cfgadm command */ + if (ddi_create_minor_node(dip, "scsi", S_IFCHR, + INST2SCSI(instance_no), DDI_NT_SCSI_ATTACHMENT_POINT, 0) == + DDI_FAILURE) { + cmn_err(CE_WARN, + "mr_sas: failed to create scsi node."); - goto fail_attach; - } + goto fail_attach; + } - create_devctl_node_f = 1; + instance->unroll.scsictl = 1; - /* create scsi node for cfgadm command */ - if (ddi_create_minor_node(dip, "scsi", S_IFCHR, - INST2SCSI(instance_no), - DDI_NT_SCSI_ATTACHMENT_POINT, 0) == - DDI_FAILURE) { - con_log(CL_ANN, (CE_WARN, - "mr_sas: failed to create scsi node.")); + (void) sprintf(instance->iocnode, "%d:lsirdctl", + instance_no); - goto fail_attach; - } + /* + * Create a node for applications + * for issuing ioctl to the driver. + */ + if (ddi_create_minor_node(dip, instance->iocnode, + S_IFCHR, INST2LSIRDCTL(instance_no), DDI_PSEUDO, 0) == + DDI_FAILURE) { + cmn_err(CE_WARN, + "mr_sas: failed to create ioctl node."); - create_scsi_node_f = 1; + goto fail_attach; + } - (void) sprintf(instance->iocnode, "%d:lsirdctl", - instance_no); + instance->unroll.ioctl = 1; - /* - * Create a node for applications - * for issuing ioctl to the driver. - */ - if (ddi_create_minor_node(dip, instance->iocnode, - S_IFCHR, INST2LSIRDCTL(instance_no), - DDI_PSEUDO, 0) == DDI_FAILURE) { - con_log(CL_ANN, (CE_WARN, - "mr_sas: failed to create ioctl node.")); + /* Create a taskq to handle dr events */ + if ((instance->taskq = ddi_taskq_create(dip, + "mrsas_dr_taskq", 1, TASKQ_DEFAULTPRI, 0)) == NULL) { + cmn_err(CE_WARN, + "mr_sas: failed to create taskq "); + instance->taskq = NULL; + goto fail_attach; + } + instance->unroll.taskq = 1; + con_log(CL_ANN1, (CE_CONT, "ddi_taskq_create() done.")); - goto fail_attach; - } + /* enable interrupt */ + instance->func_ptr->enable_intr(instance); - create_ioc_node_f = 1; + /* initiate AEN */ + if (start_mfi_aen(instance)) { + cmn_err(CE_WARN, + "mr_sas: failed to initiate AEN."); + goto fail_attach; + } + instance->unroll.aenPend = 1; + con_log(CL_ANN1, + (CE_CONT, "AEN started for instance %d.", instance_no)); - /* Create a taskq to handle dr events */ - if ((instance->taskq = ddi_taskq_create(dip, - "mrsas_dr_taskq", 1, - TASKQ_DEFAULTPRI, 0)) == NULL) { - con_log(CL_ANN, (CE_WARN, - "mr_sas: failed to create taskq ")); - instance->taskq = NULL; - goto fail_attach; - } + /* Finally! We are on the air. */ + ddi_report_dev(dip); - /* enable interrupt */ - instance->func_ptr->enable_intr(instance); + /* FMA handle checking. */ + if (mrsas_check_acc_handle(instance->regmap_handle) != + DDI_SUCCESS) { + goto fail_attach; + } + if (mrsas_check_acc_handle(instance->pci_handle) != + DDI_SUCCESS) { + goto fail_attach; + } - /* initiate AEN */ - if (start_mfi_aen(instance)) { - con_log(CL_ANN, (CE_WARN, - "mr_sas: failed to initiate AEN.")); - goto fail_initiate_aen; + instance->mr_ld_list = + kmem_zalloc(MRDRV_MAX_LD * sizeof (struct mrsas_ld), + KM_SLEEP); + if (instance->mr_ld_list == NULL) { + cmn_err(CE_WARN, "mr_sas attach(): " + "failed to allocate ld_list array"); + goto fail_attach; + } + instance->unroll.ldlist_buff = 1; + +#ifdef PDSUPPORT + if (instance->tbolt) { + instance->mr_tbolt_pd_max = MRSAS_TBOLT_PD_TGT_MAX; + instance->mr_tbolt_pd_list = + kmem_zalloc(MRSAS_TBOLT_GET_PD_MAX(instance) * + sizeof (struct mrsas_tbolt_pd), KM_SLEEP); + ASSERT(instance->mr_tbolt_pd_list); + for (i = 0; i < instance->mr_tbolt_pd_max; i++) { + instance->mr_tbolt_pd_list[i].lun_type = + MRSAS_TBOLT_PD_LUN; + instance->mr_tbolt_pd_list[i].dev_id = + (uint8_t)i; } - con_log(CL_DLEVEL1, (CE_NOTE, - "AEN started for instance %d.", instance_no)); - - /* Finally! We are on the air. */ - ddi_report_dev(dip); - - if (mrsas_check_acc_handle(instance->regmap_handle) != - DDI_SUCCESS) { - goto fail_attach; - } - if (mrsas_check_acc_handle(instance->pci_handle) != - DDI_SUCCESS) { - goto fail_attach; - } - instance->mr_ld_list = - kmem_zalloc(MRDRV_MAX_LD * sizeof (struct mrsas_ld), - KM_SLEEP); - break; - case DDI_PM_RESUME: - con_log(CL_ANN, (CE_NOTE, - "mr_sas: DDI_PM_RESUME")); - break; - case DDI_RESUME: - con_log(CL_ANN, (CE_NOTE, - "mr_sas: DDI_RESUME")); - break; - default: - con_log(CL_ANN, (CE_WARN, - "mr_sas: invalid attach cmd=%x", cmd)); - return (DDI_FAILURE); + instance->unroll.pdlist_buff = 1; + } +#endif + break; + case DDI_PM_RESUME: + con_log(CL_ANN, (CE_NOTE, "mr_sas: DDI_PM_RESUME")); + break; + case DDI_RESUME: + con_log(CL_ANN, (CE_NOTE, "mr_sas: DDI_RESUME")); + break; + default: + con_log(CL_ANN, + (CE_WARN, "mr_sas: invalid attach cmd=%x", cmd)); + return (DDI_FAILURE); } + + cmn_err(CE_NOTE, "mrsas_attach() return SUCCESS instance_num %d", + instance_no); return (DDI_SUCCESS); -fail_initiate_aen: fail_attach: - if (create_devctl_node_f) { - ddi_remove_minor_node(dip, "devctl"); - } - - if (create_scsi_node_f) { - ddi_remove_minor_node(dip, "scsi"); - } - - if (create_ioc_node_f) { - ddi_remove_minor_node(dip, instance->iocnode); - } - - if (tran_alloc_f) { - scsi_hba_tran_free(tran); - } - - - if (added_soft_isr_f) { - ddi_remove_softintr(instance->soft_intr_id); - } - - if (added_isr_f) { - mrsas_rem_intrs(instance); - } - if (instance && instance->taskq) { - ddi_taskq_destroy(instance->taskq); - } + mrsas_undo_resources(dip, instance); mrsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); @@ -780,15 +1020,30 @@ fail_attach: mrsas_fm_fini(instance); pci_config_teardown(&instance->pci_handle); - ddi_soft_state_free(mrsas_state, instance_no); - con_log(CL_ANN, (CE_NOTE, - "mr_sas: return failure from mrsas_attach")); + con_log(CL_ANN, (CE_WARN, "mr_sas: return failure from mrsas_attach")); + + cmn_err(CE_WARN, "mrsas_attach() return FAILURE instance_num %d", + instance_no); return (DDI_FAILURE); } +/* + * getinfo - gets device information + * @dip: + * @cmd: + * @arg: + * @resultp: + * + * The system calls getinfo() to obtain configuration information that only + * the driver knows. The mapping of minor numbers to device instance is + * entirely under the control of the driver. The system sometimes needs to ask + * the driver which device a particular dev_t represents. + * Given the device number return the devinfo pointer from the scsi_device + * structure. + */ /*ARGSUSED*/ static int mrsas_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) @@ -827,6 +1082,19 @@ mrsas_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) return (rval); } +/* + * detach - detaches a device from the system + * @dip: pointer to the device's dev_info structure + * @cmd: type of detach + * + * A driver's detach() entry point is called to detach an instance of a device + * that is bound to the driver. The entry point is called with the instance of + * the device node to be detached and with DDI_DETACH, which is specified as + * the cmd argument to the entry point. + * This routine is called during driver unload. We free all the allocated + * resources and call the corresponding LLD so that it can also release all + * its resources. + */ static int mrsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { @@ -834,7 +1102,8 @@ mrsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) struct mrsas_instance *instance; - con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + con_log(CL_ANN, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + /* CONSTCOND */ ASSERT(NO_COMPETING_THREADS); @@ -845,9 +1114,9 @@ mrsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) instance_no); if (!instance) { - con_log(CL_ANN, (CE_WARN, + cmn_err(CE_WARN, "mr_sas:%d could not get instance in detach", - instance_no)); + instance_no); return (DDI_FAILURE); } @@ -858,84 +1127,253 @@ mrsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) instance->subsysvid, instance->subsysid)); switch (cmd) { - case DDI_DETACH: - con_log(CL_ANN, (CE_NOTE, - "mrsas_detach: DDI_DETACH")); + case DDI_DETACH: + con_log(CL_ANN, (CE_NOTE, + "mrsas_detach: DDI_DETACH")); + + mutex_enter(&instance->config_dev_mtx); + if (instance->timeout_id != (timeout_id_t)-1) { + mutex_exit(&instance->config_dev_mtx); + (void) untimeout(instance->timeout_id); + instance->timeout_id = (timeout_id_t)-1; + mutex_enter(&instance->config_dev_mtx); + instance->unroll.timer = 0; + } + mutex_exit(&instance->config_dev_mtx); - if (scsi_hba_detach(dip) != DDI_SUCCESS) { - con_log(CL_ANN, (CE_WARN, - "mr_sas:%d failed to detach", - instance_no)); + if (instance->unroll.tranSetup == 1) { + if (scsi_hba_detach(dip) != DDI_SUCCESS) { + cmn_err(CE_WARN, + "mr_sas2%d: failed to detach", + instance_no); + return (DDI_FAILURE); + } + instance->unroll.tranSetup = 0; + con_log(CL_ANN1, + (CE_CONT, "scsi_hba_dettach() done.")); + } + + flush_cache(instance); + + mrsas_undo_resources(dip, instance); + + mrsas_fm_fini(instance); + + pci_config_teardown(&instance->pci_handle); + ddi_soft_state_free(mrsas_state, instance_no); + break; + case DDI_PM_SUSPEND: + con_log(CL_ANN, (CE_NOTE, + "mrsas_detach: DDI_PM_SUSPEND")); + + break; + case DDI_SUSPEND: + con_log(CL_ANN, (CE_NOTE, + "mrsas_detach: DDI_SUSPEND")); + + break; + default: + con_log(CL_ANN, (CE_WARN, + "invalid detach command:0x%x", cmd)); return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + + +static void +mrsas_undo_resources(dev_info_t *dip, struct mrsas_instance *instance) +{ + int instance_no; + + con_log(CL_ANN, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + + instance_no = ddi_get_instance(dip); + + + if (instance->unroll.ioctl == 1) { + ddi_remove_minor_node(dip, instance->iocnode); + instance->unroll.ioctl = 0; + } + + if (instance->unroll.scsictl == 1) { + ddi_remove_minor_node(dip, "scsi"); + instance->unroll.scsictl = 0; + } + + if (instance->unroll.devctl == 1) { + ddi_remove_minor_node(dip, "devctl"); + instance->unroll.devctl = 0; + } + + if (instance->unroll.tranSetup == 1) { + if (scsi_hba_detach(dip) != DDI_SUCCESS) { + cmn_err(CE_WARN, + "mr_sas2%d: failed to detach", instance_no); + return; /* DDI_FAILURE */ } + instance->unroll.tranSetup = 0; + con_log(CL_ANN1, (CE_CONT, "scsi_hba_dettach() done.")); + } + if (instance->unroll.tran == 1) { scsi_hba_tran_free(instance->tran); + instance->unroll.tran = 0; + con_log(CL_ANN1, (CE_CONT, "scsi_hba_tran_free() done.")); + } - flush_cache(instance); + if (instance->unroll.syncCmd == 1) { + if (instance->tbolt) { + if (abort_syncmap_cmd(instance, + instance->map_update_cmd)) { + cmn_err(CE_WARN, "mrsas_detach: " + "failed to abort previous syncmap command"); + } - if (abort_aen_cmd(instance, instance->aen_cmd)) { - con_log(CL_ANN, (CE_WARN, "mrsas_detach: " - "failed to abort prevous AEN command")); + instance->unroll.syncCmd = 0; + con_log(CL_ANN1, (CE_CONT, "sync cmd aborted, done.")); + } + } - return (DDI_FAILURE); + if (instance->unroll.aenPend == 1) { + if (abort_aen_cmd(instance, instance->aen_cmd)) + cmn_err(CE_WARN, "mrsas_detach: " + "failed to abort prevous AEN command"); + + instance->unroll.aenPend = 0; + con_log(CL_ANN1, (CE_CONT, "aen cmd aborted, done.")); + /* This means the controller is fully initialzed and running */ + /* Shutdown should be a last command to controller. */ + /* shutdown_controller(); */ + } + + + if (instance->unroll.timer == 1) { + if (instance->timeout_id != (timeout_id_t)-1) { + (void) untimeout(instance->timeout_id); + instance->timeout_id = (timeout_id_t)-1; + + instance->unroll.timer = 0; } + } - instance->func_ptr->disable_intr(instance); + instance->func_ptr->disable_intr(instance); - if (instance->isr_level == HIGH_LEVEL_INTR) { - ddi_remove_softintr(instance->soft_intr_id); + + if (instance->unroll.mutexs == 1) { + mutex_destroy(&instance->cmd_pool_mtx); + mutex_destroy(&instance->app_cmd_pool_mtx); + mutex_destroy(&instance->cmd_pend_mtx); + mutex_destroy(&instance->completed_pool_mtx); + mutex_destroy(&instance->sync_map_mtx); + mutex_destroy(&instance->int_cmd_mtx); + cv_destroy(&instance->int_cmd_cv); + mutex_destroy(&instance->config_dev_mtx); + mutex_destroy(&instance->ocr_flags_mtx); + mutex_destroy(&instance->reg_write_mtx); + + if (instance->tbolt) { + mutex_destroy(&instance->cmd_app_pool_mtx); + mutex_destroy(&instance->chip_mtx); } + instance->unroll.mutexs = 0; + con_log(CL_ANN1, (CE_CONT, "Destroy mutex & cv, done.")); + } + + + if (instance->unroll.soft_isr == 1) { + ddi_remove_softintr(instance->soft_intr_id); + instance->unroll.soft_isr = 0; + } + + if (instance->unroll.intr == 1) { mrsas_rem_intrs(instance); + instance->unroll.intr = 0; + } + + if (instance->unroll.taskq == 1) { if (instance->taskq) { ddi_taskq_destroy(instance->taskq); + instance->unroll.taskq = 0; } - kmem_free(instance->mr_ld_list, MRDRV_MAX_LD - * sizeof (struct mrsas_ld)); - free_space_for_mfi(instance); - mrsas_fm_fini(instance); + } + + /* + * free dma memory allocated for + * cmds/frames/queues/driver version etc + */ + if (instance->unroll.verBuff == 1) { + (void) mrsas_free_dma_obj(instance, instance->drv_ver_dma_obj); + instance->unroll.verBuff = 0; + } - pci_config_teardown(&instance->pci_handle); + if (instance->unroll.pdlist_buff == 1) { + if (instance->mr_tbolt_pd_list != NULL) { + kmem_free(instance->mr_tbolt_pd_list, + MRSAS_TBOLT_GET_PD_MAX(instance) * + sizeof (struct mrsas_tbolt_pd)); + } - kmem_free(instance->func_ptr, - sizeof (struct mrsas_func_ptr)); + instance->mr_tbolt_pd_list = NULL; + instance->unroll.pdlist_buff = 0; + } - if (instance->timeout_id != (timeout_id_t)-1) { - (void) untimeout(instance->timeout_id); - instance->timeout_id = (timeout_id_t)-1; + if (instance->unroll.ldlist_buff == 1) { + if (instance->mr_ld_list != NULL) { + kmem_free(instance->mr_ld_list, MRDRV_MAX_LD + * sizeof (struct mrsas_ld)); } - ddi_soft_state_free(mrsas_state, instance_no); - break; - case DDI_PM_SUSPEND: - con_log(CL_ANN, (CE_NOTE, - "mrsas_detach: DDI_PM_SUSPEND")); - break; - case DDI_SUSPEND: - con_log(CL_ANN, (CE_NOTE, - "mrsas_detach: DDI_SUSPEND")); + instance->mr_ld_list = NULL; + instance->unroll.ldlist_buff = 0; + } - break; - default: - con_log(CL_ANN, (CE_WARN, - "invalid detach command:0x%x", cmd)); - return (DDI_FAILURE); + if (instance->tbolt) { + if (instance->unroll.alloc_space_mpi2 == 1) { + free_space_for_mpi2(instance); + instance->unroll.alloc_space_mpi2 = 0; + } + } else { + if (instance->unroll.alloc_space_mfi == 1) { + free_space_for_mfi(instance); + instance->unroll.alloc_space_mfi = 0; + } } - return (DDI_SUCCESS); + if (instance->unroll.regs == 1) { + ddi_regs_map_free(&instance->regmap_handle); + instance->unroll.regs = 0; + con_log(CL_ANN1, (CE_CONT, "ddi_regs_map_free() done.")); + } } + + /* * ************************************************************************** * - * * - * common entry points - for character driver types * - * * + * * + * common entry points - for character driver types * + * * * ************************************************************************** * */ -static int +/* + * open - gets access to a device + * @dev: + * @openflags: + * @otyp: + * @credp: + * + * Access to a device by one or more application programs is controlled + * through the open() and close() entry points. The primary function of + * open() is to verify that the open request is allowed. + */ +static int mrsas_open(dev_t *dev, int openflags, int otyp, cred_t *credp) { int rval = 0; @@ -968,7 +1406,17 @@ mrsas_open(dev_t *dev, int openflags, int otyp, cred_t *credp) return (rval); } -static int +/* + * close - gives up access to a device + * @dev: + * @openflags: + * @otyp: + * @credp: + * + * close() should perform any cleanup necessary to finish using the minor + * device, and prepare the device (and driver) to be opened again. + */ +static int mrsas_close(dev_t dev, int openflags, int otyp, cred_t *credp) { int rval = 0; @@ -984,6 +1432,23 @@ mrsas_close(dev_t dev, int openflags, int otyp, cred_t *credp) return (rval); } +/* + * ioctl - performs a range of I/O commands for character drivers + * @dev: + * @cmd: + * @arg: + * @mode: + * @credp: + * @rvalp: + * + * ioctl() routine must make sure that user data is copied into or out of the + * kernel address space explicitly using copyin(), copyout(), ddi_copyin(), + * and ddi_copyout(), as appropriate. + * This is a wrapper routine to serialize access to the actual ioctl routine. + * ioctl() should return 0 on success, or the appropriate error number. The + * driver may also set the value returned to the calling process through rvalp. + */ + static int mrsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) @@ -1005,7 +1470,12 @@ mrsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, ioctl = (struct mrsas_ioctl *)kmem_zalloc(sizeof (struct mrsas_ioctl), KM_SLEEP); - ASSERT(ioctl); + if (ioctl == NULL) { + /* Failed to allocate memory for ioctl */ + con_log(CL_ANN, (CE_WARN, "mr_sas_ioctl: " + "failed to allocate memory for ioctl")); + return (ENXIO); + } switch ((uint_t)cmd) { case MRSAS_IOCTL_FIRMWARE: @@ -1032,6 +1502,9 @@ mrsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, break; case MRSAS_IOCTL_AEN: + con_log(CL_ANN, + (CE_NOTE, "mrsas_ioctl: IOCTL Register AEN.\n")); + if (ddi_copyin((void *) arg, &aen, sizeof (struct mrsas_aen), mode)) { con_log(CL_ANN, (CE_WARN, @@ -1064,12 +1537,19 @@ mrsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, /* * ************************************************************************** * - * * - * common entry points - for block driver types * - * * + * * + * common entry points - for block driver types * + * * * ************************************************************************** * */ -#ifdef __sparc +#ifdef __sparc +/* + * reset - TBD + * @dip: + * @cmd: + * + * TBD + */ /*ARGSUSED*/ static int mrsas_reset(dev_info_t *dip, ddi_reset_cmd_t cmd) @@ -1092,7 +1572,7 @@ mrsas_reset(dev_info_t *dip, ddi_reset_cmd_t cmd) instance->func_ptr->disable_intr(instance); - con_log(CL_ANN1, (CE_NOTE, "flushing cache for instance %d", + con_log(CL_ANN1, (CE_CONT, "flushing cache for instance %d", instance_no)); flush_cache(instance); @@ -1130,14 +1610,26 @@ mrsas_quiesce(dev_info_t *dip) "failed to abort prevous AEN command QUIESCE")); } + if (instance->tbolt) { + if (abort_syncmap_cmd(instance, + instance->map_update_cmd)) { + cmn_err(CE_WARN, + "mrsas_detach: failed to abort " + "previous syncmap command"); + return (DDI_FAILURE); + } + } + instance->func_ptr->disable_intr(instance); - con_log(CL_ANN1, (CE_NOTE, "flushing cache for instance %d", + con_log(CL_ANN1, (CE_CONT, "flushing cache for instance %d", instance_no)); flush_cache(instance); if (wait_for_outstanding(instance)) { + con_log(CL_ANN1, + (CE_CONT, "wait_for_outstanding: return FAIL.\n")); return (DDI_FAILURE); } return (DDI_SUCCESS); @@ -1146,11 +1638,24 @@ mrsas_quiesce(dev_info_t *dip) /* * ************************************************************************** * - * * - * entry points (SCSI HBA) * - * * + * * + * entry points (SCSI HBA) * + * * * ************************************************************************** * */ +/* + * tran_tgt_init - initialize a target device instance + * @hba_dip: + * @tgt_dip: + * @tran: + * @sd: + * + * The tran_tgt_init() entry point enables the HBA to allocate and initialize + * any per-target resources. tran_tgt_init() also enables the HBA to qualify + * the device's address as valid and supportable for that particular HBA. + * By returning DDI_FAILURE, the instance of the target driver for that device + * is not probed or attached. + */ /*ARGSUSED*/ static int mrsas_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip, @@ -1159,32 +1664,61 @@ mrsas_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip, struct mrsas_instance *instance; uint16_t tgt = sd->sd_address.a_target; uint8_t lun = sd->sd_address.a_lun; + dev_info_t *child = NULL; - con_log(CL_ANN1, (CE_NOTE, "mrsas_tgt_init target %d lun %d", + con_log(CL_DLEVEL2, (CE_NOTE, "mrsas_tgt_init target %d lun %d", tgt, lun)); instance = ADDR2MR(&sd->sd_address); if (ndi_dev_is_persistent_node(tgt_dip) == 0) { - (void) ndi_merge_node(tgt_dip, mrsas_name_node); - ddi_set_name_addr(tgt_dip, NULL); - - con_log(CL_ANN1, (CE_NOTE, "mrsas_tgt_init in " - "ndi_dev_is_persistent_node DDI_FAILURE t = %d l = %d", - tgt, lun)); + /* + * If no persistent node exists, we don't allow .conf node + * to be created. + */ + if ((child = mrsas_find_child(instance, tgt, lun)) != NULL) { + con_log(CL_DLEVEL2, + (CE_NOTE, "mrsas_tgt_init find child =" + " %p t = %d l = %d", (void *)child, tgt, lun)); + if (ndi_merge_node(tgt_dip, mrsas_name_node) != + DDI_SUCCESS) + /* Create this .conf node */ + return (DDI_SUCCESS); + } + con_log(CL_DLEVEL2, (CE_NOTE, "mrsas_tgt_init in ndi_per " + "DDI_FAILURE t = %d l = %d", tgt, lun)); return (DDI_FAILURE); + } - con_log(CL_ANN1, (CE_NOTE, "mrsas_tgt_init dev_dip %p tgt_dip %p", + con_log(CL_DLEVEL2, (CE_NOTE, "mrsas_tgt_init dev_dip %p tgt_dip %p", (void *)instance->mr_ld_list[tgt].dip, (void *)tgt_dip)); if (tgt < MRDRV_MAX_LD && lun == 0) { if (instance->mr_ld_list[tgt].dip == NULL && strcmp(ddi_driver_name(sd->sd_dev), "sd") == 0) { + mutex_enter(&instance->config_dev_mtx); instance->mr_ld_list[tgt].dip = tgt_dip; instance->mr_ld_list[tgt].lun_type = MRSAS_LD_LUN; + instance->mr_ld_list[tgt].flag = MRDRV_TGT_VALID; + mutex_exit(&instance->config_dev_mtx); + } + } + +#ifdef PDSUPPORT + else if (instance->tbolt) { + if (instance->mr_tbolt_pd_list[tgt].dip == NULL) { + mutex_enter(&instance->config_dev_mtx); + instance->mr_tbolt_pd_list[tgt].dip = tgt_dip; + instance->mr_tbolt_pd_list[tgt].flag = + MRDRV_TGT_VALID; + mutex_exit(&instance->config_dev_mtx); + con_log(CL_ANN1, (CE_NOTE, "mrsas_tran_tgt_init:" + "t%xl%x", tgt, lun)); } } +#endif + return (DDI_SUCCESS); } @@ -1199,16 +1733,29 @@ mrsas_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip, instance = ADDR2MR(&sd->sd_address); - con_log(CL_ANN1, (CE_NOTE, "tgt_free t = %d l = %d", tgt, lun)); + con_log(CL_DLEVEL2, (CE_NOTE, "tgt_free t = %d l = %d", tgt, lun)); if (tgt < MRDRV_MAX_LD && lun == 0) { if (instance->mr_ld_list[tgt].dip == tgt_dip) { + mutex_enter(&instance->config_dev_mtx); instance->mr_ld_list[tgt].dip = NULL; + mutex_exit(&instance->config_dev_mtx); } } + +#ifdef PDSUPPORT + else if (instance->tbolt) { + mutex_enter(&instance->config_dev_mtx); + instance->mr_tbolt_pd_list[tgt].dip = NULL; + mutex_exit(&instance->config_dev_mtx); + con_log(CL_ANN1, (CE_NOTE, "tgt_free: Setting dip = NULL" + "for tgt:%x", tgt)); + } +#endif + } -static dev_info_t * +dev_info_t * mrsas_find_child(struct mrsas_instance *instance, uint16_t tgt, uint8_t lun) { dev_info_t *child = NULL; @@ -1219,6 +1766,11 @@ mrsas_find_child(struct mrsas_instance *instance, uint16_t tgt, uint8_t lun) for (child = ddi_get_child(instance->dip); child; child = ddi_get_next_sibling(child)) { + /* XXX KEBE ASKS - why was this added?! */ + if (ndi_dev_is_persistent_node(child) == 0) { + continue; + } + if (mrsas_name_node(child, tmp, MAXNAMELEN) != DDI_SUCCESS) { continue; @@ -1228,11 +1780,17 @@ mrsas_find_child(struct mrsas_instance *instance, uint16_t tgt, uint8_t lun) break; } } - con_log(CL_ANN1, (CE_NOTE, "mrsas_find_child: return child = %p", + con_log(CL_DLEVEL2, (CE_NOTE, "mrsas_find_child: return child = %p", (void *)child)); return (child); } +/* + * mrsas_name_node - + * @dip: + * @name: + * @len: + */ static int mrsas_name_node(dev_info_t *dip, char *name, int len) { @@ -1240,14 +1798,14 @@ mrsas_name_node(dev_info_t *dip, char *name, int len) tgt = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "target", -1); - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_DLEVEL2, (CE_NOTE, "mrsas_name_node: dip %p tgt %d", (void *)dip, tgt)); if (tgt == -1) { return (DDI_FAILURE); } lun = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "lun", -1); - con_log(CL_ANN1, + con_log(CL_DLEVEL2, (CE_NOTE, "mrsas_name_node: tgt %d lun %d", tgt, lun)); if (lun == -1) { return (DDI_FAILURE); @@ -1256,6 +1814,26 @@ mrsas_name_node(dev_info_t *dip, char *name, int len) return (DDI_SUCCESS); } +/* + * tran_init_pkt - allocate & initialize a scsi_pkt structure + * @ap: + * @pkt: + * @bp: + * @cmdlen: + * @statuslen: + * @tgtlen: + * @flags: + * @callback: + * + * The tran_init_pkt() entry point allocates and initializes a scsi_pkt + * structure and DMA resources for a target driver request. The + * tran_init_pkt() entry point is called when the target driver calls the + * SCSA function scsi_init_pkt(). Each call of the tran_init_pkt() entry point + * is a request to perform one or more of three possible services: + * - allocation and initialization of a scsi_pkt structure + * - allocation of DMA resources for data transfer + * - reallocation of DMA resources for the next portion of the data transfer + */ static struct scsi_pkt * mrsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt, struct buf *bp, int cmdlen, int statuslen, int tgtlen, @@ -1265,7 +1843,7 @@ mrsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt, struct mrsas_instance *instance; struct scsi_pkt *new_pkt; - con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + con_log(CL_DLEVEL1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); instance = ADDR2MR(ap); @@ -1327,14 +1905,31 @@ mrsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt, return (pkt); } +/* + * tran_start - transport a SCSI command to the addressed target + * @ap: + * @pkt: + * + * The tran_start() entry point for a SCSI HBA driver is called to transport a + * SCSI command to the addressed target. The SCSI command is described + * entirely within the scsi_pkt structure, which the target driver allocated + * through the HBA driver's tran_init_pkt() entry point. If the command + * involves a data transfer, DMA resources must also have been allocated for + * the scsi_pkt structure. + * + * Return Values : + * TRAN_BUSY - request queue is full, no more free scbs + * TRAN_ACCEPT - pkt has been submitted to the instance + */ static int mrsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt) { - uchar_t cmd_done = 0; + uchar_t cmd_done = 0; struct mrsas_instance *instance = ADDR2MR(ap); struct mrsas_cmd *cmd; + con_log(CL_DLEVEL1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); if (instance->deadadapter == 1) { con_log(CL_ANN1, (CE_WARN, "mrsas_tran_start: return TRAN_FATAL_ERROR " @@ -1347,12 +1942,12 @@ mrsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt) } if (instance->adapterresetinprogress) { - con_log(CL_ANN1, (CE_NOTE, "Reset flag set, " + con_log(CL_ANN1, (CE_NOTE, "mrsas_tran_start: Reset flag set, " "returning mfi_pkt and setting TRAN_BUSY\n")); return (TRAN_BUSY); } - con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d:SCSI CDB[0]=0x%x time:%x", + con_log(CL_ANN1, (CE_CONT, "chkpnt:%s:%d:SCSI CDB[0]=0x%x time:%x", __func__, __LINE__, pkt->pkt_cdbp[0], pkt->pkt_time)); pkt->pkt_reason = CMD_CMPLT; @@ -1394,16 +1989,16 @@ mrsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt) /* Synchronize the Cmd frame for the controller */ (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, 0, 0, DDI_DMA_SYNC_FORDEV); - con_log(CL_ANN1, (CE_NOTE, "Push SCSI CDB[0]=0x%x" + con_log(CL_ANN, (CE_CONT, "issue_cmd_ppc: SCSI CDB[0]=0x%x" "cmd->index:%x\n", pkt->pkt_cdbp[0], cmd->index)); instance->func_ptr->issue_cmd(cmd, instance); } else { struct mrsas_header *hdr = &cmd->frame->hdr; - cmd->sync_cmd = MRSAS_TRUE; + /* cmd->sync_cmd = MRSAS_TRUE; */ /* KEBE asks, inherit? */ - instance->func_ptr-> issue_cmd_in_poll_mode(instance, cmd); + instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd); pkt->pkt_reason = CMD_CMPLT; pkt->pkt_statistics = 0; @@ -1416,7 +2011,8 @@ mrsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt) break; case MFI_STAT_SCSI_DONE_WITH_ERROR: - + con_log(CL_ANN, (CE_CONT, + "mrsas_tran_start: scsi done with error")); pkt->pkt_reason = CMD_CMPLT; pkt->pkt_statistics = 0; @@ -1424,6 +2020,8 @@ mrsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt) break; case MFI_STAT_DEVICE_NOT_FOUND: + con_log(CL_ANN, (CE_CONT, + "mrsas_tran_start: device not found error")); pkt->pkt_reason = CMD_DEV_GONE; pkt->pkt_statistics = STAT_DISCON; break; @@ -1446,6 +2044,19 @@ mrsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt) return (TRAN_ACCEPT); } +/* + * tran_abort - Abort any commands that are currently in transport + * @ap: + * @pkt: + * + * The tran_abort() entry point for a SCSI HBA driver is called to abort any + * commands that are currently in transport for a particular target. This entry + * point is called when a target driver calls scsi_abort(). The tran_abort() + * entry point should attempt to abort the command denoted by the pkt + * parameter. If the pkt parameter is NULL, tran_abort() should attempt to + * abort all outstanding commands in the transport layer for the particular + * target or logical unit. + */ /*ARGSUSED*/ static int mrsas_tran_abort(struct scsi_address *ap, struct scsi_pkt *pkt) @@ -1457,18 +2068,80 @@ mrsas_tran_abort(struct scsi_address *ap, struct scsi_pkt *pkt) return (DDI_FAILURE); } +/* + * tran_reset - reset either the SCSI bus or target + * @ap: + * @level: + * + * The tran_reset() entry point for a SCSI HBA driver is called to reset either + * the SCSI bus or a particular SCSI target device. This entry point is called + * when a target driver calls scsi_reset(). The tran_reset() entry point must + * reset the SCSI bus if level is RESET_ALL. If level is RESET_TARGET, just the + * particular target or logical unit must be reset. + */ /*ARGSUSED*/ static int mrsas_tran_reset(struct scsi_address *ap, int level) { + struct mrsas_instance *instance = ADDR2MR(ap); + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); - /* reset command not supported by H/W */ + if (wait_for_outstanding(instance)) { + con_log(CL_ANN1, + (CE_CONT, "wait_for_outstanding: return FAIL.\n")); + return (DDI_FAILURE); + } else { + return (DDI_SUCCESS); + } +} - return (DDI_FAILURE); +#if 0 +/* + * tran_bus_reset - reset the SCSI bus + * @dip: + * @level: + * + * The tran_bus_reset() vector in the scsi_hba_tran structure should be + * initialized during the HBA driver's attach(). The vector should point to + * an HBA entry point that is to be called when a user initiates a bus reset. + * Implementation is hardware specific. If the HBA driver cannot reset the + * SCSI bus without affecting the targets, the driver should fail RESET_BUS + * or not initialize this vector. + */ +/*ARGSUSED*/ +static int +mrsas_tran_bus_reset(dev_info_t *dip, int level) +{ + int instance_no = ddi_get_instance(dip); + + struct mrsas_instance *instance = ddi_get_soft_state(mrsas_state, + instance_no); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + if (wait_for_outstanding(instance)) { + con_log(CL_ANN1, + (CE_CONT, "wait_for_outstanding: return FAIL.\n")); + return (DDI_FAILURE); + } else { + return (DDI_SUCCESS); + } } +#endif +/* + * tran_getcap - get one of a set of SCSA-defined capabilities + * @ap: + * @cap: + * @whom: + * + * The target driver can request the current setting of the capability for a + * particular target by setting the whom parameter to nonzero. A whom value of + * zero indicates a request for the current setting of the general capability + * for the SCSI bus or for adapter hardware. The tran_getcap() should return -1 + * for undefined capabilities or the current value of the requested capability. + */ /*ARGSUSED*/ static int mrsas_tran_getcap(struct scsi_address *ap, char *cap, int whom) @@ -1477,7 +2150,7 @@ mrsas_tran_getcap(struct scsi_address *ap, char *cap, int whom) struct mrsas_instance *instance = ADDR2MR(ap); - con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + con_log(CL_DLEVEL2, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); /* we do allow inquiring about capabilities for other targets */ if (cap == NULL) { @@ -1486,8 +2159,13 @@ mrsas_tran_getcap(struct scsi_address *ap, char *cap, int whom) switch (scsi_hba_lookup_capstr(cap)) { case SCSI_CAP_DMA_MAX: - /* Limit to 16MB max transfer */ - rval = mrsas_max_cap_maxxfer; + if (instance->tbolt) { + /* Limit to 256k max transfer */ + rval = mrsas_tbolt_max_cap_maxxfer; + } else { + /* Limit to 16MB max transfer */ + rval = mrsas_max_cap_maxxfer; + } break; case SCSI_CAP_MSG_OUT: rval = 1; @@ -1536,13 +2214,29 @@ mrsas_tran_getcap(struct scsi_address *ap, char *cap, int whom) return (rval); } +/* + * tran_setcap - set one of a set of SCSA-defined capabilities + * @ap: + * @cap: + * @value: + * @whom: + * + * The target driver might request that the new value be set for a particular + * target by setting the whom parameter to nonzero. A whom value of zero + * means that request is to set the new value for the SCSI bus or for adapter + * hardware in general. + * The tran_setcap() should return the following values as appropriate: + * - -1 for undefined capabilities + * - 0 if the HBA driver cannot set the capability to the requested value + * - 1 if the HBA driver is able to set the capability to the requested value + */ /*ARGSUSED*/ static int mrsas_tran_setcap(struct scsi_address *ap, char *cap, int value, int whom) { int rval = 1; - con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + con_log(CL_DLEVEL2, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); /* We don't allow setting capabilities for other targets */ if (cap == NULL || whom == 0) { @@ -1584,12 +2278,25 @@ mrsas_tran_setcap(struct scsi_address *ap, char *cap, int value, int whom) return (rval); } +/* + * tran_destroy_pkt - deallocate scsi_pkt structure + * @ap: + * @pkt: + * + * The tran_destroy_pkt() entry point is the HBA driver function that + * deallocates scsi_pkt structures. The tran_destroy_pkt() entry point is + * called when the target driver calls scsi_destroy_pkt(). The + * tran_destroy_pkt() entry point must free any DMA resources that have been + * allocated for the packet. An implicit DMA synchronization occurs if the + * DMA resources are freed and any cached data remains after the completion + * of the transfer. + */ static void mrsas_tran_destroy_pkt(struct scsi_address *ap, struct scsi_pkt *pkt) { struct scsa_cmd *acmd = PKT2CMD(pkt); - con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + con_log(CL_DLEVEL2, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); if (acmd->cmd_flags & CFLAG_DMAVALID) { acmd->cmd_flags &= ~CFLAG_DMAVALID; @@ -1605,6 +2312,18 @@ mrsas_tran_destroy_pkt(struct scsi_address *ap, struct scsi_pkt *pkt) scsi_hba_pkt_free(ap, pkt); } +/* + * tran_dmafree - deallocates DMA resources + * @ap: + * @pkt: + * + * The tran_dmafree() entry point deallocates DMAQ resources that have been + * allocated for a scsi_pkt structure. The tran_dmafree() entry point is + * called when the target driver calls scsi_dmafree(). The tran_dmafree() must + * free only DMA resources allocated for a scsi_pkt structure, not the + * scsi_pkt itself. When DMA resources are freed, a DMA synchronization is + * implicitly performed. + */ /*ARGSUSED*/ static void mrsas_tran_dmafree(struct scsi_address *ap, struct scsi_pkt *pkt) @@ -1624,6 +2343,19 @@ mrsas_tran_dmafree(struct scsi_address *ap, struct scsi_pkt *pkt) } } +/* + * tran_sync_pkt - synchronize the DMA object allocated + * @ap: + * @pkt: + * + * The tran_sync_pkt() entry point synchronizes the DMA object allocated for + * the scsi_pkt structure before or after a DMA transfer. The tran_sync_pkt() + * entry point is called when the target driver calls scsi_sync_pkt(). If the + * data transfer direction is a DMA read from device to memory, tran_sync_pkt() + * must synchronize the CPU's view of the data. If the data transfer direction + * is a DMA write from memory to device, tran_sync_pkt() must synchronize the + * device's view of the data. + */ /*ARGSUSED*/ static void mrsas_tran_sync_pkt(struct scsi_address *ap, struct scsi_pkt *pkt) @@ -1639,6 +2371,25 @@ mrsas_tran_sync_pkt(struct scsi_address *ap, struct scsi_pkt *pkt) } } +/*ARGSUSED*/ +static int +mrsas_tran_quiesce(dev_info_t *dip) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + return (1); +} + +/*ARGSUSED*/ +static int +mrsas_tran_unquiesce(dev_info_t *dip) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + return (1); +} + + /* * mrsas_isr(caddr_t) * @@ -1654,15 +2405,29 @@ mrsas_isr(struct mrsas_instance *instance) uint32_t producer; uint32_t consumer; uint32_t context; + int retval; struct mrsas_cmd *cmd; struct mrsas_header *hdr; struct scsi_pkt *pkt; + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); ASSERT(instance); - if ((instance->intr_type == DDI_INTR_TYPE_FIXED) && - !instance->func_ptr->intr_ack(instance)) { - return (DDI_INTR_UNCLAIMED); + if (instance->tbolt) { + mutex_enter(&instance->chip_mtx); + if ((instance->intr_type == DDI_INTR_TYPE_FIXED) && + !(instance->func_ptr->intr_ack(instance))) { + mutex_exit(&instance->chip_mtx); + return (DDI_INTR_UNCLAIMED); + } + retval = mr_sas_tbolt_process_outstanding_cmd(instance); + mutex_exit(&instance->chip_mtx); + return (retval); + } else { + if ((instance->intr_type == DDI_INTR_TYPE_FIXED) && + !instance->func_ptr->intr_ack(instance)) { + return (DDI_INTR_UNCLAIMED); + } } (void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle, @@ -1681,7 +2446,7 @@ mrsas_isr(struct mrsas_instance *instance) #ifdef OCRDEBUG if (debug_consecutive_timeout_after_ocr_g == 1) { con_log(CL_ANN1, (CE_NOTE, - "simulating consecutive timeout after ocr")); + "simulating consecutive timeout after ocr")); return (DDI_INTR_CLAIMED); } #endif @@ -1694,10 +2459,10 @@ mrsas_isr(struct mrsas_instance *instance) consumer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle, instance->consumer); - con_log(CL_ANN1, (CE_NOTE, " producer %x consumer %x ", + con_log(CL_ANN, (CE_CONT, " producer %x consumer %x ", producer, consumer)); if (producer == consumer) { - con_log(CL_ANN1, (CE_WARN, "producer = consumer case")); + con_log(CL_ANN, (CE_WARN, "producer == consumer case")); DTRACE_PROBE2(isr_pc_err, uint32_t, producer, uint32_t, consumer); mutex_exit(&instance->cmd_pend_mtx); @@ -1711,10 +2476,10 @@ mrsas_isr(struct mrsas_instance *instance) cmd = instance->cmd_list[context]; if (cmd->sync_cmd == MRSAS_TRUE) { - hdr = (struct mrsas_header *)&cmd->frame->hdr; - if (hdr) { - mlist_del_init(&cmd->list); - } + hdr = (struct mrsas_header *)&cmd->frame->hdr; + if (hdr) { + mlist_del_init(&cmd->list); + } } else { pkt = cmd->pkt; if (pkt) { @@ -1761,9 +2526,9 @@ mrsas_isr(struct mrsas_instance *instance) /* * ************************************************************************** * - * * - * libraries * - * * + * * + * libraries * + * * * ************************************************************************** * */ /* @@ -1779,7 +2544,7 @@ mrsas_isr(struct mrsas_instance *instance) static struct mrsas_cmd * get_mfi_pkt(struct mrsas_instance *instance) { - mlist_t *head = &instance->cmd_pool_list; + mlist_t *head = &instance->cmd_pool_list; struct mrsas_cmd *cmd = NULL; mutex_enter(&instance->cmd_pool_mtx); @@ -1793,6 +2558,7 @@ get_mfi_pkt(struct mrsas_instance *instance) cmd->pkt = NULL; cmd->retry_count_for_ocr = 0; cmd->drv_pkt_time = 0; + } mutex_exit(&instance->cmd_pool_mtx); @@ -1812,8 +2578,12 @@ get_mfi_app_pkt(struct mrsas_instance *instance) cmd = mlist_entry(head->next, struct mrsas_cmd, list); mlist_del_init(head->next); } - if (cmd != NULL) + if (cmd != NULL) { cmd->pkt = NULL; + cmd->retry_count_for_ocr = 0; + cmd->drv_pkt_time = 0; + } + mutex_exit(&instance->app_cmd_pool_mtx); return (cmd); @@ -1842,12 +2612,12 @@ return_mfi_app_pkt(struct mrsas_instance *instance, struct mrsas_cmd *cmd) mutex_exit(&instance->app_cmd_pool_mtx); } -static void +void push_pending_mfi_pkt(struct mrsas_instance *instance, struct mrsas_cmd *cmd) { struct scsi_pkt *pkt; struct mrsas_header *hdr; - con_log(CL_ANN1, (CE_NOTE, "push_pending_pkt(): Called\n")); + con_log(CL_DLEVEL2, (CE_NOTE, "push_pending_pkt(): Called\n")); mutex_enter(&instance->cmd_pend_mtx); ASSERT(mutex_owned(&instance->cmd_pend_mtx)); mlist_del_init(&cmd->list); @@ -1861,15 +2631,15 @@ push_pending_mfi_pkt(struct mrsas_instance *instance, struct mrsas_cmd *cmd) "time %llx", (void *)cmd, cmd->index, gethrtime())); - /* Wait for specified interval */ + /* Wait for specified interval */ cmd->drv_pkt_time = ddi_get16( cmd->frame_dma_obj.acc_handle, &hdr->timeout); if (cmd->drv_pkt_time < debug_timeout_g) cmd->drv_pkt_time = (uint16_t)debug_timeout_g; - con_log(CL_ANN1, (CE_CONT, - "push_pending_pkt(): " - "Called IO Timeout Value %x\n", - cmd->drv_pkt_time)); + con_log(CL_ANN1, (CE_CONT, + "push_pending_pkt(): " + "Called IO Timeout Value %x\n", + cmd->drv_pkt_time)); } if (hdr && instance->timeout_id == (timeout_id_t)-1) { instance->timeout_id = timeout(io_timeout_checker, @@ -1893,9 +2663,10 @@ push_pending_mfi_pkt(struct mrsas_instance *instance, struct mrsas_cmd *cmd) } mutex_exit(&instance->cmd_pend_mtx); + } -static int +int mrsas_print_pending_cmds(struct mrsas_instance *instance) { mlist_t *head = &instance->cmd_pend_list; @@ -1903,47 +2674,73 @@ mrsas_print_pending_cmds(struct mrsas_instance *instance) struct mrsas_cmd *cmd = NULL; struct mrsas_header *hdr; unsigned int flag = 1; - struct scsi_pkt *pkt; - con_log(CL_ANN1, (CE_NOTE, - "mrsas_print_pending_cmds(): Called")); + int saved_level; + int cmd_count = 0; + + + saved_level = debug_level_g; + debug_level_g = CL_ANN1; + + cmn_err(CE_NOTE, "mrsas_print_pending_cmds(): Called\n"); + while (flag) { mutex_enter(&instance->cmd_pend_mtx); tmp = tmp->next; if (tmp == head) { mutex_exit(&instance->cmd_pend_mtx); flag = 0; + con_log(CL_ANN1, (CE_CONT, "mrsas_print_pending_cmds():" + " NO MORE CMDS PENDING....\n")); break; } else { cmd = mlist_entry(tmp, struct mrsas_cmd, list); mutex_exit(&instance->cmd_pend_mtx); if (cmd) { if (cmd->sync_cmd == MRSAS_TRUE) { - hdr = (struct mrsas_header *)&cmd->frame->hdr; + hdr = (struct mrsas_header *) + &cmd->frame->hdr; if (hdr) { - con_log(CL_ANN1, (CE_CONT, - "print: cmd %p index %x hdr %p", - (void *)cmd, cmd->index, - (void *)hdr)); + con_log(CL_ANN1, (CE_CONT, + "print: cmd %p index 0x%x " + "drv_pkt_time 0x%x (NO-PKT)" + " hdr %p\n", (void *)cmd, + cmd->index, + cmd->drv_pkt_time, + (void *)hdr)); } } else { pkt = cmd->pkt; if (pkt) { con_log(CL_ANN1, (CE_CONT, - "print: cmd %p index %x " - "pkt %p", (void *)cmd, cmd->index, - (void *)pkt)); + "print: cmd %p index 0x%x " + "drv_pkt_time 0x%x pkt %p \n", + (void *)cmd, cmd->index, + cmd->drv_pkt_time, (void *)pkt)); } } + + if (++cmd_count == 1) { + mrsas_print_cmd_details(instance, cmd, + 0xDD); + } else { + mrsas_print_cmd_details(instance, cmd, + 1); + } + } } } - con_log(CL_ANN1, (CE_NOTE, "mrsas_print_pending_cmds(): Done\n")); + con_log(CL_ANN1, (CE_CONT, "mrsas_print_pending_cmds(): Done\n")); + + + debug_level_g = saved_level; + return (DDI_SUCCESS); } -static int +int mrsas_complete_pending_cmds(struct mrsas_instance *instance) { @@ -1968,7 +2765,7 @@ mrsas_complete_pending_cmds(struct mrsas_instance *instance) = CMD_DEV_GONE; pkt->pkt_statistics = STAT_DISCON; - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_CONT, "fail and posting to scsa " "cmd %p index %x" " pkt %p " @@ -1980,7 +2777,7 @@ mrsas_complete_pending_cmds(struct mrsas_instance *instance) } else { /* for DCMDS */ if (cmd->sync_cmd == MRSAS_TRUE) { hdr = (struct mrsas_header *)&cmd->frame->hdr; - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_CONT, "posting invalid status to application " "cmd %p index %x" " hdr %p " @@ -1993,22 +2790,92 @@ mrsas_complete_pending_cmds(struct mrsas_instance *instance) } mlist_del_init(&cmd->list); } else { - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_CONT, "mrsas_complete_pending_cmds:" "NULL command\n")); } - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_CONT, "mrsas_complete_pending_cmds:" "looping for more commands\n")); } mutex_exit(&instance->cmd_pend_mtx); - con_log(CL_ANN1, (CE_NOTE, "mrsas_complete_pending_cmds(): DONE\n")); + con_log(CL_ANN1, (CE_CONT, "mrsas_complete_pending_cmds(): DONE\n")); return (DDI_SUCCESS); } +void +mrsas_print_cmd_details(struct mrsas_instance *instance, struct mrsas_cmd *cmd, + int detail) +{ + struct scsi_pkt *pkt = cmd->pkt; + Mpi2RaidSCSIIORequest_t *scsi_io = cmd->scsi_io_request; + int i; + int saved_level; + ddi_acc_handle_t acc_handle = + instance->mpi2_frame_pool_dma_obj.acc_handle; -static int + if (detail == 0xDD) { + saved_level = debug_level_g; + debug_level_g = CL_ANN1; + } + + + if (instance->tbolt) { + con_log(CL_ANN1, (CE_CONT, "print_cmd_details: cmd %p " + "cmd->index 0x%x SMID 0x%x timer 0x%x sec\n", + (void *)cmd, cmd->index, cmd->SMID, cmd->drv_pkt_time)); + } else { + con_log(CL_ANN1, (CE_CONT, "print_cmd_details: cmd %p " + "cmd->index 0x%x timer 0x%x sec\n", + (void *)cmd, cmd->index, cmd->drv_pkt_time)); + } + + if (pkt) { + con_log(CL_ANN1, (CE_CONT, "scsi_pkt CDB[0]=0x%x", + pkt->pkt_cdbp[0])); + } else { + con_log(CL_ANN1, (CE_CONT, "NO-PKT")); + } + + if ((detail == 0xDD) && instance->tbolt) { + con_log(CL_ANN1, (CE_CONT, "RAID_SCSI_IO_REQUEST\n")); + con_log(CL_ANN1, (CE_CONT, "DevHandle=0x%X Function=0x%X " + "IoFlags=0x%X SGLFlags=0x%X DataLength=0x%X\n", + ddi_get16(acc_handle, &scsi_io->DevHandle), + ddi_get8(acc_handle, &scsi_io->Function), + ddi_get16(acc_handle, &scsi_io->IoFlags), + ddi_get16(acc_handle, &scsi_io->SGLFlags), + ddi_get32(acc_handle, &scsi_io->DataLength))); + + for (i = 0; i < 32; i++) { + con_log(CL_ANN1, (CE_CONT, "CDB[%d]=0x%x ", i, + ddi_get8(acc_handle, &scsi_io->CDB.CDB32[i]))); + } + + con_log(CL_ANN1, (CE_CONT, "RAID-CONTEXT\n")); + con_log(CL_ANN1, (CE_CONT, "status=0x%X extStatus=0x%X " + "ldTargetId=0x%X timeoutValue=0x%X regLockFlags=0x%X " + "RAIDFlags=0x%X regLockRowLBA=0x%" PRIu64 + " regLockLength=0x%X spanArm=0x%X\n", + ddi_get8(acc_handle, &scsi_io->RaidContext.status), + ddi_get8(acc_handle, &scsi_io->RaidContext.extStatus), + ddi_get16(acc_handle, &scsi_io->RaidContext.ldTargetId), + ddi_get16(acc_handle, &scsi_io->RaidContext.timeoutValue), + ddi_get8(acc_handle, &scsi_io->RaidContext.regLockFlags), + ddi_get8(acc_handle, &scsi_io->RaidContext.RAIDFlags), + ddi_get64(acc_handle, &scsi_io->RaidContext.regLockRowLBA), + ddi_get32(acc_handle, &scsi_io->RaidContext.regLockLength), + ddi_get8(acc_handle, &scsi_io->RaidContext.spanArm))); + } + + if (detail == 0xDD) { + debug_level_g = saved_level; + } +} + + +int mrsas_issue_pending_cmds(struct mrsas_instance *instance) { mlist_t *head = &instance->cmd_pend_list; @@ -2023,53 +2890,79 @@ mrsas_issue_pending_cmds(struct mrsas_instance *instance) tmp = tmp->next; mutex_exit(&instance->cmd_pend_mtx); if (cmd) { - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_CONT, "mrsas_issue_pending_cmds(): " - "Got a cmd: cmd:%p\n", (void *)cmd)); + "Got a cmd: cmd %p index 0x%x drv_pkt_time 0x%x ", + (void *)cmd, cmd->index, cmd->drv_pkt_time)); + + /* Reset command timeout value */ + if (cmd->drv_pkt_time < debug_timeout_g) + cmd->drv_pkt_time = (uint16_t)debug_timeout_g; + cmd->retry_count_for_ocr++; - con_log(CL_ANN1, (CE_NOTE, - "mrsas_issue_pending_cmds(): " - "cmd retry count = %d\n", - cmd->retry_count_for_ocr)); + + cmn_err(CE_CONT, "cmd retry count = %d\n", + cmd->retry_count_for_ocr); + if (cmd->retry_count_for_ocr > IO_RETRY_COUNT) { - con_log(CL_ANN1, (CE_NOTE, + cmn_err(CE_WARN, "mrsas_issue_pending_cmds(): " + "cmd->retry_count exceeded limit >%d\n", + IO_RETRY_COUNT); + mrsas_print_cmd_details(instance, cmd, 0xDD); + + cmn_err(CE_WARN, "mrsas_issue_pending_cmds():" - "Calling Kill Adapter\n")); - (void) mrsas_kill_adapter(instance); + "Calling KILL Adapter\n"); + if (instance->tbolt) + mrsas_tbolt_kill_adapter(instance); + else + (void) mrsas_kill_adapter(instance); return (DDI_FAILURE); } + pkt = cmd->pkt; if (pkt) { - con_log(CL_ANN1, (CE_NOTE, - "PENDING ISSUE: cmd %p index %x " + con_log(CL_ANN1, (CE_CONT, + "PENDING PKT-CMD ISSUE: cmd %p index %x " "pkt %p time %llx", (void *)cmd, cmd->index, (void *)pkt, gethrtime())); + } else { + cmn_err(CE_CONT, + "mrsas_issue_pending_cmds(): NO-PKT, " + "cmd %p index 0x%x drv_pkt_time 0x%x ", + (void *)cmd, cmd->index, cmd->drv_pkt_time); } + + if (cmd->sync_cmd == MRSAS_TRUE) { + cmn_err(CE_CONT, "mrsas_issue_pending_cmds(): " + "SYNC_CMD == TRUE \n"); instance->func_ptr->issue_cmd_in_sync_mode( instance, cmd); } else { instance->func_ptr->issue_cmd(cmd, instance); } } else { - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_CONT, "mrsas_issue_pending_cmds: NULL command\n")); } - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_CONT, "mrsas_issue_pending_cmds:" "looping for more commands")); } - con_log(CL_ANN1, (CE_NOTE, "mrsas_issue_pending_cmds(): DONE\n")); + con_log(CL_ANN1, (CE_CONT, "mrsas_issue_pending_cmds(): DONE\n")); return (DDI_SUCCESS); } + + /* * destroy_mfi_frame_pool */ -static void +void destroy_mfi_frame_pool(struct mrsas_instance *instance) { int i; @@ -2078,7 +2971,8 @@ destroy_mfi_frame_pool(struct mrsas_instance *instance) struct mrsas_cmd *cmd; /* return all frames to pool */ - for (i = 0; i < max_cmd+1; i++) { + + for (i = 0; i < max_cmd; i++) { cmd = instance->cmd_list[i]; @@ -2093,7 +2987,7 @@ destroy_mfi_frame_pool(struct mrsas_instance *instance) /* * create_mfi_frame_pool */ -static int +int create_mfi_frame_pool(struct mrsas_instance *instance) { int i = 0; @@ -2103,11 +2997,10 @@ create_mfi_frame_pool(struct mrsas_instance *instance) uint32_t sgl_sz; uint32_t tot_frame_size; struct mrsas_cmd *cmd; + int retval = DDI_SUCCESS; max_cmd = instance->max_fw_cmds; - sge_sz = sizeof (struct mrsas_sge_ieee); - /* calculated the number of 64byte frames required for SGL */ sgl_sz = sge_sz * instance->max_num_sge; tot_frame_size = sgl_sz + MRMFI_FRAME_SIZE + SENSE_LENGTH; @@ -2115,7 +3008,7 @@ create_mfi_frame_pool(struct mrsas_instance *instance) con_log(CL_DLEVEL3, (CE_NOTE, "create_mfi_frame_pool: " "sgl_sz %x tot_frame_size %x", sgl_sz, tot_frame_size)); - while (i < max_cmd+1) { + while (i < max_cmd) { cmd = instance->cmd_list[i]; cmd->frame_dma_obj.size = tot_frame_size; @@ -2125,14 +3018,14 @@ create_mfi_frame_pool(struct mrsas_instance *instance) cmd->frame_dma_obj.dma_attr.dma_attr_sgllen = 1; cmd->frame_dma_obj.dma_attr.dma_attr_align = 64; - cookie_cnt = mrsas_alloc_dma_obj(instance, &cmd->frame_dma_obj, (uchar_t)DDI_STRUCTURE_LE_ACC); if (cookie_cnt == -1 || cookie_cnt > 1) { - con_log(CL_ANN, (CE_WARN, - "create_mfi_frame_pool: could not alloc.")); - return (DDI_FAILURE); + cmn_err(CE_WARN, + "create_mfi_frame_pool: could not alloc."); + retval = DDI_FAILURE; + goto mrsas_undo_frame_pool; } bzero(cmd->frame_dma_obj.buffer, tot_frame_size); @@ -2150,10 +3043,10 @@ create_mfi_frame_pool(struct mrsas_instance *instance) tot_frame_size - SENSE_LENGTH; if (!cmd->frame || !cmd->sense) { - con_log(CL_ANN, (CE_NOTE, - "mr_sas: pci_pool_alloc failed")); - - return (ENOMEM); + cmn_err(CE_WARN, + "mr_sas: pci_pool_alloc failed"); + retval = ENOMEM; + goto mrsas_undo_frame_pool; } ddi_put32(cmd->frame_dma_obj.acc_handle, @@ -2165,6 +3058,12 @@ create_mfi_frame_pool(struct mrsas_instance *instance) } return (DDI_SUCCESS); + +mrsas_undo_frame_pool: + if (i > 0) + destroy_mfi_frame_pool(instance); + + return (retval); } /* @@ -2207,8 +3106,8 @@ alloc_additional_dma_buffer(struct mrsas_instance *instance) if (mrsas_alloc_dma_obj(instance, &instance->mfi_internal_dma_obj, (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { - con_log(CL_ANN, (CE_WARN, - "mr_sas: could not alloc reply queue")); + cmn_err(CE_WARN, + "mr_sas: could not alloc reply queue"); return (DDI_FAILURE); } @@ -2240,9 +3139,9 @@ alloc_additional_dma_buffer(struct mrsas_instance *instance) if (mrsas_alloc_dma_obj(instance, &instance->mfi_evt_detail_obj, (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { - con_log(CL_ANN, (CE_WARN, "alloc_additional_dma_buffer: " - "could not allocate data transfer buffer.")); - return (DDI_FAILURE); + cmn_err(CE_WARN, "alloc_additional_dma_buffer: " + "could not allocate data transfer buffer."); + goto mrsas_undo_internal_buff; } bzero(instance->mfi_evt_detail_obj.buffer, @@ -2251,53 +3150,70 @@ alloc_additional_dma_buffer(struct mrsas_instance *instance) instance->mfi_evt_detail_obj.status |= DMA_OBJ_ALLOCATED; return (DDI_SUCCESS); + +mrsas_undo_internal_buff: + if (instance->mfi_internal_dma_obj.status == DMA_OBJ_ALLOCATED) { + (void) mrsas_free_dma_obj(instance, + instance->mfi_internal_dma_obj); + instance->mfi_internal_dma_obj.status = DMA_OBJ_FREED; + } + + return (DDI_FAILURE); } -/* - * free_space_for_mfi - */ -static void -free_space_for_mfi(struct mrsas_instance *instance) + +void +mrsas_free_cmd_pool(struct mrsas_instance *instance) { int i; - uint32_t max_cmd = instance->max_fw_cmds; + uint32_t max_cmd; + size_t sz; /* already freed */ if (instance->cmd_list == NULL) { return; } - free_additional_dma_buffer(instance); + max_cmd = instance->max_fw_cmds; - /* first free the MFI frame pool */ - destroy_mfi_frame_pool(instance); + /* size of cmd_list array */ + sz = sizeof (struct mrsas_cmd *) * max_cmd; - /* free all the commands in the cmd_list */ - for (i = 0; i < instance->max_fw_cmds+1; i++) { - kmem_free(instance->cmd_list[i], - sizeof (struct mrsas_cmd)); + /* First free each cmd */ + for (i = 0; i < max_cmd; i++) { + if (instance->cmd_list[i] != NULL) { + kmem_free(instance->cmd_list[i], + sizeof (struct mrsas_cmd)); + } instance->cmd_list[i] = NULL; } - /* free the cmd_list buffer itself */ - kmem_free(instance->cmd_list, - sizeof (struct mrsas_cmd *) * (max_cmd+1)); + /* Now, free cmd_list array */ + if (instance->cmd_list != NULL) + kmem_free(instance->cmd_list, sz); instance->cmd_list = NULL; INIT_LIST_HEAD(&instance->cmd_pool_list); - INIT_LIST_HEAD(&instance->app_cmd_pool_list); INIT_LIST_HEAD(&instance->cmd_pend_list); + if (instance->tbolt) { + INIT_LIST_HEAD(&instance->cmd_app_pool_list); + } else { + INIT_LIST_HEAD(&instance->app_cmd_pool_list); + } + } + /* - * alloc_space_for_mfi + * mrsas_alloc_cmd_pool */ -static int -alloc_space_for_mfi(struct mrsas_instance *instance) +int +mrsas_alloc_cmd_pool(struct mrsas_instance *instance) { int i; + int count; uint32_t max_cmd; uint32_t reserve_cmd; size_t sz; @@ -2305,9 +3221,11 @@ alloc_space_for_mfi(struct mrsas_instance *instance) struct mrsas_cmd *cmd; max_cmd = instance->max_fw_cmds; + con_log(CL_ANN1, (CE_NOTE, "mrsas_alloc_cmd_pool: " + "max_cmd %x", max_cmd)); - /* reserve 1 more slot for flush_cache */ - sz = sizeof (struct mrsas_cmd *) * (max_cmd+1); + + sz = sizeof (struct mrsas_cmd *) * max_cmd; /* * instance->cmd_list is an array of struct mrsas_cmd pointers. @@ -2315,54 +3233,127 @@ alloc_space_for_mfi(struct mrsas_instance *instance) * commands. */ instance->cmd_list = kmem_zalloc(sz, KM_SLEEP); - ASSERT(instance->cmd_list); + if (instance->cmd_list == NULL) { + con_log(CL_NONE, (CE_WARN, + "Failed to allocate memory for cmd_list")); + return (DDI_FAILURE); + } - for (i = 0; i < max_cmd+1; i++) { - instance->cmd_list[i] = kmem_zalloc(sizeof (struct mrsas_cmd), - KM_SLEEP); - ASSERT(instance->cmd_list[i]); + /* create a frame pool and assign one frame to each cmd */ + for (count = 0; count < max_cmd; count++) { + instance->cmd_list[count] = + kmem_zalloc(sizeof (struct mrsas_cmd), KM_SLEEP); + if (instance->cmd_list[count] == NULL) { + con_log(CL_NONE, (CE_WARN, + "Failed to allocate memory for mrsas_cmd")); + goto mrsas_undo_cmds; + } } + /* add all the commands to command pool */ + INIT_LIST_HEAD(&instance->cmd_pool_list); INIT_LIST_HEAD(&instance->cmd_pend_list); - /* add all the commands to command pool (instance->cmd_pool) */ - reserve_cmd = APP_RESERVE_CMDS; INIT_LIST_HEAD(&instance->app_cmd_pool_list); - for (i = 0; i < reserve_cmd-1; i++) { - cmd = instance->cmd_list[i]; - cmd->index = i; + + reserve_cmd = MRSAS_APP_RESERVED_CMDS; + + for (i = 0; i < reserve_cmd; i++) { + cmd = instance->cmd_list[i]; + cmd->index = i; mlist_add_tail(&cmd->list, &instance->app_cmd_pool_list); } - /* - * reserve slot instance->cmd_list[APP_RESERVE_CMDS-1] - * for abort_aen_cmd - */ + + for (i = reserve_cmd; i < max_cmd; i++) { - cmd = instance->cmd_list[i]; - cmd->index = i; + cmd = instance->cmd_list[i]; + cmd->index = i; mlist_add_tail(&cmd->list, &instance->cmd_pool_list); } - /* single slot for flush_cache won't be added in command pool */ - cmd = instance->cmd_list[max_cmd]; - cmd->index = i; + return (DDI_SUCCESS); - /* create a frame pool and assign one frame to each cmd */ - if (create_mfi_frame_pool(instance)) { - con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool")); +mrsas_undo_cmds: + if (count > 0) { + /* free each cmd */ + for (i = 0; i < count; i++) { + if (instance->cmd_list[i] != NULL) { + kmem_free(instance->cmd_list[i], + sizeof (struct mrsas_cmd)); + } + instance->cmd_list[i] = NULL; + } + } + +mrsas_undo_cmd_list: + if (instance->cmd_list != NULL) + kmem_free(instance->cmd_list, sz); + instance->cmd_list = NULL; + + return (DDI_FAILURE); +} + + +/* + * free_space_for_mfi + */ +static void +free_space_for_mfi(struct mrsas_instance *instance) +{ + + /* already freed */ + if (instance->cmd_list == NULL) { + return; + } + + /* Free additional dma buffer */ + free_additional_dma_buffer(instance); + + /* Free the MFI frame pool */ + destroy_mfi_frame_pool(instance); + + /* Free all the commands in the cmd_list */ + /* Free the cmd_list buffer itself */ + mrsas_free_cmd_pool(instance); +} + +/* + * alloc_space_for_mfi + */ +static int +alloc_space_for_mfi(struct mrsas_instance *instance) +{ + /* Allocate command pool (memory for cmd_list & individual commands) */ + if (mrsas_alloc_cmd_pool(instance)) { + cmn_err(CE_WARN, "error creating cmd pool"); return (DDI_FAILURE); } - /* create a frame pool and assign one frame to each cmd */ + /* Allocate MFI Frame pool */ + if (create_mfi_frame_pool(instance)) { + cmn_err(CE_WARN, "error creating frame DMA pool"); + goto mfi_undo_cmd_pool; + } + + /* Allocate additional DMA buffer */ if (alloc_additional_dma_buffer(instance)) { - con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool")); - return (DDI_FAILURE); + cmn_err(CE_WARN, "error creating frame DMA pool"); + goto mfi_undo_frame_pool; } return (DDI_SUCCESS); + +mfi_undo_frame_pool: + destroy_mfi_frame_pool(instance); + +mfi_undo_cmd_pool: + mrsas_free_cmd_pool(instance); + + return (DDI_FAILURE); } + /* * get_ctrl_info */ @@ -2376,7 +3367,11 @@ get_ctrl_info(struct mrsas_instance *instance, struct mrsas_dcmd_frame *dcmd; struct mrsas_ctrl_info *ci; - cmd = get_mfi_pkt(instance); + if (instance->tbolt) { + cmd = get_raid_msg_mfi_pkt(instance); + } else { + cmd = get_mfi_pkt(instance); + } if (!cmd) { con_log(CL_ANN, (CE_WARN, @@ -2385,7 +3380,7 @@ get_ctrl_info(struct mrsas_instance *instance, uint16_t, instance->max_fw_cmds); return (DDI_FAILURE); } - cmd->retry_count_for_ocr = 0; + /* Clear the frame buffer and assign back the context id */ (void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame)); ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, @@ -2396,8 +3391,8 @@ get_ctrl_info(struct mrsas_instance *instance, ci = (struct mrsas_ctrl_info *)instance->internal_buf; if (!ci) { - con_log(CL_ANN, (CE_WARN, - "Failed to alloc mem for ctrl info")); + cmn_err(CE_WARN, + "Failed to alloc mem for ctrl info"); return_mfi_pkt(instance, cmd); return (DDI_FAILURE); } @@ -2425,33 +3420,40 @@ get_ctrl_info(struct mrsas_instance *instance, cmd->frame_count = 1; - if (!instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { - ret = 0; + if (instance->tbolt) { + mr_sas_tbolt_build_mfi_cmd(instance, cmd); + } - ctrl_info->max_request_size = ddi_get32( - cmd->frame_dma_obj.acc_handle, &ci->max_request_size); + if (!instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + ret = 0; - ctrl_info->ld_present_count = ddi_get16( - cmd->frame_dma_obj.acc_handle, &ci->ld_present_count); + ctrl_info->max_request_size = ddi_get32( + cmd->frame_dma_obj.acc_handle, &ci->max_request_size); - ctrl_info->properties.on_off_properties = - ddi_get32(cmd->frame_dma_obj.acc_handle, - &ci->properties.on_off_properties); + ctrl_info->ld_present_count = ddi_get16( + cmd->frame_dma_obj.acc_handle, &ci->ld_present_count); - ddi_rep_get8(cmd->frame_dma_obj.acc_handle, - (uint8_t *)(ctrl_info->product_name), - (uint8_t *)(ci->product_name), 80 * sizeof (char), - DDI_DEV_AUTOINCR); - /* should get more members of ci with ddi_get when needed */ + ctrl_info->properties.on_off_properties = ddi_get32( + cmd->frame_dma_obj.acc_handle, + &ci->properties.on_off_properties); + ddi_rep_get8(cmd->frame_dma_obj.acc_handle, + (uint8_t *)(ctrl_info->product_name), + (uint8_t *)(ci->product_name), 80 * sizeof (char), + DDI_DEV_AUTOINCR); + /* should get more members of ci with ddi_get when needed */ } else { - con_log(CL_ANN, (CE_WARN, "get_ctrl_info: Ctrl info failed")); + cmn_err(CE_WARN, "get_ctrl_info: Ctrl info failed"); ret = -1; } if (mrsas_common_check(instance, cmd) != DDI_SUCCESS) { ret = -1; } - return_mfi_pkt(instance, cmd); + if (instance->tbolt) { + return_raid_msg_mfi_pkt(instance, cmd); + } else { + return_mfi_pkt(instance, cmd); + } return (ret); } @@ -2468,7 +3470,13 @@ abort_aen_cmd(struct mrsas_instance *instance, struct mrsas_cmd *cmd; struct mrsas_abort_frame *abort_fr; - cmd = instance->cmd_list[APP_RESERVE_CMDS-1]; + con_log(CL_ANN1, (CE_NOTE, "chkpnt: abort_aen:%d", __LINE__)); + + if (instance->tbolt) { + cmd = get_raid_msg_mfi_pkt(instance); + } else { + cmd = get_mfi_pkt(instance); + } if (!cmd) { con_log(CL_ANN1, (CE_WARN, @@ -2477,7 +3485,7 @@ abort_aen_cmd(struct mrsas_instance *instance, uint16_t, instance->max_fw_cmds); return (DDI_FAILURE); } - cmd->retry_count_for_ocr = 0; + /* Clear the frame buffer and assign back the context id */ (void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame)); ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, @@ -2500,9 +3508,13 @@ abort_aen_cmd(struct mrsas_instance *instance, instance->aen_cmd->abort_aen = 1; - cmd->sync_cmd = MRSAS_TRUE; + /* cmd->sync_cmd = MRSAS_TRUE; */ /* KEBE ASKS, inherit? */ cmd->frame_count = 1; + if (instance->tbolt) { + mr_sas_tbolt_build_mfi_cmd(instance, cmd); + } + if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { con_log(CL_ANN1, (CE_WARN, "abort_aen_cmd: issue_cmd_in_poll_mode failed")); @@ -2514,49 +3526,27 @@ abort_aen_cmd(struct mrsas_instance *instance, instance->aen_cmd->abort_aen = 1; instance->aen_cmd = 0; + if (instance->tbolt) { + return_raid_msg_mfi_pkt(instance, cmd); + } else { + return_mfi_pkt(instance, cmd); + } + atomic_add_16(&instance->fw_outstanding, (-1)); return (ret); } -/* - * init_mfi - */ static int -init_mfi(struct mrsas_instance *instance) +mrsas_build_init_cmd(struct mrsas_instance *instance, + struct mrsas_cmd **cmd_ptr) { struct mrsas_cmd *cmd; - struct mrsas_ctrl_info ctrl_info; struct mrsas_init_frame *init_frame; struct mrsas_init_queue_info *initq_info; + struct mrsas_drv_ver drv_ver_info; - /* we expect the FW state to be READY */ - if (mfi_state_transition_to_ready(instance)) { - con_log(CL_ANN, (CE_WARN, "mr_sas: F/W is not ready")); - goto fail_ready_state; - } - - /* get various operational parameters from status register */ - instance->max_num_sge = - (instance->func_ptr->read_fw_status_reg(instance) & - 0xFF0000) >> 0x10; - /* - * Reduce the max supported cmds by 1. This is to ensure that the - * reply_q_sz (1 more than the max cmd that driver may send) - * does not exceed max cmds that the FW can support - */ - instance->max_fw_cmds = - instance->func_ptr->read_fw_status_reg(instance) & 0xFFFF; - instance->max_fw_cmds = instance->max_fw_cmds - 1; - - instance->max_num_sge = - (instance->max_num_sge > MRSAS_MAX_SGE_CNT) ? - MRSAS_MAX_SGE_CNT : instance->max_num_sge; - - /* create a pool of commands */ - if (alloc_space_for_mfi(instance) != DDI_SUCCESS) - goto fail_alloc_fw_space; /* * Prepare a init frame. Note the init frame points to queue info @@ -2564,8 +3554,8 @@ init_mfi(struct mrsas_instance *instance) * this frame - since we don't need any SGL - we use SGL's space as * queue info structure */ - cmd = get_mfi_pkt(instance); - cmd->retry_count_for_ocr = 0; + cmd = *cmd_ptr; + /* Clear the frame buffer and assign back the context id */ (void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame)); @@ -2613,23 +3603,88 @@ init_mfi(struct mrsas_instance *instance) ddi_put32(cmd->frame_dma_obj.acc_handle, &init_frame->queue_info_new_phys_addr_hi, 0); + + /* fill driver version information */ + fill_up_drv_ver(&drv_ver_info); + + /* allocate the driver version data transfer buffer */ + instance->drv_ver_dma_obj.size = sizeof (drv_ver_info.drv_ver); + instance->drv_ver_dma_obj.dma_attr = mrsas_generic_dma_attr; + instance->drv_ver_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->drv_ver_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + instance->drv_ver_dma_obj.dma_attr.dma_attr_sgllen = 1; + instance->drv_ver_dma_obj.dma_attr.dma_attr_align = 1; + + if (mrsas_alloc_dma_obj(instance, &instance->drv_ver_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, + "init_mfi : Could not allocate driver version buffer.")); + return (DDI_FAILURE); + } + /* copy driver version to dma buffer */ + (void) memset(instance->drv_ver_dma_obj.buffer, 0, + sizeof (drv_ver_info.drv_ver)); + ddi_rep_put8(cmd->frame_dma_obj.acc_handle, + (uint8_t *)drv_ver_info.drv_ver, + (uint8_t *)instance->drv_ver_dma_obj.buffer, + sizeof (drv_ver_info.drv_ver), DDI_DEV_AUTOINCR); + + + /* copy driver version physical address to init frame */ + ddi_put64(cmd->frame_dma_obj.acc_handle, &init_frame->driverversion, + instance->drv_ver_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(cmd->frame_dma_obj.acc_handle, &init_frame->data_xfer_len, sizeof (struct mrsas_init_queue_info)); cmd->frame_count = 1; - /* issue the init frame in polled mode */ + *cmd_ptr = cmd; + + return (DDI_SUCCESS); +} + + +/* + * mrsas_init_adapter_ppc - Initialize MFI interface adapter. + */ +int +mrsas_init_adapter_ppc(struct mrsas_instance *instance) +{ + struct mrsas_cmd *cmd; + + /* + * allocate memory for mfi adapter(cmd pool, individual commands, mfi + * frames etc + */ + if (alloc_space_for_mfi(instance) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_NOTE, + "Error, failed to allocate memory for MFI adapter")); + return (DDI_FAILURE); + } + + /* Build INIT command */ + cmd = get_mfi_pkt(instance); + + if (mrsas_build_init_cmd(instance, &cmd) != DDI_SUCCESS) { + con_log(CL_ANN, + (CE_NOTE, "Error, failed to build INIT command")); + + goto fail_undo_alloc_mfi_space; + } + + /* + * Disable interrupt before sending init frame ( see linux driver code) + * send INIT MFI frame in polled mode + */ if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { con_log(CL_ANN, (CE_WARN, "failed to init firmware")); - return_mfi_pkt(instance, cmd); goto fail_fw_init; } - if (mrsas_common_check(instance, cmd) != DDI_SUCCESS) { - return_mfi_pkt(instance, cmd); + if (mrsas_common_check(instance, cmd) != DDI_SUCCESS) goto fail_fw_init; - } - return_mfi_pkt(instance, cmd); + /* return_mfi_pkt(instance, cmd); */ /* XXX KEBE ASKS, inherit? */ if (ctio_enable && (instance->func_ptr->read_fw_status_reg(instance) & 0x04000000)) { @@ -2639,8 +3694,67 @@ init_mfi(struct mrsas_instance *instance) instance->flag_ieee = 0; } - instance->disable_online_ctrl_reset = 0; + instance->unroll.alloc_space_mfi = 1; + instance->unroll.verBuff = 1; + + return (DDI_SUCCESS); + + +fail_fw_init: + (void) mrsas_free_dma_obj(instance, instance->drv_ver_dma_obj); + +fail_undo_alloc_mfi_space: + return_mfi_pkt(instance, cmd); + free_space_for_mfi(instance); + + return (DDI_FAILURE); + +} + +/* + * mrsas_init_adapter - Initialize adapter. + */ +int +mrsas_init_adapter(struct mrsas_instance *instance) +{ + struct mrsas_ctrl_info ctrl_info; + + + /* we expect the FW state to be READY */ + if (mfi_state_transition_to_ready(instance)) { + con_log(CL_ANN, (CE_WARN, "mr_sas: F/W is not ready")); + return (DDI_FAILURE); + } + + /* get various operational parameters from status register */ + instance->max_num_sge = + (instance->func_ptr->read_fw_status_reg(instance) & + 0xFF0000) >> 0x10; + instance->max_num_sge = + (instance->max_num_sge > MRSAS_MAX_SGE_CNT) ? + MRSAS_MAX_SGE_CNT : instance->max_num_sge; + + /* + * Reduce the max supported cmds by 1. This is to ensure that the + * reply_q_sz (1 more than the max cmd that driver may send) + * does not exceed max cmds that the FW can support + */ + instance->max_fw_cmds = + instance->func_ptr->read_fw_status_reg(instance) & 0xFFFF; + instance->max_fw_cmds = instance->max_fw_cmds - 1; + + + + /* Initialize adapter */ + if (instance->func_ptr->init_adapter(instance) != DDI_SUCCESS) { + con_log(CL_ANN, + (CE_WARN, "mr_sas: could not initialize adapter")); + return (DDI_FAILURE); + } + /* gather misc FW related information */ + instance->disable_online_ctrl_reset = 0; + if (!get_ctrl_info(instance, &ctrl_info)) { instance->max_sectors_per_req = ctrl_info.max_request_size; con_log(CL_ANN1, (CE_NOTE, @@ -2651,28 +3765,21 @@ init_mfi(struct mrsas_instance *instance) PAGESIZE / 512; } - if (ctrl_info.properties.on_off_properties & DISABLE_OCR_PROP_FLAG) + if (ctrl_info.properties.on_off_properties & DISABLE_OCR_PROP_FLAG) { instance->disable_online_ctrl_reset = 1; + con_log(CL_ANN1, + (CE_NOTE, "Disable online control Flag is set\n")); + } else { + con_log(CL_ANN1, + (CE_NOTE, "Disable online control Flag is not set\n")); + } return (DDI_SUCCESS); -fail_fw_init: -fail_alloc_fw_space: - - free_space_for_mfi(instance); - -fail_ready_state: - ddi_regs_map_free(&instance->regmap_handle); - -fail_mfi_reg_setup: - return (DDI_FAILURE); } - - - static int mrsas_issue_init_mfi(struct mrsas_instance *instance) { @@ -2691,7 +3798,7 @@ mrsas_issue_init_mfi(struct mrsas_instance *instance) cmd = get_mfi_app_pkt(instance); if (!cmd) { - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_WARN, "mrsas_issue_init_mfi: get_pkt failed\n")); return (DDI_FAILURE); } @@ -2753,8 +3860,15 @@ mrsas_issue_init_mfi(struct mrsas_instance *instance) return_mfi_app_pkt(instance, cmd); return (DDI_FAILURE); } + + if (mrsas_common_check(instance, cmd) != DDI_SUCCESS) { + return_mfi_pkt(instance, cmd); + return (DDI_FAILURE); + } + return_mfi_app_pkt(instance, cmd); - con_log(CL_ANN1, (CE_NOTE, "mrsas_issue_init_mfi: Done")); + con_log(CL_ANN1, (CE_CONT, "mrsas_issue_init_mfi: Done")); + return (DDI_SUCCESS); } /* @@ -2762,31 +3876,32 @@ mrsas_issue_init_mfi(struct mrsas_instance *instance) * * @reg_set : MFI register set */ -static int +int mfi_state_transition_to_ready(struct mrsas_instance *instance) { int i; uint8_t max_wait; - uint32_t fw_ctrl; + uint32_t fw_ctrl = 0; uint32_t fw_state; uint32_t cur_state; uint32_t cur_abs_reg_val; uint32_t prev_abs_reg_val; + uint32_t status; cur_abs_reg_val = instance->func_ptr->read_fw_status_reg(instance); fw_state = cur_abs_reg_val & MFI_STATE_MASK; - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_CONT, "mfi_state_transition_to_ready:FW state = 0x%x", fw_state)); while (fw_state != MFI_STATE_READY) { - con_log(CL_ANN, (CE_NOTE, + con_log(CL_ANN, (CE_CONT, "mfi_state_transition_to_ready:FW state%x", fw_state)); switch (fw_state) { case MFI_STATE_FAULT: - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN, (CE_NOTE, "mr_sas: FW in FAULT state!!")); return (ENODEV); @@ -2800,10 +3915,14 @@ mfi_state_transition_to_ready(struct mrsas_instance *instance) * to be set */ /* WR_IB_MSG_0(MFI_INIT_CLEAR_HANDSHAKE, instance); */ - WR_IB_DOORBELL(MFI_INIT_CLEAR_HANDSHAKE | - MFI_INIT_HOTPLUG, instance); - - max_wait = 2; + if (!instance->tbolt) { + WR_IB_DOORBELL(MFI_INIT_CLEAR_HANDSHAKE | + MFI_INIT_HOTPLUG, instance); + } else { + WR_RESERVED0_REGISTER(MFI_INIT_CLEAR_HANDSHAKE | + MFI_INIT_HOTPLUG, instance); + } + max_wait = (instance->tbolt == 1) ? 180 : 2; cur_state = MFI_STATE_WAIT_HANDSHAKE; break; case MFI_STATE_BOOT_MESSAGE_PENDING: @@ -2815,9 +3934,13 @@ mfi_state_transition_to_ready(struct mrsas_instance *instance) * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG) * to be set */ - WR_IB_DOORBELL(MFI_INIT_HOTPLUG, instance); - - max_wait = 10; + if (!instance->tbolt) { + WR_IB_DOORBELL(MFI_INIT_HOTPLUG, instance); + } else { + WR_RESERVED0_REGISTER(MFI_INIT_HOTPLUG, + instance); + } + max_wait = (instance->tbolt == 1) ? 180 : 10; cur_state = MFI_STATE_BOOT_MESSAGE_PENDING; break; case MFI_STATE_OPERATIONAL: @@ -2831,26 +3954,46 @@ mfi_state_transition_to_ready(struct mrsas_instance *instance) * to be set */ /* WR_IB_DOORBELL(MFI_INIT_READY, instance); */ - WR_IB_DOORBELL(MFI_RESET_FLAGS, instance); + if (!instance->tbolt) { + WR_IB_DOORBELL(MFI_RESET_FLAGS, instance); + } else { + WR_RESERVED0_REGISTER(MFI_RESET_FLAGS, + instance); + + for (i = 0; i < (10 * 1000); i++) { + status = + RD_RESERVED0_REGISTER(instance); + if (status & 1) { + delay(1 * + drv_usectohz(MILLISEC)); + } else { + break; + } + } - max_wait = 10; + } + max_wait = (instance->tbolt == 1) ? 180 : 10; cur_state = MFI_STATE_OPERATIONAL; break; case MFI_STATE_UNDEFINED: /* this state should not last for more than 2 seconds */ con_log(CL_ANN1, (CE_NOTE, "FW state undefined")); - max_wait = 2; + max_wait = (instance->tbolt == 1) ? 180 : 2; cur_state = MFI_STATE_UNDEFINED; break; case MFI_STATE_BB_INIT: - max_wait = 2; + max_wait = (instance->tbolt == 1) ? 180 : 2; cur_state = MFI_STATE_BB_INIT; break; case MFI_STATE_FW_INIT: - max_wait = 2; + max_wait = (instance->tbolt == 1) ? 180 : 2; cur_state = MFI_STATE_FW_INIT; break; + case MFI_STATE_FW_INIT_2: + max_wait = 180; + cur_state = MFI_STATE_FW_INIT_2; + break; case MFI_STATE_DEVICE_SCAN: max_wait = 180; cur_state = MFI_STATE_DEVICE_SCAN; @@ -2858,6 +4001,10 @@ mfi_state_transition_to_ready(struct mrsas_instance *instance) con_log(CL_NONE, (CE_NOTE, "Device scan in progress ...\n")); break; + case MFI_STATE_FLUSH_CACHE: + max_wait = 180; + cur_state = MFI_STATE_FLUSH_CACHE; + break; default: con_log(CL_ANN1, (CE_NOTE, "mr_sas: Unknown state 0x%x", fw_state)); @@ -2885,17 +4032,19 @@ mfi_state_transition_to_ready(struct mrsas_instance *instance) /* return error if fw_state hasn't changed after max_wait */ if (fw_state == cur_state) { - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_WARN, "FW state hasn't changed in %d secs", max_wait)); return (ENODEV); } }; - fw_ctrl = RD_IB_DOORBELL(instance); - - con_log(CL_ANN1, (CE_NOTE, - "mfi_state_transition_to_ready:FW ctrl = 0x%x", fw_ctrl)); + if (!instance->tbolt) { + fw_ctrl = RD_IB_DOORBELL(instance); + con_log(CL_ANN1, (CE_CONT, + "mfi_state_transition_to_ready:FW ctrl = 0x%x", fw_ctrl)); + } +#if 0 /* XXX KEBE ASKS, remove and use like pre-2208? */ /* * Write 0xF to the doorbell register to do the following. * - Abort all outstanding commands (bit 0). @@ -2904,11 +4053,14 @@ mfi_state_transition_to_ready(struct mrsas_instance *instance) * - Set to release FW to continue running (i.e. BIOS handshake * (bit 3). */ - WR_IB_DOORBELL(0xF, instance); - + if (!instance->tbolt) { + WR_IB_DOORBELL(0xF, instance); + } +#endif if (mrsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { return (ENODEV); } + return (DDI_SUCCESS); } @@ -2925,7 +4077,11 @@ get_seq_num(struct mrsas_instance *instance, struct mrsas_cmd *cmd; struct mrsas_dcmd_frame *dcmd; struct mrsas_evt_log_info *eli_tmp; - cmd = get_mfi_pkt(instance); + if (instance->tbolt) { + cmd = get_raid_msg_mfi_pkt(instance); + } else { + cmd = get_mfi_pkt(instance); + } if (!cmd) { cmn_err(CE_WARN, "mr_sas: failed to get a cmd"); @@ -2933,13 +4089,13 @@ get_seq_num(struct mrsas_instance *instance, instance->fw_outstanding, uint16_t, instance->max_fw_cmds); return (ENOMEM); } - cmd->retry_count_for_ocr = 0; + /* Clear the frame buffer and assign back the context id */ (void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame)); ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, cmd->index); - dcmd = &cmd->frame->dcmd; + dcmd = &cmd->frame->dcmd; /* allocate the data transfer buffer */ dcmd_dma_obj.size = sizeof (struct mrsas_evt_log_info); @@ -2951,8 +4107,8 @@ get_seq_num(struct mrsas_instance *instance, if (mrsas_alloc_dma_obj(instance, &dcmd_dma_obj, (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { - con_log(CL_ANN, (CE_WARN, - "get_seq_num: could not allocate data transfer buffer.")); + cmn_err(CE_WARN, + "get_seq_num: could not allocate data transfer buffer."); return (DDI_FAILURE); } @@ -2979,6 +4135,10 @@ get_seq_num(struct mrsas_instance *instance, cmd->sync_cmd = MRSAS_TRUE; cmd->frame_count = 1; + if (instance->tbolt) { + mr_sas_tbolt_build_mfi_cmd(instance, cmd); + } + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { cmn_err(CE_WARN, "get_seq_num: " "failed to issue MRSAS_DCMD_CTRL_EVENT_GET_INFO"); @@ -2993,12 +4153,12 @@ get_seq_num(struct mrsas_instance *instance, if (mrsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS) ret = DDI_FAILURE; - if (mrsas_common_check(instance, cmd) != DDI_SUCCESS) { - ret = DDI_FAILURE; + if (instance->tbolt) { + return_raid_msg_mfi_pkt(instance, cmd); + } else { + return_mfi_pkt(instance, cmd); } - return_mfi_pkt(instance, cmd); - return (ret); } @@ -3034,6 +4194,7 @@ start_mfi_aen(struct mrsas_instance *instance) return (-1); } + return (ret); } @@ -3045,9 +4206,11 @@ flush_cache(struct mrsas_instance *instance) { struct mrsas_cmd *cmd = NULL; struct mrsas_dcmd_frame *dcmd; - uint32_t max_cmd = instance->max_fw_cmds; - - cmd = instance->cmd_list[max_cmd]; + if (instance->tbolt) { + cmd = get_raid_msg_mfi_pkt(instance); + } else { + cmd = get_mfi_pkt(instance); + } if (!cmd) { con_log(CL_ANN1, (CE_WARN, @@ -3056,7 +4219,7 @@ flush_cache(struct mrsas_instance *instance) instance->fw_outstanding, uint16_t, instance->max_fw_cmds); return; } - cmd->retry_count_for_ocr = 0; + /* Clear the frame buffer and assign back the context id */ (void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame)); ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, @@ -3080,11 +4243,21 @@ flush_cache(struct mrsas_instance *instance) cmd->frame_count = 1; + if (instance->tbolt) { + mr_sas_tbolt_build_mfi_cmd(instance, cmd); + } + if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { con_log(CL_ANN1, (CE_WARN, "flush_cache: failed to issue MFI_DCMD_CTRL_CACHE_FLUSH")); } - con_log(CL_ANN1, (CE_NOTE, "flush_cache done")); + con_log(CL_ANN1, (CE_CONT, "flush_cache done")); + if (instance->tbolt) { + return_raid_msg_mfi_pkt(instance, cmd); + } else { + return_mfi_pkt(instance, cmd); + } + } /* @@ -3093,7 +4266,7 @@ flush_cache(struct mrsas_instance *instance) * @cmd: Command to be completed * */ -static void +void service_mfi_aen(struct mrsas_instance *instance, struct mrsas_cmd *cmd) { uint32_t seq_num; @@ -3101,12 +4274,16 @@ service_mfi_aen(struct mrsas_instance *instance, struct mrsas_cmd *cmd) (struct mrsas_evt_detail *)instance->mfi_evt_detail_obj.buffer; int rval = 0; int tgt = 0; + uint8_t dtype; +#ifdef PDSUPPORT + mrsas_pd_address_t *pd_addr; +#endif ddi_acc_handle_t acc_handle; - acc_handle = cmd->frame_dma_obj.acc_handle; + con_log(CL_ANN, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + acc_handle = cmd->frame_dma_obj.acc_handle; cmd->cmd_status = ddi_get8(acc_handle, &cmd->frame->io.cmd_status); - if (cmd->cmd_status == ENODATA) { cmd->cmd_status = 0; } @@ -3125,7 +4302,7 @@ service_mfi_aen(struct mrsas_instance *instance, struct mrsas_cmd *cmd) * Check for any ld devices that has changed state. i.e. online * or offline. */ - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_CONT, "AEN: code = %x class = %x locale = %x args = %x", ddi_get32(acc_handle, &evt_detail->code), evt_detail->cl.members.class, @@ -3136,6 +4313,10 @@ service_mfi_aen(struct mrsas_instance *instance, struct mrsas_cmd *cmd) case MR_EVT_CFG_CLEARED: { for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) { if (instance->mr_ld_list[tgt].dip != NULL) { + mutex_enter(&instance->config_dev_mtx); + instance->mr_ld_list[tgt].flag = + (uint8_t)~MRDRV_TGT_VALID; + mutex_exit(&instance->config_dev_mtx); rval = mrsas_service_evt(instance, tgt, 0, MRSAS_EVT_UNCONFIG_TGT, NULL); con_log(CL_ANN1, (CE_WARN, @@ -3147,6 +4328,10 @@ service_mfi_aen(struct mrsas_instance *instance, struct mrsas_cmd *cmd) } case MR_EVT_LD_DELETED: { + tgt = ddi_get16(acc_handle, &evt_detail->args.ld.target_id); + mutex_enter(&instance->config_dev_mtx); + instance->mr_ld_list[tgt].flag = (uint8_t)~MRDRV_TGT_VALID; + mutex_exit(&instance->config_dev_mtx); rval = mrsas_service_evt(instance, ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0, MRSAS_EVT_UNCONFIG_TGT, NULL); @@ -3167,6 +4352,86 @@ service_mfi_aen(struct mrsas_instance *instance, struct mrsas_cmd *cmd) ddi_get8(acc_handle, &evt_detail->args.ld.ld_index))); break; } /* End of MR_EVT_LD_CREATED */ + +#ifdef PDSUPPORT + case MR_EVT_PD_REMOVED_EXT: { + if (instance->tbolt) { + pd_addr = &evt_detail->args.pd_addr; + dtype = pd_addr->scsi_dev_type; + con_log(CL_DLEVEL1, (CE_NOTE, + " MR_EVT_PD_REMOVED_EXT: dtype = %x," + " arg_type = %d ", dtype, evt_detail->arg_type)); + tgt = ddi_get16(acc_handle, + &evt_detail->args.pd.device_id); + mutex_enter(&instance->config_dev_mtx); + instance->mr_tbolt_pd_list[tgt].flag = + (uint8_t)~MRDRV_TGT_VALID; + mutex_exit(&instance->config_dev_mtx); + rval = mrsas_service_evt(instance, ddi_get16( + acc_handle, &evt_detail->args.pd.device_id), + 1, MRSAS_EVT_UNCONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, "mr_sas: PD_REMOVED:" + "rval = %d tgt id = %d ", rval, + ddi_get16(acc_handle, + &evt_detail->args.pd.device_id))); + } + break; + } /* End of MR_EVT_PD_REMOVED_EXT */ + + case MR_EVT_PD_INSERTED_EXT: { + if (instance->tbolt) { + rval = mrsas_service_evt(instance, + ddi_get16(acc_handle, + &evt_detail->args.pd.device_id), + 1, MRSAS_EVT_CONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, "mr_sas: PD_INSERTEDi_EXT:" + "rval = %d tgt id = %d ", rval, + ddi_get16(acc_handle, + &evt_detail->args.pd.device_id))); + } + break; + } /* End of MR_EVT_PD_INSERTED_EXT */ + + case MR_EVT_PD_STATE_CHANGE: { + if (instance->tbolt) { + tgt = ddi_get16(acc_handle, + &evt_detail->args.pd.device_id); + if ((evt_detail->args.pd_state.prevState == + PD_SYSTEM) && + (evt_detail->args.pd_state.newState != PD_SYSTEM)) { + mutex_enter(&instance->config_dev_mtx); + instance->mr_tbolt_pd_list[tgt].flag = + (uint8_t)~MRDRV_TGT_VALID; + mutex_exit(&instance->config_dev_mtx); + rval = mrsas_service_evt(instance, + ddi_get16(acc_handle, + &evt_detail->args.pd.device_id), + 1, MRSAS_EVT_UNCONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, "mr_sas: PD_REMOVED:" + "rval = %d tgt id = %d ", rval, + ddi_get16(acc_handle, + &evt_detail->args.pd.device_id))); + break; + } + if ((evt_detail->args.pd_state.prevState + == UNCONFIGURED_GOOD) && + (evt_detail->args.pd_state.newState == PD_SYSTEM)) { + rval = mrsas_service_evt(instance, + ddi_get16(acc_handle, + &evt_detail->args.pd.device_id), + 1, MRSAS_EVT_CONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, + "mr_sas: PD_INSERTED: rval = %d " + " tgt id = %d ", rval, + ddi_get16(acc_handle, + &evt_detail->args.pd.device_id))); + break; + } + } + break; + } +#endif + } /* End of Main Switch */ /* get copy of seq_num and class/locale for re-registration */ @@ -3182,6 +4447,9 @@ service_mfi_aen(struct mrsas_instance *instance, struct mrsas_cmd *cmd) cmd->frame_count = 1; + cmd->retry_count_for_ocr = 0; + cmd->drv_pkt_time = 0; + /* Issue the aen registration frame */ instance->func_ptr->issue_cmd(cmd, instance); } @@ -3204,14 +4472,16 @@ complete_cmd_in_sync_mode(struct mrsas_instance *instance, cmd->sync_cmd = MRSAS_FALSE; - if (cmd->cmd_status == ENODATA) { - cmd->cmd_status = 0; - } - con_log(CL_ANN1, (CE_NOTE, "complete_cmd_in_sync_mode called %p \n", (void *)cmd)); + mutex_enter(&instance->int_cmd_mtx); + if (cmd->cmd_status == ENODATA) { + cmd->cmd_status = 0; + } cv_broadcast(&instance->int_cmd_cv); + mutex_exit(&instance->int_cmd_mtx); + } /* @@ -3229,20 +4499,22 @@ mrsas_initiate_ocr_if_fw_is_faulty(struct mrsas_instance *instance) cur_abs_reg_val = instance->func_ptr->read_fw_status_reg(instance); fw_state = cur_abs_reg_val & MFI_STATE_MASK; if (fw_state == MFI_STATE_FAULT) { - if (instance->disable_online_ctrl_reset == 1) { - con_log(CL_ANN1, (CE_NOTE, - "mrsas_initiate_ocr_if_fw_is_faulty: " - "FW in Fault state, detected in ISR: " - "FW doesn't support ocr ")); - return (ADAPTER_RESET_NOT_REQUIRED); + cmn_err(CE_WARN, + "mrsas_initiate_ocr_if_fw_is_faulty: " + "FW in Fault state, detected in ISR: " + "FW doesn't support ocr "); + + return (ADAPTER_RESET_NOT_REQUIRED); } else { - con_log(CL_ANN1, (CE_NOTE, - "mrsas_initiate_ocr_if_fw_is_faulty: " - "FW in Fault state, detected in ISR: FW supports ocr ")); + con_log(CL_ANN, (CE_NOTE, + "mrsas_initiate_ocr_if_fw_is_faulty: FW in Fault " + "state, detected in ISR: FW supports ocr ")); + return (ADAPTER_RESET_REQUIRED); } } + return (ADAPTER_RESET_NOT_REQUIRED); } @@ -3264,7 +4536,7 @@ mrsas_softintr(struct mrsas_instance *instance) struct mrsas_header *hdr; struct scsi_arq_status *arqstat; - con_log(CL_ANN1, (CE_CONT, "mrsas_softintr called")); + con_log(CL_ANN1, (CE_NOTE, "mrsas_softintr() called.")); ASSERT(instance); @@ -3341,7 +4613,7 @@ mrsas_softintr(struct mrsas_instance *instance) | STATE_GOT_TARGET | STATE_SENT_CMD | STATE_XFERRED_DATA | STATE_GOT_STATUS; - con_log(CL_ANN1, (CE_CONT, + con_log(CL_ANN, (CE_CONT, "CDB[0] = %x completed for %s: size %lx context %x", pkt->pkt_cdbp[0], ((acmd->islogical) ? "LD" : "PD"), acmd->cmd_dmacount, hdr->context)); @@ -3394,17 +4666,15 @@ mrsas_softintr(struct mrsas_instance *instance) break; case MFI_STAT_SCSI_DONE_WITH_ERROR: - con_log(CL_ANN1, (CE_CONT, "scsi_done error")); + con_log(CL_ANN, (CE_CONT, "scsi_done error")); pkt->pkt_reason = CMD_CMPLT; ((struct scsi_status *) pkt->pkt_scbp)->sts_chk = 1; if (pkt->pkt_cdbp[0] == SCMD_TEST_UNIT_READY) { - con_log(CL_ANN, (CE_WARN, "TEST_UNIT_READY fail")); - } else { pkt->pkt_state |= STATE_ARQ_DONE; arqstat = (void *)(pkt->pkt_scbp); @@ -3421,14 +4691,13 @@ mrsas_softintr(struct mrsas_instance *instance) (uint8_t *) &(arqstat->sts_sensedata), cmd->sense, - acmd->cmd_scblen - - offsetof(struct scsi_arq_status, - sts_sensedata), DDI_DEV_AUTOINCR); - } + sizeof (struct scsi_extended_sense), + DDI_DEV_AUTOINCR); + } break; case MFI_STAT_LD_OFFLINE: case MFI_STAT_DEVICE_NOT_FOUND: - con_log(CL_ANN1, (CE_CONT, + con_log(CL_ANN, (CE_CONT, "mrsas_softintr:device not found error")); pkt->pkt_reason = CMD_DEV_GONE; pkt->pkt_statistics = STAT_DISCON; @@ -3488,19 +4757,22 @@ mrsas_softintr(struct mrsas_instance *instance) if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && pkt->pkt_comp) { - con_log(CL_ANN1, (CE_NOTE, "mrsas_softintr: " + con_log(CL_DLEVEL1, (CE_NOTE, "mrsas_softintr: " "posting to scsa cmd %p index %x pkt %p " "time %llx", (void *)cmd, cmd->index, (void *)pkt, gethrtime())); (*pkt->pkt_comp)(pkt); } + return_mfi_pkt(instance, cmd); break; + case MFI_CMD_OP_SMP: case MFI_CMD_OP_STP: complete_cmd_in_sync_mode(instance, cmd); break; + case MFI_CMD_OP_DCMD: /* see if got an event notification */ if (ddi_get32(cmd->frame_dma_obj.acc_handle, @@ -3521,14 +4793,16 @@ mrsas_softintr(struct mrsas_instance *instance) } break; + case MFI_CMD_OP_ABORT: - con_log(CL_ANN, (CE_WARN, "MFI_CMD_OP_ABORT complete")); + con_log(CL_ANN, (CE_NOTE, "MFI_CMD_OP_ABORT complete")); /* * MFI_CMD_OP_ABORT successfully completed * in the synchronous mode */ complete_cmd_in_sync_mode(instance, cmd); break; + default: mrsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); @@ -3563,7 +4837,7 @@ mrsas_softintr(struct mrsas_instance *instance) * * Allocate the memory and other resources for an dma object. */ -static int +int mrsas_alloc_dma_obj(struct mrsas_instance *instance, dma_obj_t *obj, uchar_t endian_flags) { @@ -3610,6 +4884,11 @@ mrsas_alloc_dma_obj(struct mrsas_instance *instance, dma_obj_t *obj, return (-1); } + if (obj->dma_handle == NULL) { + /* XXX KEBE ASKS --> fm_service_impact()? */ + con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_mem_alloc")); + return (-1); + } if (ddi_dma_addr_bind_handle(obj->dma_handle, NULL, obj->buffer, obj->size, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP, @@ -3622,6 +4901,14 @@ mrsas_alloc_dma_obj(struct mrsas_instance *instance, dma_obj_t *obj, return (-1); } + if (obj->acc_handle == NULL) { + /* XXX KEBE ASKS --> fm_service_impact()? */ + ddi_dma_mem_free(&obj->acc_handle); + ddi_dma_free_handle(&obj->dma_handle); + + con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_addr_bind_handle")); + return (-1); + } if (mrsas_check_dma_handle(obj->dma_handle) != DDI_SUCCESS) { ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); @@ -3642,10 +4929,19 @@ mrsas_alloc_dma_obj(struct mrsas_instance *instance, dma_obj_t *obj, * De-allocate the memory and other resources for an dma object, which must * have been alloated by a previous call to mrsas_alloc_dma_obj() */ -static int +/* ARGSUSED */ +int mrsas_free_dma_obj(struct mrsas_instance *instance, dma_obj_t obj) { + if ((obj.dma_handle == NULL) || (obj.acc_handle == NULL)) { + return (DDI_SUCCESS); + } + + /* + * NOTE: These check-handle functions fail if *_handle == NULL, but + * this function succeeds because of the previous check. + */ if (mrsas_check_dma_handle(obj.dma_handle) != DDI_SUCCESS) { ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); return (DDI_FAILURE); @@ -3659,7 +4955,7 @@ mrsas_free_dma_obj(struct mrsas_instance *instance, dma_obj_t obj) (void) ddi_dma_unbind_handle(obj.dma_handle); ddi_dma_mem_free(&obj.acc_handle); ddi_dma_free_handle(&obj.dma_handle); - + obj.acc_handle = NULL; return (DDI_SUCCESS); } @@ -3669,7 +4965,7 @@ mrsas_free_dma_obj(struct mrsas_instance *instance, dma_obj_t obj) * * Allocate dma resources for a new scsi command */ -static int +int mrsas_dma_alloc(struct mrsas_instance *instance, struct scsi_pkt *pkt, struct buf *bp, int flags, int (*callback)()) { @@ -3705,6 +5001,13 @@ mrsas_dma_alloc(struct mrsas_instance *instance, struct scsi_pkt *pkt, tmp_dma_attr.dma_attr_sgllen = instance->max_num_sge; tmp_dma_attr.dma_attr_addr_hi = 0xffffffffffffffffull; + if (instance->tbolt) { + /* OCR-RESET FIX */ + tmp_dma_attr.dma_attr_count_max = + (U64)mrsas_tbolt_max_cap_maxxfer; /* limit to 256K */ + tmp_dma_attr.dma_attr_maxxfer = + (U64)mrsas_tbolt_max_cap_maxxfer; /* limit to 256K */ + } if ((i = ddi_dma_alloc_handle(instance->dip, &tmp_dma_attr, cb, 0, &acmd->cmd_dmahandle)) != DDI_SUCCESS) { @@ -3816,7 +5119,7 @@ no_dma_cookies: * move dma resources to next dma window * */ -static int +int mrsas_dma_move(struct mrsas_instance *instance, struct scsi_pkt *pkt, struct buf *bp) { @@ -3886,14 +5189,15 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap, { uint16_t flags = 0; uint32_t i; - uint32_t context; + uint32_t context; uint32_t sge_bytes; + uint32_t tmp_data_xfer_len; ddi_acc_handle_t acc_handle; struct mrsas_cmd *cmd; struct mrsas_sge64 *mfi_sgl; struct mrsas_sge_ieee *mfi_sgl_ieee; struct scsa_cmd *acmd = PKT2CMD(pkt); - struct mrsas_pthru_frame *pthru; + struct mrsas_pthru_frame *pthru; struct mrsas_io_frame *ldio; /* find out if this is logical or physical drive command. */ @@ -3908,8 +5212,6 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap, return (NULL); } - cmd->retry_count_for_ocr = 0; - acc_handle = cmd->frame_dma_obj.acc_handle; /* Clear the frame buffer and assign back the context id */ @@ -3951,7 +5253,7 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap, /* * case SCMD_SYNCHRONIZE_CACHE: - * flush_cache(instance); + * flush_cache(instance); * return_mfi_pkt(instance, cmd); * *cmd_done = 1; * @@ -3962,6 +5264,10 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap, case SCMD_WRITE: case SCMD_READ_G1: case SCMD_WRITE_G1: + case SCMD_READ_G4: + case SCMD_WRITE_G4: + case SCMD_READ_G5: + case SCMD_WRITE_G5: if (acmd->islogical) { ldio = (struct mrsas_io_frame *)cmd->frame; @@ -4001,6 +5307,7 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap, context = ddi_get32(acc_handle, &ldio->context); if (acmd->cmd_cdblen == CDB_GROUP0) { + /* 6-byte cdb */ ddi_put32(acc_handle, &ldio->lba_count, ( (uint16_t)(pkt->pkt_cdbp[4]))); @@ -4010,6 +5317,7 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap, ((uint32_t)((pkt->pkt_cdbp[1]) & 0x1F) << 16))); } else if (acmd->cmd_cdblen == CDB_GROUP1) { + /* 10-byte cdb */ ddi_put32(acc_handle, &ldio->lba_count, ( ((uint16_t)(pkt->pkt_cdbp[8])) | ((uint16_t)(pkt->pkt_cdbp[7]) << 8))); @@ -4019,24 +5327,26 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap, ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); - } else if (acmd->cmd_cdblen == CDB_GROUP2) { + } else if (acmd->cmd_cdblen == CDB_GROUP5) { + /* 12-byte cdb */ ddi_put32(acc_handle, &ldio->lba_count, ( - ((uint16_t)(pkt->pkt_cdbp[9])) | - ((uint16_t)(pkt->pkt_cdbp[8]) << 8) | - ((uint16_t)(pkt->pkt_cdbp[7]) << 16) | - ((uint16_t)(pkt->pkt_cdbp[6]) << 24))); + ((uint32_t)(pkt->pkt_cdbp[9])) | + ((uint32_t)(pkt->pkt_cdbp[8]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[7]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[6]) << 24))); ddi_put32(acc_handle, &ldio->start_lba_lo, ( ((uint32_t)(pkt->pkt_cdbp[5])) | ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); - } else if (acmd->cmd_cdblen == CDB_GROUP3) { + } else if (acmd->cmd_cdblen == CDB_GROUP4) { + /* 16-byte cdb */ ddi_put32(acc_handle, &ldio->lba_count, ( - ((uint16_t)(pkt->pkt_cdbp[13])) | - ((uint16_t)(pkt->pkt_cdbp[12]) << 8) | - ((uint16_t)(pkt->pkt_cdbp[11]) << 16) | - ((uint16_t)(pkt->pkt_cdbp[10]) << 24))); + ((uint32_t)(pkt->pkt_cdbp[13])) | + ((uint32_t)(pkt->pkt_cdbp[12]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[11]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[10]) << 24))); ddi_put32(acc_handle, &ldio->start_lba_lo, ( ((uint32_t)(pkt->pkt_cdbp[9])) | @@ -4044,7 +5354,7 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap, ((uint32_t)(pkt->pkt_cdbp[7]) << 16) | ((uint32_t)(pkt->pkt_cdbp[6]) << 24))); - ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ddi_put32(acc_handle, &ldio->start_lba_hi, ( ((uint32_t)(pkt->pkt_cdbp[5])) | ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | @@ -4090,8 +5400,12 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap, ddi_put8(acc_handle, &pthru->cdb_len, acmd->cmd_cdblen); ddi_put16(acc_handle, &pthru->timeout, 0); ddi_put16(acc_handle, &pthru->flags, flags); + tmp_data_xfer_len = 0; + for (i = 0; i < acmd->cmd_cookiecnt; i++) { + tmp_data_xfer_len += acmd->cmd_dmacookies[i].dmac_size; + } ddi_put32(acc_handle, &pthru->data_xfer_len, - acmd->cmd_dmacount); + tmp_data_xfer_len); ddi_put8(acc_handle, &pthru->sge_count, acmd->cmd_cookiecnt); if (instance->flag_ieee) { mfi_sgl_ieee = (struct mrsas_sge_ieee *)&pthru->sgl; @@ -4142,7 +5456,16 @@ build_cmd(struct mrsas_instance *instance, struct scsi_address *ap, return (cmd); } + #ifndef __sparc +/* + * wait_for_outstanding - Wait for all outstanding cmds + * @instance: Adapter soft state + * + * This function waits for upto MRDRV_RESET_WAIT_TIME seconds for FW to + * complete all its outstanding commands. Returns error if one or more IOs + * are pending after this time period. + */ static int wait_for_outstanding(struct mrsas_instance *instance) { @@ -4153,6 +5476,7 @@ wait_for_outstanding(struct mrsas_instance *instance) if (!instance->fw_outstanding) { break; } + drv_usecwait(MILLISEC); /* wait for 1000 usecs */; } @@ -4162,7 +5486,8 @@ wait_for_outstanding(struct mrsas_instance *instance) return (0); } -#endif /* __sparc */ +#endif /* __sparc */ + /* * issue_mfi_pthru */ @@ -4173,6 +5498,7 @@ issue_mfi_pthru(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, void *ubuf; uint32_t kphys_addr = 0; uint32_t xferlen = 0; + uint32_t new_xfer_length = 0; uint_t model; ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; dma_obj_t pthru_dma_obj; @@ -4183,24 +5509,24 @@ issue_mfi_pthru(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, kpthru = (struct mrsas_pthru_frame *)&ioctl->frame[0]; if (instance->adapterresetinprogress) { - con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: Reset flag set, " + con_log(CL_ANN1, (CE_WARN, "issue_mfi_pthru: Reset flag set, " "returning mfi_pkt and setting TRAN_BUSY\n")); return (DDI_FAILURE); } model = ddi_model_convert_from(mode & FMODELS); if (model == DDI_MODEL_ILP32) { - con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32")); + con_log(CL_ANN1, (CE_CONT, "issue_mfi_pthru: DDI_MODEL_LP32")); xferlen = kpthru->sgl.sge32[0].length; ubuf = (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr; } else { #ifdef _ILP32 - con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32")); + con_log(CL_ANN1, (CE_CONT, "issue_mfi_pthru: DDI_MODEL_LP32")); xferlen = kpthru->sgl.sge32[0].length; ubuf = (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr; #else - con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP64")); + con_log(CL_ANN1, (CE_CONT, "issue_mfi_pthru: DDI_MODEL_LP64")); xferlen = kpthru->sgl.sge64[0].length; ubuf = (void *)(ulong_t)kpthru->sgl.sge64[0].phys_addr; #endif @@ -4209,7 +5535,10 @@ issue_mfi_pthru(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, if (xferlen) { /* means IOCTL requires DMA */ /* allocate the data transfer buffer */ - pthru_dma_obj.size = xferlen; + /* pthru_dma_obj.size = xferlen; */ + MRSAS_GET_BOUNDARY_ALIGNED_LEN(xferlen, new_xfer_length, + PAGESIZE); + pthru_dma_obj.size = new_xfer_length; pthru_dma_obj.dma_attr = mrsas_generic_dma_attr; pthru_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; pthru_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; @@ -4243,7 +5572,7 @@ issue_mfi_pthru(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, } ddi_put8(acc_handle, &pthru->cmd, kpthru->cmd); - ddi_put8(acc_handle, &pthru->sense_len, 0); + ddi_put8(acc_handle, &pthru->sense_len, SENSE_LENGTH); ddi_put8(acc_handle, &pthru->cmd_status, 0); ddi_put8(acc_handle, &pthru->scsi_status, 0); ddi_put8(acc_handle, &pthru->target_id, kpthru->target_id); @@ -4254,8 +5583,8 @@ issue_mfi_pthru(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, ddi_put32(acc_handle, &pthru->data_xfer_len, kpthru->data_xfer_len); ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0); - /* pthru->sense_buf_phys_addr_lo = cmd->sense_phys_addr; */ - ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, 0); + pthru->sense_buf_phys_addr_lo = cmd->sense_phys_addr; + /* ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, 0); */ ddi_rep_put8(acc_handle, (uint8_t *)kpthru->cdb, (uint8_t *)pthru->cdb, pthru->cdb_len, DDI_DEV_AUTOINCR); @@ -4267,6 +5596,10 @@ issue_mfi_pthru(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, cmd->sync_cmd = MRSAS_TRUE; cmd->frame_count = 1; + if (instance->tbolt) { + mr_sas_tbolt_build_mfi_cmd(instance, cmd); + } + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { con_log(CL_ANN, (CE_WARN, "issue_mfi_pthru: fw_ioctl failed")); @@ -4288,11 +5621,35 @@ issue_mfi_pthru(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, kpthru->cmd_status = ddi_get8(acc_handle, &pthru->cmd_status); kpthru->scsi_status = ddi_get8(acc_handle, &pthru->scsi_status); - con_log(CL_ANN, (CE_NOTE, "issue_mfi_pthru: cmd_status %x, " + con_log(CL_ANN, (CE_CONT, "issue_mfi_pthru: cmd_status %x, " "scsi_status %x", kpthru->cmd_status, kpthru->scsi_status)); DTRACE_PROBE3(issue_pthru, uint8_t, kpthru->cmd, uint8_t, kpthru->cmd_status, uint8_t, kpthru->scsi_status); + if (kpthru->sense_len) { + uint_t sense_len = SENSE_LENGTH; + void *sense_ubuf = + (void *)(ulong_t)kpthru->sense_buf_phys_addr_lo; + if (kpthru->sense_len <= SENSE_LENGTH) { + sense_len = kpthru->sense_len; + } + + for (i = 0; i < sense_len; i++) { + if (ddi_copyout( + (uint8_t *)cmd->sense+i, + (uint8_t *)sense_ubuf+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_pthru : " + "copy to user space failed")); + } + con_log(CL_DLEVEL1, (CE_WARN, + "Copying Sense info sense_buff[%d] = 0x%X\n", + i, *((uint8_t *)cmd->sense + i))); + } + } + (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, 0, 0, + DDI_DMA_SYNC_FORDEV); + if (xferlen) { /* free kernel buffer */ if (mrsas_free_dma_obj(instance, pthru_dma_obj) != DDI_SUCCESS) @@ -4312,6 +5669,7 @@ issue_mfi_dcmd(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, void *ubuf; uint32_t kphys_addr = 0; uint32_t xferlen = 0; + uint32_t new_xfer_length = 0; uint32_t model; dma_obj_t dcmd_dma_obj; struct mrsas_dcmd_frame *kdcmd; @@ -4320,25 +5678,26 @@ issue_mfi_dcmd(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, int i; dcmd = &cmd->frame->dcmd; kdcmd = (struct mrsas_dcmd_frame *)&ioctl->frame[0]; + if (instance->adapterresetinprogress) { - con_log(CL_ANN1, (CE_NOTE, "Reset flag set, " + con_log(CL_ANN1, (CE_WARN, "Reset flag set, " "returning mfi_pkt and setting TRAN_BUSY\n")); return (DDI_FAILURE); } model = ddi_model_convert_from(mode & FMODELS); if (model == DDI_MODEL_ILP32) { - con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32")); + con_log(CL_ANN1, (CE_CONT, "issue_mfi_dcmd: DDI_MODEL_ILP32")); xferlen = kdcmd->sgl.sge32[0].length; ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; } else { #ifdef _ILP32 - con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32")); + con_log(CL_ANN1, (CE_CONT, "issue_mfi_dcmd: DDI_MODEL_ILP32")); xferlen = kdcmd->sgl.sge32[0].length; ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; #else - con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_LP64")); + con_log(CL_ANN1, (CE_CONT, "issue_mfi_dcmd: DDI_MODEL_LP64")); xferlen = kdcmd->sgl.sge64[0].length; ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr; #endif @@ -4346,7 +5705,10 @@ issue_mfi_dcmd(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, if (xferlen) { /* means IOCTL requires DMA */ /* allocate the data transfer buffer */ - dcmd_dma_obj.size = xferlen; + /* dcmd_dma_obj.size = xferlen; */ + MRSAS_GET_BOUNDARY_ALIGNED_LEN(xferlen, new_xfer_length, + PAGESIZE); + dcmd_dma_obj.size = new_xfer_length; dcmd_dma_obj.dma_attr = mrsas_generic_dma_attr; dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; @@ -4354,12 +5716,13 @@ issue_mfi_dcmd(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, dcmd_dma_obj.dma_attr.dma_attr_align = 1; /* allocate kernel buffer for DMA */ - if (mrsas_alloc_dma_obj(instance, &dcmd_dma_obj, - (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { - con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: " - "could not allocate data transfer buffer.")); - return (DDI_FAILURE); - } + if (mrsas_alloc_dma_obj(instance, &dcmd_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, + (CE_WARN, "issue_mfi_dcmd: could not " + "allocate data transfer buffer.")); + return (DDI_FAILURE); + } (void) memset(dcmd_dma_obj.buffer, 0, xferlen); /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ @@ -4396,6 +5759,10 @@ issue_mfi_dcmd(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, cmd->sync_cmd = MRSAS_TRUE; cmd->frame_count = 1; + if (instance->tbolt) { + mr_sas_tbolt_build_mfi_cmd(instance, cmd); + } + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: fw_ioctl failed")); } else { @@ -4415,6 +5782,8 @@ issue_mfi_dcmd(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, } kdcmd->cmd_status = ddi_get8(acc_handle, &dcmd->cmd_status); + con_log(CL_ANN, + (CE_CONT, "issue_mfi_dcmd: cmd_status %x", kdcmd->cmd_status)); DTRACE_PROBE3(issue_dcmd, uint32_t, kdcmd->opcode, uint8_t, kdcmd->cmd, uint8_t, kdcmd->cmd_status); @@ -4438,6 +5807,8 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, void *response_ubuf; uint32_t request_xferlen = 0; uint32_t response_xferlen = 0; + uint32_t new_xfer_length1 = 0; + uint32_t new_xfer_length2 = 0; uint_t model; dma_obj_t request_dma_obj; dma_obj_t response_dma_obj; @@ -4455,44 +5826,44 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, ksmp = (struct mrsas_smp_frame *)&ioctl->frame[0]; if (instance->adapterresetinprogress) { - con_log(CL_ANN1, (CE_NOTE, "Reset flag set, " + con_log(CL_ANN1, (CE_WARN, "Reset flag set, " "returning mfi_pkt and setting TRAN_BUSY\n")); return (DDI_FAILURE); } model = ddi_model_convert_from(mode & FMODELS); if (model == DDI_MODEL_ILP32) { - con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32")); + con_log(CL_ANN1, (CE_CONT, "issue_mfi_smp: DDI_MODEL_ILP32")); sge32 = &ksmp->sgl[0].sge32[0]; response_xferlen = sge32[0].length; request_xferlen = sge32[1].length; - con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: " + con_log(CL_ANN, (CE_CONT, "issue_mfi_smp: " "response_xferlen = %x, request_xferlen = %x", response_xferlen, request_xferlen)); response_ubuf = (void *)(ulong_t)sge32[0].phys_addr; request_ubuf = (void *)(ulong_t)sge32[1].phys_addr; - con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: " + con_log(CL_ANN1, (CE_CONT, "issue_mfi_smp: " "response_ubuf = %p, request_ubuf = %p", response_ubuf, request_ubuf)); } else { #ifdef _ILP32 - con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32")); + con_log(CL_ANN1, (CE_CONT, "issue_mfi_smp: DDI_MODEL_ILP32")); sge32 = &ksmp->sgl[0].sge32[0]; response_xferlen = sge32[0].length; request_xferlen = sge32[1].length; - con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: " + con_log(CL_ANN, (CE_CONT, "issue_mfi_smp: " "response_xferlen = %x, request_xferlen = %x", response_xferlen, request_xferlen)); response_ubuf = (void *)(ulong_t)sge32[0].phys_addr; request_ubuf = (void *)(ulong_t)sge32[1].phys_addr; - con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: " + con_log(CL_ANN1, (CE_CONT, "issue_mfi_smp: " "response_ubuf = %p, request_ubuf = %p", response_ubuf, request_ubuf)); #else - con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_LP64")); + con_log(CL_ANN1, (CE_CONT, "issue_mfi_smp: DDI_MODEL_LP64")); sge64 = &ksmp->sgl[0].sge64[0]; response_xferlen = sge64[0].length; @@ -4505,7 +5876,10 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, if (request_xferlen) { /* means IOCTL requires DMA */ /* allocate the data transfer buffer */ - request_dma_obj.size = request_xferlen; + /* request_dma_obj.size = request_xferlen; */ + MRSAS_GET_BOUNDARY_ALIGNED_LEN(request_xferlen, + new_xfer_length1, PAGESIZE); + request_dma_obj.size = new_xfer_length1; request_dma_obj.dma_attr = mrsas_generic_dma_attr; request_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; request_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; @@ -4536,7 +5910,10 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, if (response_xferlen) { /* means IOCTL requires DMA */ /* allocate the data transfer buffer */ - response_dma_obj.size = response_xferlen; + /* response_dma_obj.size = response_xferlen; */ + MRSAS_GET_BOUNDARY_ALIGNED_LEN(response_xferlen, + new_xfer_length2, PAGESIZE); + response_dma_obj.size = new_xfer_length2; response_dma_obj.dma_attr = mrsas_generic_dma_attr; response_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; response_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; @@ -4580,7 +5957,7 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, model = ddi_model_convert_from(mode & FMODELS); if (model == DDI_MODEL_ILP32) { - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_CONT, "issue_mfi_smp: DDI_MODEL_ILP32")); sge32 = &smp->sgl[0].sge32[0]; @@ -4592,7 +5969,7 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, request_dma_obj.dma_cookie[0].dmac_address); } else { #ifdef _ILP32 - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_CONT, "issue_mfi_smp: DDI_MODEL_ILP32")); sge32 = &smp->sgl[0].sge32[0]; ddi_put32(acc_handle, &sge32[0].length, response_xferlen); @@ -4602,7 +5979,7 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, ddi_put32(acc_handle, &sge32[1].phys_addr, request_dma_obj.dma_cookie[0].dmac_address); #else - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_CONT, "issue_mfi_smp: DDI_MODEL_LP64")); sge64 = &smp->sgl[0].sge64[0]; ddi_put32(acc_handle, &sge64[0].length, response_xferlen); @@ -4613,7 +5990,7 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, request_dma_obj.dma_cookie[0].dmac_address); #endif } - con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp : " + con_log(CL_ANN1, (CE_CONT, "issue_mfi_smp : " "smp->response_xferlen = %d, smp->request_xferlen = %d " "smp->data_xfer_len = %d", ddi_get32(acc_handle, &sge32[0].length), ddi_get32(acc_handle, &sge32[1].length), @@ -4622,11 +5999,15 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, cmd->sync_cmd = MRSAS_TRUE; cmd->frame_count = 1; + if (instance->tbolt) { + mr_sas_tbolt_build_mfi_cmd(instance, cmd); + } + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: fw_ioctl failed")); } else { - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_CONT, "issue_mfi_smp: copy to user space")); if (request_xferlen) { @@ -4660,7 +6041,7 @@ issue_mfi_smp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, ksmp->cmd_status = ddi_get8(acc_handle, &smp->cmd_status); con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: smp->cmd_status = %d", - ddi_get8(acc_handle, &smp->cmd_status))); + ksmp->cmd_status)); DTRACE_PROBE2(issue_smp, uint8_t, ksmp->cmd, uint8_t, ksmp->cmd_status); if (request_xferlen) { @@ -4690,6 +6071,8 @@ issue_mfi_stp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, void *fis_ubuf; void *data_ubuf; uint32_t fis_xferlen = 0; + uint32_t new_xfer_length1 = 0; + uint32_t new_xfer_length2 = 0; uint32_t data_xferlen = 0; uint_t model; dma_obj_t fis_dma_obj; @@ -4703,24 +6086,22 @@ issue_mfi_stp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, kstp = (struct mrsas_stp_frame *)&ioctl->frame[0]; if (instance->adapterresetinprogress) { - con_log(CL_ANN1, (CE_NOTE, "Reset flag set, " + con_log(CL_ANN1, (CE_WARN, "Reset flag set, " "returning mfi_pkt and setting TRAN_BUSY\n")); return (DDI_FAILURE); } model = ddi_model_convert_from(mode & FMODELS); if (model == DDI_MODEL_ILP32) { - con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32")); + con_log(CL_ANN1, (CE_CONT, "issue_mfi_stp: DDI_MODEL_ILP32")); fis_xferlen = kstp->sgl.sge32[0].length; data_xferlen = kstp->sgl.sge32[1].length; fis_ubuf = (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr; data_ubuf = (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr; - } - else - { + } else { #ifdef _ILP32 - con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32")); + con_log(CL_ANN1, (CE_CONT, "issue_mfi_stp: DDI_MODEL_ILP32")); fis_xferlen = kstp->sgl.sge32[0].length; data_xferlen = kstp->sgl.sge32[1].length; @@ -4728,7 +6109,7 @@ issue_mfi_stp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, fis_ubuf = (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr; data_ubuf = (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr; #else - con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_LP64")); + con_log(CL_ANN1, (CE_CONT, "issue_mfi_stp: DDI_MODEL_LP64")); fis_xferlen = kstp->sgl.sge64[0].length; data_xferlen = kstp->sgl.sge64[1].length; @@ -4740,12 +6121,15 @@ issue_mfi_stp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, if (fis_xferlen) { - con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: " + con_log(CL_ANN, (CE_CONT, "issue_mfi_stp: " "fis_ubuf = %p fis_xferlen = %x", fis_ubuf, fis_xferlen)); /* means IOCTL requires DMA */ /* allocate the data transfer buffer */ - fis_dma_obj.size = fis_xferlen; + /* fis_dma_obj.size = fis_xferlen; */ + MRSAS_GET_BOUNDARY_ALIGNED_LEN(fis_xferlen, + new_xfer_length1, PAGESIZE); + fis_dma_obj.size = new_xfer_length1; fis_dma_obj.dma_attr = mrsas_generic_dma_attr; fis_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; fis_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; @@ -4773,19 +6157,22 @@ issue_mfi_stp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, } if (data_xferlen) { - con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: data_ubuf = %p " + con_log(CL_ANN, (CE_CONT, "issue_mfi_stp: data_ubuf = %p " "data_xferlen = %x", data_ubuf, data_xferlen)); /* means IOCTL requires DMA */ /* allocate the data transfer buffer */ - data_dma_obj.size = data_xferlen; + /* data_dma_obj.size = data_xferlen; */ + MRSAS_GET_BOUNDARY_ALIGNED_LEN(data_xferlen, new_xfer_length2, + PAGESIZE); + data_dma_obj.size = new_xfer_length2; data_dma_obj.dma_attr = mrsas_generic_dma_attr; data_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; data_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; data_dma_obj.dma_attr.dma_attr_sgllen = 1; data_dma_obj.dma_attr.dma_attr_align = 1; -/* allocate kernel buffer for DMA */ + /* allocate kernel buffer for DMA */ if (mrsas_alloc_dma_obj(instance, &data_dma_obj, (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: " @@ -4829,6 +6216,10 @@ issue_mfi_stp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, cmd->sync_cmd = MRSAS_TRUE; cmd->frame_count = 1; + if (instance->tbolt) { + mr_sas_tbolt_build_mfi_cmd(instance, cmd); + } + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: fw_ioctl failed")); } else { @@ -4860,6 +6251,8 @@ issue_mfi_stp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, } kstp->cmd_status = ddi_get8(acc_handle, &stp->cmd_status); + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: stp->cmd_status = %d", + kstp->cmd_status)); DTRACE_PROBE2(issue_stp, uint8_t, kstp->cmd, uint8_t, kstp->cmd_status); if (fis_xferlen) { @@ -4880,7 +6273,7 @@ issue_mfi_stp(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, /* * fill_up_drv_ver */ -static void +void fill_up_drv_ver(struct mrsas_drv_ver *dv) { (void) memset(dv, 0, sizeof (struct mrsas_drv_ver)); @@ -4891,6 +6284,7 @@ fill_up_drv_ver(struct mrsas_drv_ver *dv) (void) memcpy(dv->drv_ver, MRSAS_VERSION, strlen(MRSAS_VERSION)); (void) memcpy(dv->drv_rel_date, MRSAS_RELDATE, strlen(MRSAS_RELDATE)); + } /* @@ -4917,7 +6311,7 @@ handle_drv_ioctl(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, model = ddi_model_convert_from(mode & FMODELS); if (model == DDI_MODEL_ILP32) { - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_CONT, "handle_drv_ioctl: DDI_MODEL_ILP32")); xferlen = kdcmd->sgl.sge32[0].length; @@ -4925,23 +6319,23 @@ handle_drv_ioctl(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; } else { #ifdef _ILP32 - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_CONT, "handle_drv_ioctl: DDI_MODEL_ILP32")); xferlen = kdcmd->sgl.sge32[0].length; ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; #else - con_log(CL_ANN1, (CE_NOTE, + con_log(CL_ANN1, (CE_CONT, "handle_drv_ioctl: DDI_MODEL_LP64")); xferlen = kdcmd->sgl.sge64[0].length; ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr; #endif } - con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + con_log(CL_ANN1, (CE_CONT, "handle_drv_ioctl: " "dataBuf=%p size=%d bytes", ubuf, xferlen)); switch (kdcmd->opcode) { case MRSAS_DRIVER_IOCTL_DRIVER_VERSION: - con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + con_log(CL_ANN1, (CE_CONT, "handle_drv_ioctl: " "MRSAS_DRIVER_IOCTL_DRIVER_VERSION")); fill_up_drv_ver(&dv); @@ -5017,8 +6411,11 @@ handle_mfi_ioctl(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, struct mrsas_header *hdr; struct mrsas_cmd *cmd; - cmd = get_mfi_pkt(instance); - + if (instance->tbolt) { + cmd = get_raid_msg_mfi_pkt(instance); + } else { + cmd = get_mfi_pkt(instance); + } if (!cmd) { con_log(CL_ANN, (CE_WARN, "mr_sas: " "failed to get a cmd packet")); @@ -5026,7 +6423,6 @@ handle_mfi_ioctl(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, instance->fw_outstanding, uint16_t, instance->max_fw_cmds); return (DDI_FAILURE); } - cmd->retry_count_for_ocr = 0; /* Clear the frame buffer and assign back the context id */ (void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame)); @@ -5059,7 +6455,11 @@ handle_mfi_ioctl(struct mrsas_instance *instance, struct mrsas_ioctl *ioctl, if (mrsas_common_check(instance, cmd) != DDI_SUCCESS) rval = DDI_FAILURE; - return_mfi_pkt(instance, cmd); + if (instance->tbolt) { + return_raid_msg_mfi_pkt(instance, cmd); + } else { + return_mfi_pkt(instance, cmd); + } return (rval); } @@ -5091,6 +6491,7 @@ register_mfi_aen(struct mrsas_instance *instance, uint32_t seq_num, union mrsas_evt_class_locale curr_aen; union mrsas_evt_class_locale prev_aen; + con_log(CL_ANN, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); /* * If there an AEN pending already (aen_cmd), check if the * class_locale of that pending AEN is inclusive of the new @@ -5151,14 +6552,18 @@ register_mfi_aen(struct mrsas_instance *instance, uint32_t seq_num, curr_aen.members.locale = LE_16(curr_aen.members.locale); } - cmd = get_mfi_pkt(instance); + if (instance->tbolt) { + cmd = get_raid_msg_mfi_pkt(instance); + } else { + cmd = get_mfi_pkt(instance); + } if (!cmd) { DTRACE_PROBE2(mfi_aen_err, uint16_t, instance->fw_outstanding, uint16_t, instance->max_fw_cmds); return (ENOMEM); } - cmd->retry_count_for_ocr = 0; + /* Clear the frame buffer and assign back the context id */ (void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame)); ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, @@ -5207,12 +6612,15 @@ register_mfi_aen(struct mrsas_instance *instance, uint32_t seq_num, /* Issue the aen registration frame */ /* atomic_add_16 (&instance->fw_outstanding, 1); */ + if (instance->tbolt) { + mr_sas_tbolt_build_mfi_cmd(instance, cmd); + } instance->func_ptr->issue_cmd(cmd, instance); return (0); } -static void +void display_scsi_inquiry(caddr_t scsi_inq) { #define MAX_SCSI_DEVICE_CODE 14 @@ -5220,38 +6628,38 @@ display_scsi_inquiry(caddr_t scsi_inq) char inquiry_buf[256] = {0}; int len; const char *const scsi_device_types[] = { - "Direct-Access ", + "Direct-Access ", "Sequential-Access", - "Printer ", - "Processor ", - "WORM ", - "CD-ROM ", - "Scanner ", - "Optical Device ", - "Medium Changer ", - "Communications ", - "Unknown ", - "Unknown ", - "Unknown ", - "Enclosure ", + "Printer ", + "Processor ", + "WORM ", + "CD-ROM ", + "Scanner ", + "Optical Device ", + "Medium Changer ", + "Communications ", + "Unknown ", + "Unknown ", + "Unknown ", + "Enclosure ", }; len = 0; - len += snprintf(inquiry_buf + len, 265 - len, " Vendor: "); + len += snprintf(inquiry_buf + len, 265 - len, " Vendor: "); for (i = 8; i < 16; i++) { len += snprintf(inquiry_buf + len, 265 - len, "%c", scsi_inq[i]); } - len += snprintf(inquiry_buf + len, 265 - len, " Model: "); + len += snprintf(inquiry_buf + len, 265 - len, " Model: "); for (i = 16; i < 32; i++) { len += snprintf(inquiry_buf + len, 265 - len, "%c", scsi_inq[i]); } - len += snprintf(inquiry_buf + len, 265 - len, " Rev: "); + len += snprintf(inquiry_buf + len, 265 - len, " Rev: "); for (i = 32; i < 36; i++) { len += snprintf(inquiry_buf + len, 265 - len, "%c", @@ -5264,13 +6672,13 @@ display_scsi_inquiry(caddr_t scsi_inq) i = scsi_inq[0] & 0x1f; - len += snprintf(inquiry_buf + len, 265 - len, " Type: %s ", + len += snprintf(inquiry_buf + len, 265 - len, " Type: %s ", i < MAX_SCSI_DEVICE_CODE ? scsi_device_types[i] : - "Unknown "); + "Unknown "); len += snprintf(inquiry_buf + len, 265 - len, - " ANSI SCSI revision: %02x", scsi_inq[2] & 0x07); + " ANSI SCSI revision: %02x", scsi_inq[2] & 0x07); if ((scsi_inq[2] & 0x07) == 1 && (scsi_inq[3] & 0x0f) == 1) { len += snprintf(inquiry_buf + len, 265 - len, " CCS\n"); @@ -5278,7 +6686,7 @@ display_scsi_inquiry(caddr_t scsi_inq) len += snprintf(inquiry_buf + len, 265 - len, "\n"); } - con_log(CL_ANN1, (CE_CONT, inquiry_buf)); + con_log(CL_DLEVEL2, (CE_CONT, inquiry_buf)); } static void @@ -5294,8 +6702,9 @@ io_timeout_checker(void *arg) mlist_t process_list; if (instance->adapterresetinprogress == 1) { - con_log(CL_ANN1, (CE_NOTE, "io_timeout_checker" + con_log(CL_ANN, (CE_NOTE, "io_timeout_checker:" " reset in progress")); + instance->timeout_id = timeout(io_timeout_checker, (void *) instance, drv_usectohz(MRSAS_1_SECOND)); return; @@ -5303,10 +6712,18 @@ io_timeout_checker(void *arg) /* See if this check needs to be in the beginning or last in ISR */ if (mrsas_initiate_ocr_if_fw_is_faulty(instance) == 1) { - con_log(CL_ANN1, (CE_NOTE, - "Fw Fault state Handling in io_timeout_checker")); + cmn_err(CE_WARN, "io_timeout_checker: " + "FW Fault, calling reset adapter"); + cmn_err(CE_CONT, "io_timeout_checker: " + "fw_outstanding 0x%X max_fw_cmds 0x%X", + instance->fw_outstanding, instance->max_fw_cmds); if (instance->adapterresetinprogress == 0) { - (void) mrsas_reset_ppc(instance); + instance->adapterresetinprogress = 1; + if (instance->tbolt) + (void) mrsas_tbolt_reset_ppc(instance); + else + (void) mrsas_reset_ppc(instance); + instance->adapterresetinprogress = 0; } instance->timeout_id = timeout(io_timeout_checker, (void *) instance, drv_usectohz(MRSAS_1_SECOND)); @@ -5337,10 +6754,12 @@ io_timeout_checker(void *arg) time = --cmd->drv_pkt_time; } if (time <= 0) { - con_log(CL_ANN1, (CE_NOTE, "%llx: " - "io_timeout_checker: TIMING OUT: pkt " - ": %p, cmd %p", gethrtime(), (void *)pkt, - (void *)cmd)); + cmn_err(CE_WARN, "%llx: " + "io_timeout_checker: TIMING OUT: pkt: %p, " + "cmd %p fw_outstanding 0x%X max_fw_cmds 0x%X\n", + gethrtime(), (void *)pkt, (void *)cmd, + instance->fw_outstanding, instance->max_fw_cmds); + counter++; break; } @@ -5348,52 +6767,57 @@ io_timeout_checker(void *arg) mutex_exit(&instance->cmd_pend_mtx); if (counter) { - con_log(CL_ANN1, (CE_NOTE, - "io_timeout_checker " - "cmd->retrycount_for_ocr %d, " - "cmd index %d , cmd address %p ", - cmd->retry_count_for_ocr+1, cmd->index, (void *)cmd)); - if (instance->disable_online_ctrl_reset == 1) { - con_log(CL_ANN1, (CE_NOTE, "mrsas: " - "OCR is not supported by the Firmware " - "Failing all the queued packets \n")); + cmn_err(CE_WARN, "mr_sas %d: %s(): OCR is NOT " + "supported by Firmware, KILL adapter!!!", + instance->instance, __func__); + + if (instance->tbolt) + mrsas_tbolt_kill_adapter(instance); + else + (void) mrsas_kill_adapter(instance); - (void) mrsas_kill_adapter(instance); return; } else { - if (cmd->retry_count_for_ocr <= IO_RETRY_COUNT) { + if (cmd->retry_count_for_ocr <= IO_RETRY_COUNT) { if (instance->adapterresetinprogress == 0) { - con_log(CL_ANN1, (CE_NOTE, "mrsas: " - "OCR is supported by FW " - "triggering mrsas_reset_ppc")); - (void) mrsas_reset_ppc(instance); + if (instance->tbolt) { + (void) mrsas_tbolt_reset_ppc( + instance); + } else { + (void) mrsas_reset_ppc( + instance); + } } } else { - con_log(CL_ANN1, (CE_NOTE, - "io_timeout_checker:" - " cmdindex: %d,cmd address: %p " + cmn_err(CE_WARN, + "io_timeout_checker: " + "cmd %p cmd->index %d " "timed out even after 3 resets: " - "so kill adapter", cmd->index, - (void *)cmd)); - (void) mrsas_kill_adapter(instance); + "so KILL adapter", (void *)cmd, cmd->index); + + mrsas_print_cmd_details(instance, cmd, 0xDD); + + if (instance->tbolt) + mrsas_tbolt_kill_adapter(instance); + else + (void) mrsas_kill_adapter(instance); return; } } } - - - con_log(CL_ANN1, (CE_NOTE, "mrsas: " + con_log(CL_ANN, (CE_NOTE, "mrsas: " "schedule next timeout check: " "do timeout \n")); instance->timeout_id = timeout(io_timeout_checker, (void *)instance, drv_usectohz(MRSAS_1_SECOND)); } -static int + +static uint32_t read_fw_status_reg_ppc(struct mrsas_instance *instance) { - return ((int)RD_OB_SCRATCH_PAD_0(instance)); + return ((uint32_t)RD_OB_SCRATCH_PAD_0(instance)); } static void @@ -5404,7 +6828,7 @@ issue_cmd_ppc(struct mrsas_cmd *cmd, struct mrsas_instance *instance) pkt = cmd->pkt; if (pkt) { - con_log(CL_ANN1, (CE_CONT, "%llx : issue_cmd_ppc:" + con_log(CL_DLEVEL1, (CE_NOTE, "%llx : issue_cmd_ppc:" "ISSUED CMD TO FW : called : cmd:" ": %p instance : %p pkt : %p pkt_time : %x\n", gethrtime(), (void *)cmd, (void *)instance, @@ -5417,13 +6841,18 @@ issue_cmd_ppc(struct mrsas_cmd *cmd, struct mrsas_instance *instance) } } else { - con_log(CL_ANN1, (CE_CONT, "%llx : issue_cmd_ppc:" + con_log(CL_DLEVEL1, (CE_NOTE, "%llx : issue_cmd_ppc:" "ISSUED CMD TO FW : called : cmd : %p, instance: %p" "(NO PKT)\n", gethrtime(), (void *)cmd, (void *)instance)); } + + mutex_enter(&instance->reg_write_mtx); + ASSERT(mutex_owned(&instance->reg_write_mtx)); /* Issue the command to the FW */ WR_IB_QPORT((cmd->frame_phys_addr) | (((cmd->frame_count - 1) << 1) | 1), instance); + mutex_exit(&instance->reg_write_mtx); + } /* @@ -5444,10 +6873,12 @@ struct mrsas_cmd *cmd) cmd->frame_dma_obj.acc_handle, &hdr->timeout); if (cmd->drv_pkt_time < debug_timeout_g) cmd->drv_pkt_time = (uint16_t)debug_timeout_g; + con_log(CL_ANN1, (CE_NOTE, "sync_mode_ppc: " "issue and return in reset case\n")); WR_IB_QPORT((cmd->frame_phys_addr) | (((cmd->frame_count - 1) << 1) | 1), instance); + return (DDI_SUCCESS); } else { con_log(CL_ANN1, (CE_NOTE, "sync_mode_ppc: pushing the pkt\n")); @@ -5456,15 +6887,17 @@ struct mrsas_cmd *cmd) cmd->cmd_status = ENODATA; + mutex_enter(&instance->reg_write_mtx); + ASSERT(mutex_owned(&instance->reg_write_mtx)); + /* Issue the command to the FW */ WR_IB_QPORT((cmd->frame_phys_addr) | (((cmd->frame_count - 1) << 1) | 1), instance); + mutex_exit(&instance->reg_write_mtx); mutex_enter(&instance->int_cmd_mtx); - for (i = 0; i < msecs && (cmd->cmd_status == ENODATA); i++) { cv_wait(&instance->int_cmd_cv, &instance->int_cmd_mtx); } - mutex_exit(&instance->int_cmd_mtx); con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: done")); @@ -5494,7 +6927,7 @@ issue_cmd_in_poll_mode_ppc(struct mrsas_instance *instance, ddi_put8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status, MFI_CMD_STATUS_POLL_MODE); flags = ddi_get16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags); - flags |= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE; + flags |= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE; ddi_put16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags, flags); @@ -5511,7 +6944,7 @@ issue_cmd_in_poll_mode_ppc(struct mrsas_instance *instance, if (ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status) == MFI_CMD_STATUS_POLL_MODE) { - con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_poll_mode: " + con_log(CL_ANN, (CE_NOTE, "issue_cmd_in_poll_mode: " "cmd polling timed out")); return (DDI_FAILURE); } @@ -5607,18 +7040,18 @@ intr_ack_ppc(struct mrsas_instance *instance) static int mrsas_kill_adapter(struct mrsas_instance *instance) { - if (instance->deadadapter == 1) - return (DDI_FAILURE); + if (instance->deadadapter == 1) + return (DDI_FAILURE); - con_log(CL_ANN1, (CE_NOTE, "mrsas_kill_adapter: " - "Writing to doorbell with MFI_STOP_ADP ")); - mutex_enter(&instance->ocr_flags_mtx); - instance->deadadapter = 1; - mutex_exit(&instance->ocr_flags_mtx); - instance->func_ptr->disable_intr(instance); - WR_IB_DOORBELL(MFI_STOP_ADP, instance); - (void) mrsas_complete_pending_cmds(instance); - return (DDI_SUCCESS); + con_log(CL_ANN1, (CE_NOTE, "mrsas_kill_adapter: " + "Writing to doorbell with MFI_STOP_ADP ")); + mutex_enter(&instance->ocr_flags_mtx); + instance->deadadapter = 1; + mutex_exit(&instance->ocr_flags_mtx); + instance->func_ptr->disable_intr(instance); + WR_IB_DOORBELL(MFI_STOP_ADP, instance); + (void) mrsas_complete_pending_cmds(instance); + return (DDI_SUCCESS); } @@ -5630,9 +7063,11 @@ mrsas_reset_ppc(struct mrsas_instance *instance) uint32_t cur_abs_reg_val; uint32_t fw_state; + con_log(CL_ANN, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + if (instance->deadadapter == 1) { - con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: " - "no more resets as HBA has been marked dead ")); + cmn_err(CE_WARN, "mrsas_reset_ppc: " + "no more resets as HBA has been marked dead "); return (DDI_FAILURE); } mutex_enter(&instance->ocr_flags_mtx); @@ -5640,6 +7075,7 @@ mrsas_reset_ppc(struct mrsas_instance *instance) mutex_exit(&instance->ocr_flags_mtx); con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: adpterresetinprogress " "flag set, time %llx", gethrtime())); + instance->func_ptr->disable_intr(instance); retry_reset: WR_IB_WRITE_SEQ(0, instance); @@ -5657,8 +7093,8 @@ retry_reset: delay(100 * drv_usectohz(MILLISEC)); status = RD_OB_DRWE(instance); if (retry++ == 100) { - con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: DRWE bit " - "check retry count %d\n", retry)); + cmn_err(CE_WARN, "mrsas_reset_ppc: DRWE bit " + "check retry count %d\n", retry); return (DDI_FAILURE); } } @@ -5669,11 +7105,14 @@ retry_reset: delay(100 * drv_usectohz(MILLISEC)); status = RD_OB_DRWE(instance); if (retry++ == 100) { + cmn_err(CE_WARN, "mrsas_reset_ppc: " + "RESET FAILED. KILL adapter called\n."); + (void) mrsas_kill_adapter(instance); return (DDI_FAILURE); } } - con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: Adapter reset complete")); + con_log(CL_ANN, (CE_NOTE, "mrsas_reset_ppc: Adapter reset complete")); con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: " "Calling mfi_state_transition_to_ready")); @@ -5700,15 +7139,18 @@ retry_reset: instance->fw_fault_count_after_ocr++; if (instance->fw_fault_count_after_ocr < MAX_FW_RESET_COUNT) { - con_log(CL_ANN1, (CE_WARN, "mrsas_reset_ppc: " - "FW is in fault after OCR count %d ", - instance->fw_fault_count_after_ocr)); + cmn_err(CE_WARN, "mrsas_reset_ppc: " + "FW is in fault after OCR count %d " + "Retry Reset", + instance->fw_fault_count_after_ocr); goto retry_reset; } else { - con_log(CL_ANN1, (CE_WARN, "mrsas_reset_ppc: " - "Max Reset Count exceeded " - "Mark HBA as bad")); + cmn_err(CE_WARN, "mrsas_reset_ppc: " + "Max Reset Count exceeded >%d" + "Mark HBA as bad, KILL adapter", + MAX_FW_RESET_COUNT); + (void) mrsas_kill_adapter(instance); return (DDI_FAILURE); } @@ -5734,37 +7176,52 @@ retry_reset: (void) mrsas_issue_init_mfi(instance); con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: " "mrsas_issue_init_mfi Done")); + con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: " "Calling mrsas_print_pending_cmd\n")); (void) mrsas_print_pending_cmds(instance); con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: " "mrsas_print_pending_cmd done\n")); + instance->func_ptr->enable_intr(instance); instance->fw_outstanding = 0; + con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: " "Calling mrsas_issue_pending_cmds")); (void) mrsas_issue_pending_cmds(instance); con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: " - "Complete")); + "issue_pending_cmds done.\n")); + con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: " "Calling aen registration")); + + + instance->aen_cmd->retry_count_for_ocr = 0; + instance->aen_cmd->drv_pkt_time = 0; + instance->func_ptr->issue_cmd(instance->aen_cmd, instance); con_log(CL_ANN1, (CE_NOTE, "Unsetting adpresetinprogress flag.\n")); + mutex_enter(&instance->ocr_flags_mtx); instance->adapterresetinprogress = 0; mutex_exit(&instance->ocr_flags_mtx); con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc: " "adpterresetinprogress flag unset")); + con_log(CL_ANN1, (CE_NOTE, "mrsas_reset_ppc done\n")); return (DDI_SUCCESS); } -static int -mrsas_common_check(struct mrsas_instance *instance, - struct mrsas_cmd *cmd) + +/* + * FMA functions. + */ +int +mrsas_common_check(struct mrsas_instance *instance, struct mrsas_cmd *cmd) { int ret = DDI_SUCCESS; - if (mrsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) != + if (cmd != NULL && + mrsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) != DDI_SUCCESS) { ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); if (cmd->pkt != NULL) { @@ -5776,7 +7233,7 @@ mrsas_common_check(struct mrsas_instance *instance, if (mrsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle) != DDI_SUCCESS) { ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); - if (cmd->pkt != NULL) { + if (cmd != NULL && cmd->pkt != NULL) { cmd->pkt->pkt_reason = CMD_TRAN_ERR; cmd->pkt->pkt_statistics = 0; } @@ -5785,7 +7242,7 @@ mrsas_common_check(struct mrsas_instance *instance, if (mrsas_check_dma_handle(instance->mfi_evt_detail_obj.dma_handle) != DDI_SUCCESS) { ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); - if (cmd->pkt != NULL) { + if (cmd != NULL && cmd->pkt != NULL) { cmd->pkt->pkt_reason = CMD_TRAN_ERR; cmd->pkt->pkt_statistics = 0; } @@ -5796,7 +7253,7 @@ mrsas_common_check(struct mrsas_instance *instance, ddi_fm_acc_err_clear(instance->regmap_handle, DDI_FME_VER0); - if (cmd->pkt != NULL) { + if (cmd != NULL && cmd->pkt != NULL) { cmd->pkt->pkt_reason = CMD_TRAN_ERR; cmd->pkt->pkt_statistics = 0; } @@ -5940,7 +7397,7 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type) int avail, actual, count; int i, flag, ret; - con_log(CL_DLEVEL1, (CE_WARN, "mrsas_add_intrs: intr_type = %x", + con_log(CL_DLEVEL1, (CE_NOTE, "mrsas_add_intrs: intr_type = %x", intr_type)); /* Get number of interrupts */ @@ -5952,7 +7409,7 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type) return (DDI_FAILURE); } - con_log(CL_DLEVEL1, (CE_WARN, "mrsas_add_intrs: count = %d ", count)); + con_log(CL_DLEVEL1, (CE_NOTE, "mrsas_add_intrs: count = %d ", count)); /* Get number of available interrupts */ ret = ddi_intr_get_navail(dip, intr_type, &avail); @@ -5962,7 +7419,7 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type) return (DDI_FAILURE); } - con_log(CL_DLEVEL1, (CE_WARN, "mrsas_add_intrs: avail = %d ", avail)); + con_log(CL_DLEVEL1, (CE_NOTE, "mrsas_add_intrs: avail = %d ", avail)); /* Only one interrupt routine. So limit the count to 1 */ if (count > 1) { @@ -5973,12 +7430,19 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type) * Allocate an array of interrupt handlers. Currently we support * only one interrupt. The framework can be extended later. */ - instance->intr_size = count * sizeof (ddi_intr_handle_t); - instance->intr_htable = kmem_zalloc(instance->intr_size, KM_SLEEP); - ASSERT(instance->intr_htable); + instance->intr_htable_size = count * sizeof (ddi_intr_handle_t); + instance->intr_htable = kmem_zalloc(instance->intr_htable_size, + KM_SLEEP); + if (instance->intr_htable == NULL) { + con_log(CL_ANN, (CE_WARN, "mrsas_add_intrs: " + "failed to allocate memory for intr-handle table")); + instance->intr_htable_size = 0; + return (DDI_FAILURE); + } - flag = ((intr_type == DDI_INTR_TYPE_MSI) || (intr_type == - DDI_INTR_TYPE_MSIX)) ? DDI_INTR_ALLOC_STRICT:DDI_INTR_ALLOC_NORMAL; + flag = ((intr_type == DDI_INTR_TYPE_MSI) || + (intr_type == DDI_INTR_TYPE_MSIX)) ? + DDI_INTR_ALLOC_STRICT : DDI_INTR_ALLOC_NORMAL; /* Allocate interrupt */ ret = ddi_intr_alloc(dip, instance->intr_htable, intr_type, 0, @@ -5987,9 +7451,9 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type) if ((ret != DDI_SUCCESS) || (actual == 0)) { con_log(CL_ANN, (CE_WARN, "mrsas_add_intrs: " "avail = %d", avail)); - kmem_free(instance->intr_htable, instance->intr_size); - return (DDI_FAILURE); + goto mrsas_free_htable; } + if (actual < count) { con_log(CL_ANN, (CE_WARN, "mrsas_add_intrs: " "Requested = %d Received = %d", count, actual)); @@ -6003,12 +7467,7 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type) &instance->intr_pri)) != DDI_SUCCESS) { con_log(CL_ANN, (CE_WARN, "mrsas_add_intrs: " "get priority call failed")); - - for (i = 0; i < actual; i++) { - (void) ddi_intr_free(instance->intr_htable[i]); - } - kmem_free(instance->intr_htable, instance->intr_size); - return (DDI_FAILURE); + goto mrsas_free_handles; } /* @@ -6017,12 +7476,7 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type) if (instance->intr_pri >= ddi_intr_get_hilevel_pri()) { con_log(CL_ANN, (CE_WARN, "mrsas_add_intrs: " "High level interrupts not supported.")); - - for (i = 0; i < actual; i++) { - (void) ddi_intr_free(instance->intr_htable[i]); - } - kmem_free(instance->intr_htable, instance->intr_size); - return (DDI_FAILURE); + goto mrsas_free_handles; } con_log(CL_DLEVEL1, (CE_NOTE, "mrsas_add_intrs: intr_pri = 0x%x ", @@ -6037,31 +7491,18 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type) if (ret != DDI_SUCCESS) { con_log(CL_ANN, (CE_WARN, "mrsas_add_intrs:" "failed %d", ret)); - - for (i = 0; i < actual; i++) { - (void) ddi_intr_free(instance->intr_htable[i]); - } - kmem_free(instance->intr_htable, instance->intr_size); - return (DDI_FAILURE); + goto mrsas_free_handles; } } - con_log(CL_DLEVEL1, (CE_WARN, " ddi_intr_add_handler done")); + con_log(CL_DLEVEL1, (CE_NOTE, " ddi_intr_add_handler done")); if ((ret = ddi_intr_get_cap(instance->intr_htable[0], &instance->intr_cap)) != DDI_SUCCESS) { con_log(CL_ANN, (CE_WARN, "ddi_intr_get_cap() failed %d", ret)); - - /* Free already allocated intr */ - for (i = 0; i < actual; i++) { - (void) ddi_intr_remove_handler( - instance->intr_htable[i]); - (void) ddi_intr_free(instance->intr_htable[i]); - } - kmem_free(instance->intr_htable, instance->intr_size); - return (DDI_FAILURE); + goto mrsas_free_handlers; } if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) { @@ -6081,6 +7522,23 @@ mrsas_add_intrs(struct mrsas_instance *instance, int intr_type) return (DDI_SUCCESS); +mrsas_free_handlers: + for (i = 0; i < actual; i++) + (void) ddi_intr_remove_handler(instance->intr_htable[i]); + +mrsas_free_handles: + for (i = 0; i < actual; i++) + (void) ddi_intr_free(instance->intr_htable[i]); + +mrsas_free_htable: + if (instance->intr_htable != NULL) + kmem_free(instance->intr_htable, instance->intr_htable_size); + + instance->intr_htable = NULL; + instance->intr_htable_size = 0; + + return (DDI_FAILURE); + } @@ -6108,7 +7566,12 @@ mrsas_rem_intrs(struct mrsas_instance *instance) (void) ddi_intr_free(instance->intr_htable[i]); } - kmem_free(instance->intr_htable, instance->intr_size); + if (instance->intr_htable != NULL) + kmem_free(instance->intr_htable, instance->intr_htable_size); + + instance->intr_htable = NULL; + instance->intr_htable_size = 0; + } static int @@ -6117,7 +7580,7 @@ mrsas_tran_bus_config(dev_info_t *parent, uint_t flags, { struct mrsas_instance *instance; int config; - int rval; + int rval = NDI_SUCCESS; char *ptr = NULL; int tgt, lun; @@ -6148,6 +7611,11 @@ mrsas_tran_bus_config(dev_info_t *parent, uint_t flags, if (lun == 0) { rval = mrsas_config_ld(instance, tgt, lun, childp); +#ifdef PDSUPPORT + } else if (instance->tbolt == 1 && lun != 0) { + rval = mrsas_tbolt_config_pd(instance, + tgt, lun, childp); +#endif } else { rval = NDI_FAILURE; } @@ -6185,6 +7653,15 @@ mrsas_config_all_devices(struct mrsas_instance *instance) } +#ifdef PDSUPPORT + /* Config PD devices connected to the card */ + if (instance->tbolt) { + for (tgt = 0; tgt < instance->mr_tbolt_pd_max; tgt++) { + (void) mrsas_tbolt_config_pd(instance, tgt, 1, NULL); + } + } +#endif + rval = NDI_SUCCESS; return (rval); } @@ -6241,20 +7718,30 @@ mrsas_config_ld(struct mrsas_instance *instance, uint16_t tgt, dev_info_t *child; int rval; - con_log(CL_ANN1, (CE_NOTE, "mrsas_config_ld: t = %d l = %d", + con_log(CL_DLEVEL1, (CE_NOTE, "mrsas_config_ld: t = %d l = %d", tgt, lun)); if ((child = mrsas_find_child(instance, tgt, lun)) != NULL) { if (ldip) { *ldip = child; } - con_log(CL_ANN1, (CE_NOTE, - "mrsas_config_ld: Child = %p found t = %d l = %d", - (void *)child, tgt, lun)); + if (instance->mr_ld_list[tgt].flag != MRDRV_TGT_VALID) { + rval = mrsas_service_evt(instance, tgt, 0, + MRSAS_EVT_UNCONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, + "mr_sas: DELETING STALE ENTRY rval = %d " + "tgt id = %d ", rval, tgt)); + return (NDI_FAILURE); + } return (NDI_SUCCESS); } sd = kmem_zalloc(sizeof (struct scsi_device), KM_SLEEP); + if (sd == NULL) { + con_log(CL_ANN1, (CE_WARN, "mrsas_config_ld: " + "failed to allocate mem for scsi_device")); + return (NDI_FAILURE); + } sd->sd_address.a_hba_tran = instance->tran; sd->sd_address.a_target = (uint16_t)tgt; sd->sd_address.a_lun = (uint8_t)lun; @@ -6271,12 +7758,12 @@ mrsas_config_ld(struct mrsas_instance *instance, uint16_t tgt, } kmem_free(sd, sizeof (struct scsi_device)); - con_log(CL_ANN1, (CE_NOTE, "mrsas_config_ld: return rval = %d", + con_log(CL_DLEVEL1, (CE_NOTE, "mrsas_config_ld: return rval = %d", rval)); return (rval); } -static int +int mrsas_config_scsi_device(struct mrsas_instance *instance, struct scsi_device *sd, dev_info_t **dipp) { @@ -6290,7 +7777,7 @@ mrsas_config_scsi_device(struct mrsas_instance *instance, int dtype = sd->sd_inq->inq_dtype & DTYPE_MASK; int rval; - con_log(CL_ANN1, (CE_WARN, "mr_sas: scsi_device t%dL%d", tgt, lun)); + con_log(CL_DLEVEL1, (CE_NOTE, "mr_sas: scsi_device t%dL%d", tgt, lun)); scsi_hba_nodename_compatible_get(sd->sd_inq, NULL, dtype, NULL, &nodename, &compatible, &ncompatible); @@ -6302,12 +7789,12 @@ mrsas_config_scsi_device(struct mrsas_instance *instance, } childname = (dtype == DTYPE_DIRECT) ? "sd" : nodename; - con_log(CL_ANN1, (CE_WARN, + con_log(CL_DLEVEL1, (CE_NOTE, "mr_sas: Childname = %2s nodename = %s", childname, nodename)); /* Create a dev node */ rval = ndi_devi_alloc(instance->dip, childname, DEVI_SID_NODEID, &ldip); - con_log(CL_ANN1, (CE_WARN, + con_log(CL_DLEVEL1, (CE_NOTE, "mr_sas_config_scsi_device: ndi_devi_alloc rval = %x", rval)); if (rval == NDI_SUCCESS) { if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "target", tgt) != @@ -6341,7 +7828,7 @@ mrsas_config_scsi_device(struct mrsas_instance *instance, ndi_prop_remove_all(ldip); (void) ndi_devi_free(ldip); } else { - con_log(CL_ANN1, (CE_WARN, "mr_sas: online Done :" + con_log(CL_ANN1, (CE_CONT, "mr_sas: online Done :" "0 t%dl%d", tgt, lun)); } @@ -6351,7 +7838,7 @@ finish: *dipp = ldip; } - con_log(CL_DLEVEL1, (CE_WARN, + con_log(CL_DLEVEL1, (CE_NOTE, "mr_sas: config_scsi_device rval = %d t%dL%d", rval, tgt, lun)); scsi_hba_nodename_compatible_free(nodename, compatible); @@ -6359,7 +7846,7 @@ finish: } /*ARGSUSED*/ -static int +int mrsas_service_evt(struct mrsas_instance *instance, int tgt, int lun, int event, uint64_t wwn) { @@ -6378,6 +7865,7 @@ mrsas_service_evt(struct mrsas_instance *instance, int tgt, int lun, int event, mrevt->tgt = tgt; mrevt->lun = lun; mrevt->event = event; + mrevt->wwn = wwn; if ((ddi_taskq_dispatch(instance->taskq, (void (*)(void *))mrsas_issue_evt_taskq, mrevt, DDI_NOSLEEP)) != @@ -6405,11 +7893,18 @@ mrsas_issue_evt_taskq(struct mrsas_eventinfo *mrevt) mrevt->tgt, mrevt->lun, mrevt->event)); if (mrevt->tgt < MRDRV_MAX_LD && mrevt->lun == 0) { + mutex_enter(&instance->config_dev_mtx); dip = instance->mr_ld_list[mrevt->tgt].dip; + mutex_exit(&instance->config_dev_mtx); +#ifdef PDSUPPORT } else { - return; + mutex_enter(&instance->config_dev_mtx); + dip = instance->mr_tbolt_pd_list[mrevt->tgt].dip; + mutex_exit(&instance->config_dev_mtx); +#endif } + ndi_devi_enter(instance->dip, &circ1); switch (mrevt->event) { case MRSAS_EVT_CONFIG_TGT: @@ -6418,6 +7913,12 @@ mrsas_issue_evt_taskq(struct mrsas_eventinfo *mrevt) if (mrevt->lun == 0) { (void) mrsas_config_ld(instance, mrevt->tgt, 0, NULL); +#ifdef PDSUPPORT + } else if (instance->tbolt) { + (void) mrsas_tbolt_config_pd(instance, + mrevt->tgt, + 1, NULL); +#endif } con_log(CL_ANN1, (CE_NOTE, "mr_sas: EVT_CONFIG_TGT called:" @@ -6461,11 +7962,12 @@ mrsas_issue_evt_taskq(struct mrsas_eventinfo *mrevt) ndi_devi_exit(instance->dip, circ1); } -static int + +int mrsas_mode_sense_build(struct scsi_pkt *pkt) { union scsi_cdb *cdbp; - uint16_t page_code; + uint16_t page_code; struct scsa_cmd *acmd; struct buf *bp; struct mode_header *modehdrp; diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.conf b/usr/src/uts/common/io/mr_sas/mr_sas.conf index 73bc8253d7..73cb981b48 100644 --- a/usr/src/uts/common/io/mr_sas/mr_sas.conf +++ b/usr/src/uts/common/io/mr_sas/mr_sas.conf @@ -1,13 +1,18 @@ # -# Copyright (c) 2008-2009, LSI Logic Corporation. +# Copyright (c) 2008-2012, LSI Logic Corporation. # All rights reserved. # -# Copyright 2009 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# # # mr_sas.conf for sol 10 (and later) for all supported architectures # # global definitions +flow_control="dmult" queue="qsort" tape="sctp"; + +# MSI specific flag. To enable MSI modify the flag value to "yes" +mrsas-enable-msi="yes"; + +# Fast-Path specific flag. To enable Fast-Path modify the flag value to "yes" +mrsas-enable-fp="yes"; + diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.h b/usr/src/uts/common/io/mr_sas/mr_sas.h index e56bb68d15..3e297baaed 100644 --- a/usr/src/uts/common/io/mr_sas/mr_sas.h +++ b/usr/src/uts/common/io/mr_sas/mr_sas.h @@ -2,9 +2,17 @@ * mr_sas.h: header for mr_sas * * Solaris MegaRAID driver for SAS2.0 controllers - * Copyright (c) 2008-2009, LSI Logic Corporation. + * Copyright (c) 2008-2012, LSI Logic Corporation. * All rights reserved. * + * Version: + * Author: + * Swaminathan K S + * Arun Chandrashekhar + * Manju R + * Rasheed + * Shakeel Bukhari + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * @@ -36,6 +44,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. */ + #ifndef _MR_SAS_H_ #define _MR_SAS_H_ @@ -45,12 +54,13 @@ extern "C" { #include <sys/scsi/scsi.h> #include "mr_sas_list.h" +#include "ld_pd_map.h" /* * MegaRAID SAS2.0 Driver meta data */ -#define MRSAS_VERSION "LSIv2.7" -#define MRSAS_RELDATE "Apr 21, 2010" +#define MRSAS_VERSION "6.503.00.00ILLUMOS" +#define MRSAS_RELDATE "July 30, 2012" #define MRSAS_TRUE 1 #define MRSAS_FALSE 0 @@ -58,16 +68,32 @@ extern "C" { #define ADAPTER_RESET_NOT_REQUIRED 0 #define ADAPTER_RESET_REQUIRED 1 +#define PDSUPPORT 1 + +#define SWAP_BYTES(w) ((((w)>>8)&0xFF) | (((w)&0xFF)<<8)) +#define BIG_ENDIAN(d) (SWAP_BYTES((d) >> 16) | (SWAP_BYTES(d) << 16)) /* * MegaRAID SAS2.0 device id conversion definitions. */ #define INST2LSIRDCTL(x) ((x) << INST_MINOR_SHIFT) +#define MRSAS_GET_BOUNDARY_ALIGNED_LEN(len, new_len, boundary_len) { \ + int rem; \ + rem = (len / boundary_len); \ + if ((rem * boundary_len) != len) { \ + new_len = len + ((rem + 1) * boundary_len - len); \ + } else { \ + new_len = len; \ + } \ +} + /* * MegaRAID SAS2.0 supported controllers */ #define PCI_DEVICE_ID_LSI_2108VDE 0x0078 #define PCI_DEVICE_ID_LSI_2108V 0x0079 +#define PCI_DEVICE_ID_LSI_TBOLT 0x005b +#define PCI_DEVICE_ID_LSI_INVADER 0x005d /* * Register Index for 2108 Controllers. @@ -75,6 +101,7 @@ extern "C" { #define REGISTER_SET_IO_2108 (2) #define MRSAS_MAX_SGE_CNT 0x50 +#define MRSAS_APP_RESERVED_CMDS 32 #define MRSAS_IOCTL_DRIVER 0x12341234 #define MRSAS_IOCTL_FIRMWARE 0x12345678 @@ -82,13 +109,50 @@ extern "C" { #define MRSAS_1_SECOND 1000000 +#ifdef PDSUPPORT + +#define UNCONFIGURED_GOOD 0x0 +#define PD_SYSTEM 0x40 +#define MR_EVT_PD_STATE_CHANGE 0x0072 +#define MR_EVT_PD_REMOVED_EXT 0x00f8 +#define MR_EVT_PD_INSERTED_EXT 0x00f7 +#define MR_DCMD_PD_GET_INFO 0x02020000 +#define MRSAS_TBOLT_PD_LUN 1 +#define MRSAS_TBOLT_PD_TGT_MAX 255 +#define MRSAS_TBOLT_GET_PD_MAX(s) ((s)->mr_tbolt_pd_max) + +#endif + +/* Raid Context Flags */ +#define MR_RAID_CTX_RAID_FLAGS_IO_SUB_TYPE_SHIFT 0x4 +#define MR_RAID_CTX_RAID_FLAGS_IO_SUB_TYPE_MASK 0x30 +typedef enum MR_RAID_FLAGS_IO_SUB_TYPE { + MR_RAID_FLAGS_IO_SUB_TYPE_NONE = 0, + MR_RAID_FLAGS_IO_SUB_TYPE_SYSTEM_PD = 1 +} MR_RAID_FLAGS_IO_SUB_TYPE; + /* Dynamic Enumeration Flags */ -#define MRSAS_PD_LUN 1 #define MRSAS_LD_LUN 0 -#define MRSAS_PD_TGT_MAX 255 -#define MRSAS_GET_PD_MAX(s) ((s)->mr_pd_max) #define WWN_STRLEN 17 -#define APP_RESERVE_CMDS 32 +#define LD_SYNC_BIT 1 +#define LD_SYNC_SHIFT 14 +/* ThunderBolt (TB) specific */ +#define MRSAS_THUNDERBOLT_MSG_SIZE 256 +#define MRSAS_THUNDERBOLT_MAX_COMMANDS 1024 +#define MRSAS_THUNDERBOLT_MAX_REPLY_COUNT 1024 +#define MRSAS_THUNDERBOLT_REPLY_SIZE 8 +#define MRSAS_THUNDERBOLT_MAX_CHAIN_COUNT 1 + +#define MPI2_FUNCTION_PASSTHRU_IO_REQUEST 0xF0 +#define MPI2_FUNCTION_LD_IO_REQUEST 0xF1 + +#define MR_EVT_LD_FAST_PATH_IO_STATUS_CHANGED (0xFFFF) + +#define MR_INTERNAL_MFI_FRAMES_SMID 1 +#define MR_CTRL_EVENT_WAIT_SMID 2 +#define MR_INTERNAL_DRIVER_RESET_SMID 3 + + /* * ===================================== * MegaRAID SAS2.0 MFI firmware definitions @@ -103,19 +167,18 @@ extern "C" { /* * FW posts its state in upper 4 bits of outbound_msg_0 register */ -#define MFI_STATE_SHIFT 28 -#define MFI_STATE_MASK ((uint32_t)0xF<<MFI_STATE_SHIFT) -#define MFI_STATE_UNDEFINED ((uint32_t)0x0<<MFI_STATE_SHIFT) -#define MFI_STATE_BB_INIT ((uint32_t)0x1<<MFI_STATE_SHIFT) -#define MFI_STATE_FW_INIT ((uint32_t)0x4<<MFI_STATE_SHIFT) -#define MFI_STATE_WAIT_HANDSHAKE ((uint32_t)0x6<<MFI_STATE_SHIFT) -#define MFI_STATE_FW_INIT_2 ((uint32_t)0x7<<MFI_STATE_SHIFT) -#define MFI_STATE_DEVICE_SCAN ((uint32_t)0x8<<MFI_STATE_SHIFT) -#define MFI_STATE_BOOT_MESSAGE_PENDING ((uint32_t)0x9<<MFI_STATE_SHIFT) -#define MFI_STATE_FLUSH_CACHE ((uint32_t)0xA<<MFI_STATE_SHIFT) -#define MFI_STATE_READY ((uint32_t)0xB<<MFI_STATE_SHIFT) -#define MFI_STATE_OPERATIONAL ((uint32_t)0xC<<MFI_STATE_SHIFT) -#define MFI_STATE_FAULT ((uint32_t)0xF<<MFI_STATE_SHIFT) +#define MFI_STATE_MASK 0xF0000000 +#define MFI_STATE_UNDEFINED 0x00000000 +#define MFI_STATE_BB_INIT 0x10000000 +#define MFI_STATE_FW_INIT 0x40000000 +#define MFI_STATE_WAIT_HANDSHAKE 0x60000000 +#define MFI_STATE_FW_INIT_2 0x70000000 +#define MFI_STATE_DEVICE_SCAN 0x80000000 +#define MFI_STATE_BOOT_MESSAGE_PENDING 0x90000000 +#define MFI_STATE_FLUSH_CACHE 0xA0000000 +#define MFI_STATE_READY 0xB0000000 +#define MFI_STATE_OPERATIONAL 0xC0000000 +#define MFI_STATE_FAULT 0xF0000000 #define MRMFI_FRAME_SIZE 64 @@ -148,7 +211,7 @@ extern "C" { #define MFI_FRAME_DIR_WRITE 0x0008 #define MFI_FRAME_DIR_READ 0x0010 #define MFI_FRAME_DIR_BOTH 0x0018 -#define MFI_FRAME_IEEE 0x0020 +#define MFI_FRAME_IEEE 0x0020 /* * Definition for cmd_status @@ -182,12 +245,12 @@ extern "C" { #define MR_DCMD_CTRL_EVENT_GET 0x01040300 #define MR_DCMD_CTRL_EVENT_WAIT 0x01040500 #define MR_DCMD_LD_GET_PROPERTIES 0x03030000 -#define MR_DCMD_PD_GET_INFO 0x02020000 /* * Solaris Specific MAX values */ #define MAX_SGL 24 + /* * MFI command completion codes */ @@ -244,7 +307,6 @@ enum MFI_STAT { MFI_STAT_TIME_NOT_SET = 0x31, MFI_STAT_WRONG_STATE = 0x32, MFI_STAT_LD_OFFLINE = 0x33, - /* UNUSED: 0x34 to 0xfe */ MFI_STAT_INVALID_STATUS = 0xFF }; @@ -270,11 +332,34 @@ enum MR_EVT_LOCALE { MR_EVT_LOCALE_ALL = 0xffff }; +enum MR_EVT_ARGS { + MR_EVT_ARGS_NONE, + MR_EVT_ARGS_CDB_SENSE, + MR_EVT_ARGS_LD, + MR_EVT_ARGS_LD_COUNT, + MR_EVT_ARGS_LD_LBA, + MR_EVT_ARGS_LD_OWNER, + MR_EVT_ARGS_LD_LBA_PD_LBA, + MR_EVT_ARGS_LD_PROG, + MR_EVT_ARGS_LD_STATE, + MR_EVT_ARGS_LD_STRIP, + MR_EVT_ARGS_PD, + MR_EVT_ARGS_PD_ERR, + MR_EVT_ARGS_PD_LBA, + MR_EVT_ARGS_PD_LBA_LD, + MR_EVT_ARGS_PD_PROG, + MR_EVT_ARGS_PD_STATE, + MR_EVT_ARGS_PCI, + MR_EVT_ARGS_RATE, + MR_EVT_ARGS_STR, + MR_EVT_ARGS_TIME, + MR_EVT_ARGS_ECC +}; + #define MR_EVT_CFG_CLEARED 0x0004 #define MR_EVT_LD_CREATED 0x008a #define MR_EVT_LD_DELETED 0x008b -#define MR_EVT_PD_REMOVED_EXT 0x00f8 -#define MR_EVT_PD_INSERTED_EXT 0x00f7 +#define MR_EVT_CFG_FP_CHANGE 0x017B enum LD_STATE { LD_OFFLINE = 0, @@ -302,6 +387,7 @@ enum MRSAS_EVT { * @param dma_handle : dma handle * @param dma_cookie : scatter-gather list * @param dma_attr : dma attributes for this buffer + * * Our DMA object. The caller must initialize the size and dma attributes * (dma_attr) fields before allocating the resources. */ @@ -321,23 +407,26 @@ struct mrsas_eventinfo { int tgt; int lun; int event; + uint64_t wwn; }; struct mrsas_ld { dev_info_t *dip; uint8_t lun_type; - uint8_t reserved[3]; + uint8_t flag; + uint8_t reserved[2]; }; -struct mrsas_pd { + +#ifdef PDSUPPORT +struct mrsas_tbolt_pd { dev_info_t *dip; uint8_t lun_type; uint8_t dev_id; - uint8_t flags; + uint8_t flag; uint8_t reserved; }; - -struct mrsas_pd_info { +struct mrsas_tbolt_pd_info { uint16_t deviceId; uint16_t seqNum; uint8_t inquiryData[96]; @@ -363,6 +452,7 @@ struct mrsas_pd_info { uint8_t reserved2[16]; } pathInfo; }; +#endif typedef struct mrsas_instance { uint32_t *producer; @@ -372,6 +462,12 @@ typedef struct mrsas_instance { dma_obj_t mfi_internal_dma_obj; uint16_t adapterresetinprogress; uint16_t deadadapter; + /* ThunderBolt (TB) specific */ + dma_obj_t mpi2_frame_pool_dma_obj; + dma_obj_t request_desc_dma_obj; + dma_obj_t reply_desc_dma_obj; + dma_obj_t ld_map_obj[2]; + uint8_t init_id; uint8_t flag_ieee; uint8_t disable_online_ctrl_reset; @@ -382,11 +478,17 @@ typedef struct mrsas_instance { uint32_t max_sectors_per_req; struct mrsas_cmd **cmd_list; + mlist_t cmd_pool_list; kmutex_t cmd_pool_mtx; + kmutex_t sync_map_mtx; mlist_t app_cmd_pool_list; kmutex_t app_cmd_pool_mtx; + mlist_t cmd_app_pool_list; + kmutex_t cmd_app_pool_mtx; + + mlist_t cmd_pend_list; kmutex_t cmd_pend_mtx; @@ -407,6 +509,9 @@ typedef struct mrsas_instance { kcondvar_t abort_cmd_cv; kmutex_t abort_cmd_mtx; + kmutex_t reg_write_mtx; + kmutex_t chip_mtx; + dev_info_t *dip; ddi_acc_handle_t pci_handle; @@ -420,6 +525,7 @@ typedef struct mrsas_instance { ddi_iblock_cookie_t soft_iblock_cookie; ddi_softintr_t soft_intr_id; uint8_t softint_running; + uint8_t tbolt_softint_running; kmutex_t completed_pool_mtx; mlist_t completed_pool_list; @@ -436,23 +542,99 @@ typedef struct mrsas_instance { char iocnode[16]; int fm_capabilities; + /* + * Driver resources unroll flags. The flag is set for resources that + * are needed to be free'd at detach() time. + */ + struct _unroll { + uint8_t softs; /* The software state was allocated. */ + uint8_t regs; /* Controller registers mapped. */ + uint8_t intr; /* Interrupt handler added. */ + uint8_t reqs; /* Request structs allocated. */ + uint8_t mutexs; /* Mutex's allocated. */ + uint8_t taskq; /* Task q's created. */ + uint8_t tran; /* Tran struct allocated */ + uint8_t tranSetup; /* Tran attached to the ddi. */ + uint8_t devctl; /* Device nodes for cfgadm created. */ + uint8_t scsictl; /* Device nodes for cfgadm created. */ + uint8_t ioctl; /* Device nodes for ioctl's created. */ + uint8_t timer; /* Timer started. */ + uint8_t aenPend; /* AEN cmd pending f/w. */ + uint8_t mapUpdate_pend; /* LD MAP update cmd pending f/w. */ + uint8_t soft_isr; + uint8_t ldlist_buff; + uint8_t pdlist_buff; + uint8_t syncCmd; + uint8_t verBuff; + uint8_t alloc_space_mfi; + uint8_t alloc_space_mpi2; + } unroll; + + + /* function template pointer */ + struct mrsas_function_template *func_ptr; + - struct mrsas_func_ptr *func_ptr; /* MSI interrupts specific */ - ddi_intr_handle_t *intr_htable; + ddi_intr_handle_t *intr_htable; /* Interrupt handle array */ + size_t intr_htable_size; /* Int. handle array size */ int intr_type; int intr_cnt; - size_t intr_size; uint_t intr_pri; int intr_cap; ddi_taskq_t *taskq; struct mrsas_ld *mr_ld_list; + kmutex_t config_dev_mtx; + /* ThunderBolt (TB) specific */ + ddi_softintr_t tbolt_soft_intr_id; + +#ifdef PDSUPPORT + uint32_t mr_tbolt_pd_max; + struct mrsas_tbolt_pd *mr_tbolt_pd_list; +#endif + + uint8_t fast_path_io; + + uint16_t tbolt; + uint16_t reply_read_index; + uint16_t reply_size; /* Single Reply struct size */ + uint16_t raid_io_msg_size; /* Single message size */ + uint32_t io_request_frames_phy; + uint8_t *io_request_frames; + /* Virtual address of request desc frame pool */ + MRSAS_REQUEST_DESCRIPTOR_UNION *request_message_pool; + /* Physical address of request desc frame pool */ + uint32_t request_message_pool_phy; + /* Virtual address of reply Frame */ + MPI2_REPLY_DESCRIPTORS_UNION *reply_frame_pool; + /* Physical address of reply Frame */ + uint32_t reply_frame_pool_phy; + uint8_t *reply_pool_limit; /* Last reply frame address */ + /* Physical address of Last reply frame */ + uint32_t reply_pool_limit_phy; + uint32_t reply_q_depth; /* Reply Queue Depth */ + uint8_t max_sge_in_main_msg; + uint8_t max_sge_in_chain; + uint8_t chain_offset_io_req; + uint8_t chain_offset_mpt_msg; + MR_FW_RAID_MAP_ALL *ld_map[2]; + uint32_t ld_map_phy[2]; + uint32_t size_map_info; + uint64_t map_id; + LD_LOAD_BALANCE_INFO load_balance_info[MAX_LOGICAL_DRIVES]; + struct mrsas_cmd *map_update_cmd; + uint32_t SyncRequired; kmutex_t ocr_flags_mtx; + dma_obj_t drv_ver_dma_obj; } mrsas_t; -struct mrsas_func_ptr { - int (*read_fw_status_reg)(struct mrsas_instance *); + +/* + * Function templates for various controller specific functions + */ +struct mrsas_function_template { + uint32_t (*read_fw_status_reg)(struct mrsas_instance *); void (*issue_cmd)(struct mrsas_cmd *, struct mrsas_instance *); int (*issue_cmd_in_sync_mode)(struct mrsas_instance *, struct mrsas_cmd *); @@ -461,6 +643,8 @@ struct mrsas_func_ptr { void (*enable_intr)(struct mrsas_instance *); void (*disable_intr)(struct mrsas_instance *); int (*intr_ack)(struct mrsas_instance *); + int (*init_adapter)(struct mrsas_instance *); +/* int (*reset_adapter)(struct mrsas_instance *); */ }; /* @@ -480,13 +664,11 @@ struct mrsas_func_ptr { * console messages debug levels */ #define CL_NONE 0 /* No debug information */ -#define CL_TEST_OCR 1 -#define CL_ANN 2 /* print unconditionally, announcements */ -#define CL_ANN1 3 /* No o/p */ -#define CL_DLEVEL1 4 /* debug level 1, informative */ -#define CL_DLEVEL2 5 /* debug level 2, verbose */ -#define CL_DLEVEL3 6 /* debug level 3, very verbose */ - +#define CL_ANN 1 /* print unconditionally, announcements */ +#define CL_ANN1 2 /* No o/p */ +#define CL_DLEVEL1 3 /* debug level 1, informative */ +#define CL_DLEVEL2 4 /* debug level 2, verbose */ +#define CL_DLEVEL3 5 /* debug level 3, very verbose */ #ifdef __SUNPRO_C #define __func__ "" @@ -547,9 +729,9 @@ struct mrsas_func_ptr { #define HIGH_LEVEL_INTR 1 #define NORMAL_LEVEL_INTR 0 +#define IO_TIMEOUT_VAL 0 #define IO_RETRY_COUNT 3 #define MAX_FW_RESET_COUNT 3 - /* * scsa_cmd - Per-command mr private data * @param cmd_dmahandle : dma handle @@ -598,13 +780,20 @@ struct scsa_cmd { struct mrsas_cmd { + /* + * ThunderBolt(TB) We would be needing to have a placeholder + * for RAID_MSG_IO_REQUEST inside this structure. We are + * supposed to embed the mr_frame inside the RAID_MSG and post + * it down to the firmware. + */ union mrsas_frame *frame; uint32_t frame_phys_addr; uint8_t *sense; + uint8_t *sense1; uint32_t sense_phys_addr; + uint32_t sense_phys_addr1; dma_obj_t frame_dma_obj; uint8_t frame_dma_obj_status; - uint32_t index; uint8_t sync_cmd; uint8_t cmd_status; @@ -613,8 +802,16 @@ struct mrsas_cmd { uint32_t frame_count; struct scsa_cmd *cmd; struct scsi_pkt *pkt; + Mpi2RaidSCSIIORequest_t *scsi_io_request; + Mpi2SGEIOUnion_t *sgl; + uint32_t sgl_phys_addr; + uint32_t scsi_io_request_phys_addr; + MRSAS_REQUEST_DESCRIPTOR_UNION *request_desc; + uint16_t SMID; uint16_t retry_count_for_ocr; uint16_t drv_pkt_time; + uint16_t load_balance_flag; + }; #define MAX_MGMT_ADAPTERS 1024 @@ -637,8 +834,8 @@ struct mrsas_mgmt_info { int max_index; }; -#pragma pack(1) +#pragma pack(1) /* * SAS controller properties */ @@ -662,6 +859,7 @@ struct mrsas_ctrl_prop { uint8_t cluster_enable; uint8_t coercion_mode; uint8_t alarm_enable; + uint8_t reserved_1[13]; uint32_t on_off_properties; uint8_t reserved_4[28]; @@ -867,12 +1065,15 @@ struct mrsas_ctrl_info { #define MRSAS_IOCTL_CMD 0 +#define MRDRV_TGT_VALID 1 + /* * FW can accept both 32 and 64 bit SGLs. We want to allocate 32/64 bit * SGLs based on the size of dma_addr_t */ #define IS_DMA64 (sizeof (dma_addr_t) == 8) +#define RESERVED0_REGISTER 0x00 /* XScale */ #define IB_MSG_0_OFF 0x10 /* XScale */ #define OB_MSG_0_OFF 0x18 /* XScale */ #define IB_DOORBELL_OFF 0x20 /* XScale & ROC */ @@ -883,13 +1084,18 @@ struct mrsas_ctrl_info { #define OB_SCRATCH_PAD_0_OFF 0xB0 /* ROC */ #define OB_INTR_MASK 0xFFFFFFFF #define OB_DOORBELL_CLEAR_MASK 0xFFFFFFFF -#define WRITE_SEQ_OFF 0x000000FC -#define HOST_DIAG_OFF 0x000000F8 -#define DIAG_RESET_ADAPTER 0x00000004 -#define DIAG_WRITE_ENABLE 0x00000080 -/* - * All MFI register set macros accept mrsas_register_set* - */ +#define SYSTOIOP_INTERRUPT_MASK 0x80000000 +#define OB_SCRATCH_PAD_2_OFF 0xB4 +#define WRITE_TBOLT_SEQ_OFF 0x00000004 +#define DIAG_TBOLT_RESET_ADAPTER 0x00000004 +#define HOST_TBOLT_DIAG_OFF 0x00000008 +#define RESET_TBOLT_STATUS_OFF 0x000003C3 +#define WRITE_SEQ_OFF 0x000000FC +#define HOST_DIAG_OFF 0x000000F8 +#define DIAG_RESET_ADAPTER 0x00000004 +#define DIAG_WRITE_ENABLE 0x00000080 +#define SYSTOIOP_INTERRUPT_MASK 0x80000000 + #define WR_IB_WRITE_SEQ(v, instance) ddi_put32((instance)->regmap_handle, \ (uint32_t *)((uintptr_t)(instance)->regmap + WRITE_SEQ_OFF), (v)) @@ -899,6 +1105,13 @@ struct mrsas_ctrl_info { #define WR_IB_DRWE(v, instance) ddi_put32((instance)->regmap_handle, \ (uint32_t *)((uintptr_t)(instance)->regmap + HOST_DIAG_OFF), (v)) +#define IB_LOW_QPORT 0xC0 +#define IB_HIGH_QPORT 0xC4 +#define OB_DOORBELL_REGISTER 0x9C /* 1078 implementation */ + +/* + * All MFI register set macros accept mrsas_register_set* + */ #define WR_IB_MSG_0(v, instance) ddi_put32((instance)->regmap_handle, \ (uint32_t *)((uintptr_t)(instance)->regmap + IB_MSG_0_OFF), (v)) @@ -933,6 +1146,56 @@ struct mrsas_ctrl_info { #define RD_OB_SCRATCH_PAD_0(instance) ddi_get32((instance)->regmap_handle, \ (uint32_t *)((uintptr_t)(instance)->regmap + OB_SCRATCH_PAD_0_OFF)) +/* Thunderbolt specific registers */ +#define RD_OB_SCRATCH_PAD_2(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_SCRATCH_PAD_2_OFF)) + +#define WR_TBOLT_IB_WRITE_SEQ(v, instance) \ + ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + WRITE_TBOLT_SEQ_OFF), (v)) + +#define RD_TBOLT_HOST_DIAG(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + HOST_TBOLT_DIAG_OFF)) + +#define WR_TBOLT_HOST_DIAG(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + HOST_TBOLT_DIAG_OFF), (v)) + +#define RD_TBOLT_RESET_STAT(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + RESET_TBOLT_STATUS_OFF)) + + +#define WR_MPI2_REPLY_POST_INDEX(v, instance)\ + ddi_put32((instance)->regmap_handle,\ + (uint32_t *)\ + ((uintptr_t)(instance)->regmap + MPI2_REPLY_POST_HOST_INDEX_OFFSET),\ + (v)) + + +#define RD_MPI2_REPLY_POST_INDEX(instance)\ + ddi_get32((instance)->regmap_handle,\ + (uint32_t *)\ + ((uintptr_t)(instance)->regmap + MPI2_REPLY_POST_HOST_INDEX_OFFSET)) + +#define WR_IB_LOW_QPORT(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_LOW_QPORT), (v)) + +#define WR_IB_HIGH_QPORT(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_HIGH_QPORT), (v)) + +#define WR_OB_DOORBELL_REGISTER_CLEAR(v, instance)\ + ddi_put32((instance)->regmap_handle,\ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_DOORBELL_REGISTER), \ + (v)) + +#define WR_RESERVED0_REGISTER(v, instance) ddi_put32((instance)->regmap_handle,\ + (uint32_t *)((uintptr_t)(instance)->regmap + RESERVED0_REGISTER), \ + (v)) + +#define RD_RESERVED0_REGISTER(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + RESERVED0_REGISTER)) + + + /* * When FW is in MFI_STATE_READY or MFI_STATE_OPERATIONAL, the state data * of Outbound Msg Reg 0 indicates max concurrent cmds supported, max SGEs @@ -948,6 +1211,9 @@ struct mrsas_ctrl_info { #define MFI_REPLY_2108_MESSAGE_INTR 0x00000001 #define MFI_REPLY_2108_MESSAGE_INTR_MASK 0x00000005 +/* Fusion interrupt mask */ +#define MFI_FUSION_ENABLE_INTERRUPT_MASK (0x00000008) + #define MFI_POLL_TIMEOUT_SECS 60 #define MFI_ENABLE_INTR(instance) ddi_put32((instance)->regmap_handle, \ @@ -973,45 +1239,45 @@ struct mrsas_ctrl_info { * on_off_property of mrsas_ctrl_prop * bit0-9, 11-31 are reserved */ -#define DISABLE_OCR_PROP_FLAG 0x00000400 /* bit 10 */ +#define DISABLE_OCR_PROP_FLAG 0x00000400 /* bit 10 */ struct mrsas_register_set { - uint32_t reserved_0[4]; + uint32_t reserved_0[4]; /* 0000h */ - uint32_t inbound_msg_0; - uint32_t inbound_msg_1; - uint32_t outbound_msg_0; - uint32_t outbound_msg_1; + uint32_t inbound_msg_0; /* 0010h */ + uint32_t inbound_msg_1; /* 0014h */ + uint32_t outbound_msg_0; /* 0018h */ + uint32_t outbound_msg_1; /* 001Ch */ - uint32_t inbound_doorbell; - uint32_t inbound_intr_status; - uint32_t inbound_intr_mask; + uint32_t inbound_doorbell; /* 0020h */ + uint32_t inbound_intr_status; /* 0024h */ + uint32_t inbound_intr_mask; /* 0028h */ - uint32_t outbound_doorbell; - uint32_t outbound_intr_status; - uint32_t outbound_intr_mask; + uint32_t outbound_doorbell; /* 002Ch */ + uint32_t outbound_intr_status; /* 0030h */ + uint32_t outbound_intr_mask; /* 0034h */ - uint32_t reserved_1[2]; + uint32_t reserved_1[2]; /* 0038h */ - uint32_t inbound_queue_port; - uint32_t outbound_queue_port; + uint32_t inbound_queue_port; /* 0040h */ + uint32_t outbound_queue_port; /* 0044h */ - uint32_t reserved_2[22]; + uint32_t reserved_2[22]; /* 0048h */ - uint32_t outbound_doorbell_clear; + uint32_t outbound_doorbell_clear; /* 00A0h */ - uint32_t reserved_3[3]; + uint32_t reserved_3[3]; /* 00A4h */ - uint32_t outbound_scratch_pad; + uint32_t outbound_scratch_pad; /* 00B0h */ - uint32_t reserved_4[3]; + uint32_t reserved_4[3]; /* 00B4h */ - uint32_t inbound_low_queue_port; + uint32_t inbound_low_queue_port; /* 00C0h */ - uint32_t inbound_high_queue_port; + uint32_t inbound_high_queue_port; /* 00C4h */ - uint32_t reserved_5; - uint32_t index_registers[820]; + uint32_t reserved_5; /* 00C8h */ + uint32_t index_registers[820]; /* 00CCh */ }; struct mrsas_sge32 { @@ -1037,24 +1303,24 @@ union mrsas_sgl { }; struct mrsas_header { - uint8_t cmd; - uint8_t sense_len; - uint8_t cmd_status; - uint8_t scsi_status; - - uint8_t target_id; - uint8_t lun; - uint8_t cdb_len; - uint8_t sge_count; - - uint32_t context; - uint8_t req_id; - uint8_t msgvector; - uint16_t pad_0; - - uint16_t flags; - uint16_t timeout; - uint32_t data_xferlen; + uint8_t cmd; /* 00h */ + uint8_t sense_len; /* 01h */ + uint8_t cmd_status; /* 02h */ + uint8_t scsi_status; /* 03h */ + + uint8_t target_id; /* 04h */ + uint8_t lun; /* 05h */ + uint8_t cdb_len; /* 06h */ + uint8_t sge_count; /* 07h */ + + uint32_t context; /* 08h */ + uint8_t req_id; /* 0Ch */ + uint8_t msgvector; /* 0Dh */ + uint16_t pad_0; /* 0Eh */ + + uint16_t flags; /* 10h */ + uint16_t timeout; /* 12h */ + uint32_t data_xferlen; /* 14h */ }; union mrsas_sgl_frame { @@ -1063,198 +1329,199 @@ union mrsas_sgl_frame { }; struct mrsas_init_frame { - uint8_t cmd; - uint8_t reserved_0; - uint8_t cmd_status; - - uint8_t reserved_1; - uint32_t reserved_2; - - uint32_t context; - uint8_t req_id; - uint8_t msgvector; - uint16_t pad_0; - - uint16_t flags; - uint16_t reserved_3; - uint32_t data_xfer_len; - - uint32_t queue_info_new_phys_addr_lo; - uint32_t queue_info_new_phys_addr_hi; - uint32_t queue_info_old_phys_addr_lo; - uint32_t queue_info_old_phys_addr_hi; - - uint32_t reserved_4[6]; + uint8_t cmd; /* 00h */ + uint8_t reserved_0; /* 01h */ + uint8_t cmd_status; /* 02h */ + + uint8_t reserved_1; /* 03h */ + uint32_t reserved_2; /* 04h */ + + uint32_t context; /* 08h */ + uint8_t req_id; /* 0Ch */ + uint8_t msgvector; /* 0Dh */ + uint16_t pad_0; /* 0Eh */ + + uint16_t flags; /* 10h */ + uint16_t reserved_3; /* 12h */ + uint32_t data_xfer_len; /* 14h */ + + uint32_t queue_info_new_phys_addr_lo; /* 18h */ + uint32_t queue_info_new_phys_addr_hi; /* 1Ch */ + uint32_t queue_info_old_phys_addr_lo; /* 20h */ + uint32_t queue_info_old_phys_addr_hi; /* 24h */ + uint64_t driverversion; /* 28h */ + uint32_t reserved_4[4]; /* 30h */ }; struct mrsas_init_queue_info { - uint32_t init_flags; - uint32_t reply_queue_entries; - - uint32_t reply_queue_start_phys_addr_lo; - uint32_t reply_queue_start_phys_addr_hi; - uint32_t producer_index_phys_addr_lo; - uint32_t producer_index_phys_addr_hi; - uint32_t consumer_index_phys_addr_lo; - uint32_t consumer_index_phys_addr_hi; + uint32_t init_flags; /* 00h */ + uint32_t reply_queue_entries; /* 04h */ + + uint32_t reply_queue_start_phys_addr_lo; /* 08h */ + uint32_t reply_queue_start_phys_addr_hi; /* 0Ch */ + uint32_t producer_index_phys_addr_lo; /* 10h */ + uint32_t producer_index_phys_addr_hi; /* 14h */ + uint32_t consumer_index_phys_addr_lo; /* 18h */ + uint32_t consumer_index_phys_addr_hi; /* 1Ch */ }; struct mrsas_io_frame { - uint8_t cmd; - uint8_t sense_len; - uint8_t cmd_status; - uint8_t scsi_status; + uint8_t cmd; /* 00h */ + uint8_t sense_len; /* 01h */ + uint8_t cmd_status; /* 02h */ + uint8_t scsi_status; /* 03h */ - uint8_t target_id; - uint8_t access_byte; - uint8_t reserved_0; - uint8_t sge_count; + uint8_t target_id; /* 04h */ + uint8_t access_byte; /* 05h */ + uint8_t reserved_0; /* 06h */ + uint8_t sge_count; /* 07h */ - uint32_t context; - uint8_t req_id; - uint8_t msgvector; - uint16_t pad_0; + uint32_t context; /* 08h */ + uint8_t req_id; /* 0Ch */ + uint8_t msgvector; /* 0Dh */ + uint16_t pad_0; /* 0Eh */ - uint16_t flags; - uint16_t timeout; - uint32_t lba_count; + uint16_t flags; /* 10h */ + uint16_t timeout; /* 12h */ + uint32_t lba_count; /* 14h */ - uint32_t sense_buf_phys_addr_lo; - uint32_t sense_buf_phys_addr_hi; + uint32_t sense_buf_phys_addr_lo; /* 18h */ + uint32_t sense_buf_phys_addr_hi; /* 1Ch */ - uint32_t start_lba_lo; - uint32_t start_lba_hi; + uint32_t start_lba_lo; /* 20h */ + uint32_t start_lba_hi; /* 24h */ - union mrsas_sgl sgl; + union mrsas_sgl sgl; /* 28h */ }; struct mrsas_pthru_frame { - uint8_t cmd; - uint8_t sense_len; - uint8_t cmd_status; - uint8_t scsi_status; - - uint8_t target_id; - uint8_t lun; - uint8_t cdb_len; - uint8_t sge_count; - - uint32_t context; - uint8_t req_id; - uint8_t msgvector; - uint16_t pad_0; - - uint16_t flags; - uint16_t timeout; - uint32_t data_xfer_len; - - uint32_t sense_buf_phys_addr_lo; - uint32_t sense_buf_phys_addr_hi; - - uint8_t cdb[16]; - union mrsas_sgl sgl; + uint8_t cmd; /* 00h */ + uint8_t sense_len; /* 01h */ + uint8_t cmd_status; /* 02h */ + uint8_t scsi_status; /* 03h */ + + uint8_t target_id; /* 04h */ + uint8_t lun; /* 05h */ + uint8_t cdb_len; /* 06h */ + uint8_t sge_count; /* 07h */ + + uint32_t context; /* 08h */ + uint8_t req_id; /* 0Ch */ + uint8_t msgvector; /* 0Dh */ + uint16_t pad_0; /* 0Eh */ + + uint16_t flags; /* 10h */ + uint16_t timeout; /* 12h */ + uint32_t data_xfer_len; /* 14h */ + + uint32_t sense_buf_phys_addr_lo; /* 18h */ + uint32_t sense_buf_phys_addr_hi; /* 1Ch */ + + uint8_t cdb[16]; /* 20h */ + union mrsas_sgl sgl; /* 30h */ }; struct mrsas_dcmd_frame { - uint8_t cmd; - uint8_t reserved_0; - uint8_t cmd_status; - uint8_t reserved_1[4]; - uint8_t sge_count; + uint8_t cmd; /* 00h */ + uint8_t reserved_0; /* 01h */ + uint8_t cmd_status; /* 02h */ + uint8_t reserved_1[4]; /* 03h */ + uint8_t sge_count; /* 07h */ - uint32_t context; - uint8_t req_id; - uint8_t msgvector; - uint16_t pad_0; + uint32_t context; /* 08h */ + uint8_t req_id; /* 0Ch */ + uint8_t msgvector; /* 0Dh */ + uint16_t pad_0; /* 0Eh */ - uint16_t flags; - uint16_t timeout; + uint16_t flags; /* 10h */ + uint16_t timeout; /* 12h */ - uint32_t data_xfer_len; - uint32_t opcode; + uint32_t data_xfer_len; /* 14h */ + uint32_t opcode; /* 18h */ - union { + /* uint8_t mbox[DCMD_MBOX_SZ]; */ /* 1Ch */ + union { /* 1Ch */ uint8_t b[DCMD_MBOX_SZ]; uint16_t s[6]; uint32_t w[3]; } mbox; - union mrsas_sgl sgl; + union mrsas_sgl sgl; /* 28h */ }; struct mrsas_abort_frame { - uint8_t cmd; - uint8_t reserved_0; - uint8_t cmd_status; + uint8_t cmd; /* 00h */ + uint8_t reserved_0; /* 01h */ + uint8_t cmd_status; /* 02h */ - uint8_t reserved_1; - uint32_t reserved_2; + uint8_t reserved_1; /* 03h */ + uint32_t reserved_2; /* 04h */ - uint32_t context; - uint8_t req_id; - uint8_t msgvector; - uint16_t pad_0; + uint32_t context; /* 08h */ + uint8_t req_id; /* 0Ch */ + uint8_t msgvector; /* 0Dh */ + uint16_t pad_0; /* 0Eh */ - uint16_t flags; - uint16_t reserved_3; - uint32_t reserved_4; + uint16_t flags; /* 10h */ + uint16_t reserved_3; /* 12h */ + uint32_t reserved_4; /* 14h */ - uint32_t abort_context; - uint32_t pad_1; + uint32_t abort_context; /* 18h */ + uint32_t pad_1; /* 1Ch */ - uint32_t abort_mfi_phys_addr_lo; - uint32_t abort_mfi_phys_addr_hi; + uint32_t abort_mfi_phys_addr_lo; /* 20h */ + uint32_t abort_mfi_phys_addr_hi; /* 24h */ - uint32_t reserved_5[6]; + uint32_t reserved_5[6]; /* 28h */ }; struct mrsas_smp_frame { - uint8_t cmd; - uint8_t reserved_1; - uint8_t cmd_status; - uint8_t connection_status; + uint8_t cmd; /* 00h */ + uint8_t reserved_1; /* 01h */ + uint8_t cmd_status; /* 02h */ + uint8_t connection_status; /* 03h */ - uint8_t reserved_2[3]; - uint8_t sge_count; + uint8_t reserved_2[3]; /* 04h */ + uint8_t sge_count; /* 07h */ - uint32_t context; - uint8_t req_id; - uint8_t msgvector; - uint16_t pad_0; + uint32_t context; /* 08h */ + uint8_t req_id; /* 0Ch */ + uint8_t msgvector; /* 0Dh */ + uint16_t pad_0; /* 0Eh */ - uint16_t flags; - uint16_t timeout; + uint16_t flags; /* 10h */ + uint16_t timeout; /* 12h */ - uint32_t data_xfer_len; + uint32_t data_xfer_len; /* 14h */ - uint64_t sas_addr; + uint64_t sas_addr; /* 20h */ - union mrsas_sgl sgl[2]; + union mrsas_sgl sgl[2]; /* 28h */ }; struct mrsas_stp_frame { - uint8_t cmd; - uint8_t reserved_1; - uint8_t cmd_status; - uint8_t connection_status; + uint8_t cmd; /* 00h */ + uint8_t reserved_1; /* 01h */ + uint8_t cmd_status; /* 02h */ + uint8_t connection_status; /* 03h */ - uint8_t target_id; - uint8_t reserved_2[2]; - uint8_t sge_count; + uint8_t target_id; /* 04h */ + uint8_t reserved_2[2]; /* 04h */ + uint8_t sge_count; /* 07h */ - uint32_t context; - uint8_t req_id; - uint8_t msgvector; - uint16_t pad_0; + uint32_t context; /* 08h */ + uint8_t req_id; /* 0Ch */ + uint8_t msgvector; /* 0Dh */ + uint16_t pad_0; /* 0Eh */ - uint16_t flags; - uint16_t timeout; + uint16_t flags; /* 10h */ + uint16_t timeout; /* 12h */ - uint32_t data_xfer_len; + uint32_t data_xfer_len; /* 14h */ - uint16_t fis[10]; - uint32_t stp_flags; - union mrsas_sgl sgl; + uint16_t fis[10]; /* 28h */ + uint32_t stp_flags; /* 3C */ + union mrsas_sgl sgl; /* 40 */ }; union mrsas_frame { @@ -1681,144 +1948,111 @@ struct mrsas_aen { uint32_t seq_num; uint32_t class_locale_word; }; + #pragma pack() #ifndef DDI_VENDOR_LSI #define DDI_VENDOR_LSI "LSI" #endif /* DDI_VENDOR_LSI */ -#ifndef KMDB_MODULE -static int mrsas_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); -static int mrsas_attach(dev_info_t *, ddi_attach_cmd_t); -#ifdef __sparc -static int mrsas_reset(dev_info_t *, ddi_reset_cmd_t); -#else /* __sparc */ -static int mrsas_quiesce(dev_info_t *); -#endif /* __sparc */ -static int mrsas_detach(dev_info_t *, ddi_detach_cmd_t); -static int mrsas_open(dev_t *, int, int, cred_t *); -static int mrsas_close(dev_t, int, int, cred_t *); -static int mrsas_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); - -static int mrsas_tran_tgt_init(dev_info_t *, dev_info_t *, - scsi_hba_tran_t *, struct scsi_device *); -static struct scsi_pkt *mrsas_tran_init_pkt(struct scsi_address *, register +int mrsas_config_scsi_device(struct mrsas_instance *, + struct scsi_device *, dev_info_t **); + +#ifdef PDSUPPORT +int mrsas_tbolt_config_pd(struct mrsas_instance *, uint16_t, + uint8_t, dev_info_t **); +#endif + +dev_info_t *mrsas_find_child(struct mrsas_instance *, uint16_t, + uint8_t); +int mrsas_service_evt(struct mrsas_instance *, int, int, int, + uint64_t); +void return_raid_msg_pkt(struct mrsas_instance *, struct mrsas_cmd *); +struct mrsas_cmd *get_raid_msg_mfi_pkt(struct mrsas_instance *); +void return_raid_msg_mfi_pkt(struct mrsas_instance *, struct mrsas_cmd *); + +int alloc_space_for_mpi2(struct mrsas_instance *); +void fill_up_drv_ver(struct mrsas_drv_ver *dv); + +int mrsas_issue_init_mpi2(struct mrsas_instance *); +struct scsi_pkt *mrsas_tbolt_tran_init_pkt(struct scsi_address *, register struct scsi_pkt *, struct buf *, int, int, int, int, int (*)(), caddr_t); -static int mrsas_tran_start(struct scsi_address *, +int mrsas_tbolt_tran_start(struct scsi_address *, register struct scsi_pkt *); -static int mrsas_tran_abort(struct scsi_address *, struct scsi_pkt *); -static int mrsas_tran_reset(struct scsi_address *, int); -static int mrsas_tran_getcap(struct scsi_address *, char *, int); -static int mrsas_tran_setcap(struct scsi_address *, char *, int, int); -static void mrsas_tran_destroy_pkt(struct scsi_address *, - struct scsi_pkt *); -static void mrsas_tran_dmafree(struct scsi_address *, struct scsi_pkt *); -static void mrsas_tran_sync_pkt(struct scsi_address *, struct scsi_pkt *); -static uint_t mrsas_isr(); -static uint_t mrsas_softintr(); - -static int init_mfi(struct mrsas_instance *); -static int mrsas_free_dma_obj(struct mrsas_instance *, dma_obj_t); -static int mrsas_alloc_dma_obj(struct mrsas_instance *, dma_obj_t *, - uchar_t); -static struct mrsas_cmd *get_mfi_pkt(struct mrsas_instance *); -static void return_mfi_pkt(struct mrsas_instance *, +uint32_t tbolt_read_fw_status_reg(struct mrsas_instance *); +void tbolt_issue_cmd(struct mrsas_cmd *, struct mrsas_instance *); +int tbolt_issue_cmd_in_poll_mode(struct mrsas_instance *, struct mrsas_cmd *); - -static void free_space_for_mfi(struct mrsas_instance *); -static void free_additional_dma_buffer(struct mrsas_instance *); -static int alloc_additional_dma_buffer(struct mrsas_instance *); -static int read_fw_status_reg_ppc(struct mrsas_instance *); -static void issue_cmd_ppc(struct mrsas_cmd *, struct mrsas_instance *); -static int issue_cmd_in_poll_mode_ppc(struct mrsas_instance *, - struct mrsas_cmd *); -static int issue_cmd_in_sync_mode_ppc(struct mrsas_instance *, +int tbolt_issue_cmd_in_sync_mode(struct mrsas_instance *, struct mrsas_cmd *); -static void enable_intr_ppc(struct mrsas_instance *); -static void disable_intr_ppc(struct mrsas_instance *); -static int intr_ack_ppc(struct mrsas_instance *); -static int mfi_state_transition_to_ready(struct mrsas_instance *); -static void destroy_mfi_frame_pool(struct mrsas_instance *); -static int create_mfi_frame_pool(struct mrsas_instance *); -static int mrsas_dma_alloc(struct mrsas_instance *, struct scsi_pkt *, +void tbolt_enable_intr(struct mrsas_instance *); +void tbolt_disable_intr(struct mrsas_instance *); +int tbolt_intr_ack(struct mrsas_instance *); +uint_t mr_sas_tbolt_process_outstanding_cmd(struct mrsas_instance *); + uint_t tbolt_softintr(); +int mrsas_tbolt_dma(struct mrsas_instance *, uint32_t, int, int (*)()); +int mrsas_check_dma_handle(ddi_dma_handle_t handle); +int mrsas_check_acc_handle(ddi_acc_handle_t handle); +int mrsas_dma_alloc(struct mrsas_instance *, struct scsi_pkt *, struct buf *, int, int (*)()); -static int mrsas_dma_move(struct mrsas_instance *, +int mrsas_dma_move(struct mrsas_instance *, struct scsi_pkt *, struct buf *); -static void flush_cache(struct mrsas_instance *instance); -static void display_scsi_inquiry(caddr_t); -static int start_mfi_aen(struct mrsas_instance *instance); -static int handle_drv_ioctl(struct mrsas_instance *instance, - struct mrsas_ioctl *ioctl, int mode); -static int handle_mfi_ioctl(struct mrsas_instance *instance, - struct mrsas_ioctl *ioctl, int mode); -static int handle_mfi_aen(struct mrsas_instance *instance, - struct mrsas_aen *aen); -static void fill_up_drv_ver(struct mrsas_drv_ver *dv); -static struct mrsas_cmd *build_cmd(struct mrsas_instance *instance, - struct scsi_address *ap, struct scsi_pkt *pkt, - uchar_t *cmd_done); -#ifndef __sparc -static int wait_for_outstanding(struct mrsas_instance *instance); -#endif /* __sparc */ -static int register_mfi_aen(struct mrsas_instance *instance, - uint32_t seq_num, uint32_t class_locale_word); -static int issue_mfi_pthru(struct mrsas_instance *instance, struct - mrsas_ioctl *ioctl, struct mrsas_cmd *cmd, int mode); -static int issue_mfi_dcmd(struct mrsas_instance *instance, struct - mrsas_ioctl *ioctl, struct mrsas_cmd *cmd, int mode); -static int issue_mfi_smp(struct mrsas_instance *instance, struct - mrsas_ioctl *ioctl, struct mrsas_cmd *cmd, int mode); -static int issue_mfi_stp(struct mrsas_instance *instance, struct - mrsas_ioctl *ioctl, struct mrsas_cmd *cmd, int mode); -static int abort_aen_cmd(struct mrsas_instance *instance, - struct mrsas_cmd *cmd_to_abort); - -static int mrsas_common_check(struct mrsas_instance *instance, - struct mrsas_cmd *cmd); -static void mrsas_fm_init(struct mrsas_instance *instance); -static void mrsas_fm_fini(struct mrsas_instance *instance); -static int mrsas_fm_error_cb(dev_info_t *, ddi_fm_error_t *, - const void *); -static void mrsas_fm_ereport(struct mrsas_instance *instance, - char *detail); -static int mrsas_check_dma_handle(ddi_dma_handle_t handle); -static int mrsas_check_acc_handle(ddi_acc_handle_t handle); - -static void mrsas_rem_intrs(struct mrsas_instance *instance); -static int mrsas_add_intrs(struct mrsas_instance *instance, int intr_type); - -static void mrsas_tran_tgt_free(dev_info_t *, dev_info_t *, - scsi_hba_tran_t *, struct scsi_device *); -static int mrsas_tran_bus_config(dev_info_t *, uint_t, - ddi_bus_config_op_t, void *, dev_info_t **); -static int mrsas_parse_devname(char *, int *, int *); -static int mrsas_config_all_devices(struct mrsas_instance *); -static int mrsas_config_scsi_device(struct mrsas_instance *, - struct scsi_device *, dev_info_t **); -static int mrsas_config_ld(struct mrsas_instance *, uint16_t, - uint8_t, dev_info_t **); -static dev_info_t *mrsas_find_child(struct mrsas_instance *, uint16_t, - uint8_t); -static int mrsas_name_node(dev_info_t *, char *, int); -static void mrsas_issue_evt_taskq(struct mrsas_eventinfo *); -static int mrsas_service_evt(struct mrsas_instance *, int, int, int, - uint64_t); -static int mrsas_mode_sense_build(struct scsi_pkt *); -static void push_pending_mfi_pkt(struct mrsas_instance *, +int mrsas_alloc_dma_obj(struct mrsas_instance *, dma_obj_t *, + uchar_t); +void mr_sas_tbolt_build_mfi_cmd(struct mrsas_instance *, struct mrsas_cmd *); +int mrsas_dma_alloc_dmd(struct mrsas_instance *, dma_obj_t *); +void tbolt_complete_cmd_in_sync_mode(struct mrsas_instance *, + struct mrsas_cmd *); +int alloc_req_rep_desc(struct mrsas_instance *); +int mrsas_mode_sense_build(struct scsi_pkt *); +void push_pending_mfi_pkt(struct mrsas_instance *, struct mrsas_cmd *); -static int mrsas_issue_init_mfi(struct mrsas_instance *); -static int mrsas_issue_pending_cmds(struct mrsas_instance *); -static int mrsas_print_pending_cmds(struct mrsas_instance *); -static int mrsas_complete_pending_cmds(struct mrsas_instance *); -static int mrsas_reset_ppc(struct mrsas_instance *); -static uint32_t mrsas_initiate_ocr_if_fw_is_faulty(struct mrsas_instance *); -static int mrsas_kill_adapter(struct mrsas_instance *); -static void io_timeout_checker(void *instance); -static void complete_cmd_in_sync_mode(struct mrsas_instance *, - struct mrsas_cmd *); - -#endif /* KMDB_MODULE */ +int mrsas_issue_pending_cmds(struct mrsas_instance *); +int mrsas_print_pending_cmds(struct mrsas_instance *); +int mrsas_complete_pending_cmds(struct mrsas_instance *); + +int create_mfi_frame_pool(struct mrsas_instance *); +void destroy_mfi_frame_pool(struct mrsas_instance *); +int create_mfi_mpi_frame_pool(struct mrsas_instance *); +void destroy_mfi_mpi_frame_pool(struct mrsas_instance *); +int create_mpi2_frame_pool(struct mrsas_instance *); +void destroy_mpi2_frame_pool(struct mrsas_instance *); +int mrsas_free_dma_obj(struct mrsas_instance *, dma_obj_t); +void mrsas_tbolt_free_additional_dma_buffer(struct mrsas_instance *); +void free_req_desc_pool(struct mrsas_instance *); +void free_space_for_mpi2(struct mrsas_instance *); +void mrsas_dump_reply_desc(struct mrsas_instance *); +void tbolt_complete_cmd(struct mrsas_instance *, struct mrsas_cmd *); +void display_scsi_inquiry(caddr_t); +void service_mfi_aen(struct mrsas_instance *, struct mrsas_cmd *); +int mrsas_mode_sense_build(struct scsi_pkt *); +int mrsas_tbolt_get_ld_map_info(struct mrsas_instance *); +struct mrsas_cmd *mrsas_tbolt_build_poll_cmd(struct mrsas_instance *, + struct scsi_address *, struct scsi_pkt *, uchar_t *); +int mrsas_tbolt_reset_ppc(struct mrsas_instance *instance); +void mrsas_tbolt_kill_adapter(struct mrsas_instance *instance); +int abort_syncmap_cmd(struct mrsas_instance *, struct mrsas_cmd *); +void mrsas_tbolt_prepare_cdb(struct mrsas_instance *instance, U8 cdb[], + struct IO_REQUEST_INFO *, Mpi2RaidSCSIIORequest_t *, U32); + + +int mrsas_init_adapter_ppc(struct mrsas_instance *instance); +int mrsas_init_adapter_tbolt(struct mrsas_instance *instance); +int mrsas_init_adapter(struct mrsas_instance *instance); + +int mrsas_alloc_cmd_pool(struct mrsas_instance *instance); +void mrsas_free_cmd_pool(struct mrsas_instance *instance); + +void mrsas_print_cmd_details(struct mrsas_instance *, struct mrsas_cmd *, int); +struct mrsas_cmd *get_raid_msg_pkt(struct mrsas_instance *); + +int mfi_state_transition_to_ready(struct mrsas_instance *); + + +/* FMA functions. */ +int mrsas_common_check(struct mrsas_instance *, struct mrsas_cmd *); +void mrsas_fm_ereport(struct mrsas_instance *, char *); #ifdef __cplusplus diff --git a/usr/src/uts/common/io/mr_sas/mr_sas_list.c b/usr/src/uts/common/io/mr_sas/mr_sas_list.c new file mode 100644 index 0000000000..62ae374b76 --- /dev/null +++ b/usr/src/uts/common/io/mr_sas/mr_sas_list.c @@ -0,0 +1,134 @@ +/* + * mr_sas_list.h: header for mr_sas + * + * Solaris MegaRAID driver for SAS2.0 controllers + * Copyright (c) 2008-20012, LSI Logic Corporation. + * All rights reserved. + */ + +/* Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ + +/* + * Extract C functions from LSI-provided mr_sas_list.h such that we can both + * be lint-clean and provide a slightly better source organizational model + * beyond preprocessor abuse. + */ + +#include "mr_sas_list.h" + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void +__list_add(struct mlist_head *new, struct mlist_head *prev, + struct mlist_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/* + * mlist_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +void +mlist_add(struct mlist_head *new, struct mlist_head *head) +{ + __list_add(new, head, head->next); +} + +/* + * mlist_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +void +mlist_add_tail(struct mlist_head *new, struct mlist_head *head) +{ + __list_add(new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void +__list_del(struct mlist_head *prev, struct mlist_head *next) +{ + next->prev = prev; + prev->next = next; +} + +#if 0 +/* + * mlist_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry + * is in an undefined state. + */ + +void +mlist_del(struct mlist_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = entry->prev = 0; +} +#endif + +/* + * mlist_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +void +mlist_del_init(struct mlist_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +/* + * mlist_empty - tests whether a list is empty + * @head: the list to test. + */ +int +mlist_empty(struct mlist_head *head) +{ + return (head->next == head); +} + +/* + * mlist_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +void +mlist_splice(struct mlist_head *list, struct mlist_head *head) +{ + struct mlist_head *first = list->next; + + if (first != list) { + struct mlist_head *last = list->prev; + struct mlist_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; + } +} diff --git a/usr/src/uts/common/io/mr_sas/mr_sas_list.h b/usr/src/uts/common/io/mr_sas/mr_sas_list.h index 0c177712e0..9bd9947038 100644 --- a/usr/src/uts/common/io/mr_sas/mr_sas_list.h +++ b/usr/src/uts/common/io/mr_sas/mr_sas_list.h @@ -2,40 +2,8 @@ * mr_sas_list.h: header for mr_sas * * Solaris MegaRAID driver for SAS2.0 controllers - * Copyright (c) 2008-2009, LSI Logic Corporation. + * Copyright (c) 2008-2012, LSI Logic Corporation. * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the author nor the names of its contributors may be - * used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH - * DAMAGE. - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. */ #ifndef _MR_SAS_LIST_H_ @@ -70,110 +38,22 @@ typedef struct mlist_head mlist_t; (ptr)->next = (ptr); (ptr)->prev = (ptr); \ } -#ifndef KMDB_MODULE -/* - * Insert a new entry between two known consecutive entries. - * - * This is only for internal list manipulation where we know - * the prev/next entries already! - */ -static void __list_add(struct mlist_head *new, - struct mlist_head *prev, - struct mlist_head *next) -{ - next->prev = new; - new->next = next; - new->prev = prev; - prev->next = new; -} - - -/* - * mlist_add - add a new entry - * @new: new entry to be added - * @head: list head to add it after - * - * Insert a new entry after the specified head. - * This is good for implementing stacks. - */ -static void mlist_add(struct mlist_head *new, struct mlist_head *head) -{ - __list_add(new, head, head->next); -} - - -/* - * mlist_add_tail - add a new entry - * @new: new entry to be added - * @head: list head to add it before - * - * Insert a new entry before the specified head. - * This is useful for implementing queues. - */ -static void mlist_add_tail(struct mlist_head *new, struct mlist_head *head) -{ - __list_add(new, head->prev, head); -} - +void mlist_add(struct mlist_head *, struct mlist_head *); +void mlist_add_tail(struct mlist_head *, struct mlist_head *); +#if 0 +void mlist_del(struct mlist_head *); +#endif +void mlist_del_init(struct mlist_head *); +int mlist_empty(struct mlist_head *); +void mlist_splice(struct mlist_head *, struct mlist_head *); + +/* TODO: set this */ +#if 0 +#pragma inline(list_add, list_add_tail, __list_del, list_del, + list_del_init, list_empty, list_splice) +#endif -/* - * Delete a list entry by making the prev/next entries - * point to each other. - * - * This is only for internal list manipulation where we know - * the prev/next entries already! - */ -static void __list_del(struct mlist_head *prev, - struct mlist_head *next) -{ - next->prev = prev; - prev->next = next; -} - - -/* - * mlist_del_init - deletes entry from list and reinitialize it. - * @entry: the element to delete from the list. - */ -static void mlist_del_init(struct mlist_head *entry) -{ - __list_del(entry->prev, entry->next); - INIT_LIST_HEAD(entry); -} - - -/* - * mlist_empty - tests whether a list is empty - * @head: the list to test. - */ -static int mlist_empty(struct mlist_head *head) -{ - return (head->next == head); -} - - -/* - * mlist_splice - join two lists - * @list: the new list to add. - * @head: the place to add it in the first list. - */ -static void mlist_splice(struct mlist_head *list, struct mlist_head *head) -{ - struct mlist_head *first = list->next; - - if (first != list) { - struct mlist_head *last = list->prev; - struct mlist_head *at = head->next; - - first->prev = head; - head->next = first; - - last->next = at; - at->prev = last; - } -} -#endif /* KMDB_MODULE */ /* * mlist_entry - get the struct for this entry diff --git a/usr/src/uts/common/io/mr_sas/mr_sas_tbolt.c b/usr/src/uts/common/io/mr_sas/mr_sas_tbolt.c new file mode 100644 index 0000000000..e4c89c4cf6 --- /dev/null +++ b/usr/src/uts/common/io/mr_sas/mr_sas_tbolt.c @@ -0,0 +1,3793 @@ +/* + * mr_sas_tbolt.c: source for mr_sas driver for New Generation. + * i.e. Thunderbolt and Invader + * + * Solaris MegaRAID device driver for SAS2.0 controllers + * Copyright (c) 2008-2012, LSI Logic Corporation. + * All rights reserved. + * + * Version: + * Author: + * Swaminathan K S + * Arun Chandrashekhar + * Manju R + * Rasheed + * Shakeel Bukhari + */ + + +#include <sys/types.h> +#include <sys/file.h> +#include <sys/atomic.h> +#include <sys/scsi/scsi.h> +#include <sys/byteorder.h> +#include "ld_pd_map.h" +#include "mr_sas.h" +#include "fusion.h" + +/* + * FMA header files + */ +#include <sys/ddifm.h> +#include <sys/fm/protocol.h> +#include <sys/fm/util.h> +#include <sys/fm/io/ddi.h> + + +/* Pre-TB command size and TB command size. */ +#define MR_COMMAND_SIZE (64*20) /* 1280 bytes */ +MR_LD_RAID *MR_LdRaidGet(U32 ld, MR_FW_RAID_MAP_ALL *map); +U16 MR_TargetIdToLdGet(U32 ldTgtId, MR_FW_RAID_MAP_ALL *map); +U16 MR_GetLDTgtId(U32 ld, MR_FW_RAID_MAP_ALL *map); +U16 get_updated_dev_handle(PLD_LOAD_BALANCE_INFO, struct IO_REQUEST_INFO *); +extern ddi_dma_attr_t mrsas_generic_dma_attr; +extern uint32_t mrsas_tbolt_max_cap_maxxfer; +extern struct ddi_device_acc_attr endian_attr; +extern int debug_level_g; +extern unsigned int enable_fp; +volatile int dump_io_wait_time = 90; +extern void +io_timeout_checker(void *arg); +extern volatile int debug_timeout_g; +extern int mrsas_issue_pending_cmds(struct mrsas_instance *); +extern int mrsas_complete_pending_cmds(struct mrsas_instance *instance); +extern void push_pending_mfi_pkt(struct mrsas_instance *, + struct mrsas_cmd *); +extern U8 MR_BuildRaidContext(struct mrsas_instance *, struct IO_REQUEST_INFO *, + MPI2_SCSI_IO_VENDOR_UNIQUE *, MR_FW_RAID_MAP_ALL *); + +/* Local static prototypes. */ +static struct mrsas_cmd *mrsas_tbolt_build_cmd(struct mrsas_instance *, + struct scsi_address *, struct scsi_pkt *, uchar_t *); +static void mrsas_tbolt_set_pd_lba(U8 cdb[], uint8_t *cdb_len_ptr, + U64 start_blk, U32 num_blocks); +static int mrsas_tbolt_check_map_info(struct mrsas_instance *); +static int mrsas_tbolt_sync_map_info(struct mrsas_instance *); +static int mrsas_tbolt_prepare_pkt(struct scsa_cmd *); +static int mrsas_tbolt_ioc_init(struct mrsas_instance *, dma_obj_t *); +#ifdef PDSUPPORT +static void mrsas_tbolt_get_pd_info(struct mrsas_instance *, + struct mrsas_tbolt_pd_info *, int); +#endif /* PDSUPPORT */ + +static int debug_tbolt_fw_faults_after_ocr_g = 0; + +/* + * destroy_mfi_mpi_frame_pool + */ +void +destroy_mfi_mpi_frame_pool(struct mrsas_instance *instance) +{ + int i; + + struct mrsas_cmd *cmd; + + /* return all mfi frames to pool */ + for (i = 0; i < MRSAS_APP_RESERVED_CMDS; i++) { + cmd = instance->cmd_list[i]; + if (cmd->frame_dma_obj_status == DMA_OBJ_ALLOCATED) { + (void) mrsas_free_dma_obj(instance, + cmd->frame_dma_obj); + } + cmd->frame_dma_obj_status = DMA_OBJ_FREED; + } +} + +/* + * destroy_mpi2_frame_pool + */ +void +destroy_mpi2_frame_pool(struct mrsas_instance *instance) +{ + + if (instance->mpi2_frame_pool_dma_obj.status == DMA_OBJ_ALLOCATED) { + (void) mrsas_free_dma_obj(instance, + instance->mpi2_frame_pool_dma_obj); + instance->mpi2_frame_pool_dma_obj.status |= DMA_OBJ_FREED; + } +} + + +/* + * mrsas_tbolt_free_additional_dma_buffer + */ +void +mrsas_tbolt_free_additional_dma_buffer(struct mrsas_instance *instance) +{ + int i; + + if (instance->mfi_internal_dma_obj.status == DMA_OBJ_ALLOCATED) { + (void) mrsas_free_dma_obj(instance, + instance->mfi_internal_dma_obj); + instance->mfi_internal_dma_obj.status = DMA_OBJ_FREED; + } + if (instance->mfi_evt_detail_obj.status == DMA_OBJ_ALLOCATED) { + (void) mrsas_free_dma_obj(instance, + instance->mfi_evt_detail_obj); + instance->mfi_evt_detail_obj.status = DMA_OBJ_FREED; + } + + for (i = 0; i < 2; i++) { + if (instance->ld_map_obj[i].status == DMA_OBJ_ALLOCATED) { + (void) mrsas_free_dma_obj(instance, + instance->ld_map_obj[i]); + instance->ld_map_obj[i].status = DMA_OBJ_FREED; + } + } +} + + +/* + * free_req_desc_pool + */ +void +free_req_rep_desc_pool(struct mrsas_instance *instance) +{ + if (instance->request_desc_dma_obj.status == DMA_OBJ_ALLOCATED) { + (void) mrsas_free_dma_obj(instance, + instance->request_desc_dma_obj); + instance->request_desc_dma_obj.status = DMA_OBJ_FREED; + } + + if (instance->reply_desc_dma_obj.status == DMA_OBJ_ALLOCATED) { + (void) mrsas_free_dma_obj(instance, + instance->reply_desc_dma_obj); + instance->reply_desc_dma_obj.status = DMA_OBJ_FREED; + } + + +} + + +/* + * ThunderBolt(TB) Request Message Frame Pool + */ +int +create_mpi2_frame_pool(struct mrsas_instance *instance) +{ + int i = 0; + uint16_t max_cmd; + uint32_t sgl_sz; + uint32_t raid_msg_size; + uint32_t total_size; + uint32_t offset; + uint32_t io_req_base_phys; + uint8_t *io_req_base; + struct mrsas_cmd *cmd; + + max_cmd = instance->max_fw_cmds; + + sgl_sz = 1024; + raid_msg_size = MRSAS_THUNDERBOLT_MSG_SIZE; + + /* Allocating additional 256 bytes to accomodate SMID 0. */ + total_size = MRSAS_THUNDERBOLT_MSG_SIZE + (max_cmd * raid_msg_size) + + (max_cmd * sgl_sz) + (max_cmd * SENSE_LENGTH); + + con_log(CL_ANN1, (CE_NOTE, "create_mpi2_frame_pool: " + "max_cmd %x ", max_cmd)); + + con_log(CL_DLEVEL3, (CE_NOTE, "create_mpi2_frame_pool: " + "request message frame pool size %x", total_size)); + + /* + * ThunderBolt(TB) We need to create a single chunk of DMA'ble memory + * and then split the memory to 1024 commands. Each command should be + * able to contain a RAID MESSAGE FRAME which will embed a MFI_FRAME + * within it. Further refer the "alloc_req_rep_desc" function where + * we allocate request/reply descriptors queues for a clue. + */ + + instance->mpi2_frame_pool_dma_obj.size = total_size; + instance->mpi2_frame_pool_dma_obj.dma_attr = mrsas_generic_dma_attr; + instance->mpi2_frame_pool_dma_obj.dma_attr.dma_attr_addr_hi = + 0xFFFFFFFFU; + instance->mpi2_frame_pool_dma_obj.dma_attr.dma_attr_count_max = + 0xFFFFFFFFU; + instance->mpi2_frame_pool_dma_obj.dma_attr.dma_attr_sgllen = 1; + instance->mpi2_frame_pool_dma_obj.dma_attr.dma_attr_align = 256; + + if (mrsas_alloc_dma_obj(instance, &instance->mpi2_frame_pool_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + cmn_err(CE_WARN, + "mr_sas: could not alloc mpi2 frame pool"); + return (DDI_FAILURE); + } + + bzero(instance->mpi2_frame_pool_dma_obj.buffer, total_size); + instance->mpi2_frame_pool_dma_obj.status |= DMA_OBJ_ALLOCATED; + + instance->io_request_frames = + (uint8_t *)instance->mpi2_frame_pool_dma_obj.buffer; + instance->io_request_frames_phy = + (uint32_t) + instance->mpi2_frame_pool_dma_obj.dma_cookie[0].dmac_address; + + con_log(CL_DLEVEL3, (CE_NOTE, "io_request_frames 0x%p", + (void *)instance->io_request_frames)); + + con_log(CL_DLEVEL3, (CE_NOTE, "io_request_frames_phy 0x%x", + instance->io_request_frames_phy)); + + io_req_base = (uint8_t *)instance->io_request_frames + + MRSAS_THUNDERBOLT_MSG_SIZE; + io_req_base_phys = instance->io_request_frames_phy + + MRSAS_THUNDERBOLT_MSG_SIZE; + + con_log(CL_DLEVEL3, (CE_NOTE, + "io req_base_phys 0x%x", io_req_base_phys)); + + for (i = 0; i < max_cmd; i++) { + cmd = instance->cmd_list[i]; + + offset = i * MRSAS_THUNDERBOLT_MSG_SIZE; + + cmd->scsi_io_request = (Mpi2RaidSCSIIORequest_t *) + ((uint8_t *)io_req_base + offset); + cmd->scsi_io_request_phys_addr = io_req_base_phys + offset; + + cmd->sgl = (Mpi2SGEIOUnion_t *)((uint8_t *)io_req_base + + (max_cmd * raid_msg_size) + i * sgl_sz); + + cmd->sgl_phys_addr = (io_req_base_phys + + (max_cmd * raid_msg_size) + i * sgl_sz); + + cmd->sense1 = (uint8_t *)((uint8_t *)io_req_base + + (max_cmd * raid_msg_size) + (max_cmd * sgl_sz) + + (i * SENSE_LENGTH)); + + cmd->sense_phys_addr1 = (io_req_base_phys + + (max_cmd * raid_msg_size) + (max_cmd * sgl_sz) + + (i * SENSE_LENGTH)); + + + cmd->SMID = i + 1; + + con_log(CL_DLEVEL3, (CE_NOTE, "Frame Pool Addr [%x]0x%p", + cmd->index, (void *)cmd->scsi_io_request)); + + con_log(CL_DLEVEL3, (CE_NOTE, "Frame Pool Phys Addr [%x]0x%x", + cmd->index, cmd->scsi_io_request_phys_addr)); + + con_log(CL_DLEVEL3, (CE_NOTE, "Sense Addr [%x]0x%p", + cmd->index, (void *)cmd->sense1)); + + con_log(CL_DLEVEL3, (CE_NOTE, "Sense Addr Phys [%x]0x%x", + cmd->index, cmd->sense_phys_addr1)); + + con_log(CL_DLEVEL3, (CE_NOTE, "Sgl bufffers [%x]0x%p", + cmd->index, (void *)cmd->sgl)); + + con_log(CL_DLEVEL3, (CE_NOTE, "Sgl bufffers phys [%x]0x%x", + cmd->index, cmd->sgl_phys_addr)); + } + + return (DDI_SUCCESS); + +} + + +/* + * alloc_additional_dma_buffer for AEN + */ +int +mrsas_tbolt_alloc_additional_dma_buffer(struct mrsas_instance *instance) +{ + uint32_t internal_buf_size = PAGESIZE*2; + int i; + + /* Initialize buffer status as free */ + instance->mfi_internal_dma_obj.status = DMA_OBJ_FREED; + instance->mfi_evt_detail_obj.status = DMA_OBJ_FREED; + instance->ld_map_obj[0].status = DMA_OBJ_FREED; + instance->ld_map_obj[1].status = DMA_OBJ_FREED; + + + instance->mfi_internal_dma_obj.size = internal_buf_size; + instance->mfi_internal_dma_obj.dma_attr = mrsas_generic_dma_attr; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_count_max = + 0xFFFFFFFFU; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_sgllen = 1; + + if (mrsas_alloc_dma_obj(instance, &instance->mfi_internal_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + cmn_err(CE_WARN, + "mr_sas: could not alloc reply queue"); + return (DDI_FAILURE); + } + + bzero(instance->mfi_internal_dma_obj.buffer, internal_buf_size); + + instance->mfi_internal_dma_obj.status |= DMA_OBJ_ALLOCATED; + instance->internal_buf = + (caddr_t)(((unsigned long)instance->mfi_internal_dma_obj.buffer)); + instance->internal_buf_dmac_add = + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address; + instance->internal_buf_size = internal_buf_size; + + /* allocate evt_detail */ + instance->mfi_evt_detail_obj.size = sizeof (struct mrsas_evt_detail); + instance->mfi_evt_detail_obj.dma_attr = mrsas_generic_dma_attr; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_sgllen = 1; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_align = 8; + + if (mrsas_alloc_dma_obj(instance, &instance->mfi_evt_detail_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + cmn_err(CE_WARN, "mrsas_tbolt_alloc_additional_dma_buffer: " + "could not allocate data transfer buffer."); + goto fail_tbolt_additional_buff; + } + + bzero(instance->mfi_evt_detail_obj.buffer, + sizeof (struct mrsas_evt_detail)); + + instance->mfi_evt_detail_obj.status |= DMA_OBJ_ALLOCATED; + + instance->size_map_info = sizeof (MR_FW_RAID_MAP) + + (sizeof (MR_LD_SPAN_MAP) * (MAX_LOGICAL_DRIVES - 1)); + + for (i = 0; i < 2; i++) { + /* allocate the data transfer buffer */ + instance->ld_map_obj[i].size = instance->size_map_info; + instance->ld_map_obj[i].dma_attr = mrsas_generic_dma_attr; + instance->ld_map_obj[i].dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->ld_map_obj[i].dma_attr.dma_attr_count_max = + 0xFFFFFFFFU; + instance->ld_map_obj[i].dma_attr.dma_attr_sgllen = 1; + instance->ld_map_obj[i].dma_attr.dma_attr_align = 1; + + if (mrsas_alloc_dma_obj(instance, &instance->ld_map_obj[i], + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + cmn_err(CE_WARN, + "could not allocate data transfer buffer."); + goto fail_tbolt_additional_buff; + } + + instance->ld_map_obj[i].status |= DMA_OBJ_ALLOCATED; + + (void) memset(instance->ld_map_obj[i].buffer, 0, + instance->size_map_info); + + instance->ld_map[i] = + (MR_FW_RAID_MAP_ALL *)instance->ld_map_obj[i].buffer; + instance->ld_map_phy[i] = (uint32_t)instance-> + ld_map_obj[i].dma_cookie[0].dmac_address; + + con_log(CL_DLEVEL3, (CE_NOTE, + "ld_map Addr Phys 0x%x", instance->ld_map_phy[i])); + + con_log(CL_DLEVEL3, (CE_NOTE, + "size_map_info 0x%x", instance->size_map_info)); + } + + return (DDI_SUCCESS); + +fail_tbolt_additional_buff: + mrsas_tbolt_free_additional_dma_buffer(instance); + + return (DDI_FAILURE); +} + +MRSAS_REQUEST_DESCRIPTOR_UNION * +mr_sas_get_request_descriptor(struct mrsas_instance *instance, uint16_t index) +{ + MRSAS_REQUEST_DESCRIPTOR_UNION *req_desc; + + if (index > instance->max_fw_cmds) { + con_log(CL_ANN1, (CE_NOTE, + "Invalid SMID 0x%x request for descriptor", index)); + con_log(CL_ANN1, (CE_NOTE, + "max_fw_cmds : 0x%x\n", instance->max_fw_cmds)); + return (NULL); + } + + req_desc = (MRSAS_REQUEST_DESCRIPTOR_UNION *) + ((char *)instance->request_message_pool + + (sizeof (MRSAS_REQUEST_DESCRIPTOR_UNION) * index)); + + con_log(CL_ANN1, (CE_NOTE, + "request descriptor : 0x%08lx\n", (unsigned long)req_desc)); + + con_log(CL_ANN1, (CE_NOTE, + "request descriptor base phy : 0x%08lx\n", + (unsigned long)instance->request_message_pool_phy)); + + return ((MRSAS_REQUEST_DESCRIPTOR_UNION *)req_desc); +} + + +/* + * Allocate Request and Reply Queue Descriptors. + */ +int +alloc_req_rep_desc(struct mrsas_instance *instance) +{ + uint32_t request_q_sz, reply_q_sz; + int i, max_reply_q_sz; + MPI2_REPLY_DESCRIPTORS_UNION *reply_desc; + + /* + * ThunderBolt(TB) There's no longer producer consumer mechanism. + * Once we have an interrupt we are supposed to scan through the list of + * reply descriptors and process them accordingly. We would be needing + * to allocate memory for 1024 reply descriptors + */ + + /* Allocate Reply Descriptors */ + con_log(CL_ANN1, (CE_NOTE, " reply q desc len = %x\n", + (uint_t)sizeof (MPI2_REPLY_DESCRIPTORS_UNION))); + + /* reply queue size should be multiple of 16 */ + max_reply_q_sz = ((instance->max_fw_cmds + 1 + 15)/16)*16; + + reply_q_sz = 8 * max_reply_q_sz; + + + con_log(CL_ANN1, (CE_NOTE, " reply q desc len = %x\n", + (uint_t)sizeof (MPI2_REPLY_DESCRIPTORS_UNION))); + + instance->reply_desc_dma_obj.size = reply_q_sz; + instance->reply_desc_dma_obj.dma_attr = mrsas_generic_dma_attr; + instance->reply_desc_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->reply_desc_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + instance->reply_desc_dma_obj.dma_attr.dma_attr_sgllen = 1; + instance->reply_desc_dma_obj.dma_attr.dma_attr_align = 16; + + if (mrsas_alloc_dma_obj(instance, &instance->reply_desc_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + cmn_err(CE_WARN, + "mr_sas: could not alloc reply queue"); + return (DDI_FAILURE); + } + + bzero(instance->reply_desc_dma_obj.buffer, reply_q_sz); + instance->reply_desc_dma_obj.status |= DMA_OBJ_ALLOCATED; + + /* virtual address of reply queue */ + instance->reply_frame_pool = (MPI2_REPLY_DESCRIPTORS_UNION *)( + instance->reply_desc_dma_obj.buffer); + + instance->reply_q_depth = max_reply_q_sz; + + con_log(CL_ANN1, (CE_NOTE, "[reply queue depth]0x%x", + instance->reply_q_depth)); + + con_log(CL_ANN1, (CE_NOTE, "[reply queue virt addr]0x%p", + (void *)instance->reply_frame_pool)); + + /* initializing reply address to 0xFFFFFFFF */ + reply_desc = instance->reply_frame_pool; + + for (i = 0; i < instance->reply_q_depth; i++) { + reply_desc->Words = (uint64_t)~0; + reply_desc++; + } + + + instance->reply_frame_pool_phy = + (uint32_t)instance->reply_desc_dma_obj.dma_cookie[0].dmac_address; + + con_log(CL_ANN1, (CE_NOTE, + "[reply queue phys addr]0x%x", instance->reply_frame_pool_phy)); + + + instance->reply_pool_limit_phy = (instance->reply_frame_pool_phy + + reply_q_sz); + + con_log(CL_ANN1, (CE_NOTE, "[reply pool limit phys addr]0x%x", + instance->reply_pool_limit_phy)); + + + con_log(CL_ANN1, (CE_NOTE, " request q desc len = %x\n", + (int)sizeof (MRSAS_REQUEST_DESCRIPTOR_UNION))); + + /* Allocate Request Descriptors */ + con_log(CL_ANN1, (CE_NOTE, " request q desc len = %x\n", + (int)sizeof (MRSAS_REQUEST_DESCRIPTOR_UNION))); + + request_q_sz = 8 * + (instance->max_fw_cmds); + + instance->request_desc_dma_obj.size = request_q_sz; + instance->request_desc_dma_obj.dma_attr = mrsas_generic_dma_attr; + instance->request_desc_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->request_desc_dma_obj.dma_attr.dma_attr_count_max = + 0xFFFFFFFFU; + instance->request_desc_dma_obj.dma_attr.dma_attr_sgllen = 1; + instance->request_desc_dma_obj.dma_attr.dma_attr_align = 16; + + if (mrsas_alloc_dma_obj(instance, &instance->request_desc_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + cmn_err(CE_WARN, + "mr_sas: could not alloc request queue desc"); + goto fail_undo_reply_queue; + } + + bzero(instance->request_desc_dma_obj.buffer, request_q_sz); + instance->request_desc_dma_obj.status |= DMA_OBJ_ALLOCATED; + + /* virtual address of request queue desc */ + instance->request_message_pool = (MRSAS_REQUEST_DESCRIPTOR_UNION *) + (instance->request_desc_dma_obj.buffer); + + instance->request_message_pool_phy = + (uint32_t)instance->request_desc_dma_obj.dma_cookie[0].dmac_address; + + return (DDI_SUCCESS); + +fail_undo_reply_queue: + if (instance->reply_desc_dma_obj.status == DMA_OBJ_ALLOCATED) { + (void) mrsas_free_dma_obj(instance, + instance->reply_desc_dma_obj); + instance->reply_desc_dma_obj.status = DMA_OBJ_FREED; + } + + return (DDI_FAILURE); +} + +/* + * mrsas_alloc_cmd_pool_tbolt + * + * TODO: merge tbolt-specific codee into mrsas_alloc_cmd_pool() to have single + * routine + */ +int +mrsas_alloc_cmd_pool_tbolt(struct mrsas_instance *instance) +{ + int i; + int count; + uint32_t max_cmd; + uint32_t reserve_cmd; + size_t sz; + + struct mrsas_cmd *cmd; + + max_cmd = instance->max_fw_cmds; + con_log(CL_ANN1, (CE_NOTE, "mrsas_alloc_cmd_pool: " + "max_cmd %x", max_cmd)); + + + sz = sizeof (struct mrsas_cmd *) * max_cmd; + + /* + * instance->cmd_list is an array of struct mrsas_cmd pointers. + * Allocate the dynamic array first and then allocate individual + * commands. + */ + instance->cmd_list = kmem_zalloc(sz, KM_SLEEP); + if (instance->cmd_list == NULL) { + con_log(CL_NONE, (CE_WARN, + "Failed to allocate memory for cmd_list")); + return (DDI_FAILURE); + } + + /* create a frame pool and assign one frame to each cmd */ + for (count = 0; count < max_cmd; count++) { + instance->cmd_list[count] = + kmem_zalloc(sizeof (struct mrsas_cmd), KM_SLEEP); + if (instance->cmd_list[count] == NULL) { + con_log(CL_NONE, (CE_WARN, + "Failed to allocate memory for mrsas_cmd")); + goto mrsas_undo_cmds; + } + } + + /* add all the commands to command pool */ + + INIT_LIST_HEAD(&instance->cmd_pool_list); + INIT_LIST_HEAD(&instance->cmd_pend_list); + INIT_LIST_HEAD(&instance->cmd_app_pool_list); + + reserve_cmd = MRSAS_APP_RESERVED_CMDS; + + /* cmd index 0 reservered for IOC INIT */ + for (i = 1; i < reserve_cmd; i++) { + cmd = instance->cmd_list[i]; + cmd->index = i; + mlist_add_tail(&cmd->list, &instance->cmd_app_pool_list); + } + + + for (i = reserve_cmd; i < max_cmd; i++) { + cmd = instance->cmd_list[i]; + cmd->index = i; + mlist_add_tail(&cmd->list, &instance->cmd_pool_list); + } + + return (DDI_SUCCESS); + +mrsas_undo_cmds: + if (count > 0) { + /* free each cmd */ + for (i = 0; i < count; i++) { + if (instance->cmd_list[i] != NULL) { + kmem_free(instance->cmd_list[i], + sizeof (struct mrsas_cmd)); + } + instance->cmd_list[i] = NULL; + } + } + +mrsas_undo_cmd_list: + if (instance->cmd_list != NULL) + kmem_free(instance->cmd_list, sz); + instance->cmd_list = NULL; + + return (DDI_FAILURE); +} + + +/* + * free_space_for_mpi2 + */ +void +free_space_for_mpi2(struct mrsas_instance *instance) +{ + /* already freed */ + if (instance->cmd_list == NULL) { + return; + } + + /* First free the additional DMA buffer */ + mrsas_tbolt_free_additional_dma_buffer(instance); + + /* Free the request/reply descriptor pool */ + free_req_rep_desc_pool(instance); + + /* Free the MPI message pool */ + destroy_mpi2_frame_pool(instance); + + /* Free the MFI frame pool */ + destroy_mfi_frame_pool(instance); + + /* Free all the commands in the cmd_list */ + /* Free the cmd_list buffer itself */ + mrsas_free_cmd_pool(instance); +} + + +/* + * ThunderBolt(TB) memory allocations for commands/messages/frames. + */ +int +alloc_space_for_mpi2(struct mrsas_instance *instance) +{ + /* Allocate command pool (memory for cmd_list & individual commands) */ + if (mrsas_alloc_cmd_pool_tbolt(instance)) { + cmn_err(CE_WARN, "Error creating cmd pool"); + return (DDI_FAILURE); + } + + /* Initialize single reply size and Message size */ + instance->reply_size = MRSAS_THUNDERBOLT_REPLY_SIZE; + instance->raid_io_msg_size = MRSAS_THUNDERBOLT_MSG_SIZE; + + instance->max_sge_in_main_msg = (MRSAS_THUNDERBOLT_MSG_SIZE - + (sizeof (MPI2_RAID_SCSI_IO_REQUEST) - + sizeof (MPI2_SGE_IO_UNION)))/ sizeof (MPI2_SGE_IO_UNION); + instance->max_sge_in_chain = (MR_COMMAND_SIZE - + MRSAS_THUNDERBOLT_MSG_SIZE) / sizeof (MPI2_SGE_IO_UNION); + + /* Reduce SG count by 1 to take care of group cmds feature in FW */ + instance->max_num_sge = (instance->max_sge_in_main_msg + + instance->max_sge_in_chain - 2); + instance->chain_offset_mpt_msg = + offsetof(MPI2_RAID_SCSI_IO_REQUEST, SGL) / 16; + instance->chain_offset_io_req = (MRSAS_THUNDERBOLT_MSG_SIZE - + sizeof (MPI2_SGE_IO_UNION)) / 16; + instance->reply_read_index = 0; + + + /* Allocate Request and Reply descriptors Array */ + /* Make sure the buffer is aligned to 8 for req/rep descriptor Pool */ + if (alloc_req_rep_desc(instance)) { + cmn_err(CE_WARN, + "Error, allocating memory for descripter-pool"); + goto mpi2_undo_cmd_pool; + } + con_log(CL_ANN1, (CE_NOTE, "[request message pool phys addr]0x%x", + instance->request_message_pool_phy)); + + + /* Allocate MFI Frame pool - for MPI-MFI passthru commands */ + if (create_mfi_frame_pool(instance)) { + cmn_err(CE_WARN, + "Error, allocating memory for MFI frame-pool"); + goto mpi2_undo_descripter_pool; + } + + + /* Allocate MPI2 Message pool */ + /* + * Make sure the buffer is alligned to 256 for raid message packet + * create a io request pool and assign one frame to each cmd + */ + + if (create_mpi2_frame_pool(instance)) { + cmn_err(CE_WARN, + "Error, allocating memory for MPI2 Message-pool"); + goto mpi2_undo_mfi_frame_pool; + } + +#ifdef DEBUG + con_log(CL_ANN1, (CE_CONT, "[max_sge_in_main_msg]0x%x", + instance->max_sge_in_main_msg)); + con_log(CL_ANN1, (CE_CONT, "[max_sge_in_chain]0x%x", + instance->max_sge_in_chain)); + con_log(CL_ANN1, (CE_CONT, + "[max_sge]0x%x", instance->max_num_sge)); + con_log(CL_ANN1, (CE_CONT, "[chain_offset_mpt_msg]0x%x", + instance->chain_offset_mpt_msg)); + con_log(CL_ANN1, (CE_CONT, "[chain_offset_io_req]0x%x", + instance->chain_offset_io_req)); +#endif + + + /* Allocate additional dma buffer */ + if (mrsas_tbolt_alloc_additional_dma_buffer(instance)) { + cmn_err(CE_WARN, + "Error, allocating tbolt additional DMA buffer"); + goto mpi2_undo_message_pool; + } + + return (DDI_SUCCESS); + +mpi2_undo_message_pool: + destroy_mpi2_frame_pool(instance); + +mpi2_undo_mfi_frame_pool: + destroy_mfi_frame_pool(instance); + +mpi2_undo_descripter_pool: + free_req_rep_desc_pool(instance); + +mpi2_undo_cmd_pool: + mrsas_free_cmd_pool(instance); + + return (DDI_FAILURE); +} + + +/* + * mrsas_init_adapter_tbolt - Initialize fusion interface adapter. + */ +int +mrsas_init_adapter_tbolt(struct mrsas_instance *instance) +{ + + /* + * Reduce the max supported cmds by 1. This is to ensure that the + * reply_q_sz (1 more than the max cmd that driver may send) + * does not exceed max cmds that the FW can support + */ + + if (instance->max_fw_cmds > 1008) { + instance->max_fw_cmds = 1008; + instance->max_fw_cmds = instance->max_fw_cmds-1; + } + + con_log(CL_ANN, (CE_NOTE, "mrsas_init_adapter_tbolt: " + " instance->max_fw_cmds 0x%X.", instance->max_fw_cmds)); + + + /* create a pool of commands */ + if (alloc_space_for_mpi2(instance) != DDI_SUCCESS) { + cmn_err(CE_WARN, + " alloc_space_for_mpi2() failed."); + + return (DDI_FAILURE); + } + + /* Send ioc init message */ + /* NOTE: the issue_init call does FMA checking already. */ + if (mrsas_issue_init_mpi2(instance) != DDI_SUCCESS) { + cmn_err(CE_WARN, + " mrsas_issue_init_mpi2() failed."); + + goto fail_init_fusion; + } + + instance->unroll.alloc_space_mpi2 = 1; + + con_log(CL_ANN, (CE_NOTE, + "mrsas_init_adapter_tbolt: SUCCESSFULL\n")); + + return (DDI_SUCCESS); + +fail_init_fusion: + free_space_for_mpi2(instance); + + return (DDI_FAILURE); +} + + + +/* + * init_mpi2 + */ +int +mrsas_issue_init_mpi2(struct mrsas_instance *instance) +{ + dma_obj_t init2_dma_obj; + int ret_val = DDI_SUCCESS; + + /* allocate DMA buffer for IOC INIT message */ + init2_dma_obj.size = sizeof (Mpi2IOCInitRequest_t); + init2_dma_obj.dma_attr = mrsas_generic_dma_attr; + init2_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + init2_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + init2_dma_obj.dma_attr.dma_attr_sgllen = 1; + init2_dma_obj.dma_attr.dma_attr_align = 256; + + if (mrsas_alloc_dma_obj(instance, &init2_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + cmn_err(CE_WARN, "mr_sas_issue_init_mpi2 " + "could not allocate data transfer buffer."); + return (DDI_FAILURE); + } + (void) memset(init2_dma_obj.buffer, 2, + sizeof (Mpi2IOCInitRequest_t)); + + con_log(CL_ANN1, (CE_NOTE, + "mrsas_issue_init_mpi2 _phys adr: %x \n", + init2_dma_obj.dma_cookie[0].dmac_address)); + + + /* Initialize and send ioc init message */ + ret_val = mrsas_tbolt_ioc_init(instance, &init2_dma_obj); + if (ret_val == DDI_FAILURE) { + con_log(CL_ANN1, (CE_WARN, + "mrsas_issue_init_mpi2: Failed\n")); + goto fail_init_mpi2; + } + + /* free IOC init DMA buffer */ + if (mrsas_free_dma_obj(instance, init2_dma_obj) + != DDI_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, + "mrsas_issue_init_mpi2: Free Failed\n")); + return (DDI_FAILURE); + } + + /* Get/Check and sync ld_map info */ + instance->map_id = 0; + if (mrsas_tbolt_check_map_info(instance) == DDI_SUCCESS) + (void) mrsas_tbolt_sync_map_info(instance); + + + /* No mrsas_cmd to send, so send NULL. */ + if (mrsas_common_check(instance, NULL) != DDI_SUCCESS) + goto fail_init_mpi2; + + con_log(CL_ANN, (CE_NOTE, + "mrsas_issue_init_mpi2: SUCCESSFULL\n")); + + return (DDI_SUCCESS); + +fail_init_mpi2: + (void) mrsas_free_dma_obj(instance, init2_dma_obj); + + return (DDI_FAILURE); +} + +static int +mrsas_tbolt_ioc_init(struct mrsas_instance *instance, dma_obj_t *mpi2_dma_obj) +{ + int numbytes; + uint16_t flags; + struct mrsas_init_frame2 *mfiFrameInit2; + struct mrsas_header *frame_hdr; + Mpi2IOCInitRequest_t *init; + struct mrsas_cmd *cmd = NULL; + struct mrsas_drv_ver drv_ver_info; + MRSAS_REQUEST_DESCRIPTOR_UNION *req_desc; + + con_log(CL_ANN, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + +#ifdef DEBUG + con_log(CL_ANN1, (CE_CONT, " mfiFrameInit2 len = %x\n", + (int)sizeof (*mfiFrameInit2))); + con_log(CL_ANN1, (CE_CONT, " MPI len = %x\n", (int)sizeof (*init))); + con_log(CL_ANN1, (CE_CONT, " mfiFrameInit2 len = %x\n", + (int)sizeof (struct mrsas_init_frame2))); + con_log(CL_ANN1, (CE_CONT, " MPI len = %x\n", + (int)sizeof (Mpi2IOCInitRequest_t))); +#endif + + init = (Mpi2IOCInitRequest_t *)mpi2_dma_obj->buffer; + numbytes = sizeof (*init); + bzero(init, numbytes); + + ddi_put8(mpi2_dma_obj->acc_handle, &init->Function, + MPI2_FUNCTION_IOC_INIT); + + ddi_put8(mpi2_dma_obj->acc_handle, &init->WhoInit, + MPI2_WHOINIT_HOST_DRIVER); + + /* set MsgVersion and HeaderVersion host driver was built with */ + ddi_put16(mpi2_dma_obj->acc_handle, &init->MsgVersion, + MPI2_VERSION); + + ddi_put16(mpi2_dma_obj->acc_handle, &init->HeaderVersion, + MPI2_HEADER_VERSION); + + ddi_put16(mpi2_dma_obj->acc_handle, &init->SystemRequestFrameSize, + instance->raid_io_msg_size / 4); + + ddi_put16(mpi2_dma_obj->acc_handle, &init->ReplyFreeQueueDepth, + 0); + + ddi_put16(mpi2_dma_obj->acc_handle, + &init->ReplyDescriptorPostQueueDepth, + instance->reply_q_depth); + /* + * These addresses are set using the DMA cookie addresses from when the + * memory was allocated. Sense buffer hi address should be 0. + * ddi_put32(accessp, &init->SenseBufferAddressHigh, 0); + */ + + ddi_put32(mpi2_dma_obj->acc_handle, + &init->SenseBufferAddressHigh, 0); + + ddi_put64(mpi2_dma_obj->acc_handle, + (uint64_t *)&init->SystemRequestFrameBaseAddress, + instance->io_request_frames_phy); + + ddi_put64(mpi2_dma_obj->acc_handle, + &init->ReplyDescriptorPostQueueAddress, + instance->reply_frame_pool_phy); + + ddi_put64(mpi2_dma_obj->acc_handle, + &init->ReplyFreeQueueAddress, 0); + + cmd = instance->cmd_list[0]; + if (cmd == NULL) { + return (DDI_FAILURE); + } + cmd->retry_count_for_ocr = 0; + cmd->pkt = NULL; + cmd->drv_pkt_time = 0; + + mfiFrameInit2 = (struct mrsas_init_frame2 *)cmd->scsi_io_request; + con_log(CL_ANN1, (CE_CONT, "[mfi vaddr]%p", (void *)mfiFrameInit2)); + + frame_hdr = &cmd->frame->hdr; + + ddi_put8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + + flags = ddi_get16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags); + + flags |= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE; + + ddi_put16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags, flags); + + con_log(CL_ANN, (CE_CONT, + "mrsas_tbolt_ioc_init: SMID:%x\n", cmd->SMID)); + + /* Init the MFI Header */ + ddi_put8(instance->mpi2_frame_pool_dma_obj.acc_handle, + &mfiFrameInit2->cmd, MFI_CMD_OP_INIT); + + con_log(CL_ANN1, (CE_CONT, "[CMD]%x", mfiFrameInit2->cmd)); + + ddi_put8(instance->mpi2_frame_pool_dma_obj.acc_handle, + &mfiFrameInit2->cmd_status, + MFI_STAT_INVALID_STATUS); + + con_log(CL_ANN1, (CE_CONT, "[Status]%x", mfiFrameInit2->cmd_status)); + + ddi_put32(instance->mpi2_frame_pool_dma_obj.acc_handle, + &mfiFrameInit2->queue_info_new_phys_addr_lo, + mpi2_dma_obj->dma_cookie[0].dmac_address); + + ddi_put32(instance->mpi2_frame_pool_dma_obj.acc_handle, + &mfiFrameInit2->data_xfer_len, + sizeof (Mpi2IOCInitRequest_t)); + + con_log(CL_ANN1, (CE_CONT, "[reply q desc addr]%x", + (int)init->ReplyDescriptorPostQueueAddress)); + + /* fill driver version information */ + fill_up_drv_ver(&drv_ver_info); + + /* allocate the driver version data transfer buffer */ + instance->drv_ver_dma_obj.size = sizeof (drv_ver_info.drv_ver); + instance->drv_ver_dma_obj.dma_attr = mrsas_generic_dma_attr; + instance->drv_ver_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->drv_ver_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + instance->drv_ver_dma_obj.dma_attr.dma_attr_sgllen = 1; + instance->drv_ver_dma_obj.dma_attr.dma_attr_align = 1; + + if (mrsas_alloc_dma_obj(instance, &instance->drv_ver_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + cmn_err(CE_WARN, + "fusion init: Could not allocate driver version buffer."); + return (DDI_FAILURE); + } + /* copy driver version to dma buffer */ + (void) memset(instance->drv_ver_dma_obj.buffer, 0, + sizeof (drv_ver_info.drv_ver)); + ddi_rep_put8(cmd->frame_dma_obj.acc_handle, + (uint8_t *)drv_ver_info.drv_ver, + (uint8_t *)instance->drv_ver_dma_obj.buffer, + sizeof (drv_ver_info.drv_ver), DDI_DEV_AUTOINCR); + + /* send driver version physical address to firmware */ + ddi_put64(cmd->frame_dma_obj.acc_handle, &mfiFrameInit2->driverversion, + instance->drv_ver_dma_obj.dma_cookie[0].dmac_address); + + con_log(CL_ANN1, (CE_CONT, "[MPIINIT2 frame Phys addr ]0x%x len = %x", + mfiFrameInit2->queue_info_new_phys_addr_lo, + (int)sizeof (Mpi2IOCInitRequest_t))); + + con_log(CL_ANN1, (CE_CONT, "[Length]%x", mfiFrameInit2->data_xfer_len)); + + con_log(CL_ANN1, (CE_CONT, "[MFI frame Phys Address]%x len = %x", + cmd->scsi_io_request_phys_addr, + (int)sizeof (struct mrsas_init_frame2))); + + /* disable interrupts before sending INIT2 frame */ + instance->func_ptr->disable_intr(instance); + + req_desc = (MRSAS_REQUEST_DESCRIPTOR_UNION *) + instance->request_message_pool; + req_desc->Words = cmd->scsi_io_request_phys_addr; + req_desc->MFAIo.RequestFlags = + (MPI2_REQ_DESCRIPT_FLAGS_MFA << MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT); + + cmd->request_desc = req_desc; + + /* issue the init frame */ + instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd); + + con_log(CL_ANN1, (CE_CONT, "[cmd = %d] ", frame_hdr->cmd)); + con_log(CL_ANN1, (CE_CONT, "[cmd Status= %x] ", + frame_hdr->cmd_status)); + + if (ddi_get8(instance->mpi2_frame_pool_dma_obj.acc_handle, + &mfiFrameInit2->cmd_status) == 0) { + con_log(CL_ANN, (CE_NOTE, "INIT2 Success")); + } else { + con_log(CL_ANN, (CE_WARN, "INIT2 Fail")); + mrsas_dump_reply_desc(instance); + goto fail_ioc_init; + } + + mrsas_dump_reply_desc(instance); + + instance->unroll.verBuff = 1; + + con_log(CL_ANN, (CE_NOTE, "mrsas_tbolt_ioc_init: SUCCESSFULL\n")); + + return (DDI_SUCCESS); + + +fail_ioc_init: + + (void) mrsas_free_dma_obj(instance, instance->drv_ver_dma_obj); + + return (DDI_FAILURE); +} + +int +wait_for_outstanding_poll_io(struct mrsas_instance *instance) +{ + int i; + uint32_t wait_time = dump_io_wait_time; + for (i = 0; i < wait_time; i++) { + /* + * Check For Outstanding poll Commands + * except ldsync command and aen command + */ + if (instance->fw_outstanding <= 2) { + break; + } + drv_usecwait(10*MILLISEC); + /* complete commands from reply queue */ + (void) mr_sas_tbolt_process_outstanding_cmd(instance); + } + if (instance->fw_outstanding > 2) { + return (1); + } + return (0); +} +/* + * scsi_pkt handling + * + * Visible to the external world via the transport structure. + */ + +int +mrsas_tbolt_tran_start(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + struct mrsas_instance *instance = ADDR2MR(ap); + struct scsa_cmd *acmd = PKT2CMD(pkt); + struct mrsas_cmd *cmd = NULL; + uchar_t cmd_done = 0; + + con_log(CL_DLEVEL1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + if (instance->deadadapter == 1) { + cmn_err(CE_WARN, + "mrsas_tran_start:TBOLT return TRAN_FATAL_ERROR " + "for IO, as the HBA doesnt take any more IOs"); + if (pkt) { + pkt->pkt_reason = CMD_DEV_GONE; + pkt->pkt_statistics = STAT_DISCON; + } + return (TRAN_FATAL_ERROR); + } + if (instance->adapterresetinprogress) { + con_log(CL_ANN, (CE_NOTE, "Reset flag set, " + "returning mfi_pkt and setting TRAN_BUSY\n")); + return (TRAN_BUSY); + } + (void) mrsas_tbolt_prepare_pkt(acmd); + + cmd = mrsas_tbolt_build_cmd(instance, ap, pkt, &cmd_done); + + /* + * Check if the command is already completed by the mrsas_build_cmd() + * routine. In which case the busy_flag would be clear and scb will be + * NULL and appropriate reason provided in pkt_reason field + */ + if (cmd_done) { + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_scbp[0] = STATUS_GOOD; + pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET + | STATE_SENT_CMD; + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + + return (TRAN_ACCEPT); + } + + if (cmd == NULL) { + return (TRAN_BUSY); + } + + + if ((pkt->pkt_flags & FLAG_NOINTR) == 0) { + if (instance->fw_outstanding > instance->max_fw_cmds) { + cmn_err(CE_WARN, + "Command Queue Full... Returning BUSY \n"); + return_raid_msg_pkt(instance, cmd); + return (TRAN_BUSY); + } + + /* Synchronize the Cmd frame for the controller */ + (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, 0, 0, + DDI_DMA_SYNC_FORDEV); + + con_log(CL_ANN, (CE_CONT, "tbolt_issue_cmd: SCSI CDB[0]=0x%x " + "cmd->index:0x%x SMID 0x%x\n", pkt->pkt_cdbp[0], + cmd->index, cmd->SMID)); + + instance->func_ptr->issue_cmd(cmd, instance); + } else { + instance->func_ptr->issue_cmd(cmd, instance); + (void) wait_for_outstanding_poll_io(instance); + (void) mrsas_common_check(instance, cmd); + } + + return (TRAN_ACCEPT); +} + +/* + * prepare the pkt: + * the pkt may have been resubmitted or just reused so + * initialize some fields and do some checks. + */ +static int +mrsas_tbolt_prepare_pkt(struct scsa_cmd *acmd) +{ + struct scsi_pkt *pkt = CMD2PKT(acmd); + + + /* + * Reinitialize some fields that need it; the packet may + * have been resubmitted + */ + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_state = 0; + pkt->pkt_statistics = 0; + pkt->pkt_resid = 0; + + /* + * zero status byte. + */ + *(pkt->pkt_scbp) = 0; + + return (0); +} + + +int +mr_sas_tbolt_build_sgl(struct mrsas_instance *instance, + struct scsa_cmd *acmd, + struct mrsas_cmd *cmd, + Mpi2RaidSCSIIORequest_t *scsi_raid_io, + uint32_t *datalen) +{ + uint32_t MaxSGEs; + int sg_to_process; + uint32_t i, j; + uint32_t numElements, endElement; + Mpi25IeeeSgeChain64_t *ieeeChainElement = NULL; + Mpi25IeeeSgeChain64_t *scsi_raid_io_sgl_ieee = NULL; + ddi_acc_handle_t acc_handle = + instance->mpi2_frame_pool_dma_obj.acc_handle; + + con_log(CL_ANN1, (CE_NOTE, + "chkpnt: Building Chained SGL :%d", __LINE__)); + + /* Calulate SGE size in number of Words(32bit) */ + /* Clear the datalen before updating it. */ + *datalen = 0; + + MaxSGEs = instance->max_sge_in_main_msg; + + ddi_put16(acc_handle, &scsi_raid_io->SGLFlags, + MPI2_SGE_FLAGS_64_BIT_ADDRESSING); + + /* set data transfer flag. */ + if (acmd->cmd_flags & CFLAG_DMASEND) { + ddi_put32(acc_handle, &scsi_raid_io->Control, + MPI2_SCSIIO_CONTROL_WRITE); + } else { + ddi_put32(acc_handle, &scsi_raid_io->Control, + MPI2_SCSIIO_CONTROL_READ); + } + + + numElements = acmd->cmd_cookiecnt; + + con_log(CL_DLEVEL1, (CE_NOTE, "[SGE Count]:%x", numElements)); + + if (numElements > instance->max_num_sge) { + con_log(CL_ANN, (CE_NOTE, + "[Max SGE Count Exceeded]:%x", numElements)); + return (numElements); + } + + ddi_put8(acc_handle, &scsi_raid_io->RaidContext.numSGE, + (uint8_t)numElements); + + /* set end element in main message frame */ + endElement = (numElements <= MaxSGEs) ? numElements : (MaxSGEs - 1); + + /* prepare the scatter-gather list for the firmware */ + scsi_raid_io_sgl_ieee = + (Mpi25IeeeSgeChain64_t *)&scsi_raid_io->SGL.IeeeChain; + + if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) { + Mpi25IeeeSgeChain64_t *sgl_ptr_end = scsi_raid_io_sgl_ieee; + sgl_ptr_end += instance->max_sge_in_main_msg - 1; + + ddi_put8(acc_handle, &sgl_ptr_end->Flags, 0); + } + + for (i = 0; i < endElement; i++, scsi_raid_io_sgl_ieee++) { + ddi_put64(acc_handle, &scsi_raid_io_sgl_ieee->Address, + acmd->cmd_dmacookies[i].dmac_laddress); + + ddi_put32(acc_handle, &scsi_raid_io_sgl_ieee->Length, + acmd->cmd_dmacookies[i].dmac_size); + + ddi_put8(acc_handle, &scsi_raid_io_sgl_ieee->Flags, 0); + + if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) { + if (i == (numElements - 1)) { + ddi_put8(acc_handle, + &scsi_raid_io_sgl_ieee->Flags, + IEEE_SGE_FLAGS_END_OF_LIST); + } + } + + *datalen += acmd->cmd_dmacookies[i].dmac_size; + +#ifdef DEBUG + con_log(CL_DLEVEL1, (CE_NOTE, "[SGL Address]: %" PRIx64, + scsi_raid_io_sgl_ieee->Address)); + con_log(CL_DLEVEL1, (CE_NOTE, "[SGL Length]:%x", + scsi_raid_io_sgl_ieee->Length)); + con_log(CL_DLEVEL1, (CE_NOTE, "[SGL Flags]:%x", + scsi_raid_io_sgl_ieee->Flags)); +#endif + + } + + ddi_put8(acc_handle, &scsi_raid_io->ChainOffset, 0); + + /* check if chained SGL required */ + if (i < numElements) { + + con_log(CL_ANN1, (CE_NOTE, "[Chain Element index]:%x", i)); + + if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) { + uint16_t ioFlags = + ddi_get16(acc_handle, &scsi_raid_io->IoFlags); + + if ((ioFlags & + MPI25_SAS_DEVICE0_FLAGS_ENABLED_FAST_PATH) != + MPI25_SAS_DEVICE0_FLAGS_ENABLED_FAST_PATH) { + ddi_put8(acc_handle, &scsi_raid_io->ChainOffset, + (U8)instance->chain_offset_io_req); + } else { + ddi_put8(acc_handle, + &scsi_raid_io->ChainOffset, 0); + } + } else { + ddi_put8(acc_handle, &scsi_raid_io->ChainOffset, + (U8)instance->chain_offset_io_req); + } + + /* prepare physical chain element */ + ieeeChainElement = scsi_raid_io_sgl_ieee; + + ddi_put8(acc_handle, &ieeeChainElement->NextChainOffset, 0); + + if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) { + ddi_put8(acc_handle, &ieeeChainElement->Flags, + IEEE_SGE_FLAGS_CHAIN_ELEMENT); + } else { + ddi_put8(acc_handle, &ieeeChainElement->Flags, + (IEEE_SGE_FLAGS_CHAIN_ELEMENT | + MPI2_IEEE_SGE_FLAGS_IOCPLBNTA_ADDR)); + } + + ddi_put32(acc_handle, &ieeeChainElement->Length, + (sizeof (MPI2_SGE_IO_UNION) * (numElements - i))); + + ddi_put64(acc_handle, &ieeeChainElement->Address, + (U64)cmd->sgl_phys_addr); + + sg_to_process = numElements - i; + + con_log(CL_ANN1, (CE_NOTE, + "[Additional SGE Count]:%x", endElement)); + + /* point to the chained SGL buffer */ + scsi_raid_io_sgl_ieee = (Mpi25IeeeSgeChain64_t *)cmd->sgl; + + /* build rest of the SGL in chained buffer */ + for (j = 0; j < sg_to_process; j++, scsi_raid_io_sgl_ieee++) { + con_log(CL_DLEVEL3, (CE_NOTE, "[remaining SGL]:%x", i)); + + ddi_put64(acc_handle, &scsi_raid_io_sgl_ieee->Address, + acmd->cmd_dmacookies[i].dmac_laddress); + + ddi_put32(acc_handle, &scsi_raid_io_sgl_ieee->Length, + acmd->cmd_dmacookies[i].dmac_size); + + ddi_put8(acc_handle, &scsi_raid_io_sgl_ieee->Flags, 0); + + if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) { + if (i == (numElements - 1)) { + ddi_put8(acc_handle, + &scsi_raid_io_sgl_ieee->Flags, + IEEE_SGE_FLAGS_END_OF_LIST); + } + } + + *datalen += acmd->cmd_dmacookies[i].dmac_size; + +#if DEBUG + con_log(CL_DLEVEL1, (CE_NOTE, + "[SGL Address]: %" PRIx64, + scsi_raid_io_sgl_ieee->Address)); + con_log(CL_DLEVEL1, (CE_NOTE, + "[SGL Length]:%x", scsi_raid_io_sgl_ieee->Length)); + con_log(CL_DLEVEL1, (CE_NOTE, + "[SGL Flags]:%x", scsi_raid_io_sgl_ieee->Flags)); +#endif + + i++; + } + } + + return (0); +} /*end of BuildScatterGather */ + + +/* + * build_cmd + */ +static struct mrsas_cmd * +mrsas_tbolt_build_cmd(struct mrsas_instance *instance, struct scsi_address *ap, + struct scsi_pkt *pkt, uchar_t *cmd_done) +{ + uint8_t fp_possible = 0; + uint32_t index; + uint32_t lba_count = 0; + uint32_t start_lba_hi = 0; + uint32_t start_lba_lo = 0; + ddi_acc_handle_t acc_handle = + instance->mpi2_frame_pool_dma_obj.acc_handle; + struct mrsas_cmd *cmd = NULL; + struct scsa_cmd *acmd = PKT2CMD(pkt); + MRSAS_REQUEST_DESCRIPTOR_UNION *ReqDescUnion; + Mpi2RaidSCSIIORequest_t *scsi_raid_io; + uint32_t datalen; + struct IO_REQUEST_INFO io_info; + MR_FW_RAID_MAP_ALL *local_map_ptr; + uint16_t pd_cmd_cdblen; + + con_log(CL_DLEVEL1, (CE_NOTE, + "chkpnt: Entered mrsas_tbolt_build_cmd:%d", __LINE__)); + + /* find out if this is logical or physical drive command. */ + acmd->islogical = MRDRV_IS_LOGICAL(ap); + acmd->device_id = MAP_DEVICE_ID(instance, ap); + + *cmd_done = 0; + + /* get the command packet */ + if (!(cmd = get_raid_msg_pkt(instance))) { + return (NULL); + } + + index = cmd->index; + ReqDescUnion = mr_sas_get_request_descriptor(instance, index); + ReqDescUnion->Words = 0; + ReqDescUnion->SCSIIO.SMID = cmd->SMID; + ReqDescUnion->SCSIIO.RequestFlags = + (MPI2_REQ_DESCRIPT_FLAGS_LD_IO << + MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT); + + + cmd->request_desc = ReqDescUnion; + cmd->pkt = pkt; + cmd->cmd = acmd; + + /* lets get the command directions */ + if (acmd->cmd_flags & CFLAG_DMASEND) { + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, acmd->cmd_dma_len, + DDI_DMA_SYNC_FORDEV); + } + } else if (acmd->cmd_flags & ~CFLAG_DMASEND) { + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, acmd->cmd_dma_len, + DDI_DMA_SYNC_FORCPU); + } + } else { + con_log(CL_ANN, (CE_NOTE, "NO DMA\n")); + } + + + /* get SCSI_IO raid message frame pointer */ + scsi_raid_io = (Mpi2RaidSCSIIORequest_t *)cmd->scsi_io_request; + + /* zero out SCSI_IO raid message frame */ + (void) memset(scsi_raid_io, 0, sizeof (Mpi2RaidSCSIIORequest_t)); + + /* Set the ldTargetId set by BuildRaidContext() */ + ddi_put16(acc_handle, &scsi_raid_io->RaidContext.ldTargetId, + acmd->device_id); + + /* Copy CDB to scsi_io_request message frame */ + ddi_rep_put8(acc_handle, + (uint8_t *)pkt->pkt_cdbp, (uint8_t *)scsi_raid_io->CDB.CDB32, + acmd->cmd_cdblen, DDI_DEV_AUTOINCR); + + /* + * Just the CDB length, rest of the Flags are zero + * This will be modified later. + */ + ddi_put16(acc_handle, &scsi_raid_io->IoFlags, acmd->cmd_cdblen); + + pd_cmd_cdblen = acmd->cmd_cdblen; + + switch (pkt->pkt_cdbp[0]) { + case SCMD_READ: + case SCMD_WRITE: + case SCMD_READ_G1: + case SCMD_WRITE_G1: + case SCMD_READ_G4: + case SCMD_WRITE_G4: + case SCMD_READ_G5: + case SCMD_WRITE_G5: + + if (acmd->islogical) { + /* Initialize sense Information */ + if (cmd->sense1 == NULL) { + con_log(CL_ANN, (CE_NOTE, "tbolt_build_cmd: " + "Sense buffer ptr NULL \n")); + } + bzero(cmd->sense1, SENSE_LENGTH); + con_log(CL_DLEVEL2, (CE_NOTE, "tbolt_build_cmd " + "CDB[0] = %x\n", pkt->pkt_cdbp[0])); + + if (acmd->cmd_cdblen == CDB_GROUP0) { + /* 6-byte cdb */ + lba_count = (uint16_t)(pkt->pkt_cdbp[4]); + start_lba_lo = ((uint32_t)(pkt->pkt_cdbp[3]) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 8) | + ((uint32_t)((pkt->pkt_cdbp[1]) & 0x1F) + << 16)); + } else if (acmd->cmd_cdblen == CDB_GROUP1) { + /* 10-byte cdb */ + lba_count = + (((uint16_t)(pkt->pkt_cdbp[8])) | + ((uint16_t)(pkt->pkt_cdbp[7]) << 8)); + + start_lba_lo = + (((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24)); + + } else if (acmd->cmd_cdblen == CDB_GROUP5) { + /* 12-byte cdb */ + lba_count = ( + ((uint32_t)(pkt->pkt_cdbp[9])) | + ((uint32_t)(pkt->pkt_cdbp[8]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[7]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[6]) << 24)); + + start_lba_lo = + (((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24)); + + } else if (acmd->cmd_cdblen == CDB_GROUP4) { + /* 16-byte cdb */ + lba_count = ( + ((uint32_t)(pkt->pkt_cdbp[13])) | + ((uint32_t)(pkt->pkt_cdbp[12]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[11]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[10]) << 24)); + + start_lba_lo = ( + ((uint32_t)(pkt->pkt_cdbp[9])) | + ((uint32_t)(pkt->pkt_cdbp[8]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[7]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[6]) << 24)); + + start_lba_hi = ( + ((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24)); + } + + if (instance->tbolt && + ((lba_count * 512) > mrsas_tbolt_max_cap_maxxfer)) { + cmn_err(CE_WARN, " IO SECTOR COUNT exceeds " + "controller limit 0x%x sectors\n", + lba_count); + } + + (void) memset(&io_info, 0, + sizeof (struct IO_REQUEST_INFO)); + io_info.ldStartBlock = ((uint64_t)start_lba_hi << 32) | + start_lba_lo; + io_info.numBlocks = lba_count; + io_info.ldTgtId = acmd->device_id; + + if (acmd->cmd_flags & CFLAG_DMASEND) + io_info.isRead = 0; + else + io_info.isRead = 1; + + + /* Acquire SYNC MAP UPDATE lock */ + mutex_enter(&instance->sync_map_mtx); + + local_map_ptr = + instance->ld_map[(instance->map_id & 1)]; + + if ((MR_TargetIdToLdGet( + acmd->device_id, local_map_ptr) >= + MAX_LOGICAL_DRIVES) || !instance->fast_path_io) { + cmn_err(CE_NOTE, "Fast Path NOT Possible, " + "targetId >= MAX_LOGICAL_DRIVES || " + "!instance->fast_path_io\n"); + fp_possible = 0; + /* Set Regionlock flags to BYPASS */ + /* io_request->RaidContext.regLockFlags = 0; */ + ddi_put8(acc_handle, + &scsi_raid_io->RaidContext.regLockFlags, 0); + } else { + if (MR_BuildRaidContext(instance, &io_info, + &scsi_raid_io->RaidContext, local_map_ptr)) + fp_possible = io_info.fpOkForIo; + } + + if (!enable_fp) + fp_possible = 0; + + con_log(CL_ANN1, (CE_NOTE, "enable_fp %d " + "instance->fast_path_io %d fp_possible %d \n", + enable_fp, instance->fast_path_io, fp_possible)); + + if (fp_possible) { + + /* Check for DIF enabled LD */ + if (MR_CheckDIF(acmd->device_id, local_map_ptr)) { + /* Prepare 32 Byte CDB for DIF capable Disk */ + mrsas_tbolt_prepare_cdb(instance, + scsi_raid_io->CDB.CDB32, + &io_info, scsi_raid_io, start_lba_lo); + } else { + mrsas_tbolt_set_pd_lba(scsi_raid_io->CDB.CDB32, + (uint8_t *)&pd_cmd_cdblen, + io_info.pdBlock, io_info.numBlocks); + ddi_put16(acc_handle, + &scsi_raid_io->IoFlags, pd_cmd_cdblen); + } + + ddi_put8(acc_handle, &scsi_raid_io->Function, + MPI2_FUNCTION_SCSI_IO_REQUEST); + + ReqDescUnion->SCSIIO.RequestFlags = + (MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY << + MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT); + + if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) { + uint8_t regLockFlags = ddi_get8(acc_handle, + &scsi_raid_io->RaidContext.regLockFlags); + uint16_t IoFlags = ddi_get16(acc_handle, + &scsi_raid_io->IoFlags); + + if (regLockFlags == REGION_TYPE_UNUSED) + ReqDescUnion->SCSIIO.RequestFlags = + (MPI2_REQ_DESCRIPT_FLAGS_NO_LOCK << + MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT); + + IoFlags |= + MPI25_SAS_DEVICE0_FLAGS_ENABLED_FAST_PATH; + regLockFlags |= + (MR_RL_FLAGS_GRANT_DESTINATION_CUDA | + MR_RL_FLAGS_SEQ_NUM_ENABLE); + + ddi_put8(acc_handle, + &scsi_raid_io->ChainOffset, 0); + ddi_put8(acc_handle, + &scsi_raid_io->RaidContext.nsegType, + ((0x01 << MPI2_NSEG_FLAGS_SHIFT) | + MPI2_TYPE_CUDA)); + ddi_put8(acc_handle, + &scsi_raid_io->RaidContext.regLockFlags, + regLockFlags); + ddi_put16(acc_handle, + &scsi_raid_io->IoFlags, IoFlags); + } + + if ((instance->load_balance_info[ + acmd->device_id].loadBalanceFlag) && + (io_info.isRead)) { + io_info.devHandle = + get_updated_dev_handle(&instance-> + load_balance_info[acmd->device_id], + &io_info); + cmd->load_balance_flag |= + MEGASAS_LOAD_BALANCE_FLAG; + } else { + cmd->load_balance_flag &= + ~MEGASAS_LOAD_BALANCE_FLAG; + } + + ReqDescUnion->SCSIIO.DevHandle = io_info.devHandle; + ddi_put16(acc_handle, &scsi_raid_io->DevHandle, + io_info.devHandle); + + } else { + ddi_put8(acc_handle, &scsi_raid_io->Function, + MPI2_FUNCTION_LD_IO_REQUEST); + + ddi_put16(acc_handle, + &scsi_raid_io->DevHandle, acmd->device_id); + + ReqDescUnion->SCSIIO.RequestFlags = + (MPI2_REQ_DESCRIPT_FLAGS_LD_IO << + MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT); + + ddi_put16(acc_handle, + &scsi_raid_io->RaidContext.timeoutValue, + local_map_ptr->raidMap.fpPdIoTimeoutSec); + + if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) { + uint8_t regLockFlags = ddi_get8(acc_handle, + &scsi_raid_io->RaidContext.regLockFlags); + + if (regLockFlags == REGION_TYPE_UNUSED) { + ReqDescUnion->SCSIIO.RequestFlags = + (MPI2_REQ_DESCRIPT_FLAGS_NO_LOCK << + MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT); + } + + regLockFlags |= + (MR_RL_FLAGS_GRANT_DESTINATION_CPU0 | + MR_RL_FLAGS_SEQ_NUM_ENABLE); + + ddi_put8(acc_handle, + &scsi_raid_io->RaidContext.nsegType, + ((0x01 << MPI2_NSEG_FLAGS_SHIFT) | + MPI2_TYPE_CUDA)); + ddi_put8(acc_handle, + &scsi_raid_io->RaidContext.regLockFlags, + regLockFlags); + } + } /* Not FP */ + + /* Release SYNC MAP UPDATE lock */ + mutex_exit(&instance->sync_map_mtx); + + + /* + * Set sense buffer physical address/length in scsi_io_request. + */ + ddi_put32(acc_handle, &scsi_raid_io->SenseBufferLowAddress, + cmd->sense_phys_addr1); + ddi_put8(acc_handle, &scsi_raid_io->SenseBufferLength, + SENSE_LENGTH); + + /* Construct SGL */ + ddi_put8(acc_handle, &scsi_raid_io->SGLOffset0, + offsetof(MPI2_RAID_SCSI_IO_REQUEST, SGL) / 4); + + (void) mr_sas_tbolt_build_sgl(instance, acmd, cmd, + scsi_raid_io, &datalen); + + ddi_put32(acc_handle, &scsi_raid_io->DataLength, datalen); + + break; +#ifndef PDSUPPORT /* if PDSUPPORT, skip break and fall through */ + } else { + break; +#endif + } + /* fall through For all non-rd/wr cmds */ + default: + switch (pkt->pkt_cdbp[0]) { + case 0x35: { /* SCMD_SYNCHRONIZE_CACHE */ + return_raid_msg_pkt(instance, cmd); + *cmd_done = 1; + return (NULL); + } + + case SCMD_MODE_SENSE: + case SCMD_MODE_SENSE_G1: { + union scsi_cdb *cdbp; + uint16_t page_code; + + cdbp = (void *)pkt->pkt_cdbp; + page_code = (uint16_t)cdbp->cdb_un.sg.scsi[0]; + switch (page_code) { + case 0x3: + case 0x4: + (void) mrsas_mode_sense_build(pkt); + return_raid_msg_pkt(instance, cmd); + *cmd_done = 1; + return (NULL); + } + break; + } + + default: { + /* + * Here we need to handle PASSTHRU for + * Logical Devices. Like Inquiry etc. + */ + + if (!(acmd->islogical)) { + + /* Acquire SYNC MAP UPDATE lock */ + mutex_enter(&instance->sync_map_mtx); + + local_map_ptr = + instance->ld_map[(instance->map_id & 1)]; + + ddi_put8(acc_handle, &scsi_raid_io->Function, + MPI2_FUNCTION_SCSI_IO_REQUEST); + + ReqDescUnion->SCSIIO.RequestFlags = + (MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY << + MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT); + + ddi_put16(acc_handle, &scsi_raid_io->DevHandle, + local_map_ptr->raidMap. + devHndlInfo[acmd->device_id].curDevHdl); + + + /* Set regLockFlasgs to REGION_TYPE_BYPASS */ + ddi_put8(acc_handle, + &scsi_raid_io->RaidContext.regLockFlags, 0); + ddi_put64(acc_handle, + &scsi_raid_io->RaidContext.regLockRowLBA, + 0); + ddi_put32(acc_handle, + &scsi_raid_io->RaidContext.regLockLength, + 0); + ddi_put8(acc_handle, + &scsi_raid_io->RaidContext.RAIDFlags, + MR_RAID_FLAGS_IO_SUB_TYPE_SYSTEM_PD << + MR_RAID_CTX_RAID_FLAGS_IO_SUB_TYPE_SHIFT); + ddi_put16(acc_handle, + &scsi_raid_io->RaidContext.timeoutValue, + local_map_ptr->raidMap.fpPdIoTimeoutSec); + ddi_put16(acc_handle, + &scsi_raid_io->RaidContext.ldTargetId, + acmd->device_id); + ddi_put8(acc_handle, + &scsi_raid_io->LUN[1], acmd->lun); + + /* Release SYNC MAP UPDATE lock */ + mutex_exit(&instance->sync_map_mtx); + + } else { + ddi_put8(acc_handle, &scsi_raid_io->Function, + MPI2_FUNCTION_LD_IO_REQUEST); + ddi_put8(acc_handle, + &scsi_raid_io->LUN[1], acmd->lun); + ddi_put16(acc_handle, + &scsi_raid_io->DevHandle, acmd->device_id); + ReqDescUnion->SCSIIO.RequestFlags = + (MPI2_REQ_DESCRIPT_FLAGS_SCSI_IO << + MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT); + } + + /* + * Set sense buffer physical address/length in + * scsi_io_request. + */ + ddi_put32(acc_handle, + &scsi_raid_io->SenseBufferLowAddress, + cmd->sense_phys_addr1); + ddi_put8(acc_handle, + &scsi_raid_io->SenseBufferLength, SENSE_LENGTH); + + /* Construct SGL */ + ddi_put8(acc_handle, &scsi_raid_io->SGLOffset0, + offsetof(MPI2_RAID_SCSI_IO_REQUEST, SGL) / 4); + + (void) mr_sas_tbolt_build_sgl(instance, acmd, cmd, + scsi_raid_io, &datalen); + + ddi_put32(acc_handle, + &scsi_raid_io->DataLength, datalen); + + + con_log(CL_ANN, (CE_CONT, + "tbolt_build_cmd CDB[0] =%x, TargetID =%x\n", + pkt->pkt_cdbp[0], acmd->device_id)); + con_log(CL_DLEVEL1, (CE_CONT, + "data length = %x\n", + scsi_raid_io->DataLength)); + con_log(CL_DLEVEL1, (CE_CONT, + "cdb length = %x\n", + acmd->cmd_cdblen)); + } + break; + } + + } + + return (cmd); +} + +/* + * mrsas_tbolt_tran_init_pkt - allocate & initialize a scsi_pkt structure + * @ap: + * @pkt: + * @bp: + * @cmdlen: + * @statuslen: + * @tgtlen: + * @flags: + * @callback: + * + * The tran_init_pkt() entry point allocates and initializes a scsi_pkt + * structure and DMA resources for a target driver request. The + * tran_init_pkt() entry point is called when the target driver calls the + * SCSA function scsi_init_pkt(). Each call of the tran_init_pkt() entry point + * is a request to perform one or more of three possible services: + * - allocation and initialization of a scsi_pkt structure + * - allocation of DMA resources for data transfer + * - reallocation of DMA resources for the next portion of the data transfer + */ +struct scsi_pkt * +mrsas_tbolt_tran_init_pkt(struct scsi_address *ap, + register struct scsi_pkt *pkt, + struct buf *bp, int cmdlen, int statuslen, int tgtlen, + int flags, int (*callback)(), caddr_t arg) +{ + struct scsa_cmd *acmd; + struct mrsas_instance *instance; + struct scsi_pkt *new_pkt; + + instance = ADDR2MR(ap); + + /* step #1 : pkt allocation */ + if (pkt == NULL) { + pkt = scsi_hba_pkt_alloc(instance->dip, ap, cmdlen, statuslen, + tgtlen, sizeof (struct scsa_cmd), callback, arg); + if (pkt == NULL) { + return (NULL); + } + + acmd = PKT2CMD(pkt); + + /* + * Initialize the new pkt - we redundantly initialize + * all the fields for illustrative purposes. + */ + acmd->cmd_pkt = pkt; + acmd->cmd_flags = 0; + acmd->cmd_scblen = statuslen; + acmd->cmd_cdblen = cmdlen; + acmd->cmd_dmahandle = NULL; + acmd->cmd_ncookies = 0; + acmd->cmd_cookie = 0; + acmd->cmd_cookiecnt = 0; + acmd->cmd_nwin = 0; + + pkt->pkt_address = *ap; + pkt->pkt_comp = (void (*)())NULL; + pkt->pkt_flags = 0; + pkt->pkt_time = 0; + pkt->pkt_resid = 0; + pkt->pkt_state = 0; + pkt->pkt_statistics = 0; + pkt->pkt_reason = 0; + new_pkt = pkt; + } else { + acmd = PKT2CMD(pkt); + new_pkt = NULL; + } + + /* step #2 : dma allocation/move */ + if (bp && bp->b_bcount != 0) { + if (acmd->cmd_dmahandle == NULL) { + if (mrsas_dma_alloc(instance, pkt, bp, flags, + callback) == DDI_FAILURE) { + if (new_pkt) { + scsi_hba_pkt_free(ap, new_pkt); + } + return ((struct scsi_pkt *)NULL); + } + } else { + if (mrsas_dma_move(instance, pkt, bp) == DDI_FAILURE) { + return ((struct scsi_pkt *)NULL); + } + } + } + return (pkt); +} + + +uint32_t +tbolt_read_fw_status_reg(struct mrsas_instance *instance) +{ + return ((uint32_t)RD_OB_SCRATCH_PAD_0(instance)); +} + +void +tbolt_issue_cmd(struct mrsas_cmd *cmd, struct mrsas_instance *instance) +{ + MRSAS_REQUEST_DESCRIPTOR_UNION *req_desc = cmd->request_desc; + atomic_add_16(&instance->fw_outstanding, 1); + + struct scsi_pkt *pkt; + + con_log(CL_ANN1, + (CE_NOTE, "tbolt_issue_cmd: cmd->[SMID]=0x%X", cmd->SMID)); + + con_log(CL_DLEVEL1, (CE_CONT, + " [req desc Words] %" PRIx64 " \n", req_desc->Words)); + con_log(CL_DLEVEL1, (CE_CONT, + " [req desc low part] %x \n", + (uint_t)(req_desc->Words & 0xffffffffff))); + con_log(CL_DLEVEL1, (CE_CONT, + " [req desc high part] %x \n", (uint_t)(req_desc->Words >> 32))); + pkt = cmd->pkt; + + if (pkt) { + con_log(CL_ANN1, (CE_CONT, "%llx :TBOLT issue_cmd_ppc:" + "ISSUED CMD TO FW : called : cmd:" + ": %p instance : %p pkt : %p pkt_time : %x\n", + gethrtime(), (void *)cmd, (void *)instance, + (void *)pkt, cmd->drv_pkt_time)); + if (instance->adapterresetinprogress) { + cmd->drv_pkt_time = (uint16_t)debug_timeout_g; + con_log(CL_ANN, (CE_NOTE, + "TBOLT Reset the scsi_pkt timer")); + } else { + push_pending_mfi_pkt(instance, cmd); + } + + } else { + con_log(CL_ANN1, (CE_CONT, "%llx :TBOLT issue_cmd_ppc:" + "ISSUED CMD TO FW : called : cmd : %p, instance: %p" + "(NO PKT)\n", gethrtime(), (void *)cmd, (void *)instance)); + } + + /* Issue the command to the FW */ + mutex_enter(&instance->reg_write_mtx); + WR_IB_LOW_QPORT((uint32_t)(req_desc->Words), instance); + WR_IB_HIGH_QPORT((uint32_t)(req_desc->Words >> 32), instance); + mutex_exit(&instance->reg_write_mtx); +} + +/* + * issue_cmd_in_sync_mode + */ +int +tbolt_issue_cmd_in_sync_mode(struct mrsas_instance *instance, + struct mrsas_cmd *cmd) +{ + int i; + uint32_t msecs = MFI_POLL_TIMEOUT_SECS * MILLISEC; + MRSAS_REQUEST_DESCRIPTOR_UNION *req_desc = cmd->request_desc; + + struct mrsas_header *hdr; + hdr = (struct mrsas_header *)&cmd->frame->hdr; + + con_log(CL_ANN, + (CE_NOTE, "tbolt_issue_cmd_in_sync_mode: cmd->[SMID]=0x%X", + cmd->SMID)); + + + if (instance->adapterresetinprogress) { + cmd->drv_pkt_time = ddi_get16 + (cmd->frame_dma_obj.acc_handle, &hdr->timeout); + if (cmd->drv_pkt_time < debug_timeout_g) + cmd->drv_pkt_time = (uint16_t)debug_timeout_g; + con_log(CL_ANN, (CE_NOTE, "tbolt_issue_cmd_in_sync_mode:" + "RESET-IN-PROGRESS, issue cmd & return.\n")); + + mutex_enter(&instance->reg_write_mtx); + WR_IB_LOW_QPORT((uint32_t)(req_desc->Words), instance); + WR_IB_HIGH_QPORT((uint32_t)(req_desc->Words >> 32), instance); + mutex_exit(&instance->reg_write_mtx); + + return (DDI_SUCCESS); + } else { + con_log(CL_ANN1, (CE_NOTE, + "tbolt_issue_cmd_in_sync_mode: pushing the pkt\n")); + push_pending_mfi_pkt(instance, cmd); + } + + con_log(CL_DLEVEL2, (CE_NOTE, + "HighQport offset :%p", + (void *)((uintptr_t)(instance)->regmap + IB_HIGH_QPORT))); + con_log(CL_DLEVEL2, (CE_NOTE, + "LowQport offset :%p", + (void *)((uintptr_t)(instance)->regmap + IB_LOW_QPORT))); + + cmd->sync_cmd = MRSAS_TRUE; + cmd->cmd_status = ENODATA; + + + mutex_enter(&instance->reg_write_mtx); + WR_IB_LOW_QPORT((uint32_t)(req_desc->Words), instance); + WR_IB_HIGH_QPORT((uint32_t)(req_desc->Words >> 32), instance); + mutex_exit(&instance->reg_write_mtx); + + con_log(CL_ANN1, (CE_NOTE, + " req desc high part %x \n", (uint_t)(req_desc->Words >> 32))); + con_log(CL_ANN1, (CE_NOTE, " req desc low part %x \n", + (uint_t)(req_desc->Words & 0xffffffff))); + + mutex_enter(&instance->int_cmd_mtx); + for (i = 0; i < msecs && (cmd->cmd_status == ENODATA); i++) { + cv_wait(&instance->int_cmd_cv, &instance->int_cmd_mtx); + } + mutex_exit(&instance->int_cmd_mtx); + + + if (i < (msecs -1)) { + return (DDI_SUCCESS); + } else { + return (DDI_FAILURE); + } +} + +/* + * issue_cmd_in_poll_mode + */ +int +tbolt_issue_cmd_in_poll_mode(struct mrsas_instance *instance, + struct mrsas_cmd *cmd) +{ + int i; + uint16_t flags; + uint32_t msecs = MFI_POLL_TIMEOUT_SECS * MILLISEC; + struct mrsas_header *frame_hdr; + + con_log(CL_ANN, + (CE_NOTE, "tbolt_issue_cmd_in_poll_mode: cmd->[SMID]=0x%X", + cmd->SMID)); + + MRSAS_REQUEST_DESCRIPTOR_UNION *req_desc = cmd->request_desc; + + frame_hdr = (struct mrsas_header *)&cmd->frame->hdr; + ddi_put8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + flags = ddi_get16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags); + flags |= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE; + ddi_put16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags, flags); + + con_log(CL_ANN1, (CE_NOTE, " req desc low part %x \n", + (uint_t)(req_desc->Words & 0xffffffff))); + con_log(CL_ANN1, (CE_NOTE, + " req desc high part %x \n", (uint_t)(req_desc->Words >> 32))); + + /* issue the frame using inbound queue port */ + mutex_enter(&instance->reg_write_mtx); + WR_IB_LOW_QPORT((uint32_t)(req_desc->Words), instance); + WR_IB_HIGH_QPORT((uint32_t)(req_desc->Words >> 32), instance); + mutex_exit(&instance->reg_write_mtx); + + for (i = 0; i < msecs && ( + ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status) + == MFI_CMD_STATUS_POLL_MODE); i++) { + /* wait for cmd_status to change from 0xFF */ + drv_usecwait(MILLISEC); /* wait for 1000 usecs */ + } + + if (ddi_get8(cmd->frame_dma_obj.acc_handle, + &frame_hdr->cmd_status) == MFI_CMD_STATUS_POLL_MODE) { + con_log(CL_ANN1, (CE_NOTE, + " cmd failed %" PRIx64 " \n", (req_desc->Words))); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +void +tbolt_enable_intr(struct mrsas_instance *instance) +{ + /* TODO: For Thunderbolt/Invader also clear intr on enable */ + /* writel(~0, ®s->outbound_intr_status); */ + /* readl(®s->outbound_intr_status); */ + + WR_OB_INTR_MASK(~(MFI_FUSION_ENABLE_INTERRUPT_MASK), instance); + + /* dummy read to force PCI flush */ + (void) RD_OB_INTR_MASK(instance); + +} + +void +tbolt_disable_intr(struct mrsas_instance *instance) +{ + uint32_t mask = 0xFFFFFFFF; + + WR_OB_INTR_MASK(mask, instance); + + /* Dummy readl to force pci flush */ + + (void) RD_OB_INTR_MASK(instance); +} + + +int +tbolt_intr_ack(struct mrsas_instance *instance) +{ + uint32_t status; + + /* check if it is our interrupt */ + status = RD_OB_INTR_STATUS(instance); + con_log(CL_ANN1, (CE_NOTE, + "chkpnt: Entered tbolt_intr_ack status = %d \n", status)); + + if (!(status & MFI_FUSION_ENABLE_INTERRUPT_MASK)) { + return (DDI_INTR_UNCLAIMED); + } + + if (mrsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (DDI_INTR_UNCLAIMED); + } + + if ((status & 1) || (status & MFI_FUSION_ENABLE_INTERRUPT_MASK)) { + /* clear the interrupt by writing back the same value */ + WR_OB_INTR_STATUS(status, instance); + /* dummy READ */ + (void) RD_OB_INTR_STATUS(instance); + } + return (DDI_INTR_CLAIMED); +} + +/* + * get_raid_msg_pkt : Get a command from the free pool + * After successful allocation, the caller of this routine + * must clear the frame buffer (memset to zero) before + * using the packet further. + * + * ***** Note ***** + * After clearing the frame buffer the context id of the + * frame buffer SHOULD be restored back. + */ + +struct mrsas_cmd * +get_raid_msg_pkt(struct mrsas_instance *instance) +{ + mlist_t *head = &instance->cmd_pool_list; + struct mrsas_cmd *cmd = NULL; + + mutex_enter(&instance->cmd_pool_mtx); + ASSERT(mutex_owned(&instance->cmd_pool_mtx)); + + + if (!mlist_empty(head)) { + cmd = mlist_entry(head->next, struct mrsas_cmd, list); + mlist_del_init(head->next); + } + if (cmd != NULL) { + cmd->pkt = NULL; + cmd->retry_count_for_ocr = 0; + cmd->drv_pkt_time = 0; + } + mutex_exit(&instance->cmd_pool_mtx); + + if (cmd != NULL) + bzero(cmd->scsi_io_request, + sizeof (Mpi2RaidSCSIIORequest_t)); + return (cmd); +} + +struct mrsas_cmd * +get_raid_msg_mfi_pkt(struct mrsas_instance *instance) +{ + mlist_t *head = &instance->cmd_app_pool_list; + struct mrsas_cmd *cmd = NULL; + + mutex_enter(&instance->cmd_app_pool_mtx); + ASSERT(mutex_owned(&instance->cmd_app_pool_mtx)); + + if (!mlist_empty(head)) { + cmd = mlist_entry(head->next, struct mrsas_cmd, list); + mlist_del_init(head->next); + } + if (cmd != NULL) { + cmd->retry_count_for_ocr = 0; + cmd->drv_pkt_time = 0; + cmd->pkt = NULL; + cmd->request_desc = NULL; + + } + + mutex_exit(&instance->cmd_app_pool_mtx); + + if (cmd != NULL) { + bzero(cmd->scsi_io_request, + sizeof (Mpi2RaidSCSIIORequest_t)); + } + + return (cmd); +} + +/* + * return_raid_msg_pkt : Return a cmd to free command pool + */ +void +return_raid_msg_pkt(struct mrsas_instance *instance, struct mrsas_cmd *cmd) +{ + mutex_enter(&instance->cmd_pool_mtx); + ASSERT(mutex_owned(&instance->cmd_pool_mtx)); + + + mlist_add_tail(&cmd->list, &instance->cmd_pool_list); + + mutex_exit(&instance->cmd_pool_mtx); +} + +void +return_raid_msg_mfi_pkt(struct mrsas_instance *instance, struct mrsas_cmd *cmd) +{ + mutex_enter(&instance->cmd_app_pool_mtx); + ASSERT(mutex_owned(&instance->cmd_app_pool_mtx)); + + mlist_add_tail(&cmd->list, &instance->cmd_app_pool_list); + + mutex_exit(&instance->cmd_app_pool_mtx); +} + + +void +mr_sas_tbolt_build_mfi_cmd(struct mrsas_instance *instance, + struct mrsas_cmd *cmd) +{ + Mpi2RaidSCSIIORequest_t *scsi_raid_io; + Mpi25IeeeSgeChain64_t *scsi_raid_io_sgl_ieee; + MRSAS_REQUEST_DESCRIPTOR_UNION *ReqDescUnion; + uint32_t index; + ddi_acc_handle_t acc_handle = + instance->mpi2_frame_pool_dma_obj.acc_handle; + + if (!instance->tbolt) { + con_log(CL_ANN, (CE_NOTE, "Not MFA enabled.\n")); + return; + } + + index = cmd->index; + + ReqDescUnion = mr_sas_get_request_descriptor(instance, index); + + if (!ReqDescUnion) { + con_log(CL_ANN1, (CE_NOTE, "[NULL REQDESC]")); + return; + } + + con_log(CL_ANN1, (CE_NOTE, "[SMID]%x", cmd->SMID)); + + ReqDescUnion->Words = 0; + + ReqDescUnion->SCSIIO.RequestFlags = + (MPI2_REQ_DESCRIPT_FLAGS_SCSI_IO << + MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT); + + ReqDescUnion->SCSIIO.SMID = cmd->SMID; + + cmd->request_desc = ReqDescUnion; + + /* get raid message frame pointer */ + scsi_raid_io = (Mpi2RaidSCSIIORequest_t *)cmd->scsi_io_request; + + if (instance->device_id == PCI_DEVICE_ID_LSI_INVADER) { + Mpi25IeeeSgeChain64_t *sgl_ptr_end = (Mpi25IeeeSgeChain64_t *) + &scsi_raid_io->SGL.IeeeChain; + sgl_ptr_end += instance->max_sge_in_main_msg - 1; + ddi_put8(acc_handle, &sgl_ptr_end->Flags, 0); + } + + ddi_put8(acc_handle, &scsi_raid_io->Function, + MPI2_FUNCTION_PASSTHRU_IO_REQUEST); + + ddi_put8(acc_handle, &scsi_raid_io->SGLOffset0, + offsetof(MPI2_RAID_SCSI_IO_REQUEST, SGL) / 4); + + ddi_put8(acc_handle, &scsi_raid_io->ChainOffset, + (U8)offsetof(MPI2_RAID_SCSI_IO_REQUEST, SGL) / 16); + + ddi_put32(acc_handle, &scsi_raid_io->SenseBufferLowAddress, + cmd->sense_phys_addr1); + + + scsi_raid_io_sgl_ieee = + (Mpi25IeeeSgeChain64_t *)&scsi_raid_io->SGL.IeeeChain; + + ddi_put64(acc_handle, &scsi_raid_io_sgl_ieee->Address, + (U64)cmd->frame_phys_addr); + + ddi_put8(acc_handle, + &scsi_raid_io_sgl_ieee->Flags, (IEEE_SGE_FLAGS_CHAIN_ELEMENT | + MPI2_IEEE_SGE_FLAGS_IOCPLBNTA_ADDR)); + /* LSI put hardcoded 1024 instead of MEGASAS_MAX_SZ_CHAIN_FRAME. */ + ddi_put32(acc_handle, &scsi_raid_io_sgl_ieee->Length, 1024); + + con_log(CL_ANN1, (CE_NOTE, + "[MFI CMD PHY ADDRESS]:%" PRIx64, + scsi_raid_io_sgl_ieee->Address)); + con_log(CL_ANN1, (CE_NOTE, + "[SGL Length]:%x", scsi_raid_io_sgl_ieee->Length)); + con_log(CL_ANN1, (CE_NOTE, "[SGL Flags]:%x", + scsi_raid_io_sgl_ieee->Flags)); +} + + +void +tbolt_complete_cmd(struct mrsas_instance *instance, + struct mrsas_cmd *cmd) +{ + uint8_t status; + uint8_t extStatus; + uint8_t arm; + struct scsa_cmd *acmd; + struct scsi_pkt *pkt; + struct scsi_arq_status *arqstat; + Mpi2RaidSCSIIORequest_t *scsi_raid_io; + LD_LOAD_BALANCE_INFO *lbinfo; + ddi_acc_handle_t acc_handle = + instance->mpi2_frame_pool_dma_obj.acc_handle; + + scsi_raid_io = (Mpi2RaidSCSIIORequest_t *)cmd->scsi_io_request; + + status = ddi_get8(acc_handle, &scsi_raid_io->RaidContext.status); + extStatus = ddi_get8(acc_handle, &scsi_raid_io->RaidContext.extStatus); + + con_log(CL_DLEVEL3, (CE_NOTE, "status %x", status)); + con_log(CL_DLEVEL3, (CE_NOTE, "extStatus %x", extStatus)); + + if (status != MFI_STAT_OK) { + con_log(CL_ANN, (CE_WARN, + "IO Cmd Failed SMID %x", cmd->SMID)); + } else { + con_log(CL_ANN, (CE_NOTE, + "IO Cmd Success SMID %x", cmd->SMID)); + } + + /* regular commands */ + + switch (ddi_get8(acc_handle, &scsi_raid_io->Function)) { + + case MPI2_FUNCTION_SCSI_IO_REQUEST : /* Fast Path IO. */ + acmd = (struct scsa_cmd *)cmd->cmd; + lbinfo = &instance->load_balance_info[acmd->device_id]; + + if (cmd->load_balance_flag & MEGASAS_LOAD_BALANCE_FLAG) { + arm = lbinfo->raid1DevHandle[0] == + scsi_raid_io->DevHandle ? 0 : 1; + + lbinfo->scsi_pending_cmds[arm]--; + cmd->load_balance_flag &= ~MEGASAS_LOAD_BALANCE_FLAG; + } + con_log(CL_DLEVEL3, (CE_NOTE, + "FastPath IO Completion Success ")); + /* FALLTHRU */ + + case MPI2_FUNCTION_LD_IO_REQUEST : { /* Regular Path IO. */ + acmd = (struct scsa_cmd *)cmd->cmd; + pkt = (struct scsi_pkt *)CMD2PKT(acmd); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, acmd->cmd_dma_len, + DDI_DMA_SYNC_FORCPU); + } + } + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + pkt->pkt_state = STATE_GOT_BUS | STATE_GOT_TARGET | + STATE_SENT_CMD | STATE_XFERRED_DATA | STATE_GOT_STATUS; + + con_log(CL_ANN, (CE_CONT, " CDB[0] = %x completed for %s: " + "size %lx SMID %x cmd_status %x", pkt->pkt_cdbp[0], + ((acmd->islogical) ? "LD" : "PD"), + acmd->cmd_dmacount, cmd->SMID, status)); + + if (pkt->pkt_cdbp[0] == SCMD_INQUIRY) { + struct scsi_inquiry *inq; + + if (acmd->cmd_dmacount != 0) { + bp_mapin(acmd->cmd_buf); + inq = (struct scsi_inquiry *) + acmd->cmd_buf->b_un.b_addr; + + /* don't expose physical drives to OS */ + if (acmd->islogical && + (status == MFI_STAT_OK)) { + display_scsi_inquiry((caddr_t)inq); +#ifdef PDSUPPORT + } else if ((status == MFI_STAT_OK) && + inq->inq_dtype == DTYPE_DIRECT) { + display_scsi_inquiry((caddr_t)inq); +#endif + } else { + /* for physical disk */ + status = MFI_STAT_DEVICE_NOT_FOUND; + } + } + } + + switch (status) { + case MFI_STAT_OK: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + case MFI_STAT_LD_CC_IN_PROGRESS: + case MFI_STAT_LD_RECON_IN_PROGRESS: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + case MFI_STAT_LD_INIT_IN_PROGRESS: + pkt->pkt_reason = CMD_TRAN_ERR; + break; + case MFI_STAT_SCSI_IO_FAILED: + cmn_err(CE_WARN, "tbolt_complete_cmd: scsi_io failed"); + pkt->pkt_reason = CMD_TRAN_ERR; + break; + case MFI_STAT_SCSI_DONE_WITH_ERROR: + con_log(CL_ANN, (CE_WARN, + "tbolt_complete_cmd: scsi_done with error")); + + pkt->pkt_reason = CMD_CMPLT; + ((struct scsi_status *)pkt->pkt_scbp)->sts_chk = 1; + + if (pkt->pkt_cdbp[0] == SCMD_TEST_UNIT_READY) { + con_log(CL_ANN, + (CE_WARN, "TEST_UNIT_READY fail")); + } else { + pkt->pkt_state |= STATE_ARQ_DONE; + arqstat = (void *)(pkt->pkt_scbp); + arqstat->sts_rqpkt_reason = CMD_CMPLT; + arqstat->sts_rqpkt_resid = 0; + arqstat->sts_rqpkt_state |= + STATE_GOT_BUS | STATE_GOT_TARGET + | STATE_SENT_CMD + | STATE_XFERRED_DATA; + *(uint8_t *)&arqstat->sts_rqpkt_status = + STATUS_GOOD; + con_log(CL_ANN1, + (CE_NOTE, "Copying Sense data %x", + cmd->SMID)); + + ddi_rep_get8(acc_handle, + (uint8_t *)&(arqstat->sts_sensedata), + cmd->sense1, + sizeof (struct scsi_extended_sense), + DDI_DEV_AUTOINCR); + + } + break; + case MFI_STAT_LD_OFFLINE: + cmn_err(CE_WARN, + "tbolt_complete_cmd: ld offline " + "CDB[0]=0x%x targetId=0x%x devhandle=0x%x\n", + /* UNDO: */ + ddi_get8(acc_handle, &scsi_raid_io->CDB.CDB32[0]), + + ddi_get16(acc_handle, + &scsi_raid_io->RaidContext.ldTargetId), + + ddi_get16(acc_handle, &scsi_raid_io->DevHandle)); + + pkt->pkt_reason = CMD_DEV_GONE; + pkt->pkt_statistics = STAT_DISCON; + break; + case MFI_STAT_DEVICE_NOT_FOUND: + con_log(CL_ANN, (CE_CONT, + "tbolt_complete_cmd: device not found error")); + pkt->pkt_reason = CMD_DEV_GONE; + pkt->pkt_statistics = STAT_DISCON; + break; + + case MFI_STAT_LD_LBA_OUT_OF_RANGE: + pkt->pkt_state |= STATE_ARQ_DONE; + pkt->pkt_reason = CMD_CMPLT; + ((struct scsi_status *)pkt->pkt_scbp)->sts_chk = 1; + + arqstat = (void *)(pkt->pkt_scbp); + arqstat->sts_rqpkt_reason = CMD_CMPLT; + arqstat->sts_rqpkt_resid = 0; + arqstat->sts_rqpkt_state |= STATE_GOT_BUS + | STATE_GOT_TARGET | STATE_SENT_CMD + | STATE_XFERRED_DATA; + *(uint8_t *)&arqstat->sts_rqpkt_status = STATUS_GOOD; + + arqstat->sts_sensedata.es_valid = 1; + arqstat->sts_sensedata.es_key = KEY_ILLEGAL_REQUEST; + arqstat->sts_sensedata.es_class = CLASS_EXTENDED_SENSE; + + /* + * LOGICAL BLOCK ADDRESS OUT OF RANGE: + * ASC: 0x21h; ASCQ: 0x00h; + */ + arqstat->sts_sensedata.es_add_code = 0x21; + arqstat->sts_sensedata.es_qual_code = 0x00; + break; + case MFI_STAT_INVALID_CMD: + case MFI_STAT_INVALID_DCMD: + case MFI_STAT_INVALID_PARAMETER: + case MFI_STAT_INVALID_SEQUENCE_NUMBER: + default: + cmn_err(CE_WARN, "tbolt_complete_cmd: Unknown status!"); + pkt->pkt_reason = CMD_TRAN_ERR; + + break; + } + + atomic_add_16(&instance->fw_outstanding, (-1)); + + (void) mrsas_common_check(instance, cmd); + if (acmd->cmd_dmahandle) { + if (mrsas_check_dma_handle(acmd->cmd_dmahandle) != + DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, + DDI_SERVICE_UNAFFECTED); + pkt->pkt_reason = CMD_TRAN_ERR; + pkt->pkt_statistics = 0; + } + } + + /* Call the callback routine */ + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && pkt->pkt_comp) + (*pkt->pkt_comp)(pkt); + + con_log(CL_ANN1, (CE_NOTE, "Free smid %x", cmd->SMID)); + + ddi_put8(acc_handle, &scsi_raid_io->RaidContext.status, 0); + + ddi_put8(acc_handle, &scsi_raid_io->RaidContext.extStatus, 0); + + return_raid_msg_pkt(instance, cmd); + break; + } + case MPI2_FUNCTION_PASSTHRU_IO_REQUEST: /* MFA command. */ + + if (cmd->frame->dcmd.opcode == MR_DCMD_LD_MAP_GET_INFO && + cmd->frame->dcmd.mbox.b[1] == 1) { + + mutex_enter(&instance->sync_map_mtx); + + con_log(CL_ANN, (CE_NOTE, + "LDMAP sync command SMID RECEIVED 0x%X", + cmd->SMID)); + if (cmd->frame->hdr.cmd_status != 0) { + cmn_err(CE_WARN, + "map sync failed, status = 0x%x.\n", + cmd->frame->hdr.cmd_status); + } else { + instance->map_id++; + cmn_err(CE_NOTE, + "map sync received, switched map_id to %" + PRIu64 " \n", instance->map_id); + } + + if (MR_ValidateMapInfo(instance->ld_map[ + (instance->map_id & 1)], + instance->load_balance_info)) { + instance->fast_path_io = 1; + } else { + instance->fast_path_io = 0; + } + + con_log(CL_ANN, (CE_NOTE, + "instance->fast_path_io %d \n", + instance->fast_path_io)); + + instance->unroll.syncCmd = 0; + + if (instance->map_update_cmd == cmd) { + return_raid_msg_pkt(instance, cmd); + atomic_add_16(&instance->fw_outstanding, (-1)); + (void) mrsas_tbolt_sync_map_info(instance); + } + + cmn_err(CE_NOTE, "LDMAP sync completed.\n"); + mutex_exit(&instance->sync_map_mtx); + break; + } + + if (cmd->frame->dcmd.opcode == MR_DCMD_CTRL_EVENT_WAIT) { + con_log(CL_ANN1, (CE_CONT, + "AEN command SMID RECEIVED 0x%X", + cmd->SMID)); + if ((instance->aen_cmd == cmd) && + (instance->aen_cmd->abort_aen)) { + con_log(CL_ANN, (CE_WARN, "mrsas_softintr: " + "aborted_aen returned")); + } else { + atomic_add_16(&instance->fw_outstanding, (-1)); + service_mfi_aen(instance, cmd); + } + } + + if (cmd->sync_cmd == MRSAS_TRUE) { + con_log(CL_ANN1, (CE_CONT, + "Sync-mode Command Response SMID RECEIVED 0x%X", + cmd->SMID)); + + tbolt_complete_cmd_in_sync_mode(instance, cmd); + } else { + con_log(CL_ANN, (CE_CONT, + "tbolt_complete_cmd: Wrong SMID RECEIVED 0x%X", + cmd->SMID)); + } + break; + default: + mrsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + + /* free message */ + con_log(CL_ANN, + (CE_NOTE, "tbolt_complete_cmd: Unknown Type!!!!!!!!")); + break; + } +} + +uint_t +mr_sas_tbolt_process_outstanding_cmd(struct mrsas_instance *instance) +{ + uint8_t replyType; + Mpi2SCSIIOSuccessReplyDescriptor_t *replyDesc; + Mpi2ReplyDescriptorsUnion_t *desc; + uint16_t smid; + union desc_value d_val; + struct mrsas_cmd *cmd; + + struct mrsas_header *hdr; + struct scsi_pkt *pkt; + + (void) ddi_dma_sync(instance->reply_desc_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORDEV); + + (void) ddi_dma_sync(instance->reply_desc_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORCPU); + + desc = instance->reply_frame_pool; + desc += instance->reply_read_index; + + replyDesc = (MPI2_SCSI_IO_SUCCESS_REPLY_DESCRIPTOR *)desc; + replyType = replyDesc->ReplyFlags & + MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK; + + if (replyType == MPI2_RPY_DESCRIPT_FLAGS_UNUSED) + return (DDI_INTR_UNCLAIMED); + + if (mrsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle) + != DDI_SUCCESS) { + mrsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + con_log(CL_ANN1, + (CE_WARN, "mr_sas_tbolt_process_outstanding_cmd(): " + "FMA check, returning DDI_INTR_UNCLAIMED")); + return (DDI_INTR_CLAIMED); + } + + con_log(CL_ANN1, (CE_NOTE, "Reply Desc = %p Words = %" PRIx64 " \n", + (void *)desc, desc->Words)); + + d_val.word = desc->Words; + + + /* Read Reply descriptor */ + while ((d_val.u1.low != 0xffffffff) && + (d_val.u1.high != 0xffffffff)) { + + (void) ddi_dma_sync(instance->reply_desc_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORCPU); + + smid = replyDesc->SMID; + + if (!smid || smid > instance->max_fw_cmds + 1) { + con_log(CL_ANN1, (CE_NOTE, + "Reply Desc at Break = %p Words = %" PRIx64 " \n", + (void *)desc, desc->Words)); + break; + } + + cmd = instance->cmd_list[smid - 1]; + if (!cmd) { + con_log(CL_ANN1, (CE_NOTE, "mr_sas_tbolt_process_" + "outstanding_cmd: Invalid command " + " or Poll commad Received in completion path\n")); + } else { + mutex_enter(&instance->cmd_pend_mtx); + if (cmd->sync_cmd == MRSAS_TRUE) { + hdr = (struct mrsas_header *)&cmd->frame->hdr; + if (hdr) { + con_log(CL_ANN1, (CE_NOTE, "mr_sas_" + "tbolt_process_outstanding_cmd:" + " mlist_del_init(&cmd->list).\n")); + mlist_del_init(&cmd->list); + } + } else { + pkt = cmd->pkt; + if (pkt) { + con_log(CL_ANN1, (CE_NOTE, "mr_sas_" + "tbolt_process_outstanding_cmd:" + "mlist_del_init(&cmd->list).\n")); + mlist_del_init(&cmd->list); + } + } + + mutex_exit(&instance->cmd_pend_mtx); + + tbolt_complete_cmd(instance, cmd); + } + /* set it back to all 0xfffffffff. */ + desc->Words = (uint64_t)~0; + + instance->reply_read_index++; + + if (instance->reply_read_index >= (instance->reply_q_depth)) { + con_log(CL_ANN1, (CE_NOTE, "wrap around")); + instance->reply_read_index = 0; + } + + /* Get the next reply descriptor */ + if (!instance->reply_read_index) + desc = instance->reply_frame_pool; + else + desc++; + + replyDesc = (MPI2_SCSI_IO_SUCCESS_REPLY_DESCRIPTOR *)desc; + + d_val.word = desc->Words; + + con_log(CL_ANN1, (CE_NOTE, + "Next Reply Desc = %p Words = %" PRIx64 "\n", + (void *)desc, desc->Words)); + + replyType = replyDesc->ReplyFlags & + MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK; + + if (replyType == MPI2_RPY_DESCRIPT_FLAGS_UNUSED) + break; + + } /* End of while loop. */ + + /* update replyIndex to FW */ + WR_MPI2_REPLY_POST_INDEX(instance->reply_read_index, instance); + + + (void) ddi_dma_sync(instance->reply_desc_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORDEV); + + (void) ddi_dma_sync(instance->reply_desc_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORCPU); + return (DDI_INTR_CLAIMED); +} + + + + +/* + * complete_cmd_in_sync_mode - Completes an internal command + * @instance: Adapter soft state + * @cmd: Command to be completed + * + * The issue_cmd_in_sync_mode() function waits for a command to complete + * after it issues a command. This function wakes up that waiting routine by + * calling wake_up() on the wait queue. + */ +void +tbolt_complete_cmd_in_sync_mode(struct mrsas_instance *instance, + struct mrsas_cmd *cmd) +{ + + cmd->cmd_status = ddi_get8(cmd->frame_dma_obj.acc_handle, + &cmd->frame->io.cmd_status); + + cmd->sync_cmd = MRSAS_FALSE; + + mutex_enter(&instance->int_cmd_mtx); + if (cmd->cmd_status == ENODATA) { + cmd->cmd_status = 0; + } + cv_broadcast(&instance->int_cmd_cv); + mutex_exit(&instance->int_cmd_mtx); + +} + +/* + * mrsas_tbolt_get_ld_map_info - Returns ld_map structure + * instance: Adapter soft state + * + * Issues an internal command (DCMD) to get the FW's controller PD + * list structure. This information is mainly used to find out SYSTEM + * supported by the FW. + */ +int +mrsas_tbolt_get_ld_map_info(struct mrsas_instance *instance) +{ + int ret = 0; + struct mrsas_cmd *cmd = NULL; + struct mrsas_dcmd_frame *dcmd; + MR_FW_RAID_MAP_ALL *ci; + uint32_t ci_h = 0; + U32 size_map_info; + + cmd = get_raid_msg_pkt(instance); + + if (cmd == NULL) { + cmn_err(CE_WARN, + "Failed to get a cmd from free-pool in get_ld_map_info()"); + return (DDI_FAILURE); + } + + dcmd = &cmd->frame->dcmd; + + size_map_info = sizeof (MR_FW_RAID_MAP) + + (sizeof (MR_LD_SPAN_MAP) * + (MAX_LOGICAL_DRIVES - 1)); + + con_log(CL_ANN, (CE_NOTE, + "size_map_info : 0x%x", size_map_info)); + + ci = instance->ld_map[(instance->map_id & 1)]; + ci_h = instance->ld_map_phy[(instance->map_id & 1)]; + + if (!ci) { + cmn_err(CE_WARN, "Failed to alloc mem for ld_map_info"); + return_raid_msg_pkt(instance, cmd); + return (-1); + } + + (void) memset(ci, 0, sizeof (*ci)); + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + dcmd->cmd = MFI_CMD_OP_DCMD; + dcmd->cmd_status = 0xFF; + dcmd->sge_count = 1; + dcmd->flags = MFI_FRAME_DIR_READ; + dcmd->timeout = 0; + dcmd->pad_0 = 0; + dcmd->data_xfer_len = size_map_info; + dcmd->opcode = MR_DCMD_LD_MAP_GET_INFO; + dcmd->sgl.sge32[0].phys_addr = ci_h; + dcmd->sgl.sge32[0].length = size_map_info; + + + mr_sas_tbolt_build_mfi_cmd(instance, cmd); + + if (!instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + ret = 0; + con_log(CL_ANN1, (CE_NOTE, + "Get LD Map Info success\n")); + } else { + cmn_err(CE_WARN, + "Get LD Map Info failed\n"); + ret = -1; + } + + return_raid_msg_pkt(instance, cmd); + + return (ret); +} + +void +mrsas_dump_reply_desc(struct mrsas_instance *instance) +{ + uint32_t i; + MPI2_REPLY_DESCRIPTORS_UNION *reply_desc; + union desc_value d_val; + + reply_desc = instance->reply_frame_pool; + + for (i = 0; i < instance->reply_q_depth; i++, reply_desc++) { + d_val.word = reply_desc->Words; + con_log(CL_DLEVEL3, (CE_NOTE, + "i=%d, %x:%x", + i, d_val.u1.high, d_val.u1.low)); + } +} + +/* + * mrsas_tbolt_command_create - Create command for fast path. + * @io_info: MegaRAID IO request packet pointer. + * @ref_tag: Reference tag for RD/WRPROTECT + * + * Create the command for fast path. + */ +void +mrsas_tbolt_prepare_cdb(struct mrsas_instance *instance, U8 cdb[], + struct IO_REQUEST_INFO *io_info, Mpi2RaidSCSIIORequest_t *scsi_io_request, + U32 ref_tag) +{ + uint16_t EEDPFlags; + uint32_t Control; + ddi_acc_handle_t acc_handle = + instance->mpi2_frame_pool_dma_obj.acc_handle; + + /* Prepare 32-byte CDB if DIF is supported on this device */ + con_log(CL_ANN, (CE_NOTE, "Prepare DIF CDB\n")); + + (void) memset(cdb, 0, 32); + + cdb[0] = MRSAS_SCSI_VARIABLE_LENGTH_CMD; + + + cdb[7] = MRSAS_SCSI_ADDL_CDB_LEN; + + if (io_info->isRead) + cdb[9] = MRSAS_SCSI_SERVICE_ACTION_READ32; + else + cdb[9] = MRSAS_SCSI_SERVICE_ACTION_WRITE32; + + /* Verify within linux driver, set to MEGASAS_RD_WR_PROTECT_CHECK_ALL */ + cdb[10] = MRSAS_RD_WR_PROTECT; + + /* LOGICAL BLOCK ADDRESS */ + cdb[12] = (U8)(((io_info->pdBlock) >> 56) & 0xff); + cdb[13] = (U8)(((io_info->pdBlock) >> 48) & 0xff); + cdb[14] = (U8)(((io_info->pdBlock) >> 40) & 0xff); + cdb[15] = (U8)(((io_info->pdBlock) >> 32) & 0xff); + cdb[16] = (U8)(((io_info->pdBlock) >> 24) & 0xff); + cdb[17] = (U8)(((io_info->pdBlock) >> 16) & 0xff); + cdb[18] = (U8)(((io_info->pdBlock) >> 8) & 0xff); + cdb[19] = (U8)((io_info->pdBlock) & 0xff); + + /* Logical block reference tag */ + ddi_put32(acc_handle, &scsi_io_request->CDB.EEDP32.PrimaryReferenceTag, + BIG_ENDIAN(ref_tag)); + + ddi_put16(acc_handle, + &scsi_io_request->CDB.EEDP32.PrimaryApplicationTagMask, 0xffff); + + ddi_put32(acc_handle, &scsi_io_request->DataLength, + ((io_info->numBlocks)*512)); + /* Specify 32-byte cdb */ + ddi_put16(acc_handle, &scsi_io_request->IoFlags, 32); + + /* Transfer length */ + cdb[28] = (U8)(((io_info->numBlocks) >> 24) & 0xff); + cdb[29] = (U8)(((io_info->numBlocks) >> 16) & 0xff); + cdb[30] = (U8)(((io_info->numBlocks) >> 8) & 0xff); + cdb[31] = (U8)((io_info->numBlocks) & 0xff); + + /* set SCSI IO EEDPFlags */ + EEDPFlags = ddi_get16(acc_handle, &scsi_io_request->EEDPFlags); + Control = ddi_get32(acc_handle, &scsi_io_request->Control); + + /* set SCSI IO EEDPFlags bits */ + if (io_info->isRead) { + /* + * For READ commands, the EEDPFlags shall be set to specify to + * Increment the Primary Reference Tag, to Check the Reference + * Tag, and to Check and Remove the Protection Information + * fields. + */ + EEDPFlags = MPI2_SCSIIO_EEDPFLAGS_INC_PRI_REFTAG | + MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG | + MPI2_SCSIIO_EEDPFLAGS_CHECK_REMOVE_OP | + MPI2_SCSIIO_EEDPFLAGS_CHECK_APPTAG | + MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD; + } else { + /* + * For WRITE commands, the EEDPFlags shall be set to specify to + * Increment the Primary Reference Tag, and to Insert + * Protection Information fields. + */ + EEDPFlags = MPI2_SCSIIO_EEDPFLAGS_INC_PRI_REFTAG | + MPI2_SCSIIO_EEDPFLAGS_INSERT_OP; + } + Control |= (0x4 << 26); + + ddi_put16(acc_handle, &scsi_io_request->EEDPFlags, EEDPFlags); + ddi_put32(acc_handle, &scsi_io_request->Control, Control); + ddi_put32(acc_handle, + &scsi_io_request->EEDPBlockSize, MRSAS_EEDPBLOCKSIZE); +} + + +/* + * mrsas_tbolt_set_pd_lba - Sets PD LBA + * @cdb: CDB + * @cdb_len: cdb length + * @start_blk: Start block of IO + * + * Used to set the PD LBA in CDB for FP IOs + */ +static void +mrsas_tbolt_set_pd_lba(U8 cdb[], uint8_t *cdb_len_ptr, U64 start_blk, + U32 num_blocks) +{ + U8 cdb_len = *cdb_len_ptr; + U8 flagvals = 0, opcode = 0, groupnum = 0, control = 0; + + /* Some drives don't support 16/12 byte CDB's, convert to 10 */ + if (((cdb_len == 12) || (cdb_len == 16)) && + (start_blk <= 0xffffffff)) { + if (cdb_len == 16) { + con_log(CL_ANN, + (CE_NOTE, "Converting READ/WRITE(16) to READ10\n")); + opcode = cdb[0] == READ_16 ? READ_10 : WRITE_10; + flagvals = cdb[1]; + groupnum = cdb[14]; + control = cdb[15]; + } else { + con_log(CL_ANN, + (CE_NOTE, "Converting READ/WRITE(12) to READ10\n")); + opcode = cdb[0] == READ_12 ? READ_10 : WRITE_10; + flagvals = cdb[1]; + groupnum = cdb[10]; + control = cdb[11]; + } + + (void) memset(cdb, 0, sizeof (cdb)); + + cdb[0] = opcode; + cdb[1] = flagvals; + cdb[6] = groupnum; + cdb[9] = control; + /* Set transfer length */ + cdb[8] = (U8)(num_blocks & 0xff); + cdb[7] = (U8)((num_blocks >> 8) & 0xff); + cdb_len = 10; + } else if ((cdb_len < 16) && (start_blk > 0xffffffff)) { + /* Convert to 16 byte CDB for large LBA's */ + con_log(CL_ANN, + (CE_NOTE, "Converting 6/10/12 CDB to 16 byte CDB\n")); + switch (cdb_len) { + case 6: + opcode = cdb[0] == READ_6 ? READ_16 : WRITE_16; + control = cdb[5]; + break; + case 10: + opcode = cdb[0] == READ_10 ? READ_16 : WRITE_16; + flagvals = cdb[1]; + groupnum = cdb[6]; + control = cdb[9]; + break; + case 12: + opcode = cdb[0] == READ_12 ? READ_16 : WRITE_16; + flagvals = cdb[1]; + groupnum = cdb[10]; + control = cdb[11]; + break; + } + + (void) memset(cdb, 0, sizeof (cdb)); + + cdb[0] = opcode; + cdb[1] = flagvals; + cdb[14] = groupnum; + cdb[15] = control; + + /* Transfer length */ + cdb[13] = (U8)(num_blocks & 0xff); + cdb[12] = (U8)((num_blocks >> 8) & 0xff); + cdb[11] = (U8)((num_blocks >> 16) & 0xff); + cdb[10] = (U8)((num_blocks >> 24) & 0xff); + + /* Specify 16-byte cdb */ + cdb_len = 16; + } else if ((cdb_len == 6) && (start_blk > 0x1fffff)) { + /* convert to 10 byte CDB */ + opcode = cdb[0] == READ_6 ? READ_10 : WRITE_10; + control = cdb[5]; + + (void) memset(cdb, 0, sizeof (cdb)); + cdb[0] = opcode; + cdb[9] = control; + + /* Set transfer length */ + cdb[8] = (U8)(num_blocks & 0xff); + cdb[7] = (U8)((num_blocks >> 8) & 0xff); + + /* Specify 10-byte cdb */ + cdb_len = 10; + } + + + /* Fall through Normal case, just load LBA here */ + switch (cdb_len) { + case 6: + { + U8 val = cdb[1] & 0xE0; + cdb[3] = (U8)(start_blk & 0xff); + cdb[2] = (U8)((start_blk >> 8) & 0xff); + cdb[1] = val | ((U8)(start_blk >> 16) & 0x1f); + break; + } + case 10: + cdb[5] = (U8)(start_blk & 0xff); + cdb[4] = (U8)((start_blk >> 8) & 0xff); + cdb[3] = (U8)((start_blk >> 16) & 0xff); + cdb[2] = (U8)((start_blk >> 24) & 0xff); + break; + case 12: + cdb[5] = (U8)(start_blk & 0xff); + cdb[4] = (U8)((start_blk >> 8) & 0xff); + cdb[3] = (U8)((start_blk >> 16) & 0xff); + cdb[2] = (U8)((start_blk >> 24) & 0xff); + break; + + case 16: + cdb[9] = (U8)(start_blk & 0xff); + cdb[8] = (U8)((start_blk >> 8) & 0xff); + cdb[7] = (U8)((start_blk >> 16) & 0xff); + cdb[6] = (U8)((start_blk >> 24) & 0xff); + cdb[5] = (U8)((start_blk >> 32) & 0xff); + cdb[4] = (U8)((start_blk >> 40) & 0xff); + cdb[3] = (U8)((start_blk >> 48) & 0xff); + cdb[2] = (U8)((start_blk >> 56) & 0xff); + break; + } + + *cdb_len_ptr = cdb_len; +} + + +static int +mrsas_tbolt_check_map_info(struct mrsas_instance *instance) +{ + MR_FW_RAID_MAP_ALL *ld_map; + + if (!mrsas_tbolt_get_ld_map_info(instance)) { + + ld_map = instance->ld_map[(instance->map_id & 1)]; + + con_log(CL_ANN1, (CE_NOTE, "ldCount=%d, map size=%d\n", + ld_map->raidMap.ldCount, ld_map->raidMap.totalSize)); + + if (MR_ValidateMapInfo(instance->ld_map[ + (instance->map_id & 1)], instance->load_balance_info)) { + con_log(CL_ANN, + (CE_CONT, "MR_ValidateMapInfo success")); + + instance->fast_path_io = 1; + con_log(CL_ANN, + (CE_NOTE, "instance->fast_path_io %d \n", + instance->fast_path_io)); + + return (DDI_SUCCESS); + } + + } + + instance->fast_path_io = 0; + cmn_err(CE_WARN, "MR_ValidateMapInfo failed"); + con_log(CL_ANN, (CE_NOTE, + "instance->fast_path_io %d \n", instance->fast_path_io)); + + return (DDI_FAILURE); +} + +/* + * Marks HBA as bad. This will be called either when an + * IO packet times out even after 3 FW resets + * or FW is found to be fault even after 3 continuous resets. + */ + +void +mrsas_tbolt_kill_adapter(struct mrsas_instance *instance) +{ + cmn_err(CE_WARN, "TBOLT Kill adapter called\n"); + + if (instance->deadadapter == 1) + return; + + con_log(CL_ANN1, (CE_NOTE, "tbolt_kill_adapter: " + "Writing to doorbell with MFI_STOP_ADP ")); + mutex_enter(&instance->ocr_flags_mtx); + instance->deadadapter = 1; + mutex_exit(&instance->ocr_flags_mtx); + instance->func_ptr->disable_intr(instance); + WR_RESERVED0_REGISTER(MFI_STOP_ADP, instance); + /* Flush */ + (void) RD_RESERVED0_REGISTER(instance); + + (void) mrsas_print_pending_cmds(instance); + (void) mrsas_complete_pending_cmds(instance); +} + +void +mrsas_reset_reply_desc(struct mrsas_instance *instance) +{ + int i; + MPI2_REPLY_DESCRIPTORS_UNION *reply_desc; + instance->reply_read_index = 0; + + /* initializing reply address to 0xFFFFFFFF */ + reply_desc = instance->reply_frame_pool; + + for (i = 0; i < instance->reply_q_depth; i++) { + reply_desc->Words = (uint64_t)~0; + reply_desc++; + } +} + +int +mrsas_tbolt_reset_ppc(struct mrsas_instance *instance) +{ + uint32_t status = 0x00; + uint32_t retry = 0; + uint32_t cur_abs_reg_val; + uint32_t fw_state; + uint32_t abs_state; + uint32_t i; + + con_log(CL_ANN, (CE_NOTE, + "mrsas_tbolt_reset_ppc entered\n ")); + + if (instance->deadadapter == 1) { + cmn_err(CE_WARN, "mrsas_tbolt_reset_ppc: " + "no more resets as HBA has been marked dead "); + return (DDI_FAILURE); + } + + mutex_enter(&instance->ocr_flags_mtx); + instance->adapterresetinprogress = 1; + con_log(CL_ANN, (CE_NOTE, "mrsas_tbolt_reset_ppc:" + "adpterresetinprogress flag set, time %llx", gethrtime())); + mutex_exit(&instance->ocr_flags_mtx); + + instance->func_ptr->disable_intr(instance); + + /* Add delay inorder to complete the ioctl & io cmds in-flight */ + for (i = 0; i < 3000; i++) { + drv_usecwait(MILLISEC); /* wait for 1000 usecs */ + } + + instance->reply_read_index = 0; + +retry_reset: + con_log(CL_ANN, (CE_NOTE, "mrsas_tbolt_reset_ppc: " + ":Resetting TBOLT ")); + + WR_TBOLT_IB_WRITE_SEQ(0xF, instance); + WR_TBOLT_IB_WRITE_SEQ(4, instance); + WR_TBOLT_IB_WRITE_SEQ(0xb, instance); + WR_TBOLT_IB_WRITE_SEQ(2, instance); + WR_TBOLT_IB_WRITE_SEQ(7, instance); + WR_TBOLT_IB_WRITE_SEQ(0xd, instance); + con_log(CL_ANN1, (CE_NOTE, + "mrsas_tbolt_reset_ppc: magic number written " + "to write sequence register\n")); + delay(100 * drv_usectohz(MILLISEC)); + status = RD_TBOLT_HOST_DIAG(instance); + con_log(CL_ANN1, (CE_NOTE, + "mrsas_tbolt_reset_ppc: READ HOSTDIAG SUCCESS " + "to write sequence register\n")); + + while (status & DIAG_TBOLT_RESET_ADAPTER) { + delay(100 * drv_usectohz(MILLISEC)); + status = RD_TBOLT_HOST_DIAG(instance); + if (retry++ == 100) { + cmn_err(CE_WARN, + "mrsas_tbolt_reset_ppc:" + "resetadapter bit is set already " + "check retry count %d\n", retry); + return (DDI_FAILURE); + } + } + + WR_TBOLT_HOST_DIAG(status | DIAG_TBOLT_RESET_ADAPTER, instance); + delay(100 * drv_usectohz(MILLISEC)); + + ddi_rep_get8((instance)->regmap_handle, (uint8_t *)&status, + (uint8_t *)((uintptr_t)(instance)->regmap + + RESET_TBOLT_STATUS_OFF), 4, DDI_DEV_AUTOINCR); + + while ((status & DIAG_TBOLT_RESET_ADAPTER)) { + delay(100 * drv_usectohz(MILLISEC)); + ddi_rep_get8((instance)->regmap_handle, (uint8_t *)&status, + (uint8_t *)((uintptr_t)(instance)->regmap + + RESET_TBOLT_STATUS_OFF), 4, DDI_DEV_AUTOINCR); + if (retry++ == 100) { + /* Dont call kill adapter here */ + /* RESET BIT ADAPTER is cleared by firmare */ + /* mrsas_tbolt_kill_adapter(instance); */ + cmn_err(CE_WARN, + "mr_sas %d: %s(): RESET FAILED; return failure!!!", + instance->instance, __func__); + return (DDI_FAILURE); + } + } + + con_log(CL_ANN, + (CE_NOTE, "mrsas_tbolt_reset_ppc: Adapter reset complete")); + con_log(CL_ANN, (CE_NOTE, "mrsas_tbolt_reset_ppc: " + "Calling mfi_state_transition_to_ready")); + + abs_state = instance->func_ptr->read_fw_status_reg(instance); + retry = 0; + while ((abs_state <= MFI_STATE_FW_INIT) && (retry++ < 1000)) { + delay(100 * drv_usectohz(MILLISEC)); + abs_state = instance->func_ptr->read_fw_status_reg(instance); + } + if (abs_state <= MFI_STATE_FW_INIT) { + cmn_err(CE_WARN, + "mrsas_tbolt_reset_ppc: firmware state < MFI_STATE_FW_INIT" + "state = 0x%x, RETRY RESET.\n", abs_state); + goto retry_reset; + } + + /* Mark HBA as bad, if FW is fault after 3 continuous resets */ + if (mfi_state_transition_to_ready(instance) || + debug_tbolt_fw_faults_after_ocr_g == 1) { + cur_abs_reg_val = + instance->func_ptr->read_fw_status_reg(instance); + fw_state = cur_abs_reg_val & MFI_STATE_MASK; + + con_log(CL_ANN1, (CE_NOTE, + "mrsas_tbolt_reset_ppc :before fake: FW is not ready " + "FW state = 0x%x", fw_state)); + if (debug_tbolt_fw_faults_after_ocr_g == 1) + fw_state = MFI_STATE_FAULT; + + con_log(CL_ANN, + (CE_NOTE, "mrsas_tbolt_reset_ppc : FW is not ready " + "FW state = 0x%x", fw_state)); + + if (fw_state == MFI_STATE_FAULT) { + /* increment the count */ + instance->fw_fault_count_after_ocr++; + if (instance->fw_fault_count_after_ocr + < MAX_FW_RESET_COUNT) { + cmn_err(CE_WARN, "mrsas_tbolt_reset_ppc: " + "FW is in fault after OCR count %d " + "Retry Reset", + instance->fw_fault_count_after_ocr); + goto retry_reset; + + } else { + cmn_err(CE_WARN, "mrsas %d: %s:" + "Max Reset Count exceeded >%d" + "Mark HBA as bad, KILL adapter", + instance->instance, __func__, + MAX_FW_RESET_COUNT); + + mrsas_tbolt_kill_adapter(instance); + return (DDI_FAILURE); + } + } + } + + /* reset the counter as FW is up after OCR */ + instance->fw_fault_count_after_ocr = 0; + + mrsas_reset_reply_desc(instance); + + + con_log(CL_ANN1, (CE_NOTE, "mrsas_tbolt_reset_ppc: " + "Calling mrsas_issue_init_mpi2")); + abs_state = mrsas_issue_init_mpi2(instance); + if (abs_state == (uint32_t)DDI_FAILURE) { + cmn_err(CE_WARN, "mrsas_tbolt_reset_ppc: " + "INIT failed Retrying Reset"); + goto retry_reset; + } + con_log(CL_ANN1, (CE_NOTE, "mrsas_tbolt_reset_ppc: " + "mrsas_issue_init_mpi2 Done")); + + con_log(CL_ANN, (CE_NOTE, "mrsas_tbolt_reset_ppc: " + "Calling mrsas_print_pending_cmd\n")); + (void) mrsas_print_pending_cmds(instance); + con_log(CL_ANN, (CE_NOTE, "mrsas_tbolt_reset_ppc: " + "mrsas_print_pending_cmd done\n")); + + instance->func_ptr->enable_intr(instance); + instance->fw_outstanding = 0; + + con_log(CL_ANN1, (CE_NOTE, "mrsas_tbolt_reset_ppc: " + "Calling mrsas_issue_pending_cmds")); + (void) mrsas_issue_pending_cmds(instance); + con_log(CL_ANN1, (CE_NOTE, "mrsas_tbolt_reset_ppc: " + "issue_pending_cmds done.\n")); + + con_log(CL_ANN1, (CE_NOTE, "mrsas_tbolt_reset_ppc: " + "Calling aen registration")); + + instance->aen_cmd->retry_count_for_ocr = 0; + instance->aen_cmd->drv_pkt_time = 0; + + instance->func_ptr->issue_cmd(instance->aen_cmd, instance); + + con_log(CL_ANN1, (CE_NOTE, "Unsetting adpresetinprogress flag.\n")); + mutex_enter(&instance->ocr_flags_mtx); + instance->adapterresetinprogress = 0; + mutex_exit(&instance->ocr_flags_mtx); + con_log(CL_ANN1, (CE_NOTE, "mrsas_tbolt_reset_ppc: " + "adpterresetinprogress flag unset")); + + con_log(CL_ANN, (CE_NOTE, "mrsas_tbolt_reset_ppc done\n")); + return (DDI_SUCCESS); + +} + + +/* + * mrsas_sync_map_info - Returns FW's ld_map structure + * @instance: Adapter soft state + * + * Issues an internal command (DCMD) to get the FW's controller PD + * list structure. This information is mainly used to find out SYSTEM + * supported by the FW. + */ + +static int +mrsas_tbolt_sync_map_info(struct mrsas_instance *instance) +{ + int ret = 0, i; + struct mrsas_cmd *cmd = NULL; + struct mrsas_dcmd_frame *dcmd; + uint32_t size_sync_info, num_lds; + LD_TARGET_SYNC *ci = NULL; + MR_FW_RAID_MAP_ALL *map; + MR_LD_RAID *raid; + LD_TARGET_SYNC *ld_sync; + uint32_t ci_h = 0; + uint32_t size_map_info; + + cmd = get_raid_msg_pkt(instance); + + if (cmd == NULL) { + cmn_err(CE_WARN, "Failed to get a cmd from free-pool in " + "mrsas_tbolt_sync_map_info(). "); + return (DDI_FAILURE); + } + + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + bzero(cmd->scsi_io_request, sizeof (Mpi2RaidSCSIIORequest_t)); + + + map = instance->ld_map[instance->map_id & 1]; + + num_lds = map->raidMap.ldCount; + + dcmd = &cmd->frame->dcmd; + + size_sync_info = sizeof (LD_TARGET_SYNC) * num_lds; + + con_log(CL_ANN, (CE_NOTE, "size_sync_info =0x%x ; ld count = 0x%x \n ", + size_sync_info, num_lds)); + + ci = (LD_TARGET_SYNC *)instance->ld_map[(instance->map_id - 1) & 1]; + + (void) memset(ci, 0, sizeof (MR_FW_RAID_MAP_ALL)); + ci_h = instance->ld_map_phy[(instance->map_id - 1) & 1]; + + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + ld_sync = (LD_TARGET_SYNC *)ci; + + for (i = 0; i < num_lds; i++, ld_sync++) { + raid = MR_LdRaidGet(i, map); + + con_log(CL_ANN1, + (CE_NOTE, "i : 0x%x, Seq Num : 0x%x, Sync Reqd : 0x%x\n", + i, raid->seqNum, raid->flags.ldSyncRequired)); + + ld_sync->ldTargetId = MR_GetLDTgtId(i, map); + + con_log(CL_ANN1, (CE_NOTE, "i : 0x%x, tgt : 0x%x \n", + i, ld_sync->ldTargetId)); + + ld_sync->seqNum = raid->seqNum; + } + + + size_map_info = sizeof (MR_FW_RAID_MAP) + + (sizeof (MR_LD_SPAN_MAP) * (MAX_LOGICAL_DRIVES - 1)); + + dcmd->cmd = MFI_CMD_OP_DCMD; + dcmd->cmd_status = 0xFF; + dcmd->sge_count = 1; + dcmd->flags = MFI_FRAME_DIR_WRITE; + dcmd->timeout = 0; + dcmd->pad_0 = 0; + dcmd->data_xfer_len = size_map_info; + ASSERT(num_lds <= 255); + dcmd->mbox.b[0] = (U8)num_lds; + dcmd->mbox.b[1] = 1; /* Pend */ + dcmd->opcode = MR_DCMD_LD_MAP_GET_INFO; + dcmd->sgl.sge32[0].phys_addr = ci_h; + dcmd->sgl.sge32[0].length = size_map_info; + + + instance->map_update_cmd = cmd; + mr_sas_tbolt_build_mfi_cmd(instance, cmd); + + instance->func_ptr->issue_cmd(cmd, instance); + + instance->unroll.syncCmd = 1; + con_log(CL_ANN1, (CE_NOTE, "sync cmd issued. [SMID]:%x", cmd->SMID)); + + return (ret); +} + +/* + * abort_syncmap_cmd + */ +int +abort_syncmap_cmd(struct mrsas_instance *instance, + struct mrsas_cmd *cmd_to_abort) +{ + int ret = 0; + + struct mrsas_cmd *cmd; + struct mrsas_abort_frame *abort_fr; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt: abort_ldsync:%d", __LINE__)); + + cmd = get_raid_msg_mfi_pkt(instance); + + if (!cmd) { + cmn_err(CE_WARN, + "Failed to get a cmd from free-pool abort_syncmap_cmd()."); + return (DDI_FAILURE); + } + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + abort_fr = &cmd->frame->abort; + + /* prepare and issue the abort frame */ + ddi_put8(cmd->frame_dma_obj.acc_handle, + &abort_fr->cmd, MFI_CMD_OP_ABORT); + ddi_put8(cmd->frame_dma_obj.acc_handle, &abort_fr->cmd_status, + MFI_CMD_STATUS_SYNC_MODE); + ddi_put16(cmd->frame_dma_obj.acc_handle, &abort_fr->flags, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &abort_fr->abort_context, + cmd_to_abort->index); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &abort_fr->abort_mfi_phys_addr_lo, cmd_to_abort->frame_phys_addr); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &abort_fr->abort_mfi_phys_addr_hi, 0); + + cmd->frame_count = 1; + + mr_sas_tbolt_build_mfi_cmd(instance, cmd); + + if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + con_log(CL_ANN1, (CE_WARN, + "abort_ldsync_cmd: issue_cmd_in_poll_mode failed")); + ret = -1; + } else { + ret = 0; + } + + return_raid_msg_mfi_pkt(instance, cmd); + + atomic_add_16(&instance->fw_outstanding, (-1)); + + return (ret); +} + + +#ifdef PDSUPPORT +int +mrsas_tbolt_config_pd(struct mrsas_instance *instance, uint16_t tgt, + uint8_t lun, dev_info_t **ldip) +{ + struct scsi_device *sd; + dev_info_t *child; + int rval, dtype; + struct mrsas_tbolt_pd_info *pds = NULL; + + con_log(CL_ANN1, (CE_NOTE, "mrsas_tbolt_config_pd: t = %d l = %d", + tgt, lun)); + + if ((child = mrsas_find_child(instance, tgt, lun)) != NULL) { + if (ldip) { + *ldip = child; + } + if (instance->mr_tbolt_pd_list[tgt].flag != MRDRV_TGT_VALID) { + rval = mrsas_service_evt(instance, tgt, 1, + MRSAS_EVT_UNCONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, + "mr_sas:DELETING STALE ENTRY rval = %d " + "tgt id = %d ", rval, tgt)); + return (NDI_FAILURE); + } + return (NDI_SUCCESS); + } + + pds = (struct mrsas_tbolt_pd_info *) + kmem_zalloc(sizeof (struct mrsas_tbolt_pd_info), KM_SLEEP); + mrsas_tbolt_get_pd_info(instance, pds, tgt); + dtype = pds->scsiDevType; + + /* Check for Disk */ + if ((dtype == DTYPE_DIRECT)) { + if ((dtype == DTYPE_DIRECT) && + (LE_16(pds->fwState) != PD_SYSTEM)) { + kmem_free(pds, sizeof (struct mrsas_tbolt_pd_info)); + return (NDI_FAILURE); + } + sd = kmem_zalloc(sizeof (struct scsi_device), KM_SLEEP); + sd->sd_address.a_hba_tran = instance->tran; + sd->sd_address.a_target = (uint16_t)tgt; + sd->sd_address.a_lun = (uint8_t)lun; + + if (scsi_hba_probe(sd, NULL) == SCSIPROBE_EXISTS) { + rval = mrsas_config_scsi_device(instance, sd, ldip); + con_log(CL_DLEVEL1, (CE_NOTE, + "Phys. device found: tgt %d dtype %d: %s", + tgt, dtype, sd->sd_inq->inq_vid)); + } else { + rval = NDI_FAILURE; + con_log(CL_DLEVEL1, (CE_NOTE, "Phys. device Not found " + "scsi_hba_probe Failed: tgt %d dtype %d: %s", + tgt, dtype, sd->sd_inq->inq_vid)); + } + + /* sd_unprobe is blank now. Free buffer manually */ + if (sd->sd_inq) { + kmem_free(sd->sd_inq, SUN_INQSIZE); + sd->sd_inq = (struct scsi_inquiry *)NULL; + } + kmem_free(sd, sizeof (struct scsi_device)); + rval = NDI_SUCCESS; + } else { + con_log(CL_ANN1, (CE_NOTE, + "Device not supported: tgt %d lun %d dtype %d", + tgt, lun, dtype)); + rval = NDI_FAILURE; + } + + kmem_free(pds, sizeof (struct mrsas_tbolt_pd_info)); + con_log(CL_ANN1, (CE_NOTE, "mrsas_config_pd: return rval = %d", + rval)); + return (rval); +} + +static void +mrsas_tbolt_get_pd_info(struct mrsas_instance *instance, + struct mrsas_tbolt_pd_info *pds, int tgt) +{ + struct mrsas_cmd *cmd; + struct mrsas_dcmd_frame *dcmd; + dma_obj_t dcmd_dma_obj; + + cmd = get_raid_msg_pkt(instance); + + if (!cmd) { + con_log(CL_ANN1, + (CE_WARN, "Failed to get a cmd for get pd info")); + return; + } + + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union mrsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + + dcmd = &cmd->frame->dcmd; + dcmd_dma_obj.size = sizeof (struct mrsas_tbolt_pd_info); + dcmd_dma_obj.dma_attr = mrsas_generic_dma_attr; + dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xffffffff; + dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xffffffff; + dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1; + dcmd_dma_obj.dma_attr.dma_attr_align = 1; + + (void) mrsas_alloc_dma_obj(instance, &dcmd_dma_obj, + DDI_STRUCTURE_LE_ACC); + (void) memset(dcmd_dma_obj.buffer, 0, + sizeof (struct mrsas_tbolt_pd_info)); + (void) memset(dcmd->mbox.b, 0, 12); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_READ); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, + sizeof (struct mrsas_tbolt_pd_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + MR_DCMD_PD_GET_INFO); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[0], tgt); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length, + sizeof (struct mrsas_tbolt_pd_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr, + dcmd_dma_obj.dma_cookie[0].dmac_address); + + cmd->sync_cmd = MRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->tbolt) { + mr_sas_tbolt_build_mfi_cmd(instance, cmd); + } + + instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd); + + ddi_rep_get8(cmd->frame_dma_obj.acc_handle, (uint8_t *)pds, + (uint8_t *)dcmd_dma_obj.buffer, sizeof (struct mrsas_tbolt_pd_info), + DDI_DEV_AUTOINCR); + (void) mrsas_free_dma_obj(instance, dcmd_dma_obj); + return_raid_msg_pkt(instance, cmd); +} +#endif diff --git a/usr/src/uts/common/io/scsi/targets/sd.c b/usr/src/uts/common/io/scsi/targets/sd.c index 501bca39c8..a1edcc664c 100644 --- a/usr/src/uts/common/io/scsi/targets/sd.c +++ b/usr/src/uts/common/io/scsi/targets/sd.c @@ -26,6 +26,7 @@ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011 Bayard G. Bell. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. */ /* * Copyright 2011 cyril.galibern@opensvc.com @@ -3502,9 +3503,13 @@ sd_set_mmc_caps(sd_ssc_t *ssc) * according to the successful response to the page * 0x2A mode sense request. */ - scsi_log(SD_DEVINFO(un), sd_label, CE_WARN, - "sd_set_mmc_caps: Mode Sense returned " - "invalid block descriptor length\n"); + /* + * The following warning occurs due to the KVM CD-ROM + * mishandling the multi-media commands. Ignore it. + * scsi_log(SD_DEVINFO(un), sd_label, CE_WARN, + * "sd_set_mmc_caps: Mode Sense returned " + * "invalid block descriptor length\n"); + */ kmem_free(buf, BUFLEN_MODE_CDROM_CAP); return; } @@ -4441,18 +4446,77 @@ sd_sdconf_id_match(struct sd_lun *un, char *id, int idlen) { struct scsi_inquiry *sd_inq; int rval = SD_SUCCESS; + char *p; + int chk_vidlen = 0, chk_pidlen = 0; + int has_tail = 0; + static const int VSZ = sizeof (sd_inq->inq_vid); + static const int PSZ = sizeof (sd_inq->inq_pid); ASSERT(un != NULL); sd_inq = un->un_sd->sd_inq; ASSERT(id != NULL); /* - * We use the inq_vid as a pointer to a buffer containing the - * vid and pid and use the entire vid/pid length of the table - * entry for the comparison. This works because the inq_pid - * data member follows inq_vid in the scsi_inquiry structure. + * We would like to use the inq_vid as a pointer to a buffer + * containing the vid and pid and use the entire vid/pid length of + * the table entry for the comparison. However, this does not work + * because, while the inq_pid data member follows inq_vid in the + * scsi_inquiry structure, we do not control the contents of this + * buffer, and some broken devices violate SPC 4.3.1 and return + * fields with null bytes in them. + */ + chk_vidlen = MIN(VSZ, idlen); + p = id + chk_vidlen - 1; + while (*p == ' ' && chk_vidlen > 0) { + --p; + --chk_vidlen; + } + + /* + * If it's all spaces, check the whole thing. */ - if (strncasecmp(sd_inq->inq_vid, id, idlen) != 0) { + if (chk_vidlen == 0) + chk_vidlen = MIN(VSZ, idlen); + + if (idlen > VSZ) { + chk_pidlen = idlen - VSZ; + p = id + idlen - 1; + while (*p == ' ' && chk_pidlen > 0) { + --p; + --chk_pidlen; + } + if (chk_pidlen == 0) + chk_pidlen = MIN(PSZ, idlen - VSZ); + } + + /* + * There's one more thing we need to do here. If the user specified + * an ID with trailing spaces, we need to make sure the inquiry + * vid/pid has only spaces or NULs after the check length; otherwise, it + * can't match. + */ + if (idlen > chk_vidlen && chk_vidlen < VSZ) { + for (p = sd_inq->inq_vid + chk_vidlen; + p < sd_inq->inq_vid + VSZ; ++p) { + if (*p != ' ' && *p != '\0') { + ++has_tail; + break; + } + } + } + if (idlen > chk_pidlen + VSZ && chk_pidlen < PSZ) { + for (p = sd_inq->inq_pid + chk_pidlen; + p < sd_inq->inq_pid + PSZ; ++p) { + if (*p != ' ' && *p != '\0') { + ++has_tail; + break; + } + } + } + + if (has_tail || strncasecmp(sd_inq->inq_vid, id, chk_vidlen) != 0 || + (idlen > VSZ && + strncasecmp(sd_inq->inq_pid, id + VSZ, chk_pidlen) != 0)) { /* * The user id string is compared to the inquiry vid/pid * using a case insensitive comparison and ignoring @@ -22318,6 +22382,7 @@ sdioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cred_p, int *rval_p) case DKIOCINFO: case DKIOCGMEDIAINFO: case DKIOCGMEDIAINFOEXT: + case DKIOCSOLIDSTATE: case MHIOCENFAILFAST: case MHIOCSTATUS: case MHIOCTKOWN: @@ -22510,6 +22575,16 @@ skip_ready_valid: } break; + case DKIOCSOLIDSTATE: + SD_TRACE(SD_LOG_IOCTL, un, "DKIOCSOLIDSTATE\n"); + i = un->un_f_is_solid_state ? 1 : 0; + if (ddi_copyout(&i, (void *)arg, sizeof (int), flag) != 0) { + err = EFAULT; + } else { + err = 0; + } + break; + case DKIOCHOTPLUGGABLE: SD_TRACE(SD_LOG_IOCTL, un, "DKIOCHOTPLUGGABLE\n"); i = un->un_f_is_hotpluggable ? 1 : 0; diff --git a/usr/src/uts/common/io/sdcard/impl/sda_mem.c b/usr/src/uts/common/io/sdcard/impl/sda_mem.c index 752a3b8a32..1b485cac24 100644 --- a/usr/src/uts/common/io/sdcard/impl/sda_mem.c +++ b/usr/src/uts/common/io/sdcard/impl/sda_mem.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. */ /* @@ -207,6 +208,7 @@ sda_mem_bd_mediainfo(void *arg, bd_media_t *media) media->m_nblks = slot->s_nblks; media->m_blksize = slot->s_blksz; media->m_readonly = slot->s_flags & SLOTF_WRITABLE ? B_FALSE : B_TRUE; + media->m_solidstate = B_TRUE; sda_slot_exit(slot); return (0); } diff --git a/usr/src/uts/common/io/vnic/vnic_dev.c b/usr/src/uts/common/io/vnic/vnic_dev.c index c70ff2b22b..065d7f2cbc 100644 --- a/usr/src/uts/common/io/vnic/vnic_dev.c +++ b/usr/src/uts/common/io/vnic/vnic_dev.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -51,6 +52,7 @@ #include <sys/vlan.h> #include <sys/vnic.h> #include <sys/vnic_impl.h> +#include <sys/mac_impl.h> #include <sys/mac_flow_impl.h> #include <inet/ip_impl.h> @@ -81,6 +83,7 @@ static int vnic_m_stat(void *, uint_t, uint64_t *); static void vnic_m_ioctl(void *, queue_t *, mblk_t *); static int vnic_m_setprop(void *, const char *, mac_prop_id_t, uint_t, const void *); +static int vnic_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); static void vnic_m_propinfo(void *, const char *, mac_prop_id_t, mac_prop_info_handle_t); static mblk_t *vnic_m_tx(void *, mblk_t *); @@ -100,7 +103,7 @@ static mod_hash_t *vnic_hash; #define VNIC_HASH_KEY(vnic_id) ((mod_hash_key_t)(uintptr_t)vnic_id) #define VNIC_M_CALLBACK_FLAGS \ - (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO) + (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO) static mac_callbacks_t vnic_m_callbacks = { VNIC_M_CALLBACK_FLAGS, @@ -117,7 +120,7 @@ static mac_callbacks_t vnic_m_callbacks = { NULL, NULL, vnic_m_setprop, - NULL, + vnic_m_getprop, vnic_m_propinfo }; @@ -849,17 +852,19 @@ static int vnic_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, uint_t pr_valsize, const void *pr_val) { - int err = ENOTSUP; + int err = 0; vnic_t *vn = m_driver; - /* allow setting MTU only on an etherstub */ - if (vn->vn_link_id != DATALINK_INVALID_LINKID) - return (err); - switch (pr_num) { case MAC_PROP_MTU: { uint32_t mtu; + /* allow setting MTU only on an etherstub */ + if (vn->vn_link_id != DATALINK_INVALID_LINKID) { + err = ENOTSUP; + break; + } + if (pr_valsize < sizeof (mtu)) { err = EINVAL; break; @@ -872,12 +877,46 @@ vnic_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, err = mac_maxsdu_update(vn->vn_mh, mtu); break; } + case MAC_PROP_VN_PROMISC_FILTERED: { + boolean_t filtered; + + if (pr_valsize < sizeof (filtered)) { + err = EINVAL; + break; + } + + bcopy(pr_val, &filtered, sizeof (filtered)); + mac_set_promisc_filtered(vn->vn_mch, filtered); + } default: + err = ENOTSUP; break; } return (err); } +static int +vnic_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, void *pr_val) +{ + vnic_t *vn = arg; + int ret = 0; + boolean_t out; + + switch (pr_num) { + case MAC_PROP_VN_PROMISC_FILTERED: + out = mac_get_promisc_filtered(vn->vn_mch); + ASSERT(pr_valsize >= sizeof (boolean_t)); + bcopy(&out, pr_val, sizeof (boolean_t)); + break; + default: + ret = EINVAL; + break; + } + + return (ret); +} + /* ARGSUSED */ static void vnic_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, mac_prop_info_handle_t prh) diff --git a/usr/src/uts/common/os/bio.c b/usr/src/uts/common/os/bio.c index 0db01f80d7..c3d04e5508 100644 --- a/usr/src/uts/common/os/bio.c +++ b/usr/src/uts/common/os/bio.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -1320,6 +1321,9 @@ pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags) cpup = CPU; /* get pointer AFTER preemption is disabled */ CPU_STATS_ADDQ(cpup, vm, pgin, 1); CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len)); + + atomic_add_64(&curzone->zone_pgpgin, btopr(len)); + if ((flags & B_ASYNC) == 0) { klwp_t *lwp = ttolwp(curthread); if (lwp != NULL) @@ -1336,13 +1340,19 @@ pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags) if (pp != NULL && pp->p_vnode != NULL) { if (IS_SWAPFSVP(pp->p_vnode)) { CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len)); + atomic_add_64(&curzone->zone_anonpgin, + btopr(len)); } else { if (pp->p_vnode->v_flag & VVMEXEC) { CPU_STATS_ADDQ(cpup, vm, execpgin, btopr(len)); + atomic_add_64(&curzone->zone_execpgin, + btopr(len)); } else { CPU_STATS_ADDQ(cpup, vm, fspgin, btopr(len)); + atomic_add_64(&curzone->zone_fspgin, + btopr(len)); } } } diff --git a/usr/src/uts/common/os/clock.c b/usr/src/uts/common/os/clock.c index 451c9db48c..3f4dd63c82 100644 --- a/usr/src/uts/common/os/clock.c +++ b/usr/src/uts/common/os/clock.c @@ -23,6 +23,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ #include <sys/param.h> @@ -66,6 +67,7 @@ #include <sys/ddi_timer.h> #include <sys/random.h> #include <sys/modctl.h> +#include <sys/zone.h> /* * for NTP support @@ -1158,6 +1160,10 @@ loadavg_update() } while ((cpupart = cpupart->cp_next) != cp_list_head); + /* + * Third pass totals up per-zone statistics. + */ + zone_loadavg_update(); } /* diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c index e097f355ec..7870617a26 100644 --- a/usr/src/uts/common/os/clock_highres.c +++ b/usr/src/uts/common/os/clock_highres.c @@ -24,7 +24,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2012, Joyent Inc. All rights reserved. + */ #include <sys/timer.h> #include <sys/systm.h> @@ -112,6 +114,25 @@ clock_highres_timer_settime(itimer_t *it, int flags, cyctime.cyt_when = ts2hrt(&when->it_value); cyctime.cyt_interval = ts2hrt(&when->it_interval); + if (cyctime.cyt_when != 0 && cyctime.cyt_interval == 0 && + it->it_itime.it_interval.tv_sec == 0 && + it->it_itime.it_interval.tv_nsec == 0 && + (cyc = *cycp) != CYCLIC_NONE) { + /* + * If our existing timer is a one-shot and our new timer is a + * one-shot, we'll save ourselves a world of grief and just + * reprogram the cyclic. + */ + it->it_itime = *when; + + if (!(flags & TIMER_ABSTIME)) + cyctime.cyt_when += gethrtime(); + + hrt2ts(cyctime.cyt_when, &it->it_itime.it_value); + (void) cyclic_reprogram(cyc, cyctime.cyt_when); + return (0); + } + mutex_enter(&cpu_lock); if ((cyc = *cycp) != CYCLIC_NONE) { cyclic_remove(cyc); @@ -162,17 +183,14 @@ clock_highres_timer_settime(itimer_t *it, int flags, if (cyctime.cyt_interval == 0) { /* - * If this is a one-shot, then we set the interval to assure - * that the cyclic will next fire INT64_MAX nanoseconds after - * boot (which corresponds to over 292 years -- yes, Buck Rogers - * may have his 292-year-uptime-Solaris box malfunction). If - * this timer is never touched, this cyclic will simply - * consume space in the cyclic subsystem. As soon as + * If this is a one-shot, then we set the interval to be + * inifinite. If this timer is never touched, this cyclic will + * simply consume space in the cyclic subsystem. As soon as * timer_settime() or timer_delete() is called, the cyclic is * removed (so it's not possible to run the machine out * of resources by creating one-shots). */ - cyctime.cyt_interval = INT64_MAX - cyctime.cyt_when; + cyctime.cyt_interval = CY_INFINITY; } it->it_itime = *when; @@ -185,8 +203,6 @@ clock_highres_timer_settime(itimer_t *it, int flags, if (cyctime.cyt_when != 0) *cycp = cyc = cyclic_add(&hdlr, &cyctime); - else - *cycp = cyc = CYCLIC_NONE; /* * Now that we have the cyclic created, we need to bind it to our diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c index a292f4e14f..ebaa6bfe41 100644 --- a/usr/src/uts/common/os/contract.c +++ b/usr/src/uts/common/os/contract.c @@ -497,7 +497,7 @@ contract_abandon(contract_t *ct, proc_t *p, int explicit) contract_t *parent = &p->p_ct_process->conp_contract; int inherit = 0; - ASSERT(p == curproc); + VERIFY(p == curproc); mutex_enter(&ct->ct_lock); @@ -547,7 +547,7 @@ contract_abandon(contract_t *ct, proc_t *p, int explicit) if (inherit) { ct->ct_state = CTS_INHERITED; - ASSERT(ct->ct_regent == parent); + VERIFY(ct->ct_regent == parent); contract_process_take(parent, ct); /* @@ -2063,8 +2063,8 @@ cte_copy(ct_equeue_t *q, ct_equeue_t *newq) { ct_kevent_t *e, *first = NULL; - ASSERT(q->ctq_listno == CTEL_CONTRACT); - ASSERT(newq->ctq_listno == CTEL_PBUNDLE); + VERIFY(q->ctq_listno == CTEL_CONTRACT); + VERIFY(newq->ctq_listno == CTEL_PBUNDLE); mutex_enter(&q->ctq_lock); mutex_enter(&newq->ctq_lock); @@ -2077,8 +2077,16 @@ cte_copy(ct_equeue_t *q, ct_equeue_t *newq) if ((e->cte_flags & (CTE_INFO | CTE_ACK)) == 0) { if (first == NULL) first = e; - list_insert_tail(&newq->ctq_events, e); - cte_hold(e); + /* + * It is possible for adoption to race with an owner's + * cte_publish_all(); we must only enqueue events that + * have not already been enqueued. + */ + if (!list_link_active((list_node_t *) + ((uintptr_t)e + newq->ctq_events.list_offset))) { + list_insert_tail(&newq->ctq_events, e); + cte_hold(e); + } } } @@ -2117,7 +2125,7 @@ cte_trim(ct_equeue_t *q, contract_t *ct) int flags, stopper; int start = 1; - ASSERT(MUTEX_HELD(&q->ctq_lock)); + VERIFY(MUTEX_HELD(&q->ctq_lock)); for (e = list_head(&q->ctq_events); e != NULL; e = next) { next = list_next(&q->ctq_events, e); @@ -2227,13 +2235,24 @@ cte_queue_drain(ct_equeue_t *q, int ack) * cte_publish_all. */ static void -cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp) +cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp, boolean_t mayexist) { ASSERT(MUTEX_HELD(&q->ctq_lock)); q->ctq_atime = *tsp; /* + * If this event may already exist on this queue, check to see if it + * is already there and return if so. + */ + if (mayexist && list_link_active((list_node_t *)((uintptr_t)e + + q->ctq_events.list_offset))) { + mutex_exit(&q->ctq_lock); + cte_rele(e); + return; + } + + /* * Don't publish if the event is informative and there aren't * any listeners, or if the queue has been shut down. */ @@ -2247,6 +2266,8 @@ cte_publish(ct_equeue_t *q, ct_kevent_t *e, timespec_t *tsp) /* * Enqueue event */ + VERIFY(!list_link_active((list_node_t *) + ((uintptr_t)e + q->ctq_events.list_offset))); list_insert_tail(&q->ctq_events, e); /* @@ -2318,14 +2339,14 @@ cte_publish_all(contract_t *ct, ct_kevent_t *e, nvlist_t *data, nvlist_t *gdata) ct->ct_evcnt++; } mutex_exit(&ct->ct_lock); - cte_publish(&ct->ct_events, e, &ts); + cte_publish(&ct->ct_events, e, &ts, B_FALSE); /* * CTEL_BUNDLE - Next deliver to the contract type's bundle * queue. */ mutex_enter(&ct->ct_type->ct_type_events.ctq_lock); - cte_publish(&ct->ct_type->ct_type_events, e, &ts); + cte_publish(&ct->ct_type->ct_type_events, e, &ts, B_FALSE); /* * CTEL_PBUNDLE - Finally, if the contract has an owner, @@ -2342,7 +2363,14 @@ cte_publish_all(contract_t *ct, ct_kevent_t *e, nvlist_t *data, nvlist_t *gdata) q = ct->ct_owner->p_ct_equeue[ct->ct_type->ct_type_index]; mutex_enter(&q->ctq_lock); mutex_exit(&ct->ct_lock); - cte_publish(q, e, &ts); + + /* + * It is possible for this code to race with adoption; we + * publish the event indicating that the event may already + * be enqueued because adoption beat us to it (in which case + * cte_pubish() does nothing). + */ + cte_publish(q, e, &ts, B_TRUE); } else { mutex_exit(&ct->ct_lock); cte_rele(e); diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c index 9e04f631a9..3b3935a772 100644 --- a/usr/src/uts/common/os/core.c +++ b/usr/src/uts/common/os/core.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -534,6 +535,10 @@ expand_string(const char *pat, char *fp, int size, cred_t *cr) case 'z': s = p->p_zone->zone_name; break; + case 'Z': + /* This is zonepath + "/root/", except for GZ */ + s = p->p_zone->zone_rootpath; + break; case '%': (void) strcpy((s = buf), "%"); break; @@ -548,6 +553,9 @@ expand_string(const char *pat, char *fp, int size, cred_t *cr) if ((size -= len) <= 0) return (ENAMETOOLONG); (void) strcpy(fp, s); + /* strip trailing "/root/" from non-GZ zonepath string */ + if (c == 'Z' && len > 6) + len -= 6; fp += len; } diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c index 1ec63249ab..20e57efaad 100644 --- a/usr/src/uts/common/os/cred.c +++ b/usr/src/uts/common/os/cred.c @@ -724,6 +724,14 @@ crgetzoneid(const cred_t *cr) cr->cr_zone->zone_id); } +zoneid_t +crgetzonedid(const cred_t *cr) +{ + return (cr->cr_zone == NULL ? + (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) : + cr->cr_zone->zone_did); +} + projid_t crgetprojid(const cred_t *cr) { diff --git a/usr/src/uts/common/os/cyclic.c b/usr/src/uts/common/os/cyclic.c index 1bb6baf445..93a318d260 100644 --- a/usr/src/uts/common/os/cyclic.c +++ b/usr/src/uts/common/os/cyclic.c @@ -24,6 +24,10 @@ */ /* + * Copyright (c) 2012, Joyent Inc. All rights reserved. + */ + +/* * The Cyclic Subsystem * -------------------- * @@ -1139,7 +1143,7 @@ top: CYC_TRACE(cpu, level, "softint-top", cyclics, pc); while (consndx != pc->cypc_prodndx) { - int pend, npend, opend; + uint32_t pend, npend, opend; int consmasked = consndx & sizemask; cyclic_t *cyclic = &cyclics[buf[consmasked]]; cyc_func_t handler = cyclic->cy_handler; diff --git a/usr/src/uts/common/os/dtrace_subr.c b/usr/src/uts/common/os/dtrace_subr.c index f2a9ac1b7d..d2ce3361c1 100644 --- a/usr/src/uts/common/os/dtrace_subr.c +++ b/usr/src/uts/common/os/dtrace_subr.c @@ -44,6 +44,7 @@ void (*dtrace_helpers_fork)(proc_t *, proc_t *); void (*dtrace_cpustart_init)(void); void (*dtrace_cpustart_fini)(void); void (*dtrace_cpc_fire)(uint64_t); +void (*dtrace_closef)(void); void (*dtrace_debugger_init)(void); void (*dtrace_debugger_fini)(void); diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c index b97a09454b..7c5b8323e3 100644 --- a/usr/src/uts/common/os/exit.c +++ b/usr/src/uts/common/os/exit.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -388,10 +389,16 @@ proc_exit(int why, int what) if (p->p_pid == z->zone_proc_initpid) { if (z->zone_boot_err == 0 && zone_status_get(z) < ZONE_IS_SHUTTING_DOWN && - zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN && - z->zone_restart_init == B_TRUE && - restart_init(what, why) == 0) - return (0); + zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) { + if (z->zone_restart_init == B_TRUE) { + if (restart_init(what, why) == 0) + return (0); + } else { + (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, + CRED()); + } + } + /* * Since we didn't or couldn't restart init, we clear * the zone's init state and proceed with exit diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c index a014d25c0f..3b47e05ef2 100644 --- a/usr/src/uts/common/os/fio.c +++ b/usr/src/uts/common/os/fio.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -54,6 +55,7 @@ #include <sys/poll.h> #include <sys/rctl.h> #include <sys/port_impl.h> +#include <sys/dtrace.h> #include <c2/audit.h> #include <sys/nbmlock.h> @@ -952,6 +954,18 @@ closef(file_t *fp) ASSERT(fp->f_count == 0); mutex_exit(&fp->f_tlock); + /* + * If DTrace has getf() subroutines active, it will set dtrace_closef + * to point to code that implements a barrier with respect to probe + * context. This must be called before the file_t is freed (and the + * vnode that it refers to is released) -- but it must be after the + * file_t has been removed from the uf_entry_t. That is, there must + * be no way for a racing getf() in probe context to yield the fp that + * we're operating upon. + */ + if (dtrace_closef != NULL) + (*dtrace_closef)(); + VN_RELE(vp); /* * deallocate resources to audit_data diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c index 83b817e866..a5f5a6f3c2 100644 --- a/usr/src/uts/common/os/kstat_fr.c +++ b/usr/src/uts/common/os/kstat_fr.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* @@ -160,6 +161,7 @@ struct { kstat_named_t avenrun_5min; kstat_named_t avenrun_15min; kstat_named_t boot_time; + kstat_named_t nsec_per_tick; } system_misc_kstat = { { "ncpus", KSTAT_DATA_UINT32 }, { "lbolt", KSTAT_DATA_UINT32 }, @@ -171,6 +173,7 @@ struct { { "avenrun_5min", KSTAT_DATA_UINT32 }, { "avenrun_15min", KSTAT_DATA_UINT32 }, { "boot_time", KSTAT_DATA_UINT32 }, + { "nsec_per_tick", KSTAT_DATA_UINT32 }, }; struct { @@ -803,7 +806,6 @@ system_misc_kstat_update(kstat_t *ksp, int rw) { int myncpus = ncpus; int *loadavgp = &avenrun[0]; - int loadavg[LOADAVG_NSTATS]; time_t zone_boot_time; clock_t zone_lbolt; hrtime_t zone_hrtime; @@ -820,17 +822,11 @@ system_misc_kstat_update(kstat_t *ksp, int rw) */ mutex_enter(&cpu_lock); if (pool_pset_enabled()) { - psetid_t mypsid = zone_pset_get(curproc->p_zone); - int error; - myncpus = zone_ncpus_get(curproc->p_zone); ASSERT(myncpus > 0); - error = cpupart_get_loadavg(mypsid, &loadavg[0], - LOADAVG_NSTATS); - ASSERT(error == 0); - loadavgp = &loadavg[0]; } mutex_exit(&cpu_lock); + loadavgp = &curproc->p_zone->zone_avenrun[0]; } if (INGLOBALZONE(curproc)) { @@ -838,9 +834,7 @@ system_misc_kstat_update(kstat_t *ksp, int rw) zone_lbolt = ddi_get_lbolt(); zone_nproc = nproc; } else { - struct timeval tvp; - hrt2tv(curproc->p_zone->zone_zsched->p_mstart, &tvp); - zone_boot_time = tvp.tv_sec; + zone_boot_time = curproc->p_zone->zone_boot_time; zone_hrtime = gethrtime(); zone_lbolt = (clock_t)(NSEC_TO_TICK(zone_hrtime) - @@ -861,6 +855,8 @@ system_misc_kstat_update(kstat_t *ksp, int rw) system_misc_kstat.avenrun_15min.value.ui32 = (uint32_t)loadavgp[2]; system_misc_kstat.boot_time.value.ui32 = (uint32_t) zone_boot_time; + system_misc_kstat.nsec_per_tick.value.ui32 = (uint32_t) + nsec_per_tick; return (0); } diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c index f5cebbf82e..63a89a2ce8 100644 --- a/usr/src/uts/common/os/logsubr.c +++ b/usr/src/uts/common/os/logsubr.c @@ -248,8 +248,7 @@ log_init(void) */ printf("\rSunOS Release %s Version %s %u-bit\n", utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *)); - printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. " - "All rights reserved.\n"); + printf("Copyright (c) 2010-2012, Joyent Inc. All rights reserved.\n"); #ifdef DEBUG printf("DEBUG enabled\n"); #endif diff --git a/usr/src/uts/common/os/msacct.c b/usr/src/uts/common/os/msacct.c index df975eb7ee..30e50cce72 100644 --- a/usr/src/uts/common/os/msacct.c +++ b/usr/src/uts/common/os/msacct.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -33,6 +34,7 @@ #include <sys/debug.h> #include <sys/msacct.h> #include <sys/time.h> +#include <sys/zone.h> /* * Mega-theory block comment: @@ -390,6 +392,7 @@ void syscall_mstate(int fromms, int toms) { kthread_t *t = curthread; + zone_t *z = ttozone(t); struct mstate *ms; hrtime_t *mstimep; hrtime_t curtime; @@ -413,6 +416,10 @@ syscall_mstate(int fromms, int toms) newtime = curtime - ms->ms_state_start; } *mstimep += newtime; + if (fromms == LMS_USER) + atomic_add_64(&z->zone_utime, newtime); + else if (fromms == LMS_SYSTEM) + atomic_add_64(&z->zone_stime, newtime); t->t_mstate = toms; ms->ms_state_start = curtime; ms->ms_prev = fromms; @@ -560,27 +567,18 @@ cpu_update_pct(kthread_t *t, hrtime_t newtime) */ do { - if (T_ONPROC(t) && t->t_waitrq == 0) { - hrlb = t->t_hrtime; + pctcpu = t->t_pctcpu; + hrlb = t->t_hrtime; + delta = newtime - hrlb; + if (delta < 0) { + newtime = gethrtime_unscaled(); delta = newtime - hrlb; - if (delta < 0) { - newtime = gethrtime_unscaled(); - delta = newtime - hrlb; - } - t->t_hrtime = newtime; - scalehrtime(&delta); - pctcpu = t->t_pctcpu; + } + t->t_hrtime = newtime; + scalehrtime(&delta); + if (T_ONPROC(t) && t->t_waitrq == 0) { npctcpu = cpu_grow(pctcpu, delta); } else { - hrlb = t->t_hrtime; - delta = newtime - hrlb; - if (delta < 0) { - newtime = gethrtime_unscaled(); - delta = newtime - hrlb; - } - t->t_hrtime = newtime; - scalehrtime(&delta); - pctcpu = t->t_pctcpu; npctcpu = cpu_decay(pctcpu, delta); } } while (cas32(&t->t_pctcpu, pctcpu, npctcpu) != pctcpu); @@ -602,7 +600,10 @@ new_mstate(kthread_t *t, int new_state) hrtime_t curtime; hrtime_t newtime; hrtime_t oldtime; + hrtime_t ztime; + hrtime_t origstart; klwp_t *lwp; + zone_t *z; ASSERT(new_state != LMS_WAIT_CPU); ASSERT((unsigned)new_state < NMSTATES); @@ -625,6 +626,7 @@ new_mstate(kthread_t *t, int new_state) ms = &lwp->lwp_mstate; state = t->t_mstate; + origstart = ms->ms_state_start; do { switch (state) { case LMS_TFAULT: @@ -637,7 +639,7 @@ new_mstate(kthread_t *t, int new_state) mstimep = &ms->ms_acct[state]; break; } - newtime = curtime - ms->ms_state_start; + ztime = newtime = curtime - ms->ms_state_start; if (newtime < 0) { curtime = gethrtime_unscaled(); oldtime = *mstimep - 1; /* force CAS to fail */ @@ -648,6 +650,20 @@ new_mstate(kthread_t *t, int new_state) t->t_mstate = new_state; ms->ms_state_start = curtime; } while (cas64((uint64_t *)mstimep, oldtime, newtime) != oldtime); + + /* + * When the system boots the initial startup thread will have a + * ms_state_start of 0 which would add a huge system time to the global + * zone. We want to skip aggregating that initial bit of work. + */ + if (origstart != 0) { + z = ttozone(t); + if (state == LMS_USER) + atomic_add_64(&z->zone_utime, ztime); + else if (state == LMS_SYSTEM) + atomic_add_64(&z->zone_stime, ztime); + } + /* * Remember the previous running microstate. */ @@ -686,6 +702,8 @@ restore_mstate(kthread_t *t) hrtime_t waitrq; hrtime_t newtime; hrtime_t oldtime; + hrtime_t waittime; + zone_t *z; /* * Don't call restore mstate of threads without lwps. (Kernel threads) @@ -756,11 +774,15 @@ restore_mstate(kthread_t *t) oldtime = *mstimep; newtime += oldtime; } while (cas64((uint64_t *)mstimep, oldtime, newtime) != oldtime); + /* * Update the WAIT_CPU timer and per-cpu waitrq total. */ - ms->ms_acct[LMS_WAIT_CPU] += (curtime - waitrq); - CPU->cpu_waitrq += (curtime - waitrq); + z = ttozone(t); + waittime = curtime - waitrq; + ms->ms_acct[LMS_WAIT_CPU] += waittime; + atomic_add_64(&z->zone_wtime, waittime); + CPU->cpu_waitrq += waittime; ms->ms_state_start = curtime; } diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c index 573ebbc367..d8f7882723 100644 --- a/usr/src/uts/common/os/policy.c +++ b/usr/src/uts/common/os/policy.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -2563,3 +2564,12 @@ secpolicy_ppp_config(const cred_t *cr) return (secpolicy_net_config(cr, B_FALSE)); return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL)); } + +int +secpolicy_hyprlofs_control(const cred_t *cr) +{ + if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL)) + return (EPERM); + return (0); +} + diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs index a5a918b326..53617bd0fe 100644 --- a/usr/src/uts/common/os/priv_defs +++ b/usr/src/uts/common/os/priv_defs @@ -176,6 +176,10 @@ privilege PRIV_GRAPHICS_MAP Allows a process to perform privileged mappings through a graphics device. +privilege PRIV_HYPRLOFS_CONTROL + + Allows a process to manage hyprlofs entries. + privilege PRIV_IPC_DAC_READ Allows a process to read a System V IPC diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c index 6946a35a38..1b222538b3 100644 --- a/usr/src/uts/common/os/vmem.c +++ b/usr/src/uts/common/os/vmem.c @@ -1619,7 +1619,7 @@ vmem_destroy(vmem_t *vmp) leaked = vmem_size(vmp, VMEM_ALLOC); if (leaked != 0) - cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s", + cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s", vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ? "identifiers" : "bytes"); diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 79ccd94ae4..f308b45260 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2012, Joyent Inc. All rights reserved. */ /* @@ -369,21 +370,18 @@ static char *zone_ref_subsys_names[] = { rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; rctl_hndl_t rc_zone_max_swap; +rctl_hndl_t rc_zone_phys_mem; rctl_hndl_t rc_zone_max_lofi; rctl_hndl_t rc_zone_cpu_cap; +rctl_hndl_t rc_zone_cpu_baseline; +rctl_hndl_t rc_zone_cpu_burst_time; +rctl_hndl_t rc_zone_zfs_io_pri; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_nprocs; rctl_hndl_t rc_zone_shmmax; rctl_hndl_t rc_zone_shmmni; rctl_hndl_t rc_zone_semmni; rctl_hndl_t rc_zone_msgmni; -/* - * Synchronization primitives used to synchronize between mounts and zone - * creation/destruction. - */ -static int mounts_in_progress; -static kcondvar_t mount_cv; -static kmutex_t mount_lock; const char * const zone_default_initname = "/sbin/init"; static char * const zone_prefix = "/zone/"; @@ -423,23 +421,27 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, * Version 5 alters the zone_boot system call, and converts its old * bootargs parameter to be set by the zone_setattr API instead. * Version 6 adds the flag argument to zone_create. + * Version 7 adds the requested zoneid to zone_create. */ -static const int ZONE_SYSCALL_API_VERSION = 6; +static const int ZONE_SYSCALL_API_VERSION = 7; /* * Certain filesystems (such as NFS and autofs) need to know which zone * the mount is being placed in. Because of this, we need to be able to - * ensure that a zone isn't in the process of being created such that - * nfs_mount() thinks it is in the global zone, while by the time it - * gets added the list of mounted zones, it ends up on zoneA's mount - * list. + * ensure that a zone isn't in the process of being created/destroyed such + * that nfs_mount() thinks it is in the global/NGZ zone, while by the time + * it gets added the list of mounted zones, it ends up on the wrong zone's + * mount list. Since a zone can't reside on an NFS file system, we don't + * have to worry about the zonepath itself. * * The following functions: block_mounts()/resume_mounts() and * mount_in_progress()/mount_completed() are used by zones and the VFS - * layer (respectively) to synchronize zone creation and new mounts. + * layer (respectively) to synchronize zone state transitions and new + * mounts within a zone. This syncronization is on a per-zone basis, so + * activity for one zone will not interfere with activity for another zone. * * The semantics are like a reader-reader lock such that there may - * either be multiple mounts (or zone creations, if that weren't + * either be multiple mounts (or zone state transitions, if that weren't * serialized by zonehash_lock) in progress at the same time, but not * both. * @@ -447,10 +449,8 @@ static const int ZONE_SYSCALL_API_VERSION = 6; * taking too long. * * The semantics are such that there is unfair bias towards the - * "current" operation. This means that zone creations may starve if - * there is a rapid succession of new mounts coming in to the system, or - * there is a remote possibility that zones will be created at such a - * rate that new mounts will not be able to proceed. + * "current" operation. This means that zone halt may starve if + * there is a rapid succession of new mounts coming in to the zone. */ /* * Prevent new mounts from progressing to the point of calling @@ -458,7 +458,7 @@ static const int ZONE_SYSCALL_API_VERSION = 6; * them to complete. */ static int -block_mounts(void) +block_mounts(zone_t *zp) { int retval = 0; @@ -467,19 +467,21 @@ block_mounts(void) * called with zonehash_lock held. */ ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); - mutex_enter(&mount_lock); - while (mounts_in_progress > 0) { - if (cv_wait_sig(&mount_cv, &mount_lock) == 0) + mutex_enter(&zp->zone_mount_lock); + while (zp->zone_mounts_in_progress > 0) { + if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0) goto signaled; } /* * A negative value of mounts_in_progress indicates that mounts - * have been blocked by (-mounts_in_progress) different callers. + * have been blocked by (-mounts_in_progress) different callers + * (remotely possible if two threads enter zone_shutdown at the same + * time). */ - mounts_in_progress--; + zp->zone_mounts_in_progress--; retval = 1; signaled: - mutex_exit(&mount_lock); + mutex_exit(&zp->zone_mount_lock); return (retval); } @@ -488,26 +490,26 @@ signaled: * Allow them to progress if we were the last obstacle. */ static void -resume_mounts(void) +resume_mounts(zone_t *zp) { - mutex_enter(&mount_lock); - if (++mounts_in_progress == 0) - cv_broadcast(&mount_cv); - mutex_exit(&mount_lock); + mutex_enter(&zp->zone_mount_lock); + if (++zp->zone_mounts_in_progress == 0) + cv_broadcast(&zp->zone_mount_cv); + mutex_exit(&zp->zone_mount_lock); } /* - * The VFS layer is busy with a mount; zones should wait until all - * mounts are completed to progress. + * The VFS layer is busy with a mount; this zone should wait until all + * of its mounts are completed to progress. */ void -mount_in_progress(void) +mount_in_progress(zone_t *zp) { - mutex_enter(&mount_lock); - while (mounts_in_progress < 0) - cv_wait(&mount_cv, &mount_lock); - mounts_in_progress++; - mutex_exit(&mount_lock); + mutex_enter(&zp->zone_mount_lock); + while (zp->zone_mounts_in_progress < 0) + cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock); + zp->zone_mounts_in_progress++; + mutex_exit(&zp->zone_mount_lock); } /* @@ -515,12 +517,12 @@ mount_in_progress(void) * callers if this is the last mount. */ void -mount_completed(void) +mount_completed(zone_t *zp) { - mutex_enter(&mount_lock); - if (--mounts_in_progress == 0) - cv_broadcast(&mount_cv); - mutex_exit(&mount_lock); + mutex_enter(&zp->zone_mount_lock); + if (--zp->zone_mounts_in_progress == 0) + cv_broadcast(&zp->zone_mount_cv); + mutex_exit(&zp->zone_mount_lock); } /* @@ -1380,6 +1382,114 @@ static rctl_ops_t zone_cpu_cap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_cpu_base_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_base(p->p_zone)); +} + +/* + * The zone cpu base is used to set the baseline CPU for the zone + * so we can track when the zone is bursting. + */ +/*ARGSUSED*/ +static int +zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_base(zone, nv)); +} + +static rctl_ops_t zone_cpu_base_ops = { + rcop_no_action, + zone_cpu_base_get, + zone_cpu_base_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t +zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_burst_time(p->p_zone)); +} + +/* + * The zone cpu burst time is used to set the amount of time CPU(s) can be + * bursting for the zone. + */ +/*ARGSUSED*/ +static int +zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_burst_time(zone, nv)); +} + +static rctl_ops_t zone_cpu_burst_time_ops = { + rcop_no_action, + zone_cpu_burst_time_get, + zone_cpu_burst_time_set, + rcop_no_test +}; + +/* + * zone.zfs-io-pri resource control support (IO priority). + */ +/*ARGSUSED*/ +static rctl_qty_t +zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (p->p_zone->zone_zfs_io_pri); +} + +/*ARGSUSED*/ +static int +zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + /* + * set priority to the new value. + */ + zone->zone_zfs_io_pri = nv; + return (0); +} + +static rctl_ops_t zone_zfs_io_pri_ops = { + rcop_no_action, + zone_zfs_io_pri_get, + zone_zfs_io_pri_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_lwps_usage(rctl_t *r, proc_t *p) { rctl_qty_t nlwps; @@ -1674,6 +1784,39 @@ static rctl_ops_t zone_max_swap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_phys_mem_usage(rctl_t *rctl, struct proc *p) +{ + rctl_qty_t q; + zone_t *z = p->p_zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + /* No additional lock because not enforced in the kernel */ + q = z->zone_phys_mem; + return (q); +} + +/*ARGSUSED*/ +static int +zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + if (e->rcep_p.zone == NULL) + return (0); + e->rcep_p.zone->zone_phys_mem_ctl = nv; + return (0); +} + +static rctl_ops_t zone_phys_mem_ops = { + rcop_no_action, + zone_phys_mem_usage, + zone_phys_mem_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_max_lofi_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; @@ -1767,6 +1910,20 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw) } static int +zone_physmem_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = zone->zone_phys_mem; + zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl; + return (0); +} + +static int zone_nprocs_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; @@ -1795,7 +1952,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw) } static kstat_t * -zone_kstat_create_common(zone_t *zone, char *name, +zone_rctl_kstat_create_common(zone_t *zone, char *name, int (*updatefunc) (kstat_t *, int)) { kstat_t *ksp; @@ -1820,26 +1977,337 @@ zone_kstat_create_common(zone_t *zone, char *name, return (ksp); } +static int +zone_vfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_vfs_kstat_t *zvp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_vfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the VFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the slow ops + * counters are updated directly by the VFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zvp->zv_nread.value.ui64 = kiop->nread; + zvp->zv_reads.value.ui64 = kiop->reads; + zvp->zv_rtime.value.ui64 = kiop->rtime; + zvp->zv_rlentime.value.ui64 = kiop->rlentime; + zvp->zv_nwritten.value.ui64 = kiop->nwritten; + zvp->zv_writes.value.ui64 = kiop->writes; + zvp->zv_wtime.value.ui64 = kiop->wtime; + zvp->zv_wlentime.value.ui64 = kiop->wlentime; + + scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_vfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_vfs_kstat_t *zvp; + + if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id, + zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED, + sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_vfs_lock; + zone->zone_vfs_stats = zvp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zvp->zv_zonename, zone->zone_name); + kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_vfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_zfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_zfs_kstat_t *zzp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_zfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the ZFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the I/O throttle + * counters are updated directly by the ZFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zzp->zz_nread.value.ui64 = kiop->nread; + zzp->zz_reads.value.ui64 = kiop->reads; + zzp->zz_rtime.value.ui64 = kiop->rtime; + zzp->zz_rlentime.value.ui64 = kiop->rlentime; + zzp->zz_nwritten.value.ui64 = kiop->nwritten; + zzp->zz_writes.value.ui64 = kiop->writes; + + scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64); + scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_zfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_zfs_kstat_t *zzp; + + if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id, + zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED, + sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_zfs_lock; + zone->zone_zfs_stats = zzp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zzp->zz_zonename, zone->zone_name); + kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_zfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_mcap_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_mcap_kstat_t *zmp = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zmp->zm_rss.value.ui64 = zone->zone_phys_mem; + zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl; + zmp->zm_swap.value.ui64 = zone->zone_max_swap; + zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl; + zmp->zm_nover.value.ui64 = zone->zone_mcap_nover; + zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout; + zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin; + zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin; + zmp->zm_execpgin.value.ui64 = zone->zone_execpgin; + zmp->zm_fspgin.value.ui64 = zone->zone_fspgin; + zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail; + zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle; + zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec; + + return (0); +} + +static kstat_t * +zone_mcap_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_mcap_kstat_t *zmp; + + if ((ksp = kstat_create_zone("memory_cap", zone->zone_id, + zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED, + sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_mcap_lock; + zone->zone_mcap_stats = zmp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); + kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail", + KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle", + KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec", + KSTAT_DATA_UINT64); + + ksp->ks_update = zone_mcap_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_misc_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_misc_kstat_t *zmp = ksp->ks_data; + hrtime_t tmp; + + if (rw == KSTAT_WRITE) + return (EACCES); + + tmp = zone->zone_utime; + scalehrtime(&tmp); + zmp->zm_utime.value.ui64 = tmp; + tmp = zone->zone_stime; + scalehrtime(&tmp); + zmp->zm_stime.value.ui64 = tmp; + tmp = zone->zone_wtime; + scalehrtime(&tmp); + zmp->zm_wtime.value.ui64 = tmp; + + zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0]; + zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1]; + zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2]; + + return (0); +} + +static kstat_t * +zone_misc_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_misc_kstat_t *zmp; + + if ((ksp = kstat_create_zone("zones", zone->zone_id, + zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED, + sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_misc_lock; + zone->zone_misc_stats = zmp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); + kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min", + KSTAT_DATA_UINT32); + + ksp->ks_update = zone_misc_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + static void zone_kstat_create(zone_t *zone) { - zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, + zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone, "lockedmem", zone_lockedmem_kstat_update); - zone->zone_swapresv_kstat = zone_kstat_create_common(zone, + zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone, "swapresv", zone_swapresv_kstat_update); - zone->zone_nprocs_kstat = zone_kstat_create_common(zone, + zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone, + "physicalmem", zone_physmem_kstat_update); + zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone, "nprocs", zone_nprocs_kstat_update); + + if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) { + zone->zone_vfs_stats = kmem_zalloc( + sizeof (zone_vfs_kstat_t), KM_SLEEP); + } + + if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) { + zone->zone_zfs_stats = kmem_zalloc( + sizeof (zone_zfs_kstat_t), KM_SLEEP); + } + + if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) { + zone->zone_mcap_stats = kmem_zalloc( + sizeof (zone_mcap_kstat_t), KM_SLEEP); + } + + if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) { + zone->zone_misc_stats = kmem_zalloc( + sizeof (zone_misc_kstat_t), KM_SLEEP); + } } static void -zone_kstat_delete_common(kstat_t **pkstat) +zone_kstat_delete_common(kstat_t **pkstat, size_t datasz) { void *data; if (*pkstat != NULL) { data = (*pkstat)->ks_data; kstat_delete(*pkstat); - kmem_free(data, sizeof (zone_kstat_t)); + kmem_free(data, datasz); *pkstat = NULL; } } @@ -1847,9 +2315,23 @@ zone_kstat_delete_common(kstat_t **pkstat) static void zone_kstat_delete(zone_t *zone) { - zone_kstat_delete_common(&zone->zone_lockedmem_kstat); - zone_kstat_delete_common(&zone->zone_swapresv_kstat); - zone_kstat_delete_common(&zone->zone_nprocs_kstat); + zone_kstat_delete_common(&zone->zone_lockedmem_kstat, + sizeof (zone_kstat_t)); + zone_kstat_delete_common(&zone->zone_swapresv_kstat, + sizeof (zone_kstat_t)); + zone_kstat_delete_common(&zone->zone_physmem_kstat, + sizeof (zone_kstat_t)); + zone_kstat_delete_common(&zone->zone_nprocs_kstat, + sizeof (zone_kstat_t)); + + zone_kstat_delete_common(&zone->zone_vfs_ksp, + sizeof (zone_vfs_kstat_t)); + zone_kstat_delete_common(&zone->zone_zfs_ksp, + sizeof (zone_zfs_kstat_t)); + zone_kstat_delete_common(&zone->zone_mcap_ksp, + sizeof (zone_mcap_kstat_t)); + zone_kstat_delete_common(&zone->zone_misc_ksp, + sizeof (zone_misc_kstat_t)); } /* @@ -1883,6 +2365,8 @@ zone_zsd_init(void) zone0.zone_locked_mem_ctl = UINT64_MAX; ASSERT(zone0.zone_max_swap == 0); zone0.zone_max_swap_ctl = UINT64_MAX; + zone0.zone_phys_mem = 0; + zone0.zone_phys_mem_ctl = UINT64_MAX; zone0.zone_max_lofi = 0; zone0.zone_max_lofi_ctl = UINT64_MAX; zone0.zone_shmmax = 0; @@ -1906,7 +2390,13 @@ zone_zsd_init(void) zone0.zone_initname = initname; zone0.zone_lockedmem_kstat = NULL; zone0.zone_swapresv_kstat = NULL; + zone0.zone_physmem_kstat = NULL; zone0.zone_nprocs_kstat = NULL; + zone0.zone_zfs_io_pri = 1; + zone0.zone_stime = 0; + zone0.zone_utime = 0; + zone0.zone_wtime = 0; + list_create(&zone0.zone_ref_list, sizeof (zone_ref_t), offsetof(zone_ref_t, zref_linkage)); list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), @@ -2013,6 +2503,21 @@ zone_init(void) RCTL_GLOBAL_INFINITE, MAXCAP, MAXCAP, &zone_cpu_cap_ops); + rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + MAXCAP, MAXCAP, &zone_cpu_base_ops); + + rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + INT_MAX, INT_MAX, &zone_cpu_burst_time_ops); + + rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + 1024, 1024, &zone_zfs_io_pri_ops); + rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX, &zone_lwps_ops); @@ -2054,6 +2559,20 @@ zone_init(void) rde = rctl_dict_lookup("zone.cpu-shares"); (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + /* + * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach + * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'. + */ + dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + bzero(dval, sizeof (rctl_val_t)); + dval->rcv_value = 1; + dval->rcv_privilege = RCPRIV_PRIVILEGED; + dval->rcv_flagaction = RCTL_LOCAL_NOACTION; + dval->rcv_action_recip_pid = -1; + + rde = rctl_dict_lookup("zone.zfs-io-priority"); + (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + rc_zone_locked_mem = rctl_register("zone.max-locked-memory", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2064,6 +2583,11 @@ zone_init(void) RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_max_swap_ops); + rc_zone_phys_mem = rctl_register("zone.max-physical-memory", + RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | + RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, + &zone_phys_mem_ops); + rc_zone_max_lofi = rctl_register("zone.max-lofi", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2375,14 +2899,65 @@ zone_set_initname(zone_t *zone, const char *zone_initname) return (0); } +/* + * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used + * to provide the physical memory capping kstats. Since physical memory + * capping is currently implemented in userland, that code uses the setattr + * entry point to increment the kstats. We always simply increment nover + * every time that setattr is called and we always add in the input value + * to zone_mcap_pagedout every time that is called. + */ +/*ARGSUSED*/ static int -zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) +zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover) { - uint64_t mcap; - int err = 0; + zone->zone_mcap_nover++; + + return (0); +} + +static int +zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout) +{ + uint64_t pageout; + int err; - if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) - zone->zone_phys_mcap = mcap; + if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0) + zone->zone_mcap_pagedout += pageout; + + return (err); +} + +/* + * The zone_set_page_fault_delay function is used to set the number of usecs + * to throttle page faults. This is normally 0 but can be set to a non-0 value + * by the user-land memory capping code when the zone is over its physcial + * memory cap. + */ +static int +zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay) +{ + uint32_t dusec; + int err; + + if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0) + zone->zone_pg_flt_delay = dusec; + + return (err); +} + +/* + * The zone_set_rss function is used to set the zone's RSS when we do the + * fast, approximate calculation in user-land. + */ +static int +zone_set_rss(zone_t *zone, const uint64_t *prss) +{ + uint64_t rss; + int err; + + if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0) + zone->zone_phys_mem = rss; return (err); } @@ -2794,6 +3369,12 @@ getzoneid(void) return (curproc->p_zone->zone_id); } +zoneid_t +getzonedid(void) +{ + return (curproc->p_zone->zone_did); +} + /* * Internal versions of zone_find_by_*(). These don't zone_hold() or * check the validity of a zone's state. @@ -2977,6 +3558,92 @@ zone_find_by_path(const char *path) } /* + * Public interface for updating per-zone load averages. Called once per + * second. + * + * Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c. + */ +void +zone_loadavg_update() +{ + zone_t *zp; + zone_status_t status; + struct loadavg_s *lavg; + hrtime_t zone_total; + int i; + hrtime_t hr_avg; + int nrun; + static int64_t f[3] = { 135, 27, 9 }; + int64_t q, r; + + mutex_enter(&zonehash_lock); + for (zp = list_head(&zone_active); zp != NULL; + zp = list_next(&zone_active, zp)) { + mutex_enter(&zp->zone_lock); + + /* Skip zones that are on the way down or not yet up */ + status = zone_status_get(zp); + if (status < ZONE_IS_READY || status >= ZONE_IS_DOWN) { + /* For all practical purposes the zone doesn't exist. */ + mutex_exit(&zp->zone_lock); + continue; + } + + /* + * Update the 10 second moving average data in zone_loadavg. + */ + lavg = &zp->zone_loadavg; + + zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime; + scalehrtime(&zone_total); + + /* The zone_total should always be increasing. */ + lavg->lg_loads[lavg->lg_cur] = (zone_total > lavg->lg_total) ? + zone_total - lavg->lg_total : 0; + lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ; + /* lg_total holds the prev. 1 sec. total */ + lavg->lg_total = zone_total; + + /* + * To simplify the calculation, we don't calculate the load avg. + * until the zone has been up for at least 10 seconds and our + * moving average is thus full. + */ + if ((lavg->lg_len + 1) < S_LOADAVG_SZ) { + lavg->lg_len++; + mutex_exit(&zp->zone_lock); + continue; + } + + /* Now calculate the 1min, 5min, 15 min load avg. */ + hr_avg = 0; + for (i = 0; i < S_LOADAVG_SZ; i++) + hr_avg += lavg->lg_loads[i]; + hr_avg = hr_avg / S_LOADAVG_SZ; + nrun = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX); + + /* Compute load avg. See comment in calcloadavg() */ + for (i = 0; i < 3; i++) { + q = (zp->zone_hp_avenrun[i] >> 16) << 7; + r = (zp->zone_hp_avenrun[i] & 0xffff) << 7; + zp->zone_hp_avenrun[i] += + ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4; + + /* avenrun[] can only hold 31 bits of load avg. */ + if (zp->zone_hp_avenrun[i] < + ((uint64_t)1<<(31+16-FSHIFT))) + zp->zone_avenrun[i] = (int32_t) + (zp->zone_hp_avenrun[i] >> (16 - FSHIFT)); + else + zp->zone_avenrun[i] = 0x7fffffff; + } + + mutex_exit(&zp->zone_lock); + } + mutex_exit(&zonehash_lock); +} + +/* * Get the number of cpus visible to this zone. The system-wide global * 'ncpus' is returned if pools are disabled, the caller is in the * global zone, or a NULL zone argument is passed in. @@ -3789,7 +4456,10 @@ zsched(void *arg) mutex_enter(&zone_status_lock); zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); mutex_exit(&zone_status_lock); + } else { + zone->zone_boot_time = gethrestime_sec(); } + pool_unlock(); } @@ -4081,7 +4751,7 @@ zone_create(const char *zone_name, const char *zone_root, caddr_t rctlbuf, size_t rctlbufsz, caddr_t zfsbuf, size_t zfsbufsz, int *extended_error, int match, uint32_t doi, const bslabel_t *label, - int flags) + int flags, zoneid_t zone_did) { struct zsched_arg zarg; nvlist_t *rctls = NULL; @@ -4104,6 +4774,7 @@ zone_create(const char *zone_name, const char *zone_root, zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); zoneid = zone->zone_id = id_alloc(zoneid_space); + zone->zone_did = zone_did; zone->zone_status = ZONE_IS_UNINITIALIZED; zone->zone_pool = pool_default; zone->zone_pool_mod = gethrtime(); @@ -4172,10 +4843,14 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_locked_mem_ctl = UINT64_MAX; zone->zone_max_swap = 0; zone->zone_max_swap_ctl = UINT64_MAX; + zone->zone_phys_mem = 0; + zone->zone_phys_mem_ctl = UINT64_MAX; zone->zone_max_lofi = 0; zone->zone_max_lofi_ctl = UINT64_MAX; - zone0.zone_lockedmem_kstat = NULL; - zone0.zone_swapresv_kstat = NULL; + zone->zone_lockedmem_kstat = NULL; + zone->zone_swapresv_kstat = NULL; + zone->zone_physmem_kstat = NULL; + zone->zone_zfs_io_pri = 1; /* * Zsched initializes the rctls. @@ -4229,7 +4904,7 @@ zone_create(const char *zone_name, const char *zone_root, return (zone_create_error(error, 0, extended_error)); } - if (block_mounts() == 0) { + if (block_mounts(zone) == 0) { mutex_enter(&pp->p_lock); if (curthread != pp->p_agenttp) continuelwps(pp); @@ -4380,7 +5055,7 @@ zone_create(const char *zone_name, const char *zone_root, /* * The zone is fully visible, so we can let mounts progress. */ - resume_mounts(); + resume_mounts(zone); if (rctls) nvlist_free(rctls); @@ -4396,7 +5071,7 @@ errout: continuelwps(pp); mutex_exit(&pp->p_lock); - resume_mounts(); + resume_mounts(zone); if (rctls) nvlist_free(rctls); /* @@ -4474,6 +5149,7 @@ zone_boot(zoneid_t zoneid) static int zone_empty(zone_t *zone) { + int cnt = 0; int waitstatus; /* @@ -4484,7 +5160,16 @@ zone_empty(zone_t *zone) ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); while ((waitstatus = zone_status_timedwait_sig(zone, ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) { - killall(zone->zone_id); + boolean_t force = B_FALSE; + + /* Every 30 seconds, try harder */ + if (cnt++ >= 30) { + cmn_err(CE_WARN, "attempt to force kill zone %d\n", + zone->zone_id); + force = B_TRUE; + cnt = 0; + } + killall(zone->zone_id, force); } /* * return EINTR if we were signaled @@ -4551,15 +5236,6 @@ zone_shutdown(zoneid_t zoneid) if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) return (set_errno(EINVAL)); - /* - * Block mounts so that VFS_MOUNT() can get an accurate view of - * the zone's status with regards to ZONE_IS_SHUTTING down. - * - * e.g. NFS can fail the mount if it determines that the zone - * has already begun the shutdown sequence. - */ - if (block_mounts() == 0) - return (set_errno(EINTR)); mutex_enter(&zonehash_lock); /* * Look for zone under hash lock to prevent races with other @@ -4567,9 +5243,30 @@ zone_shutdown(zoneid_t zoneid) */ if ((zone = zone_find_all_by_id(zoneid)) == NULL) { mutex_exit(&zonehash_lock); - resume_mounts(); return (set_errno(EINVAL)); } + + /* + * We have to drop zonehash_lock before calling block_mounts. + * Hold the zone so we can continue to use the zone_t. + */ + zone_hold(zone); + mutex_exit(&zonehash_lock); + + /* + * Block mounts so that VFS_MOUNT() can get an accurate view of + * the zone's status with regards to ZONE_IS_SHUTTING down. + * + * e.g. NFS can fail the mount if it determines that the zone + * has already begun the shutdown sequence. + * + */ + if (block_mounts(zone) == 0) { + zone_rele(zone); + return (set_errno(EINTR)); + } + + mutex_enter(&zonehash_lock); mutex_enter(&zone_status_lock); status = zone_status_get(zone); /* @@ -4578,7 +5275,8 @@ zone_shutdown(zoneid_t zoneid) if (status < ZONE_IS_READY) { mutex_exit(&zone_status_lock); mutex_exit(&zonehash_lock); - resume_mounts(); + resume_mounts(zone); + zone_rele(zone); return (set_errno(EINVAL)); } /* @@ -4588,7 +5286,8 @@ zone_shutdown(zoneid_t zoneid) if (status >= ZONE_IS_DOWN) { mutex_exit(&zone_status_lock); mutex_exit(&zonehash_lock); - resume_mounts(); + resume_mounts(zone); + zone_rele(zone); return (0); } /* @@ -4623,10 +5322,9 @@ zone_shutdown(zoneid_t zoneid) } } } - zone_hold(zone); /* so we can use the zone_t later */ mutex_exit(&zone_status_lock); mutex_exit(&zonehash_lock); - resume_mounts(); + resume_mounts(zone); if (error = zone_empty(zone)) { zone_rele(zone); @@ -5222,14 +5920,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) error = EFAULT; } break; - case ZONE_ATTR_PHYS_MCAP: - size = sizeof (zone->zone_phys_mcap); - if (bufsize > size) - bufsize = size; - if (buf != NULL && - copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) - error = EFAULT; - break; case ZONE_ATTR_SCHED_CLASS: mutex_enter(&class_lock); @@ -5284,6 +5974,14 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) } kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_DID: + size = sizeof (zoneid_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0) + error = EFAULT; + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -5315,10 +6013,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) return (set_errno(EPERM)); /* - * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the - * global zone. + * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT + * attributes can be set on the global zone. */ - if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { + if (zoneid == GLOBAL_ZONEID && + attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) { return (set_errno(EINVAL)); } @@ -5335,7 +6034,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) * non-global zones. */ zone_status = zone_status_get(zone); - if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) { + if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT && + attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS && + zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } @@ -5344,6 +6045,10 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_INITNAME: err = zone_set_initname(zone, (const char *)buf); break; + case ZONE_ATTR_INITNORESTART: + zone->zone_restart_init = B_FALSE; + err = 0; + break; case ZONE_ATTR_BOOTARGS: err = zone_set_bootargs(zone, (const char *)buf); break; @@ -5353,8 +6058,17 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_FS_ALLOWED: err = zone_set_fs_allowed(zone, (const char *)buf); break; - case ZONE_ATTR_PHYS_MCAP: - err = zone_set_phys_mcap(zone, (const uint64_t *)buf); + case ZONE_ATTR_PMCAP_NOVER: + err = zone_set_mcap_nover(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_PMCAP_PAGEOUT: + err = zone_set_mcap_pageout(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_PG_FLT_DELAY: + err = zone_set_page_fault_delay(zone, (const uint32_t *)buf); + break; + case ZONE_ATTR_RSS: + err = zone_set_rss(zone, (const uint64_t *)buf); break; case ZONE_ATTR_SCHED_CLASS: err = zone_set_sched_class(zone, (const char *)buf); @@ -6075,6 +6789,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) zs.doi = zs32.doi; zs.label = (const bslabel_t *)(uintptr_t)zs32.label; zs.flags = zs32.flags; + zs.zoneid = zs32.zoneid; #else panic("get_udatamodel() returned bogus result\n"); #endif @@ -6085,7 +6800,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) (caddr_t)zs.rctlbuf, zs.rctlbufsz, (caddr_t)zs.zfsbuf, zs.zfsbufsz, zs.extended_error, zs.match, zs.doi, - zs.label, zs.flags)); + zs.label, zs.flags, zs.zoneid)); case ZONE_BOOT: return (zone_boot((zoneid_t)(uintptr_t)arg1)); case ZONE_DESTROY: @@ -6363,7 +7078,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp) * zone_ki_call_zoneadmd() will do a more thorough job of this * later. */ - killall(zone->zone_id); + killall(zone->zone_id, B_FALSE); /* * Now, create the thread to contact zoneadmd and do the rest of the * work. This thread can't be created in our zone otherwise diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 271682bc67..50846d0cb3 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -20,6 +20,7 @@ # # # Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright 2012, Joyent, Inc. All rights reserved. # include $(SRC)/uts/Makefile.uts @@ -277,6 +278,7 @@ CHKHDRS= \ ipc.h \ ipc_impl.h \ ipc_rctl.h \ + ipd.h \ ipmi.h \ isa_defs.h \ iscsi_authclient.h \ @@ -856,6 +858,8 @@ FSHDRS= \ hsfs_rrip.h \ hsfs_spec.h \ hsfs_susp.h \ + hyprlofs.h \ + hyprlofs_info.h \ lofs_info.h \ lofs_node.h \ mntdata.h \ diff --git a/usr/src/uts/common/sys/aggr_impl.h b/usr/src/uts/common/sys/aggr_impl.h index 8363d231cf..e0cfd6f778 100644 --- a/usr/src/uts/common/sys/aggr_impl.h +++ b/usr/src/uts/common/sys/aggr_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. */ #ifndef _SYS_AGGR_IMPL_H @@ -307,6 +308,8 @@ extern boolean_t aggr_port_notify_link(aggr_grp_t *, aggr_port_t *); extern void aggr_port_init_callbacks(aggr_port_t *); extern void aggr_recv_cb(void *, mac_resource_handle_t, mblk_t *, boolean_t); +extern void aggr_recv_promisc_cb(void *, mac_resource_handle_t, mblk_t *, + boolean_t); extern void aggr_tx_ring_update(void *, uintptr_t); extern void aggr_tx_notify_thread(void *); diff --git a/usr/src/uts/common/sys/blkdev.h b/usr/src/uts/common/sys/blkdev.h index 2307610bae..4ec50fbf3b 100644 --- a/usr/src/uts/common/sys/blkdev.h +++ b/usr/src/uts/common/sys/blkdev.h @@ -19,6 +19,7 @@ * CDDL HEADER END */ /* + * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. */ @@ -116,6 +117,7 @@ struct bd_media { uint64_t m_nblks; uint32_t m_blksize; boolean_t m_readonly; + boolean_t m_solidstate; }; #define BD_INFO_FLAG_REMOVABLE (1U << 0) diff --git a/usr/src/uts/common/sys/buf.h b/usr/src/uts/common/sys/buf.h index a9191aed7c..cb8a6012fc 100644 --- a/usr/src/uts/common/sys/buf.h +++ b/usr/src/uts/common/sys/buf.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -186,6 +187,7 @@ struct biostats { #define B_STARTED 0x2000000 /* io:::start probe called for buf */ #define B_ABRWRITE 0x4000000 /* Application based recovery active */ #define B_PAGE_NOWAIT 0x8000000 /* Skip the page if it is locked */ +#define B_INVALCURONLY 0x10000000 /* invalidate only for curproc */ /* * There is some confusion over the meaning of B_FREE and B_INVAL and what @@ -198,6 +200,12 @@ struct biostats { * between the sole use of these two flags. In both cases, IO will be done * if the page is not yet committed to storage. * + * The B_INVALCURONLY flag modifies the behavior of the B_INVAL flag and is + * intended to be used in conjunction with B_INVAL. B_INVALCURONLY has no + * meaning on its own. When both B_INVALCURONLY and B_INVAL are set, then + * the mapping for the page is only invalidated for the current process. + * In this case, the page is not destroyed unless this was the final mapping. + * * In order to discard pages without writing them back, (B_INVAL | B_TRUNC) * should be used. * diff --git a/usr/src/uts/common/sys/cpucaps.h b/usr/src/uts/common/sys/cpucaps.h index 6063ff4380..6bc042108c 100644 --- a/usr/src/uts/common/sys/cpucaps.h +++ b/usr/src/uts/common/sys/cpucaps.h @@ -22,6 +22,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011, 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_CPUCAPS_H @@ -84,12 +85,16 @@ extern void cpucaps_zone_remove(zone_t *); */ extern int cpucaps_project_set(kproject_t *, rctl_qty_t); extern int cpucaps_zone_set(zone_t *, rctl_qty_t); +extern int cpucaps_zone_set_base(zone_t *, rctl_qty_t); +extern int cpucaps_zone_set_burst_time(zone_t *, rctl_qty_t); /* * Get current CPU usage for a project/zone. */ extern rctl_qty_t cpucaps_project_get(kproject_t *); extern rctl_qty_t cpucaps_zone_get(zone_t *); +extern rctl_qty_t cpucaps_zone_get_base(zone_t *); +extern rctl_qty_t cpucaps_zone_get_burst_time(zone_t *); /* * Scheduling class hooks into CPU caps framework. diff --git a/usr/src/uts/common/sys/cpucaps_impl.h b/usr/src/uts/common/sys/cpucaps_impl.h index 95afd21827..2cd4ed644d 100644 --- a/usr/src/uts/common/sys/cpucaps_impl.h +++ b/usr/src/uts/common/sys/cpucaps_impl.h @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011, 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_CPUCAPS_IMPL_H @@ -66,8 +67,12 @@ typedef struct cpucap { waitq_t cap_waitq; /* waitq for capped threads */ kstat_t *cap_kstat; /* cpucaps specific kstat */ int64_t cap_gen; /* zone cap specific */ + hrtime_t cap_chk_value; /* effective CPU usage cap */ hrtime_t cap_value; /* scaled CPU usage cap */ hrtime_t cap_usage; /* current CPU usage */ + hrtime_t cap_base; /* base CPU for burst */ + u_longlong_t cap_burst_limit; /* max secs (in tics) for a burst */ + u_longlong_t cap_bursting; /* # of ticks currently bursting */ disp_lock_t cap_usagelock; /* protects cap_usage above */ /* * Per cap statistics. @@ -75,6 +80,7 @@ typedef struct cpucap { hrtime_t cap_maxusage; /* maximum cap usage */ u_longlong_t cap_below; /* # of ticks spend below the cap */ u_longlong_t cap_above; /* # of ticks spend above the cap */ + u_longlong_t cap_above_base; /* # of ticks spent above the base */ } cpucap_t; /* diff --git a/usr/src/uts/common/sys/cred.h b/usr/src/uts/common/sys/cred.h index 5056f9a511..914f132dc0 100644 --- a/usr/src/uts/common/sys/cred.h +++ b/usr/src/uts/common/sys/cred.h @@ -93,6 +93,7 @@ extern gid_t crgetgid(const cred_t *); extern gid_t crgetrgid(const cred_t *); extern gid_t crgetsgid(const cred_t *); extern zoneid_t crgetzoneid(const cred_t *); +extern zoneid_t crgetzonedid(const cred_t *); extern projid_t crgetprojid(const cred_t *); extern cred_t *crgetmapped(const cred_t *); diff --git a/usr/src/uts/common/sys/dkio.h b/usr/src/uts/common/sys/dkio.h index eb4ddf34fe..a5b0c312f9 100644 --- a/usr/src/uts/common/sys/dkio.h +++ b/usr/src/uts/common/sys/dkio.h @@ -23,6 +23,7 @@ * Copyright (c) 1982, 2010, Oracle and/or its affiliates. All rights reserved. * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. */ #ifndef _SYS_DKIO_H @@ -237,6 +238,9 @@ struct dk_callback { #define DKIOCSETEXTPART (DKIOC|46) #endif +/* ioctl to report whether the disk is solid state or not - used for ZFS */ +#define DKIOCSOLIDSTATE (DKIOC|38) + /* * Ioctl to force driver to re-read the alternate partition and rebuild * the internal defect map. diff --git a/usr/src/uts/common/sys/dktp/dadk.h b/usr/src/uts/common/sys/dktp/dadk.h index f5c990e7c0..2178ad1f0d 100644 --- a/usr/src/uts/common/sys/dktp/dadk.h +++ b/usr/src/uts/common/sys/dktp/dadk.h @@ -65,6 +65,8 @@ struct dadk { kstat_t *dad_errstats; /* error stats */ kmutex_t dad_cmd_mutex; int dad_cmd_count; + uint32_t dad_err_cnt; /* number of recent errors */ + hrtime_t dad_last_log; /* time of last error log */ }; #define DAD_SECSIZ dad_phyg.g_secsiz diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h index fb2a0749d3..303a9c7e45 100644 --- a/usr/src/uts/common/sys/dld.h +++ b/usr/src/uts/common/sys/dld.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ #ifndef _SYS_DLD_H @@ -191,6 +192,7 @@ typedef struct dld_ioc_rename { datalink_id_t dir_linkid1; datalink_id_t dir_linkid2; char dir_link[MAXLINKNAMELEN]; + boolean_t dir_zoneinit; } dld_ioc_rename_t; /* @@ -203,6 +205,7 @@ typedef struct dld_ioc_rename { typedef struct dld_ioc_zid { zoneid_t diz_zid; datalink_id_t diz_linkid; + boolean_t diz_transient; } dld_ioc_zid_t; /* diff --git a/usr/src/uts/common/sys/dls.h b/usr/src/uts/common/sys/dls.h index 6bd2bbe35a..adcfe76c08 100644 --- a/usr/src/uts/common/sys/dls.h +++ b/usr/src/uts/common/sys/dls.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ #ifndef _SYS_DLS_H @@ -110,7 +111,7 @@ extern void dls_devnet_close(dls_dl_handle_t); extern boolean_t dls_devnet_rebuild(); extern int dls_devnet_rename(datalink_id_t, datalink_id_t, - const char *); + const char *, boolean_t); extern int dls_devnet_create(mac_handle_t, datalink_id_t, zoneid_t); extern int dls_devnet_destroy(mac_handle_t, datalink_id_t *, @@ -127,7 +128,7 @@ extern uint16_t dls_devnet_vid(dls_dl_handle_t); extern datalink_id_t dls_devnet_linkid(dls_dl_handle_t); extern int dls_devnet_dev2linkid(dev_t, datalink_id_t *); extern int dls_devnet_phydev(datalink_id_t, dev_t *); -extern int dls_devnet_setzid(dls_dl_handle_t, zoneid_t); +extern int dls_devnet_setzid(dls_dl_handle_t, zoneid_t, boolean_t); extern zoneid_t dls_devnet_getzid(dls_dl_handle_t); extern zoneid_t dls_devnet_getownerzid(dls_dl_handle_t); extern boolean_t dls_devnet_islinkvisible(datalink_id_t, zoneid_t); diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h index 60f51c47b5..8f7af6856c 100644 --- a/usr/src/uts/common/sys/dls_impl.h +++ b/usr/src/uts/common/sys/dls_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ #ifndef _SYS_DLS_IMPL_H @@ -96,7 +97,8 @@ extern void dls_create_str_kstats(dld_str_t *); extern int dls_stat_update(kstat_t *, dls_link_t *, int); extern int dls_stat_create(const char *, int, const char *, zoneid_t, int (*)(struct kstat *, int), void *, - kstat_t **); + kstat_t **, zoneid_t); +extern void dls_stat_delete(kstat_t *); extern int dls_devnet_open_by_dev(dev_t, dls_link_t **, dls_dl_handle_t *); diff --git a/usr/src/uts/common/sys/dls_mgmt.h b/usr/src/uts/common/sys/dls_mgmt.h index b4032c24d6..4f73d92118 100644 --- a/usr/src/uts/common/sys/dls_mgmt.h +++ b/usr/src/uts/common/sys/dls_mgmt.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent Inc. All rights reserved. */ #ifndef _DLS_MGMT_H @@ -165,6 +166,7 @@ typedef struct dlmgmt_door_getname { typedef struct dlmgmt_door_getlinkid { int ld_cmd; char ld_link[MAXLINKNAMELEN]; + zoneid_t ld_zoneid; } dlmgmt_door_getlinkid_t; typedef struct dlmgmt_door_getnext_s { diff --git a/usr/src/uts/common/sys/dtrace.h b/usr/src/uts/common/sys/dtrace.h index fd7612f88a..e6d9e0e675 100644 --- a/usr/src/uts/common/sys/dtrace.h +++ b/usr/src/uts/common/sys/dtrace.h @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ @@ -36,16 +36,16 @@ extern "C" { #endif -/* - * DTrace Dynamic Tracing Software: Kernel Interfaces - * - * Note: The contents of this file are private to the implementation of the - * Solaris system and DTrace subsystem and are subject to change at any time - * without notice. Applications and drivers using these interfaces will fail - * to run on future releases. These interfaces should not be used for any - * purpose except those expressly outlined in dtrace(7D) and libdtrace(3LIB). - * Please refer to the "Solaris Dynamic Tracing Guide" for more information. - */ + /* + * DTrace Dynamic Tracing Software: Kernel Interfaces + * + * Note: The contents of this file are private to the implementation of the + * Solaris system and DTrace subsystem and are subject to change at any time + * without notice. Applications and drivers using these interfaces will fail + * to run on future releases. These interfaces should not be used for any + * purpose except those expressly outlined in dtrace(7D) and libdtrace(3LIB). + * Please refer to the "Solaris Dynamic Tracing Guide" for more information. + */ #ifndef _ASM @@ -57,9 +57,9 @@ extern "C" { #include <sys/cyclic.h> #include <sys/int_limits.h> -/* - * DTrace Universal Constants and Typedefs - */ + /* + * DTrace Universal Constants and Typedefs + */ #define DTRACE_CPUALL -1 /* all CPUs */ #define DTRACE_IDNONE 0 /* invalid probe identifier */ #define DTRACE_EPIDNONE 0 /* invalid enabled probe identifier */ @@ -75,35 +75,35 @@ extern "C" { #define DTRACE_FUNCNAMELEN 128 #define DTRACE_NAMELEN 64 #define DTRACE_FULLNAMELEN (DTRACE_PROVNAMELEN + DTRACE_MODNAMELEN + \ - DTRACE_FUNCNAMELEN + DTRACE_NAMELEN + 4) + DTRACE_FUNCNAMELEN + DTRACE_NAMELEN + 4) #define DTRACE_ARGTYPELEN 128 -typedef uint32_t dtrace_id_t; /* probe identifier */ -typedef uint32_t dtrace_epid_t; /* enabled probe identifier */ -typedef uint32_t dtrace_aggid_t; /* aggregation identifier */ -typedef int64_t dtrace_aggvarid_t; /* aggregation variable identifier */ -typedef uint16_t dtrace_actkind_t; /* action kind */ -typedef int64_t dtrace_optval_t; /* option value */ -typedef uint32_t dtrace_cacheid_t; /* predicate cache identifier */ - -typedef enum dtrace_probespec { - DTRACE_PROBESPEC_NONE = -1, - DTRACE_PROBESPEC_PROVIDER = 0, - DTRACE_PROBESPEC_MOD, - DTRACE_PROBESPEC_FUNC, - DTRACE_PROBESPEC_NAME -} dtrace_probespec_t; - -/* - * DTrace Intermediate Format (DIF) - * - * The following definitions describe the DTrace Intermediate Format (DIF), a - * a RISC-like instruction set and program encoding used to represent - * predicates and actions that can be bound to DTrace probes. The constants - * below defining the number of available registers are suggested minimums; the - * compiler should use DTRACEIOC_CONF to dynamically obtain the number of - * registers provided by the current DTrace implementation. - */ + typedef uint32_t dtrace_id_t; /* probe identifier */ + typedef uint32_t dtrace_epid_t; /* enabled probe identifier */ + typedef uint32_t dtrace_aggid_t; /* aggregation identifier */ + typedef int64_t dtrace_aggvarid_t; /* aggregation variable identifier */ + typedef uint16_t dtrace_actkind_t; /* action kind */ + typedef int64_t dtrace_optval_t; /* option value */ + typedef uint32_t dtrace_cacheid_t; /* predicate cache identifier */ + + typedef enum dtrace_probespec { + DTRACE_PROBESPEC_NONE = -1, + DTRACE_PROBESPEC_PROVIDER = 0, + DTRACE_PROBESPEC_MOD, + DTRACE_PROBESPEC_FUNC, + DTRACE_PROBESPEC_NAME + } dtrace_probespec_t; + + /* + * DTrace Intermediate Format (DIF) + * + * The following definitions describe the DTrace Intermediate Format (DIF), a + * a RISC-like instruction set and program encoding used to represent + * predicates and actions that can be bound to DTrace probes. The constants + * below defining the number of available registers are suggested minimums; the + * compiler should use DTRACEIOC_CONF to dynamically obtain the number of + * registers provided by the current DTrace implementation. + */ #define DIF_VERSION_1 1 /* DIF version 1: Solaris 10 Beta */ #define DIF_VERSION_2 2 /* DIF version 2: Solaris 10 FCS */ #define DIF_VERSION DIF_VERSION_2 /* latest DIF instruction set version */ @@ -288,10 +288,11 @@ typedef enum dtrace_probespec { #define DIF_SUBR_INET_NTOA6 43 #define DIF_SUBR_TOUPPER 44 #define DIF_SUBR_TOLOWER 45 +#define DIF_SUBR_GETF 46 -#define DIF_SUBR_MAX 45 /* max subroutine value */ +#define DIF_SUBR_MAX 46 /* max subroutine value */ -typedef uint32_t dif_instr_t; + typedef uint32_t dif_instr_t; #define DIF_INSTR_OP(i) (((i) >> 24) & 0xff) #define DIF_INSTR_R1(i) (((i) >> 16) & 0xff) @@ -333,39 +334,39 @@ typedef uint32_t dif_instr_t; #define DIF_REG_R0 0 /* %r0 is always set to zero */ -/* - * A DTrace Intermediate Format Type (DIF Type) is used to represent the types - * of variables, function and associative array arguments, and the return type - * for each DIF object (shown below). It contains a description of the type, - * its size in bytes, and a module identifier. - */ -typedef struct dtrace_diftype { - uint8_t dtdt_kind; /* type kind (see below) */ - uint8_t dtdt_ckind; /* type kind in CTF */ - uint8_t dtdt_flags; /* type flags (see below) */ - uint8_t dtdt_pad; /* reserved for future use */ - uint32_t dtdt_size; /* type size in bytes (unless string) */ -} dtrace_diftype_t; + /* + * A DTrace Intermediate Format Type (DIF Type) is used to represent the types + * of variables, function and associative array arguments, and the return type + * for each DIF object (shown below). It contains a description of the type, + * its size in bytes, and a module identifier. + */ + typedef struct dtrace_diftype { + uint8_t dtdt_kind; /* type kind (see below) */ + uint8_t dtdt_ckind; /* type kind in CTF */ + uint8_t dtdt_flags; /* type flags (see below) */ + uint8_t dtdt_pad; /* reserved for future use */ + uint32_t dtdt_size; /* type size in bytes (unless string) */ + } dtrace_diftype_t; #define DIF_TYPE_CTF 0 /* type is a CTF type */ #define DIF_TYPE_STRING 1 /* type is a D string */ #define DIF_TF_BYREF 0x1 /* type is passed by reference */ -/* - * A DTrace Intermediate Format variable record is used to describe each of the - * variables referenced by a given DIF object. It contains an integer variable - * identifier along with variable scope and properties, as shown below. The - * size of this structure must be sizeof (int) aligned. - */ -typedef struct dtrace_difv { - uint32_t dtdv_name; /* variable name index in dtdo_strtab */ - uint32_t dtdv_id; /* variable reference identifier */ - uint8_t dtdv_kind; /* variable kind (see below) */ - uint8_t dtdv_scope; /* variable scope (see below) */ - uint16_t dtdv_flags; /* variable flags (see below) */ - dtrace_diftype_t dtdv_type; /* variable type (see above) */ -} dtrace_difv_t; + /* + * A DTrace Intermediate Format variable record is used to describe each of the + * variables referenced by a given DIF object. It contains an integer variable + * identifier along with variable scope and properties, as shown below. The + * size of this structure must be sizeof (int) aligned. + */ + typedef struct dtrace_difv { + uint32_t dtdv_name; /* variable name index in dtdo_strtab */ + uint32_t dtdv_id; /* variable reference identifier */ + uint8_t dtdv_kind; /* variable kind (see below) */ + uint8_t dtdv_scope; /* variable scope (see below) */ + uint16_t dtdv_flags; /* variable flags (see below) */ + dtrace_diftype_t dtdv_type; /* variable type (see above) */ + } dtrace_difv_t; #define DIFV_KIND_ARRAY 0 /* variable is an array of quantities */ #define DIFV_KIND_SCALAR 1 /* variable is a scalar quantity */ @@ -377,21 +378,21 @@ typedef struct dtrace_difv { #define DIFV_F_REF 0x1 /* variable is referenced by DIFO */ #define DIFV_F_MOD 0x2 /* variable is written by DIFO */ -/* - * DTrace Actions - * - * The upper byte determines the class of the action; the low bytes determines - * the specific action within that class. The classes of actions are as - * follows: - * - * [ no class ] <= May record process- or kernel-related data - * DTRACEACT_PROC <= Only records process-related data - * DTRACEACT_PROC_DESTRUCTIVE <= Potentially destructive to processes - * DTRACEACT_KERNEL <= Only records kernel-related data - * DTRACEACT_KERNEL_DESTRUCTIVE <= Potentially destructive to the kernel - * DTRACEACT_SPECULATIVE <= Speculation-related action - * DTRACEACT_AGGREGATION <= Aggregating action - */ + /* + * DTrace Actions + * + * The upper byte determines the class of the action; the low bytes determines + * the specific action within that class. The classes of actions are as + * follows: + * + * [ no class ] <= May record process- or kernel-related data + * DTRACEACT_PROC <= Only records process-related data + * DTRACEACT_PROC_DESTRUCTIVE <= Potentially destructive to processes + * DTRACEACT_KERNEL <= Only records kernel-related data + * DTRACEACT_KERNEL_DESTRUCTIVE <= Potentially destructive to the kernel + * DTRACEACT_SPECULATIVE <= Speculation-related action + * DTRACEACT_AGGREGATION <= Aggregating action + */ #define DTRACEACT_NONE 0 /* no action */ #define DTRACEACT_DIFEXPR 1 /* action is DIF expression */ #define DTRACEACT_EXIT 2 /* exit() action */ @@ -435,27 +436,27 @@ typedef struct dtrace_difv { #define DTRACEACT_ISDESTRUCTIVE(x) \ (DTRACEACT_CLASS(x) == DTRACEACT_PROC_DESTRUCTIVE || \ - DTRACEACT_CLASS(x) == DTRACEACT_KERNEL_DESTRUCTIVE) + DTRACEACT_CLASS(x) == DTRACEACT_KERNEL_DESTRUCTIVE) #define DTRACEACT_ISSPECULATIVE(x) \ (DTRACEACT_CLASS(x) == DTRACEACT_SPECULATIVE) #define DTRACEACT_ISPRINTFLIKE(x) \ ((x) == DTRACEACT_PRINTF || (x) == DTRACEACT_PRINTA || \ - (x) == DTRACEACT_SYSTEM || (x) == DTRACEACT_FREOPEN) - -/* - * DTrace Aggregating Actions - * - * These are functions f(x) for which the following is true: - * - * f(f(x_0) U f(x_1) U ... U f(x_n)) = f(x_0 U x_1 U ... U x_n) - * - * where x_n is a set of arbitrary data. Aggregating actions are in their own - * DTrace action class, DTTRACEACT_AGGREGATION. The macros provided here allow - * for easier processing of the aggregation argument and data payload for a few - * aggregating actions (notably: quantize(), lquantize(), and ustack()). - */ + (x) == DTRACEACT_SYSTEM || (x) == DTRACEACT_FREOPEN) + + /* + * DTrace Aggregating Actions + * + * These are functions f(x) for which the following is true: + * + * f(f(x_0) U f(x_1) U ... U f(x_n)) = f(x_0 U x_1 U ... U x_n) + * + * where x_n is a set of arbitrary data. Aggregating actions are in their own + * DTrace action class, DTTRACEACT_AGGREGATION. The macros provided here allow + * for easier processing of the aggregation argument and data payload for a few + * aggregating actions (notably: quantize(), lquantize(), and ustack()). + */ #define DTRACEACT_AGGREGATION 0x0700 #define DTRACEAGG_COUNT (DTRACEACT_AGGREGATION + 1) #define DTRACEAGG_MIN (DTRACEACT_AGGREGATION + 2) @@ -477,9 +478,9 @@ typedef struct dtrace_difv { #define DTRACE_QUANTIZE_BUCKETVAL(buck) \ (int64_t)((buck) < DTRACE_QUANTIZE_ZEROBUCKET ? \ - -(1LL << (DTRACE_QUANTIZE_ZEROBUCKET - 1 - (buck))) : \ - (buck) == DTRACE_QUANTIZE_ZEROBUCKET ? 0 : \ - 1LL << ((buck) - DTRACE_QUANTIZE_ZEROBUCKET - 1)) + -(1LL << (DTRACE_QUANTIZE_ZEROBUCKET - 1 - (buck))) : \ + (buck) == DTRACE_QUANTIZE_ZEROBUCKET ? 0 : \ + 1LL << ((buck) - DTRACE_QUANTIZE_ZEROBUCKET - 1)) #define DTRACE_LQUANTIZE_STEPSHIFT 48 #define DTRACE_LQUANTIZE_STEPMASK ((uint64_t)UINT16_MAX << 48) @@ -490,15 +491,15 @@ typedef struct dtrace_difv { #define DTRACE_LQUANTIZE_STEP(x) \ (uint16_t)(((x) & DTRACE_LQUANTIZE_STEPMASK) >> \ - DTRACE_LQUANTIZE_STEPSHIFT) + DTRACE_LQUANTIZE_STEPSHIFT) #define DTRACE_LQUANTIZE_LEVELS(x) \ (uint16_t)(((x) & DTRACE_LQUANTIZE_LEVELMASK) >> \ - DTRACE_LQUANTIZE_LEVELSHIFT) + DTRACE_LQUANTIZE_LEVELSHIFT) #define DTRACE_LQUANTIZE_BASE(x) \ (int32_t)(((x) & DTRACE_LQUANTIZE_BASEMASK) >> \ - DTRACE_LQUANTIZE_BASESHIFT) + DTRACE_LQUANTIZE_BASESHIFT) #define DTRACE_LLQUANTIZE_FACTORSHIFT 48 #define DTRACE_LLQUANTIZE_FACTORMASK ((uint64_t)UINT16_MAX << 48) @@ -511,19 +512,19 @@ typedef struct dtrace_difv { #define DTRACE_LLQUANTIZE_FACTOR(x) \ (uint16_t)(((x) & DTRACE_LLQUANTIZE_FACTORMASK) >> \ - DTRACE_LLQUANTIZE_FACTORSHIFT) + DTRACE_LLQUANTIZE_FACTORSHIFT) #define DTRACE_LLQUANTIZE_LOW(x) \ (uint16_t)(((x) & DTRACE_LLQUANTIZE_LOWMASK) >> \ - DTRACE_LLQUANTIZE_LOWSHIFT) + DTRACE_LLQUANTIZE_LOWSHIFT) #define DTRACE_LLQUANTIZE_HIGH(x) \ (uint16_t)(((x) & DTRACE_LLQUANTIZE_HIGHMASK) >> \ - DTRACE_LLQUANTIZE_HIGHSHIFT) + DTRACE_LLQUANTIZE_HIGHSHIFT) #define DTRACE_LLQUANTIZE_NSTEP(x) \ (uint16_t)(((x) & DTRACE_LLQUANTIZE_NSTEPMASK) >> \ - DTRACE_LLQUANTIZE_NSTEPSHIFT) + DTRACE_LLQUANTIZE_NSTEPSHIFT) #define DTRACE_USTACK_NFRAMES(x) (uint32_t)((x) & UINT32_MAX) #define DTRACE_USTACK_STRSIZE(x) (uint32_t)((x) >> 32) @@ -540,72 +541,72 @@ typedef struct dtrace_difv { #define DTRACE_PTR(type, name) type *name #endif -/* - * DTrace Object Format (DOF) - * - * DTrace programs can be persistently encoded in the DOF format so that they - * may be embedded in other programs (for example, in an ELF file) or in the - * dtrace driver configuration file for use in anonymous tracing. The DOF - * format is versioned and extensible so that it can be revised and so that - * internal data structures can be modified or extended compatibly. All DOF - * structures use fixed-size types, so the 32-bit and 64-bit representations - * are identical and consumers can use either data model transparently. - * - * The file layout is structured as follows: - * - * +---------------+-------------------+----- ... ----+---- ... ------+ - * | dof_hdr_t | dof_sec_t[ ... ] | loadable | non-loadable | - * | (file header) | (section headers) | section data | section data | - * +---------------+-------------------+----- ... ----+---- ... ------+ - * |<------------ dof_hdr.dofh_loadsz --------------->| | - * |<------------ dof_hdr.dofh_filesz ------------------------------->| - * - * The file header stores meta-data including a magic number, data model for - * the instrumentation, data encoding, and properties of the DIF code within. - * The header describes its own size and the size of the section headers. By - * convention, an array of section headers follows the file header, and then - * the data for all loadable sections and unloadable sections. This permits - * consumer code to easily download the headers and all loadable data into the - * DTrace driver in one contiguous chunk, omitting other extraneous sections. - * - * The section headers describe the size, offset, alignment, and section type - * for each section. Sections are described using a set of #defines that tell - * the consumer what kind of data is expected. Sections can contain links to - * other sections by storing a dof_secidx_t, an index into the section header - * array, inside of the section data structures. The section header includes - * an entry size so that sections with data arrays can grow their structures. - * - * The DOF data itself can contain many snippets of DIF (i.e. >1 DIFOs), which - * are represented themselves as a collection of related DOF sections. This - * permits us to change the set of sections associated with a DIFO over time, - * and also permits us to encode DIFOs that contain different sets of sections. - * When a DOF section wants to refer to a DIFO, it stores the dof_secidx_t of a - * section of type DOF_SECT_DIFOHDR. This section's data is then an array of - * dof_secidx_t's which in turn denote the sections associated with this DIFO. - * - * This loose coupling of the file structure (header and sections) to the - * structure of the DTrace program itself (ECB descriptions, action - * descriptions, and DIFOs) permits activities such as relocation processing - * to occur in a single pass without having to understand D program structure. - * - * Finally, strings are always stored in ELF-style string tables along with a - * string table section index and string table offset. Therefore strings in - * DOF are always arbitrary-length and not bound to the current implementation. - */ + /* + * DTrace Object Format (DOF) + * + * DTrace programs can be persistently encoded in the DOF format so that they + * may be embedded in other programs (for example, in an ELF file) or in the + * dtrace driver configuration file for use in anonymous tracing. The DOF + * format is versioned and extensible so that it can be revised and so that + * internal data structures can be modified or extended compatibly. All DOF + * structures use fixed-size types, so the 32-bit and 64-bit representations + * are identical and consumers can use either data model transparently. + * + * The file layout is structured as follows: + * + * +---------------+-------------------+----- ... ----+---- ... ------+ + * | dof_hdr_t | dof_sec_t[ ... ] | loadable | non-loadable | + * | (file header) | (section headers) | section data | section data | + * +---------------+-------------------+----- ... ----+---- ... ------+ + * |<------------ dof_hdr.dofh_loadsz --------------->| | + * |<------------ dof_hdr.dofh_filesz ------------------------------->| + * + * The file header stores meta-data including a magic number, data model for + * the instrumentation, data encoding, and properties of the DIF code within. + * The header describes its own size and the size of the section headers. By + * convention, an array of section headers follows the file header, and then + * the data for all loadable sections and unloadable sections. This permits + * consumer code to easily download the headers and all loadable data into the + * DTrace driver in one contiguous chunk, omitting other extraneous sections. + * + * The section headers describe the size, offset, alignment, and section type + * for each section. Sections are described using a set of #defines that tell + * the consumer what kind of data is expected. Sections can contain links to + * other sections by storing a dof_secidx_t, an index into the section header + * array, inside of the section data structures. The section header includes + * an entry size so that sections with data arrays can grow their structures. + * + * The DOF data itself can contain many snippets of DIF (i.e. >1 DIFOs), which + * are represented themselves as a collection of related DOF sections. This + * permits us to change the set of sections associated with a DIFO over time, + * and also permits us to encode DIFOs that contain different sets of sections. + * When a DOF section wants to refer to a DIFO, it stores the dof_secidx_t of a + * section of type DOF_SECT_DIFOHDR. This section's data is then an array of + * dof_secidx_t's which in turn denote the sections associated with this DIFO. + * + * This loose coupling of the file structure (header and sections) to the + * structure of the DTrace program itself (ECB descriptions, action + * descriptions, and DIFOs) permits activities such as relocation processing + * to occur in a single pass without having to understand D program structure. + * + * Finally, strings are always stored in ELF-style string tables along with a + * string table section index and string table offset. Therefore strings in + * DOF are always arbitrary-length and not bound to the current implementation. + */ #define DOF_ID_SIZE 16 /* total size of dofh_ident[] in bytes */ -typedef struct dof_hdr { - uint8_t dofh_ident[DOF_ID_SIZE]; /* identification bytes (see below) */ - uint32_t dofh_flags; /* file attribute flags (if any) */ - uint32_t dofh_hdrsize; /* size of file header in bytes */ - uint32_t dofh_secsize; /* size of section header in bytes */ - uint32_t dofh_secnum; /* number of section headers */ - uint64_t dofh_secoff; /* file offset of section headers */ - uint64_t dofh_loadsz; /* file size of loadable portion */ - uint64_t dofh_filesz; /* file size of entire DOF file */ - uint64_t dofh_pad; /* reserved for future use */ -} dof_hdr_t; + typedef struct dof_hdr { + uint8_t dofh_ident[DOF_ID_SIZE]; /* identification bytes (see below) */ + uint32_t dofh_flags; /* file attribute flags (if any) */ + uint32_t dofh_hdrsize; /* size of file header in bytes */ + uint32_t dofh_secsize; /* size of section header in bytes */ + uint32_t dofh_secnum; /* number of section headers */ + uint64_t dofh_secoff; /* file offset of section headers */ + uint64_t dofh_loadsz; /* file size of loadable portion */ + uint64_t dofh_filesz; /* file size of entire DOF file */ + uint64_t dofh_pad; /* reserved for future use */ + } dof_hdr_t; #define DOF_ID_MAG0 0 /* first byte of magic number */ #define DOF_ID_MAG1 1 /* second byte of magic number */ @@ -653,20 +654,20 @@ typedef struct dof_hdr { #define DOF_FL_VALID 0 /* mask of all valid dofh_flags bits */ -typedef uint32_t dof_secidx_t; /* section header table index type */ -typedef uint32_t dof_stridx_t; /* string table index type */ + typedef uint32_t dof_secidx_t; /* section header table index type */ + typedef uint32_t dof_stridx_t; /* string table index type */ #define DOF_SECIDX_NONE (-1U) /* null value for section indices */ #define DOF_STRIDX_NONE (-1U) /* null value for string indices */ -typedef struct dof_sec { - uint32_t dofs_type; /* section type (see below) */ - uint32_t dofs_align; /* section data memory alignment */ - uint32_t dofs_flags; /* section flags (if any) */ - uint32_t dofs_entsize; /* size of section entry (if table) */ - uint64_t dofs_offset; /* offset of section data within file */ - uint64_t dofs_size; /* size of section data in bytes */ -} dof_sec_t; + typedef struct dof_sec { + uint32_t dofs_type; /* section type (see below) */ + uint32_t dofs_align; /* section data memory alignment */ + uint32_t dofs_flags; /* section flags (if any) */ + uint32_t dofs_entsize; /* size of section entry (if table) */ + uint64_t dofs_offset; /* offset of section data within file */ + uint64_t dofs_size; /* size of section data in bytes */ + } dof_sec_t; #define DOF_SECT_NONE 0 /* null section */ #define DOF_SECT_COMMENTS 1 /* compiler comments */ @@ -700,297 +701,297 @@ typedef struct dof_sec { #define DOF_SEC_ISLOADABLE(x) \ (((x) == DOF_SECT_ECBDESC) || ((x) == DOF_SECT_PROBEDESC) || \ - ((x) == DOF_SECT_ACTDESC) || ((x) == DOF_SECT_DIFOHDR) || \ - ((x) == DOF_SECT_DIF) || ((x) == DOF_SECT_STRTAB) || \ - ((x) == DOF_SECT_VARTAB) || ((x) == DOF_SECT_RELTAB) || \ - ((x) == DOF_SECT_TYPTAB) || ((x) == DOF_SECT_URELHDR) || \ - ((x) == DOF_SECT_KRELHDR) || ((x) == DOF_SECT_OPTDESC) || \ - ((x) == DOF_SECT_PROVIDER) || ((x) == DOF_SECT_PROBES) || \ - ((x) == DOF_SECT_PRARGS) || ((x) == DOF_SECT_PROFFS) || \ - ((x) == DOF_SECT_INTTAB) || ((x) == DOF_SECT_XLTAB) || \ - ((x) == DOF_SECT_XLMEMBERS) || ((x) == DOF_SECT_XLIMPORT) || \ - ((x) == DOF_SECT_XLIMPORT) || ((x) == DOF_SECT_XLEXPORT) || \ - ((x) == DOF_SECT_PREXPORT) || ((x) == DOF_SECT_PRENOFFS)) - -typedef struct dof_ecbdesc { - dof_secidx_t dofe_probes; /* link to DOF_SECT_PROBEDESC */ - dof_secidx_t dofe_pred; /* link to DOF_SECT_DIFOHDR */ - dof_secidx_t dofe_actions; /* link to DOF_SECT_ACTDESC */ - uint32_t dofe_pad; /* reserved for future use */ - uint64_t dofe_uarg; /* user-supplied library argument */ -} dof_ecbdesc_t; - -typedef struct dof_probedesc { - dof_secidx_t dofp_strtab; /* link to DOF_SECT_STRTAB section */ - dof_stridx_t dofp_provider; /* provider string */ - dof_stridx_t dofp_mod; /* module string */ - dof_stridx_t dofp_func; /* function string */ - dof_stridx_t dofp_name; /* name string */ - uint32_t dofp_id; /* probe identifier (or zero) */ -} dof_probedesc_t; - -typedef struct dof_actdesc { - dof_secidx_t dofa_difo; /* link to DOF_SECT_DIFOHDR */ - dof_secidx_t dofa_strtab; /* link to DOF_SECT_STRTAB section */ - uint32_t dofa_kind; /* action kind (DTRACEACT_* constant) */ - uint32_t dofa_ntuple; /* number of subsequent tuple actions */ - uint64_t dofa_arg; /* kind-specific argument */ - uint64_t dofa_uarg; /* user-supplied argument */ -} dof_actdesc_t; - -typedef struct dof_difohdr { - dtrace_diftype_t dofd_rtype; /* return type for this fragment */ - dof_secidx_t dofd_links[1]; /* variable length array of indices */ -} dof_difohdr_t; - -typedef struct dof_relohdr { - dof_secidx_t dofr_strtab; /* link to DOF_SECT_STRTAB for names */ - dof_secidx_t dofr_relsec; /* link to DOF_SECT_RELTAB for relos */ - dof_secidx_t dofr_tgtsec; /* link to section we are relocating */ -} dof_relohdr_t; - -typedef struct dof_relodesc { - dof_stridx_t dofr_name; /* string name of relocation symbol */ - uint32_t dofr_type; /* relo type (DOF_RELO_* constant) */ - uint64_t dofr_offset; /* byte offset for relocation */ - uint64_t dofr_data; /* additional type-specific data */ -} dof_relodesc_t; + ((x) == DOF_SECT_ACTDESC) || ((x) == DOF_SECT_DIFOHDR) || \ + ((x) == DOF_SECT_DIF) || ((x) == DOF_SECT_STRTAB) || \ + ((x) == DOF_SECT_VARTAB) || ((x) == DOF_SECT_RELTAB) || \ + ((x) == DOF_SECT_TYPTAB) || ((x) == DOF_SECT_URELHDR) || \ + ((x) == DOF_SECT_KRELHDR) || ((x) == DOF_SECT_OPTDESC) || \ + ((x) == DOF_SECT_PROVIDER) || ((x) == DOF_SECT_PROBES) || \ + ((x) == DOF_SECT_PRARGS) || ((x) == DOF_SECT_PROFFS) || \ + ((x) == DOF_SECT_INTTAB) || ((x) == DOF_SECT_XLTAB) || \ + ((x) == DOF_SECT_XLMEMBERS) || ((x) == DOF_SECT_XLIMPORT) || \ + ((x) == DOF_SECT_XLIMPORT) || ((x) == DOF_SECT_XLEXPORT) || \ + ((x) == DOF_SECT_PREXPORT) || ((x) == DOF_SECT_PRENOFFS)) + + typedef struct dof_ecbdesc { + dof_secidx_t dofe_probes; /* link to DOF_SECT_PROBEDESC */ + dof_secidx_t dofe_pred; /* link to DOF_SECT_DIFOHDR */ + dof_secidx_t dofe_actions; /* link to DOF_SECT_ACTDESC */ + uint32_t dofe_pad; /* reserved for future use */ + uint64_t dofe_uarg; /* user-supplied library argument */ + } dof_ecbdesc_t; + + typedef struct dof_probedesc { + dof_secidx_t dofp_strtab; /* link to DOF_SECT_STRTAB section */ + dof_stridx_t dofp_provider; /* provider string */ + dof_stridx_t dofp_mod; /* module string */ + dof_stridx_t dofp_func; /* function string */ + dof_stridx_t dofp_name; /* name string */ + uint32_t dofp_id; /* probe identifier (or zero) */ + } dof_probedesc_t; + + typedef struct dof_actdesc { + dof_secidx_t dofa_difo; /* link to DOF_SECT_DIFOHDR */ + dof_secidx_t dofa_strtab; /* link to DOF_SECT_STRTAB section */ + uint32_t dofa_kind; /* action kind (DTRACEACT_* constant) */ + uint32_t dofa_ntuple; /* number of subsequent tuple actions */ + uint64_t dofa_arg; /* kind-specific argument */ + uint64_t dofa_uarg; /* user-supplied argument */ + } dof_actdesc_t; + + typedef struct dof_difohdr { + dtrace_diftype_t dofd_rtype; /* return type for this fragment */ + dof_secidx_t dofd_links[1]; /* variable length array of indices */ + } dof_difohdr_t; + + typedef struct dof_relohdr { + dof_secidx_t dofr_strtab; /* link to DOF_SECT_STRTAB for names */ + dof_secidx_t dofr_relsec; /* link to DOF_SECT_RELTAB for relos */ + dof_secidx_t dofr_tgtsec; /* link to section we are relocating */ + } dof_relohdr_t; + + typedef struct dof_relodesc { + dof_stridx_t dofr_name; /* string name of relocation symbol */ + uint32_t dofr_type; /* relo type (DOF_RELO_* constant) */ + uint64_t dofr_offset; /* byte offset for relocation */ + uint64_t dofr_data; /* additional type-specific data */ + } dof_relodesc_t; #define DOF_RELO_NONE 0 /* empty relocation entry */ #define DOF_RELO_SETX 1 /* relocate setx value */ -typedef struct dof_optdesc { - uint32_t dofo_option; /* option identifier */ - dof_secidx_t dofo_strtab; /* string table, if string option */ - uint64_t dofo_value; /* option value or string index */ -} dof_optdesc_t; + typedef struct dof_optdesc { + uint32_t dofo_option; /* option identifier */ + dof_secidx_t dofo_strtab; /* string table, if string option */ + uint64_t dofo_value; /* option value or string index */ + } dof_optdesc_t; -typedef uint32_t dof_attr_t; /* encoded stability attributes */ + typedef uint32_t dof_attr_t; /* encoded stability attributes */ #define DOF_ATTR(n, d, c) (((n) << 24) | ((d) << 16) | ((c) << 8)) #define DOF_ATTR_NAME(a) (((a) >> 24) & 0xff) #define DOF_ATTR_DATA(a) (((a) >> 16) & 0xff) #define DOF_ATTR_CLASS(a) (((a) >> 8) & 0xff) -typedef struct dof_provider { - dof_secidx_t dofpv_strtab; /* link to DOF_SECT_STRTAB section */ - dof_secidx_t dofpv_probes; /* link to DOF_SECT_PROBES section */ - dof_secidx_t dofpv_prargs; /* link to DOF_SECT_PRARGS section */ - dof_secidx_t dofpv_proffs; /* link to DOF_SECT_PROFFS section */ - dof_stridx_t dofpv_name; /* provider name string */ - dof_attr_t dofpv_provattr; /* provider attributes */ - dof_attr_t dofpv_modattr; /* module attributes */ - dof_attr_t dofpv_funcattr; /* function attributes */ - dof_attr_t dofpv_nameattr; /* name attributes */ - dof_attr_t dofpv_argsattr; /* args attributes */ - dof_secidx_t dofpv_prenoffs; /* link to DOF_SECT_PRENOFFS section */ -} dof_provider_t; - -typedef struct dof_probe { - uint64_t dofpr_addr; /* probe base address or offset */ - dof_stridx_t dofpr_func; /* probe function string */ - dof_stridx_t dofpr_name; /* probe name string */ - dof_stridx_t dofpr_nargv; /* native argument type strings */ - dof_stridx_t dofpr_xargv; /* translated argument type strings */ - uint32_t dofpr_argidx; /* index of first argument mapping */ - uint32_t dofpr_offidx; /* index of first offset entry */ - uint8_t dofpr_nargc; /* native argument count */ - uint8_t dofpr_xargc; /* translated argument count */ - uint16_t dofpr_noffs; /* number of offset entries for probe */ - uint32_t dofpr_enoffidx; /* index of first is-enabled offset */ - uint16_t dofpr_nenoffs; /* number of is-enabled offsets */ - uint16_t dofpr_pad1; /* reserved for future use */ - uint32_t dofpr_pad2; /* reserved for future use */ -} dof_probe_t; - -typedef struct dof_xlator { - dof_secidx_t dofxl_members; /* link to DOF_SECT_XLMEMBERS section */ - dof_secidx_t dofxl_strtab; /* link to DOF_SECT_STRTAB section */ - dof_stridx_t dofxl_argv; /* input parameter type strings */ - uint32_t dofxl_argc; /* input parameter list length */ - dof_stridx_t dofxl_type; /* output type string name */ - dof_attr_t dofxl_attr; /* output stability attributes */ -} dof_xlator_t; - -typedef struct dof_xlmember { - dof_secidx_t dofxm_difo; /* member link to DOF_SECT_DIFOHDR */ - dof_stridx_t dofxm_name; /* member name */ - dtrace_diftype_t dofxm_type; /* member type */ -} dof_xlmember_t; - -typedef struct dof_xlref { - dof_secidx_t dofxr_xlator; /* link to DOF_SECT_XLATORS section */ - uint32_t dofxr_member; /* index of referenced dof_xlmember */ - uint32_t dofxr_argn; /* index of argument for DIF_OP_XLARG */ -} dof_xlref_t; - -/* - * DTrace Intermediate Format Object (DIFO) - * - * A DIFO is used to store the compiled DIF for a D expression, its return - * type, and its string and variable tables. The string table is a single - * buffer of character data into which sets instructions and variable - * references can reference strings using a byte offset. The variable table - * is an array of dtrace_difv_t structures that describe the name and type of - * each variable and the id used in the DIF code. This structure is described - * above in the DIF section of this header file. The DIFO is used at both - * user-level (in the library) and in the kernel, but the structure is never - * passed between the two: the DOF structures form the only interface. As a - * result, the definition can change depending on the presence of _KERNEL. - */ -typedef struct dtrace_difo { - dif_instr_t *dtdo_buf; /* instruction buffer */ - uint64_t *dtdo_inttab; /* integer table (optional) */ - char *dtdo_strtab; /* string table (optional) */ - dtrace_difv_t *dtdo_vartab; /* variable table (optional) */ - uint_t dtdo_len; /* length of instruction buffer */ - uint_t dtdo_intlen; /* length of integer table */ - uint_t dtdo_strlen; /* length of string table */ - uint_t dtdo_varlen; /* length of variable table */ - dtrace_diftype_t dtdo_rtype; /* return type */ - uint_t dtdo_refcnt; /* owner reference count */ - uint_t dtdo_destructive; /* invokes destructive subroutines */ + typedef struct dof_provider { + dof_secidx_t dofpv_strtab; /* link to DOF_SECT_STRTAB section */ + dof_secidx_t dofpv_probes; /* link to DOF_SECT_PROBES section */ + dof_secidx_t dofpv_prargs; /* link to DOF_SECT_PRARGS section */ + dof_secidx_t dofpv_proffs; /* link to DOF_SECT_PROFFS section */ + dof_stridx_t dofpv_name; /* provider name string */ + dof_attr_t dofpv_provattr; /* provider attributes */ + dof_attr_t dofpv_modattr; /* module attributes */ + dof_attr_t dofpv_funcattr; /* function attributes */ + dof_attr_t dofpv_nameattr; /* name attributes */ + dof_attr_t dofpv_argsattr; /* args attributes */ + dof_secidx_t dofpv_prenoffs; /* link to DOF_SECT_PRENOFFS section */ + } dof_provider_t; + + typedef struct dof_probe { + uint64_t dofpr_addr; /* probe base address or offset */ + dof_stridx_t dofpr_func; /* probe function string */ + dof_stridx_t dofpr_name; /* probe name string */ + dof_stridx_t dofpr_nargv; /* native argument type strings */ + dof_stridx_t dofpr_xargv; /* translated argument type strings */ + uint32_t dofpr_argidx; /* index of first argument mapping */ + uint32_t dofpr_offidx; /* index of first offset entry */ + uint8_t dofpr_nargc; /* native argument count */ + uint8_t dofpr_xargc; /* translated argument count */ + uint16_t dofpr_noffs; /* number of offset entries for probe */ + uint32_t dofpr_enoffidx; /* index of first is-enabled offset */ + uint16_t dofpr_nenoffs; /* number of is-enabled offsets */ + uint16_t dofpr_pad1; /* reserved for future use */ + uint32_t dofpr_pad2; /* reserved for future use */ + } dof_probe_t; + + typedef struct dof_xlator { + dof_secidx_t dofxl_members; /* link to DOF_SECT_XLMEMBERS section */ + dof_secidx_t dofxl_strtab; /* link to DOF_SECT_STRTAB section */ + dof_stridx_t dofxl_argv; /* input parameter type strings */ + uint32_t dofxl_argc; /* input parameter list length */ + dof_stridx_t dofxl_type; /* output type string name */ + dof_attr_t dofxl_attr; /* output stability attributes */ + } dof_xlator_t; + + typedef struct dof_xlmember { + dof_secidx_t dofxm_difo; /* member link to DOF_SECT_DIFOHDR */ + dof_stridx_t dofxm_name; /* member name */ + dtrace_diftype_t dofxm_type; /* member type */ + } dof_xlmember_t; + + typedef struct dof_xlref { + dof_secidx_t dofxr_xlator; /* link to DOF_SECT_XLATORS section */ + uint32_t dofxr_member; /* index of referenced dof_xlmember */ + uint32_t dofxr_argn; /* index of argument for DIF_OP_XLARG */ + } dof_xlref_t; + + /* + * DTrace Intermediate Format Object (DIFO) + * + * A DIFO is used to store the compiled DIF for a D expression, its return + * type, and its string and variable tables. The string table is a single + * buffer of character data into which sets instructions and variable + * references can reference strings using a byte offset. The variable table + * is an array of dtrace_difv_t structures that describe the name and type of + * each variable and the id used in the DIF code. This structure is described + * above in the DIF section of this header file. The DIFO is used at both + * user-level (in the library) and in the kernel, but the structure is never + * passed between the two: the DOF structures form the only interface. As a + * result, the definition can change depending on the presence of _KERNEL. + */ + typedef struct dtrace_difo { + dif_instr_t *dtdo_buf; /* instruction buffer */ + uint64_t *dtdo_inttab; /* integer table (optional) */ + char *dtdo_strtab; /* string table (optional) */ + dtrace_difv_t *dtdo_vartab; /* variable table (optional) */ + uint_t dtdo_len; /* length of instruction buffer */ + uint_t dtdo_intlen; /* length of integer table */ + uint_t dtdo_strlen; /* length of string table */ + uint_t dtdo_varlen; /* length of variable table */ + dtrace_diftype_t dtdo_rtype; /* return type */ + uint_t dtdo_refcnt; /* owner reference count */ + uint_t dtdo_destructive; /* invokes destructive subroutines */ #ifndef _KERNEL - dof_relodesc_t *dtdo_kreltab; /* kernel relocations */ - dof_relodesc_t *dtdo_ureltab; /* user relocations */ - struct dt_node **dtdo_xlmtab; /* translator references */ - uint_t dtdo_krelen; /* length of krelo table */ - uint_t dtdo_urelen; /* length of urelo table */ - uint_t dtdo_xlmlen; /* length of translator table */ + dof_relodesc_t *dtdo_kreltab; /* kernel relocations */ + dof_relodesc_t *dtdo_ureltab; /* user relocations */ + struct dt_node **dtdo_xlmtab; /* translator references */ + uint_t dtdo_krelen; /* length of krelo table */ + uint_t dtdo_urelen; /* length of urelo table */ + uint_t dtdo_xlmlen; /* length of translator table */ #endif -} dtrace_difo_t; - -/* - * DTrace Enabling Description Structures - * - * When DTrace is tracking the description of a DTrace enabling entity (probe, - * predicate, action, ECB, record, etc.), it does so in a description - * structure. These structures all end in "desc", and are used at both - * user-level and in the kernel -- but (with the exception of - * dtrace_probedesc_t) they are never passed between them. Typically, - * user-level will use the description structures when assembling an enabling. - * It will then distill those description structures into a DOF object (see - * above), and send it into the kernel. The kernel will again use the - * description structures to create a description of the enabling as it reads - * the DOF. When the description is complete, the enabling will be actually - * created -- turning it into the structures that represent the enabling - * instead of merely describing it. Not surprisingly, the description - * structures bear a strong resemblance to the DOF structures that act as their - * conduit. - */ -struct dtrace_predicate; - -typedef struct dtrace_probedesc { - dtrace_id_t dtpd_id; /* probe identifier */ - char dtpd_provider[DTRACE_PROVNAMELEN]; /* probe provider name */ - char dtpd_mod[DTRACE_MODNAMELEN]; /* probe module name */ - char dtpd_func[DTRACE_FUNCNAMELEN]; /* probe function name */ - char dtpd_name[DTRACE_NAMELEN]; /* probe name */ -} dtrace_probedesc_t; - -typedef struct dtrace_repldesc { - dtrace_probedesc_t dtrpd_match; /* probe descr. to match */ - dtrace_probedesc_t dtrpd_create; /* probe descr. to create */ -} dtrace_repldesc_t; - -typedef struct dtrace_preddesc { - dtrace_difo_t *dtpdd_difo; /* pointer to DIF object */ - struct dtrace_predicate *dtpdd_predicate; /* pointer to predicate */ -} dtrace_preddesc_t; - -typedef struct dtrace_actdesc { - dtrace_difo_t *dtad_difo; /* pointer to DIF object */ - struct dtrace_actdesc *dtad_next; /* next action */ - dtrace_actkind_t dtad_kind; /* kind of action */ - uint32_t dtad_ntuple; /* number in tuple */ - uint64_t dtad_arg; /* action argument */ - uint64_t dtad_uarg; /* user argument */ - int dtad_refcnt; /* reference count */ -} dtrace_actdesc_t; - -typedef struct dtrace_ecbdesc { - dtrace_actdesc_t *dted_action; /* action description(s) */ - dtrace_preddesc_t dted_pred; /* predicate description */ - dtrace_probedesc_t dted_probe; /* probe description */ - uint64_t dted_uarg; /* library argument */ - int dted_refcnt; /* reference count */ -} dtrace_ecbdesc_t; - -/* - * DTrace Metadata Description Structures - * - * DTrace separates the trace data stream from the metadata stream. The only - * metadata tokens placed in the data stream are the dtrace_rechdr_t (EPID + - * timestamp) or (in the case of aggregations) aggregation identifiers. To - * determine the structure of the data, DTrace consumers pass the token to the - * kernel, and receive in return a corresponding description of the enabled - * probe (via the dtrace_eprobedesc structure) or the aggregation (via the - * dtrace_aggdesc structure). Both of these structures are expressed in terms - * of record descriptions (via the dtrace_recdesc structure) that describe the - * exact structure of the data. Some record descriptions may also contain a - * format identifier; this additional bit of metadata can be retrieved from the - * kernel, for which a format description is returned via the dtrace_fmtdesc - * structure. Note that all four of these structures must be bitness-neutral - * to allow for a 32-bit DTrace consumer on a 64-bit kernel. - */ -typedef struct dtrace_recdesc { - dtrace_actkind_t dtrd_action; /* kind of action */ - uint32_t dtrd_size; /* size of record */ - uint32_t dtrd_offset; /* offset in ECB's data */ - uint16_t dtrd_alignment; /* required alignment */ - uint16_t dtrd_format; /* format, if any */ - uint64_t dtrd_arg; /* action argument */ - uint64_t dtrd_uarg; /* user argument */ -} dtrace_recdesc_t; - -typedef struct dtrace_eprobedesc { - dtrace_epid_t dtepd_epid; /* enabled probe ID */ - dtrace_id_t dtepd_probeid; /* probe ID */ - uint64_t dtepd_uarg; /* library argument */ - uint32_t dtepd_size; /* total size */ - int dtepd_nrecs; /* number of records */ - dtrace_recdesc_t dtepd_rec[1]; /* records themselves */ -} dtrace_eprobedesc_t; - -typedef struct dtrace_aggdesc { - DTRACE_PTR(char, dtagd_name); /* not filled in by kernel */ - dtrace_aggvarid_t dtagd_varid; /* not filled in by kernel */ - int dtagd_flags; /* not filled in by kernel */ - dtrace_aggid_t dtagd_id; /* aggregation ID */ - dtrace_epid_t dtagd_epid; /* enabled probe ID */ - uint32_t dtagd_size; /* size in bytes */ - int dtagd_nrecs; /* number of records */ - uint32_t dtagd_pad; /* explicit padding */ - dtrace_recdesc_t dtagd_rec[1]; /* record descriptions */ -} dtrace_aggdesc_t; - -typedef struct dtrace_fmtdesc { - DTRACE_PTR(char, dtfd_string); /* format string */ - int dtfd_length; /* length of format string */ - uint16_t dtfd_format; /* format identifier */ -} dtrace_fmtdesc_t; + } dtrace_difo_t; + + /* + * DTrace Enabling Description Structures + * + * When DTrace is tracking the description of a DTrace enabling entity (probe, + * predicate, action, ECB, record, etc.), it does so in a description + * structure. These structures all end in "desc", and are used at both + * user-level and in the kernel -- but (with the exception of + * dtrace_probedesc_t) they are never passed between them. Typically, + * user-level will use the description structures when assembling an enabling. + * It will then distill those description structures into a DOF object (see + * above), and send it into the kernel. The kernel will again use the + * description structures to create a description of the enabling as it reads + * the DOF. When the description is complete, the enabling will be actually + * created -- turning it into the structures that represent the enabling + * instead of merely describing it. Not surprisingly, the description + * structures bear a strong resemblance to the DOF structures that act as their + * conduit. + */ + struct dtrace_predicate; + + typedef struct dtrace_probedesc { + dtrace_id_t dtpd_id; /* probe identifier */ + char dtpd_provider[DTRACE_PROVNAMELEN]; /* probe provider name */ + char dtpd_mod[DTRACE_MODNAMELEN]; /* probe module name */ + char dtpd_func[DTRACE_FUNCNAMELEN]; /* probe function name */ + char dtpd_name[DTRACE_NAMELEN]; /* probe name */ + } dtrace_probedesc_t; + + typedef struct dtrace_repldesc { + dtrace_probedesc_t dtrpd_match; /* probe descr. to match */ + dtrace_probedesc_t dtrpd_create; /* probe descr. to create */ + } dtrace_repldesc_t; + + typedef struct dtrace_preddesc { + dtrace_difo_t *dtpdd_difo; /* pointer to DIF object */ + struct dtrace_predicate *dtpdd_predicate; /* pointer to predicate */ + } dtrace_preddesc_t; + + typedef struct dtrace_actdesc { + dtrace_difo_t *dtad_difo; /* pointer to DIF object */ + struct dtrace_actdesc *dtad_next; /* next action */ + dtrace_actkind_t dtad_kind; /* kind of action */ + uint32_t dtad_ntuple; /* number in tuple */ + uint64_t dtad_arg; /* action argument */ + uint64_t dtad_uarg; /* user argument */ + int dtad_refcnt; /* reference count */ + } dtrace_actdesc_t; + + typedef struct dtrace_ecbdesc { + dtrace_actdesc_t *dted_action; /* action description(s) */ + dtrace_preddesc_t dted_pred; /* predicate description */ + dtrace_probedesc_t dted_probe; /* probe description */ + uint64_t dted_uarg; /* library argument */ + int dted_refcnt; /* reference count */ + } dtrace_ecbdesc_t; + + /* + * DTrace Metadata Description Structures + * + * DTrace separates the trace data stream from the metadata stream. The only + * metadata tokens placed in the data stream are the dtrace_rechdr_t (EPID + + * timestamp) or (in the case of aggregations) aggregation identifiers. To + * determine the structure of the data, DTrace consumers pass the token to the + * kernel, and receive in return a corresponding description of the enabled + * probe (via the dtrace_eprobedesc structure) or the aggregation (via the + * dtrace_aggdesc structure). Both of these structures are expressed in terms + * of record descriptions (via the dtrace_recdesc structure) that describe the + * exact structure of the data. Some record descriptions may also contain a + * format identifier; this additional bit of metadata can be retrieved from the + * kernel, for which a format description is returned via the dtrace_fmtdesc + * structure. Note that all four of these structures must be bitness-neutral + * to allow for a 32-bit DTrace consumer on a 64-bit kernel. + */ + typedef struct dtrace_recdesc { + dtrace_actkind_t dtrd_action; /* kind of action */ + uint32_t dtrd_size; /* size of record */ + uint32_t dtrd_offset; /* offset in ECB's data */ + uint16_t dtrd_alignment; /* required alignment */ + uint16_t dtrd_format; /* format, if any */ + uint64_t dtrd_arg; /* action argument */ + uint64_t dtrd_uarg; /* user argument */ + } dtrace_recdesc_t; + + typedef struct dtrace_eprobedesc { + dtrace_epid_t dtepd_epid; /* enabled probe ID */ + dtrace_id_t dtepd_probeid; /* probe ID */ + uint64_t dtepd_uarg; /* library argument */ + uint32_t dtepd_size; /* total size */ + int dtepd_nrecs; /* number of records */ + dtrace_recdesc_t dtepd_rec[1]; /* records themselves */ + } dtrace_eprobedesc_t; + + typedef struct dtrace_aggdesc { + DTRACE_PTR(char, dtagd_name); /* not filled in by kernel */ + dtrace_aggvarid_t dtagd_varid; /* not filled in by kernel */ + int dtagd_flags; /* not filled in by kernel */ + dtrace_aggid_t dtagd_id; /* aggregation ID */ + dtrace_epid_t dtagd_epid; /* enabled probe ID */ + uint32_t dtagd_size; /* size in bytes */ + int dtagd_nrecs; /* number of records */ + uint32_t dtagd_pad; /* explicit padding */ + dtrace_recdesc_t dtagd_rec[1]; /* record descriptions */ + } dtrace_aggdesc_t; + + typedef struct dtrace_fmtdesc { + DTRACE_PTR(char, dtfd_string); /* format string */ + int dtfd_length; /* length of format string */ + uint16_t dtfd_format; /* format identifier */ + } dtrace_fmtdesc_t; #define DTRACE_SIZEOF_EPROBEDESC(desc) \ (sizeof (dtrace_eprobedesc_t) + ((desc)->dtepd_nrecs ? \ - (((desc)->dtepd_nrecs - 1) * sizeof (dtrace_recdesc_t)) : 0)) + (((desc)->dtepd_nrecs - 1) * sizeof (dtrace_recdesc_t)) : 0)) #define DTRACE_SIZEOF_AGGDESC(desc) \ (sizeof (dtrace_aggdesc_t) + ((desc)->dtagd_nrecs ? \ - (((desc)->dtagd_nrecs - 1) * sizeof (dtrace_recdesc_t)) : 0)) - -/* - * DTrace Option Interface - * - * Run-time DTrace options are set and retrieved via DOF_SECT_OPTDESC sections - * in a DOF image. The dof_optdesc structure contains an option identifier and - * an option value. The valid option identifiers are found below; the mapping - * between option identifiers and option identifying strings is maintained at - * user-level. Note that the value of DTRACEOPT_UNSET is such that all of the - * following are potentially valid option values: all positive integers, zero - * and negative one. Some options (notably "bufpolicy" and "bufresize") take - * predefined tokens as their values; these are defined with - * DTRACEOPT_{option}_{token}. - */ + (((desc)->dtagd_nrecs - 1) * sizeof (dtrace_recdesc_t)) : 0)) + + /* + * DTrace Option Interface + * + * Run-time DTrace options are set and retrieved via DOF_SECT_OPTDESC sections + * in a DOF image. The dof_optdesc structure contains an option identifier and + * an option value. The valid option identifiers are found below; the mapping + * between option identifiers and option identifying strings is maintained at + * user-level. Note that the value of DTRACEOPT_UNSET is such that all of the + * following are potentially valid option values: all positive integers, zero + * and negative one. Some options (notably "bufpolicy" and "bufresize") take + * predefined tokens as their values; these are defined with + * DTRACEOPT_{option}_{token}. + */ #define DTRACEOPT_BUFSIZE 0 /* buffer size */ #define DTRACEOPT_BUFPOLICY 1 /* buffer policy */ #define DTRACEOPT_DYNVARSIZE 2 /* dynamic variable size */ @@ -1019,7 +1020,8 @@ typedef struct dtrace_fmtdesc { #define DTRACEOPT_AGGSORTPOS 25 /* agg. position to sort on */ #define DTRACEOPT_AGGSORTKEYPOS 26 /* agg. key position to sort on */ #define DTRACEOPT_TEMPORAL 27 /* temporally ordered output */ -#define DTRACEOPT_MAX 28 /* number of options */ +#define DTRACEOPT_ZONE 28 /* zone in which to enable probes */ +#define DTRACEOPT_MAX 29 /* number of options */ #define DTRACEOPT_UNSET (dtrace_optval_t)-2 /* unset option */ @@ -1649,14 +1651,21 @@ typedef struct dof_helper { * * A bitwise OR that encapsulates both the mode (either DTRACE_MODE_KERNEL * or DTRACE_MODE_USER) and the policy when the privilege of the enabling - * is insufficient for that mode (either DTRACE_MODE_NOPRIV_DROP or - * DTRACE_MODE_NOPRIV_RESTRICT). If the policy is DTRACE_MODE_NOPRIV_DROP, - * insufficient privilege will result in the probe firing being silently - * ignored for the enabling; if the policy is DTRACE_NODE_NOPRIV_RESTRICT, - * insufficient privilege will not prevent probe processing for the - * enabling, but restrictions will be in place that induce a UPRIV fault - * upon attempt to examine probe arguments or current process state. - * + * is insufficient for that mode (a combination of DTRACE_MODE_NOPRIV_DROP, + * DTRACE_MODE_NOPRIV_RESTRICT, and DTRACE_MODE_LIMITEDPRIV_RESTRICT). If + * DTRACE_MODE_NOPRIV_DROP bit is set, insufficient privilege will result + * in the probe firing being silently ignored for the enabling; if the + * DTRACE_NODE_NOPRIV_RESTRICT bit is set, insufficient privilege will not + * prevent probe processing for the enabling, but restrictions will be in + * place that induce a UPRIV fault upon attempt to examine probe arguments + * or current process state. If the DTRACE_MODE_LIMITEDPRIV_RESTRICT bit + * is set, similar restrictions will be placed upon operation if the + * privilege is sufficient to process the enabling, but does not otherwise + * entitle the enabling to all zones. The DTRACE_MODE_NOPRIV_DROP and + * DTRACE_MODE_NOPRIV_RESTRICT are mutually exclusive (and one of these + * two policies must be specified), but either may be combined (or not) + * with DTRACE_MODE_LIMITEDPRIV_RESTRICT. + * * 1.10.4 Caller's context * * This is called from within dtrace_probe() meaning that interrupts @@ -2054,6 +2063,7 @@ typedef struct dtrace_pops { #define DTRACE_MODE_USER 0x02 #define DTRACE_MODE_NOPRIV_DROP 0x10 #define DTRACE_MODE_NOPRIV_RESTRICT 0x20 +#define DTRACE_MODE_LIMITEDPRIV_RESTRICT 0x40 typedef uintptr_t dtrace_provider_id_t; @@ -2268,6 +2278,7 @@ extern void (*dtrace_helpers_cleanup)(); extern void (*dtrace_helpers_fork)(proc_t *parent, proc_t *child); extern void (*dtrace_cpustart_init)(); extern void (*dtrace_cpustart_fini)(); +extern void (*dtrace_closef)(); extern void (*dtrace_debugger_init)(); extern void (*dtrace_debugger_fini)(); diff --git a/usr/src/uts/common/sys/dtrace_impl.h b/usr/src/uts/common/sys/dtrace_impl.h index d780082137..f79bf1e42e 100644 --- a/usr/src/uts/common/sys/dtrace_impl.h +++ b/usr/src/uts/common/sys/dtrace_impl.h @@ -924,6 +924,7 @@ typedef struct dtrace_mstate { uintptr_t dtms_strtok; /* saved strtok() pointer */ uint32_t dtms_access; /* memory access rights */ dtrace_difo_t *dtms_difo; /* current dif object */ + file_t *dtms_getf; /* cached rval of getf() */ } dtrace_mstate_t; #define DTRACE_COND_OWNER 0x1 @@ -1144,6 +1145,7 @@ struct dtrace_state { dtrace_optval_t dts_options[DTRACEOPT_MAX]; /* options */ dtrace_cred_t dts_cred; /* credentials */ size_t dts_nretained; /* number of retained enabs */ + int dts_getf; /* number of getf() calls */ }; struct dtrace_provider { diff --git a/usr/src/uts/common/sys/fs/hyprlofs.h b/usr/src/uts/common/sys/fs/hyprlofs.h new file mode 100644 index 0000000000..b8c4149df2 --- /dev/null +++ b/usr/src/uts/common/sys/fs/hyprlofs.h @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_HYPRLOFS_H +#define _SYS_FS_HYPRLOFS_H + +#include <sys/param.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * hyprlofs ioctl numbers. + */ +#define HYPRLOFS_IOC ('H' << 8) + +#define HYPRLOFS_ADD_ENTRIES (HYPRLOFS_IOC | 1) +#define HYPRLOFS_RM_ENTRIES (HYPRLOFS_IOC | 2) +#define HYPRLOFS_RM_ALL (HYPRLOFS_IOC | 3) +#define HYPRLOFS_GET_ENTRIES (HYPRLOFS_IOC | 4) + +typedef struct { + char *hle_path; + uint_t hle_plen; + char *hle_name; + uint_t hle_nlen; +} hyprlofs_entry_t; + +typedef struct { + hyprlofs_entry_t *hle_entries; + uint_t hle_len; +} hyprlofs_entries_t; + +typedef struct { + char hce_path[MAXPATHLEN]; + char hce_name[MAXPATHLEN]; +} hyprlofs_curr_entry_t; + +typedef struct { + hyprlofs_curr_entry_t *hce_entries; + uint_t hce_cnt; +} hyprlofs_curr_entries_t; + +#ifdef _KERNEL +typedef struct { + caddr32_t hle_path; + uint_t hle_plen; + caddr32_t hle_name; + uint_t hle_nlen; +} hyprlofs_entry32_t; + +typedef struct { + caddr32_t hle_entries; + uint_t hle_len; +} hyprlofs_entries32_t; + +typedef struct { + caddr32_t hce_entries; + uint_t hce_cnt; +} hyprlofs_curr_entries32_t; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_HYPRLOFS_H */ diff --git a/usr/src/uts/common/sys/fs/hyprlofs_info.h b/usr/src/uts/common/sys/fs/hyprlofs_info.h new file mode 100644 index 0000000000..29bdadc4e2 --- /dev/null +++ b/usr/src/uts/common/sys/fs/hyprlofs_info.h @@ -0,0 +1,189 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_HLOFS_INFO_H +#define _SYS_FS_HLOFS_INFO_H + +#include <sys/t_lock.h> +#include <vm/seg.h> +#include <vm/seg_vn.h> +#include <sys/vfs_opreg.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * hlnode is the file system dependent node for hyprlofs. + * It is modeled on the tmpfs tmpnode. + * + * hln_rwlock protects access of the directory list at hln_dir + * as well as syncronizing read/writes to directory hlnodes. + * hln_tlock protects updates to hln_mode and hln_nlink. + * hln_tlock doesn't require any hlnode locks. + */ +typedef struct hlnode { + struct hlnode *hln_back; /* linked list of hlnodes */ + struct hlnode *hln_forw; /* linked list of hlnodes */ + union { + struct { + struct hldirent *un_dirlist; /* dirent list */ + uint_t un_dirents; /* number of dirents */ + } un_dirstruct; + vnode_t *un_realvp; /* real vnode */ + } un_hlnode; + vnode_t *hln_vnode; /* vnode for this hlnode */ + int hln_gen; /* pseudo gen num for hlfid */ + int hln_looped; /* flag indicating loopback */ + vattr_t hln_attr; /* attributes */ + krwlock_t hln_rwlock; /* rw - serialize mods and */ + /* directory updates */ + kmutex_t hln_tlock; /* time, flag, and nlink lock */ +} hlnode_t; + +/* + * hyprlofs per-mount data structure. + * All fields are protected by hlm_contents. + */ +typedef struct { + vfs_t *hlm_vfsp; /* filesystem's vfs struct */ + hlnode_t *hlm_rootnode; /* root hlnode */ + char *hlm_mntpath; /* name of hyprlofs mount point */ + dev_t hlm_dev; /* unique dev # of mounted `device' */ + uint_t hlm_gen; /* pseudo generation number for files */ + kmutex_t hlm_contents; /* lock for hlfsmount structure */ +} hlfsmount_t; + +/* + * hyprlofs directories are made up of a linked list of hldirent structures + * hanging off directory hlnodes. File names are not fixed length, + * but are null terminated. + */ +typedef struct hldirent { + hlnode_t *hld_hlnode; /* hlnode for this file */ + struct hldirent *hld_next; /* next directory entry */ + struct hldirent *hld_prev; /* prev directory entry */ + uint_t hld_offset; /* "offset" of dir entry */ + uint_t hld_hash; /* a hash of td_name */ + struct hldirent *hld_link; /* linked via the hash table */ + hlnode_t *hld_parent; /* parent, dir we are in */ + char *hld_name; /* must be null terminated */ + /* max length is MAXNAMELEN */ +} hldirent_t; + +/* + * hlfid overlays the fid structure (for VFS_VGET) + */ +typedef struct { + uint16_t hlfid_len; + ino32_t hlfid_ino; + int32_t hlfid_gen; +} hlfid_t; + +/* + * File system independent to hyprlofs conversion macros + */ +#define VFSTOHLM(vfsp) ((hlfsmount_t *)(vfsp)->vfs_data) +#define VTOHLM(vp) ((hlfsmount_t *)(vp)->v_vfsp->vfs_data) +#define VTOHLN(vp) ((hlnode_t *)(vp)->v_data) +#define HLNTOV(tp) ((tp)->hln_vnode) +#define REALVP(vp) ((vnode_t *)VTOHLN(vp)->hln_realvp) +#define hlnode_hold(tp) VN_HOLD(HLNTOV(tp)) +#define hlnode_rele(tp) VN_RELE(HLNTOV(tp)) + +#define hln_dir un_hlnode.un_dirstruct.un_dirlist +#define hln_dirents un_hlnode.un_dirstruct.un_dirents +#define hln_realvp un_hlnode.un_realvp + +/* + * Attributes + */ +#define hln_mask hln_attr.va_mask +#define hln_type hln_attr.va_type +#define hln_mode hln_attr.va_mode +#define hln_uid hln_attr.va_uid +#define hln_gid hln_attr.va_gid +#define hln_fsid hln_attr.va_fsid +#define hln_nodeid hln_attr.va_nodeid +#define hln_nlink hln_attr.va_nlink +#define hln_size hln_attr.va_size +#define hln_atime hln_attr.va_atime +#define hln_mtime hln_attr.va_mtime +#define hln_ctime hln_attr.va_ctime +#define hln_rdev hln_attr.va_rdev +#define hln_blksize hln_attr.va_blksize +#define hln_nblocks hln_attr.va_nblocks +#define hln_seq hln_attr.va_seq + +#define HL_MUSTHAVE 1 + +/* + * enums + */ +enum de_op { DE_CREATE, DE_MKDIR }; /* direnter ops */ +enum dr_op { DR_REMOVE, DR_RMDIR }; /* dirremove ops */ + +/* + * hyprlofs_minfree is the amount (in pages) of anonymous memory that hyprlofs + * leaves free for the rest of the system. The default value for + * hyprlofs_minfree is btopr(HYPRLOFSMINFREE) but it can be patched to a + * different number of pages. Since hyprlofs doesn't actually use much + * memory, its unlikely this ever needs to be patched. + */ +#define HYPRLOFSMINFREE 8 * 1024 * 1024 /* 8 Megabytes */ + +extern size_t hyprlofs_minfree; /* Anonymous memory in pages */ + +/* + * hyprlofs can allocate only a certain percentage of kernel memory, + * which is used for hlnodes, directories, file names, etc. + * This is statically set as HYPRLOFSMAXFRACKMEM of physical memory. + * The actual number of allocatable bytes can be patched in hyprlofs_maxkmem. + */ +#define HYPRLOFSMAXFRACKMEM 25 /* 1/25 of physical memory */ + +extern size_t hyprlofs_kmemspace; +extern size_t hyprlofs_maxkmem; /* Allocatable kernel memory in bytes */ + +extern void hyprlofs_node_init(hlfsmount_t *, hlnode_t *, vattr_t *, + cred_t *); +extern int hyprlofs_dirlookup(hlnode_t *, char *, hlnode_t **, cred_t *); +extern int hyprlofs_dirdelete(hlnode_t *, hlnode_t *, char *, enum dr_op, + cred_t *); +extern void hyprlofs_dirinit(hlnode_t *, hlnode_t *); +extern void hyprlofs_dirtrunc(hlnode_t *); +extern void *hyprlofs_memalloc(size_t, int); +extern void hyprlofs_memfree(void *, size_t); +extern int hyprlofs_taccess(void *, int, cred_t *); +extern int hyprlofs_direnter(hlfsmount_t *, hlnode_t *, char *, enum de_op, + vnode_t *, vattr_t *, hlnode_t **, cred_t *); + +extern struct vnodeops *hyprlofs_vnodeops; +extern const struct fs_operation_def hyprlofs_vnodeops_template[]; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_HLOFS_INFO_H */ diff --git a/usr/src/uts/common/sys/fss.h b/usr/src/uts/common/sys/fss.h index 583586fd75..03c35277d4 100644 --- a/usr/src/uts/common/sys/fss.h +++ b/usr/src/uts/common/sys/fss.h @@ -22,6 +22,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ #ifndef _SYS_FSS_H @@ -86,6 +87,7 @@ typedef struct fsspset { /* on the list */ struct fssproj *fssps_list; /* list of project parts */ struct fsszone *fssps_zones; /* list of fsszone_t's in pset */ + uint32_t fssps_gen; /* generation for zone's kstats */ } fsspset_t; /* @@ -101,7 +103,10 @@ typedef struct fssproj { /* protected by fssps_lock */ uint32_t fssp_shares; /* copy of our kpj_shares */ /* protected by fssps_displock */ - uint32_t fssp_ticks; /* total of all ticks */ + uint32_t fssp_ticks; /* total of nice tick values */ + /* protected by fssps_displock */ + uint32_t fssp_tick_cnt; /* cnt of all ticks in this sec */ + uint32_t fssp_shr_pct; /* active shr % in this sec */ /* protected by fssps_displock */ fssusage_t fssp_usage; /* this project's decayed usage */ fssusage_t fssp_shusage; /* normalized usage */ diff --git a/usr/src/uts/common/sys/ipd.h b/usr/src/uts/common/sys/ipd.h new file mode 100644 index 0000000000..901e74f44c --- /dev/null +++ b/usr/src/uts/common/sys/ipd.h @@ -0,0 +1,81 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +/* + * These definitions are private to ipd and ipdadm. + */ + +#ifndef _SYS_IPD_H +#define _SYS_IPD_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define IPD_DEV_PATH "/dev/ipd" +#define IPD_MAX_DELAY 1000000 /* 1s in us */ + +typedef struct ipd_ioc_perturb { + zoneid_t ipip_zoneid; + uint32_t ipip_arg; +} ipd_ioc_perturb_t; + +typedef struct ipd_ioc_info { + zoneid_t ipii_zoneid; + uint32_t ipii_corrupt; + uint32_t ipii_drop; + uint32_t ipii_delay; +} ipd_ioc_info_t; + +#ifdef _KERNEL + +typedef struct ipd_ioc_list32 { + uint_t ipil_nzones; + caddr32_t ipil_info; +} ipd_ioc_list32_t; + +#endif /* _KERNEL */ + +typedef struct ipd_ioc_list { + uint_t ipil_nzones; + ipd_ioc_info_t *ipil_info; +} ipd_ioc_list_t; + +#define IPD_CORRUPT 0x1 +#define IPD_DELAY 0x2 +#define IPD_DROP 0x4 + +#define IPDIOC (('i' << 24) | ('p' << 16) | ('d' << 8)) +#define IPDIOC_CORRUPT (IPDIOC | 1) /* disable ipd */ +#define IPDIOC_DELAY (IPDIOC | 2) /* disable ipd */ +#define IPDIOC_DROP (IPDIOC | 3) /* disable ipd */ +#define IPDIOC_LIST (IPDIOC | 4) /* enable ipd */ +#define IPDIOC_REMOVE (IPDIOC | 5) /* disable ipd */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_IPD_H */ diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h index 6876fccb1a..220446af65 100644 --- a/usr/src/uts/common/sys/mac.h +++ b/usr/src/uts/common/sys/mac.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_MAC_H @@ -205,6 +206,7 @@ typedef enum { MAC_PROP_MAX_RXHWCLNT_AVAIL, MAC_PROP_MAX_TXHWCLNT_AVAIL, MAC_PROP_IB_LINKMODE, + MAC_PROP_VN_PROMISC_FILTERED, MAC_PROP_PRIVATE = -1 } mac_prop_id_t; diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h index ae25df6a0d..ec49527300 100644 --- a/usr/src/uts/common/sys/mac_client_impl.h +++ b/usr/src/uts/common/sys/mac_client_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_MAC_CLIENT_IMPL_H @@ -302,6 +303,7 @@ extern int mac_tx_percpu_cnt; /* Mac protection flags */ #define MPT_FLAG_V6_LOCAL_ADDR_SET 0x0001 +#define MPT_FLAG_PROMISC_FILTERED 0x0002 /* in mac_client.c */ extern void mac_promisc_client_dispatch(mac_client_impl_t *, mblk_t *); diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h index 8f9f23ff71..2eef66113d 100644 --- a/usr/src/uts/common/sys/mac_impl.h +++ b/usr/src/uts/common/sys/mac_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_MAC_IMPL_H @@ -885,6 +886,8 @@ extern void mac_protect_fini(mac_client_impl_t *); extern int mac_set_resources(mac_handle_t, mac_resource_props_t *); extern void mac_get_resources(mac_handle_t, mac_resource_props_t *); extern void mac_get_effective_resources(mac_handle_t, mac_resource_props_t *); +extern void mac_set_promisc_filtered(mac_client_handle_t, boolean_t); +extern boolean_t mac_get_promisc_filtered(mac_client_handle_t); extern cpupart_t *mac_pset_find(mac_resource_props_t *, boolean_t *); extern void mac_set_pool_effective(boolean_t, cpupart_t *, diff --git a/usr/src/uts/common/sys/mman.h b/usr/src/uts/common/sys/mman.h index 6c9119e56d..82344607b0 100644 --- a/usr/src/uts/common/sys/mman.h +++ b/usr/src/uts/common/sys/mman.h @@ -22,6 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -353,6 +354,7 @@ struct memcntl_mha32 { #define MS_SYNC 0x4 /* wait for msync */ #define MS_ASYNC 0x1 /* return immediately */ #define MS_INVALIDATE 0x2 /* invalidate caches */ +#define MS_INVALCURPROC 0x8 /* invalidate cache for curproc only */ #if (_POSIX_C_SOURCE <= 2) && !defined(_XPG4_2) || defined(__EXTENSIONS__) /* functions to mctl */ diff --git a/usr/src/uts/common/sys/mntent.h b/usr/src/uts/common/sys/mntent.h index e95ef3fccc..d215d88790 100644 --- a/usr/src/uts/common/sys/mntent.h +++ b/usr/src/uts/common/sys/mntent.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012, Joyent, Inc. All rights reserved. * * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T * All Rights Reserved @@ -47,6 +48,7 @@ extern "C" { #define MNTTYPE_PCFS "pcfs" /* PC (MSDOS) file system */ #define MNTTYPE_PC MNTTYPE_PCFS /* Deprecated name; use MNTTYPE_PCFS */ #define MNTTYPE_LOFS "lofs" /* Loop back file system */ +#define MNTTYPE_HYPRLOFS "hyprlofs" /* Hyperlofs file system */ #define MNTTYPE_LO MNTTYPE_LOFS /* Deprecated name; use MNTTYPE_LOFS */ #define MNTTYPE_HSFS "hsfs" /* High Sierra (9660) file system */ #define MNTTYPE_SWAP "swap" /* Swap file system */ diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h index bcd5ba2b4c..819c788b9e 100644 --- a/usr/src/uts/common/sys/policy.h +++ b/usr/src/uts/common/sys/policy.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_POLICY_H @@ -171,6 +172,7 @@ int secpolicy_setid_setsticky_clear(vnode_t *, vattr_t *, const vattr_t *, cred_t *); int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, vtype_t); int secpolicy_xvm_control(const cred_t *); +int secpolicy_hyprlofs_control(const cred_t *); int secpolicy_basic_exec(const cred_t *, vnode_t *); int secpolicy_basic_fork(const cred_t *); diff --git a/usr/src/uts/common/sys/port.h b/usr/src/uts/common/sys/port.h index ccb0308255..d4d74d55ea 100644 --- a/usr/src/uts/common/sys/port.h +++ b/usr/src/uts/common/sys/port.h @@ -24,11 +24,13 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_PORT_H #define _SYS_PORT_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -106,6 +108,7 @@ typedef struct port_notify32 { #define FILE_ACCESS 0x00000001 #define FILE_MODIFIED 0x00000002 #define FILE_ATTRIB 0x00000004 +#define FILE_TRUNC 0x00100000 #define FILE_NOFOLLOW 0x10000000 /* diff --git a/usr/src/uts/common/sys/port_impl.h b/usr/src/uts/common/sys/port_impl.h index 9f3f291874..504fb9ece1 100644 --- a/usr/src/uts/common/sys/port_impl.h +++ b/usr/src/uts/common/sys/port_impl.h @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_PORT_IMPL_H #define _SYS_PORT_IMPL_H @@ -311,6 +315,7 @@ typedef struct portfop_vp { #define FOP_FILE_SETATTR_MTIME 0x00080000 #define FOP_FILE_SETATTR_CTIME 0x00100000 #define FOP_FILE_LINK_SRC 0x00200000 +#define FOP_FILE_TRUNC 0x00400000 /* * File modification event. @@ -339,10 +344,15 @@ typedef struct portfop_vp { /* + * File trunc event + */ +#define FOP_TRUNC_MASK (FOP_FILE_TRUNC|FOP_FILE_CREATE) + +/* * valid watchable events */ #define FILE_EVENTS_MASK (FILE_ACCESS|FILE_MODIFIED|FILE_ATTRIB \ - |FILE_NOFOLLOW) + |FILE_NOFOLLOW|FILE_TRUNC) /* --- End file events --- */ /* diff --git a/usr/src/uts/common/sys/procfs.h b/usr/src/uts/common/sys/procfs.h index 0c4a48fcdd..f592fd9dcf 100644 --- a/usr/src/uts/common/sys/procfs.h +++ b/usr/src/uts/common/sys/procfs.h @@ -65,10 +65,6 @@ extern "C" { #include <sys/stat.h> #include <sys/param.h> -#if !defined(_LP64) && _FILE_OFFSET_BITS == 64 -#error "Cannot use procfs in the large file compilation environment" -#endif - /* * System call interfaces for /proc. */ diff --git a/usr/src/uts/common/sys/sdt_impl.h b/usr/src/uts/common/sys/sdt_impl.h index cbe95f7c66..f7cc683f2f 100644 --- a/usr/src/uts/common/sys/sdt_impl.h +++ b/usr/src/uts/common/sys/sdt_impl.h @@ -24,11 +24,13 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_SDT_IMPL_H #define _SYS_SDT_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -45,6 +47,7 @@ typedef struct sdt_provider { char *sdtp_name; /* name of provider */ char *sdtp_prefix; /* prefix for probe names */ dtrace_pattr_t *sdtp_attr; /* stability attributes */ + uint32_t sdtp_priv; /* privilege, if any */ dtrace_provider_id_t sdtp_id; /* provider ID */ } sdt_provider_t; @@ -75,6 +78,7 @@ typedef struct sdt_argdesc { } sdt_argdesc_t; extern void sdt_getargdesc(void *, dtrace_id_t, void *, dtrace_argdesc_t *); +extern int sdt_mode(void *, dtrace_id_t, void *); #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h index 188230d61e..c7f460e7c7 100644 --- a/usr/src/uts/common/sys/thread.h +++ b/usr/src/uts/common/sys/thread.h @@ -68,6 +68,8 @@ typedef struct ctxop { void (*free_op)(void *, int); /* function which frees the context */ void *arg; /* argument to above functions, ctx pointer */ struct ctxop *next; /* next context ops */ + hrtime_t save_ts; /* timestamp of last save */ + hrtime_t restore_ts; /* timestamp of last restore */ } ctxop_t; /* diff --git a/usr/src/uts/common/sys/uadmin.h b/usr/src/uts/common/sys/uadmin.h index c35d0a5cfb..6adeb477bb 100644 --- a/usr/src/uts/common/sys/uadmin.h +++ b/usr/src/uts/common/sys/uadmin.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -157,7 +158,7 @@ extern kmutex_t ualock; extern void mdboot(int, int, char *, boolean_t); extern void mdpreboot(int, int, char *); extern int kadmin(int, int, void *, cred_t *); -extern void killall(zoneid_t); +extern void killall(zoneid_t, boolean_t); #endif #if defined(__STDC__) diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h index 1aa4a8ee6d..97e3430ae2 100644 --- a/usr/src/uts/common/sys/vm_usage.h +++ b/usr/src/uts/common/sys/vm_usage.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ #ifndef _SYS_VM_USAGE_H @@ -79,8 +80,9 @@ extern "C" { /* zoneid */ #define VMUSAGE_COL_EUSERS 0x2000 /* same as VMUSAGE_COL_RUSERS, but by */ /* euser */ +#define VMUSAGE_A_ZONE 0x4000 /* rss/swap for a specified zone */ -#define VMUSAGE_MASK 0x3fff /* all valid flags for getvmusage() */ +#define VMUSAGE_MASK 0x7fff /* all valid flags for getvmusage() */ typedef struct vmusage { id_t vmu_zoneid; /* zoneid, or ALL_ZONES for */ diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index 3ba7bf47f4..a44930c853 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_ZONE_H @@ -94,12 +95,17 @@ extern "C" { #define ZONE_ATTR_INITNAME 9 #define ZONE_ATTR_BOOTARGS 10 #define ZONE_ATTR_BRAND 11 -#define ZONE_ATTR_PHYS_MCAP 12 +#define ZONE_ATTR_PMCAP_NOVER 12 #define ZONE_ATTR_SCHED_CLASS 13 #define ZONE_ATTR_FLAGS 14 #define ZONE_ATTR_HOSTID 15 #define ZONE_ATTR_FS_ALLOWED 16 #define ZONE_ATTR_NETWORK 17 +#define ZONE_ATTR_DID 18 +#define ZONE_ATTR_PMCAP_PAGEOUT 19 +#define ZONE_ATTR_INITNORESTART 20 +#define ZONE_ATTR_PG_FLT_DELAY 21 +#define ZONE_ATTR_RSS 22 /* Start of the brand-specific attribute namespace */ #define ZONE_ATTR_BRAND_ATTRS 32768 @@ -180,6 +186,7 @@ typedef struct { uint32_t doi; /* DOI for label */ caddr32_t label; /* label associated with zone */ int flags; + zoneid_t zoneid; /* requested zoneid */ } zone_def32; #endif typedef struct { @@ -196,6 +203,7 @@ typedef struct { uint32_t doi; /* DOI for label */ const bslabel_t *label; /* label associated with zone */ int flags; + zoneid_t zoneid; /* requested zoneid */ } zone_def; /* extended error information */ @@ -240,7 +248,7 @@ typedef enum zone_cmd { typedef struct zone_cmd_arg { uint64_t uniqid; /* unique "generation number" */ zone_cmd_t cmd; /* requested action */ - uint32_t _pad; /* need consistent 32/64 bit alignmt */ + uint32_t debug; /* enable brand hook debug */ char locale[MAXPATHLEN]; /* locale in which to render messages */ char bootbuf[BOOTARGS_MAX]; /* arguments passed to zone_boot() */ } zone_cmd_arg_t; @@ -320,6 +328,7 @@ typedef struct zone_net_data { * libraries which may be defining ther own versions. */ #include <sys/list.h> +#include <sys/cpuvar.h> #define GLOBAL_ZONEUNIQID 0 /* uniqid of the global zone */ @@ -367,7 +376,7 @@ typedef struct zone_dataset { } zone_dataset_t; /* - * structure for zone kstats + * structure for rctl zone kstats */ typedef struct zone_kstat { kstat_named_t zk_zonename; @@ -377,6 +386,73 @@ typedef struct zone_kstat { struct cpucap; +typedef struct { + hrtime_t cycle_start; + uint_t cycle_cnt; + hrtime_t zone_avg_cnt; +} sys_zio_cntr_t; + +typedef struct { + kstat_named_t zv_zonename; + kstat_named_t zv_nread; + kstat_named_t zv_reads; + kstat_named_t zv_rtime; + kstat_named_t zv_rlentime; + kstat_named_t zv_nwritten; + kstat_named_t zv_writes; + kstat_named_t zv_wtime; + kstat_named_t zv_wlentime; + kstat_named_t zv_10ms_ops; + kstat_named_t zv_100ms_ops; + kstat_named_t zv_1s_ops; + kstat_named_t zv_10s_ops; + kstat_named_t zv_delay_cnt; + kstat_named_t zv_delay_time; +} zone_vfs_kstat_t; + +typedef struct { + kstat_named_t zz_zonename; + kstat_named_t zz_nread; + kstat_named_t zz_reads; + kstat_named_t zz_rtime; + kstat_named_t zz_rlentime; + kstat_named_t zz_nwritten; + kstat_named_t zz_writes; + kstat_named_t zz_waittime; +} zone_zfs_kstat_t; + +typedef struct { + kstat_named_t zm_zonename; + kstat_named_t zm_rss; + kstat_named_t zm_phys_cap; + kstat_named_t zm_swap; + kstat_named_t zm_swap_cap; + kstat_named_t zm_nover; + kstat_named_t zm_pagedout; + kstat_named_t zm_pgpgin; + kstat_named_t zm_anonpgin; + kstat_named_t zm_execpgin; + kstat_named_t zm_fspgin; + kstat_named_t zm_anon_alloc_fail; + kstat_named_t zm_pf_throttle; + kstat_named_t zm_pf_throttle_usec; +} zone_mcap_kstat_t; + +typedef struct { + kstat_named_t zm_zonename; /* full name, kstat truncates name */ + kstat_named_t zm_utime; + kstat_named_t zm_stime; + kstat_named_t zm_wtime; + kstat_named_t zm_avenrun1; + kstat_named_t zm_avenrun5; + kstat_named_t zm_avenrun15; + kstat_named_t zm_run_ticks; + kstat_named_t zm_run_wait; + kstat_named_t zm_fss_shr_pct; + kstat_named_t zm_fss_pri_hi; + kstat_named_t zm_fss_pri_avg; +} zone_misc_kstat_t; + typedef struct zone { /* * zone_name is never modified once set. @@ -416,6 +492,7 @@ typedef struct zone { */ list_node_t zone_linkage; zoneid_t zone_id; /* ID of zone */ + zoneid_t zone_did; /* persistent debug ID of zone */ uint_t zone_ref; /* count of zone_hold()s on zone */ uint_t zone_cred_ref; /* count of zone_hold_cred()s on zone */ /* @@ -471,7 +548,7 @@ typedef struct zone { char *zone_initname; /* fs path to 'init' */ int zone_boot_err; /* for zone_boot() if boot fails */ char *zone_bootargs; /* arguments passed via zone_boot() */ - uint64_t zone_phys_mcap; /* physical memory cap */ + rctl_qty_t zone_phys_mem_ctl; /* current phys. memory limit */ /* * zone_kthreads is protected by zone_status_lock. */ @@ -490,6 +567,9 @@ typedef struct zone { hrtime_t zone_pool_mod; /* last pool bind modification time */ /* zone_psetid is protected by cpu_lock */ psetid_t zone_psetid; /* pset the zone is bound to */ + + time_t zone_boot_time; /* Similar to boot_time */ + /* * The following two can be read without holding any locks. They are * updated under cpu_lock. @@ -517,6 +597,37 @@ typedef struct zone { list_t zone_dl_list; netstack_t *zone_netstack; struct cpucap *zone_cpucap; /* CPU caps data */ + + /* + * Data and counters used for ZFS fair-share disk IO. + */ + rctl_qty_t zone_zfs_io_pri; /* ZFS IO priority */ + uint_t zone_zfs_queued; /* enqueued count */ + uint64_t zone_zfs_weight; /* used to prevent starvation */ + uint64_t zone_io_util; /* IO utilization metric */ + boolean_t zone_io_util_above_avg; /* IO util percent > avg. */ + uint16_t zone_io_delay; /* IO delay on logical r/w */ + kmutex_t zone_stg_io_lock; /* protects IO window data */ + sys_zio_cntr_t zone_rd_ops; /* Counters for ZFS reads, */ + sys_zio_cntr_t zone_wr_ops; /* writes and logical writes. */ + sys_zio_cntr_t zone_lwr_ops; + + /* + * kstats and counters for VFS ops and bytes. + */ + kmutex_t zone_vfs_lock; /* protects VFS statistics */ + kstat_t *zone_vfs_ksp; + kstat_io_t zone_vfs_rwstats; + zone_vfs_kstat_t *zone_vfs_stats; + + /* + * kstats for ZFS I/O ops and bytes. + */ + kmutex_t zone_zfs_lock; /* protects ZFS statistics */ + kstat_t *zone_zfs_ksp; + kstat_io_t zone_zfs_rwstats; + zone_zfs_kstat_t *zone_zfs_stats; + /* * Solaris Auditing per-zone audit context */ @@ -534,6 +645,69 @@ typedef struct zone { rctl_qty_t zone_nprocs_ctl; /* current limit protected by */ /* zone_rctls->rcs_lock */ kstat_t *zone_nprocs_kstat; + + /* + * kstats and counters for physical memory capping. + */ + rctl_qty_t zone_phys_mem; /* current bytes of phys. mem. (RSS) */ + kstat_t *zone_physmem_kstat; + uint64_t zone_mcap_nover; /* # of times over phys. cap */ + uint64_t zone_mcap_pagedout; /* bytes of mem. paged out */ + kmutex_t zone_mcap_lock; /* protects mcap statistics */ + kstat_t *zone_mcap_ksp; + zone_mcap_kstat_t *zone_mcap_stats; + uint64_t zone_pgpgin; /* pages paged in */ + uint64_t zone_anonpgin; /* anon pages paged in */ + uint64_t zone_execpgin; /* exec pages paged in */ + uint64_t zone_fspgin; /* fs pages paged in */ + uint64_t zone_anon_alloc_fail; /* cnt of anon alloc fails */ + uint64_t zone_pf_throttle; /* cnt of page flt throttles */ + uint64_t zone_pf_throttle_usec; /* time of page flt throttles */ + + /* Num usecs to throttle page fault when zone is over phys. mem cap */ + uint32_t zone_pg_flt_delay; + + /* + * Misc. kstats and counters for zone cpu-usage aggregation. + * The zone_Xtime values are the sum of the micro-state accounting + * values for all threads that are running or have run in the zone. + * This is tracked in msacct.c as threads change state. + * The zone_stime is the sum of the LMS_SYSTEM times. + * The zone_utime is the sum of the LMS_USER times. + * The zone_wtime is the sum of the LMS_WAIT_CPU times. + * As with per-thread micro-state accounting values, these values are + * not scaled to nanosecs. The scaling is done by the + * zone_misc_kstat_update function when kstats are requested. + */ + kmutex_t zone_misc_lock; /* protects misc statistics */ + kstat_t *zone_misc_ksp; + zone_misc_kstat_t *zone_misc_stats; + uint64_t zone_stime; /* total system time */ + uint64_t zone_utime; /* total user time */ + uint64_t zone_wtime; /* total time waiting in runq */ + + struct loadavg_s zone_loadavg; /* loadavg for this zone */ + uint64_t zone_hp_avenrun[3]; /* high-precision avenrun */ + int zone_avenrun[3]; /* FSCALED avg. run queue len */ + + /* + * FSS stats updated once per second by fss_decay_usage. + */ + uint32_t zone_fss_gen; /* FSS generation cntr */ + uint64_t zone_run_ticks; /* tot # of ticks running */ + + /* + * DTrace-private per-zone state + */ + int zone_dtrace_getf; /* # of unprivileged getf()s */ + + /* + * Synchronization primitives used to synchronize between mounts and + * zone creation/destruction. + */ + int zone_mounts_in_progress; + kcondvar_t zone_mount_cv; + kmutex_t zone_mount_lock; } zone_t; /* @@ -566,9 +740,11 @@ extern zone_t *zone_find_by_name(char *); extern zone_t *zone_find_by_any_path(const char *, boolean_t); extern zone_t *zone_find_by_path(const char *); extern zoneid_t getzoneid(void); +extern zoneid_t getzonedid(void); extern zone_t *zone_find_by_id_nolock(zoneid_t); extern int zone_datalink_walk(zoneid_t, int (*)(datalink_id_t, void *), void *); extern int zone_check_datalink(zoneid_t *, datalink_id_t); +extern void zone_loadavg_update(); /* * Zone-specific data (ZSD) APIs @@ -752,13 +928,14 @@ extern int zone_dataset_visible(const char *, int *); extern int zone_kadmin(int, int, const char *, cred_t *); extern void zone_shutdown_global(void); -extern void mount_in_progress(void); -extern void mount_completed(void); +extern void mount_in_progress(zone_t *); +extern void mount_completed(zone_t *); extern int zone_walk(int (*)(zone_t *, void *), void *); extern rctl_hndl_t rc_zone_locked_mem; extern rctl_hndl_t rc_zone_max_swap; +extern rctl_hndl_t rc_zone_phys_mem; extern rctl_hndl_t rc_zone_max_lofi; #endif /* _KERNEL */ diff --git a/usr/src/uts/common/syscall/getloadavg.c b/usr/src/uts/common/syscall/getloadavg.c index c669f9b8ba..0f44064e90 100644 --- a/usr/src/uts/common/syscall/getloadavg.c +++ b/usr/src/uts/common/syscall/getloadavg.c @@ -22,10 +22,9 @@ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/systm.h> #include <sys/errno.h> @@ -41,7 +40,6 @@ int getloadavg(int *buf, int nelem) { int *loadbuf = &avenrun[0]; - int loadavg[LOADAVG_NSTATS]; int error; if (nelem < 0) @@ -50,15 +48,7 @@ getloadavg(int *buf, int nelem) nelem = LOADAVG_NSTATS; if (!INGLOBALZONE(curproc)) { - mutex_enter(&cpu_lock); - if (pool_pset_enabled()) { - psetid_t psetid = zone_pset_get(curproc->p_zone); - - error = cpupart_get_loadavg(psetid, &loadavg[0], nelem); - ASSERT(error == 0); /* pset isn't going anywhere */ - loadbuf = &loadavg[0]; - } - mutex_exit(&cpu_lock); + loadbuf = &curproc->p_zone->zone_avenrun[0]; } error = copyout(loadbuf, buf, nelem * sizeof (avenrun[0])); diff --git a/usr/src/uts/common/syscall/memcntl.c b/usr/src/uts/common/syscall/memcntl.c index 1ab3a8b65e..63c8b64ad0 100644 --- a/usr/src/uts/common/syscall/memcntl.c +++ b/usr/src/uts/common/syscall/memcntl.c @@ -21,6 +21,7 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -116,13 +117,17 @@ memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask) * MS_SYNC used to be defined to be zero but is now non-zero. * For binary compatibility we still accept zero * (the absence of MS_ASYNC) to mean the same thing. + * Binary compatibility is not an issue for MS_INVALCURPROC. */ iarg = (uintptr_t)arg; if ((iarg & ~MS_INVALIDATE) == 0) iarg |= MS_SYNC; - if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) || - ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) { + if (((iarg & + ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC)) != 0) || + ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC)) || + ((iarg & (MS_INVALIDATE|MS_INVALCURPROC)) == + (MS_INVALIDATE|MS_INVALCURPROC))) { error = set_errno(EINVAL); } else { error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0); diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c index 471c66ff32..e68f640045 100644 --- a/usr/src/uts/common/syscall/sysconfig.c +++ b/usr/src/uts/common/syscall/sysconfig.c @@ -22,6 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -158,8 +159,8 @@ sysconfig(int which) * even though rcapd can be used on the global zone too. */ if (!INGLOBALZONE(curproc) && - curproc->p_zone->zone_phys_mcap != 0) - return (MIN(btop(curproc->p_zone->zone_phys_mcap), + curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) + return (MIN(btop(curproc->p_zone->zone_phys_mem_ctl), physinstalled)); return (physinstalled); @@ -167,26 +168,23 @@ sysconfig(int which) case _CONFIG_AVPHYS_PAGES: /* * If the non-global zone has a phys. memory cap, use - * the phys. memory cap - zone's current rss. We always + * the phys. memory cap - zone's rss. We always * report the system-wide value for the global zone, even - * though rcapd can be used on the global zone too. + * though memory capping can be used on the global zone too. + * We use the cached value for the RSS since vm_getusage() + * is so expensive and we don't need this value to be exact. */ if (!INGLOBALZONE(curproc) && - curproc->p_zone->zone_phys_mcap != 0) { + curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) { pgcnt_t cap, rss, free; - vmusage_t in_use; - size_t cnt = 1; - cap = btop(curproc->p_zone->zone_phys_mcap); + cap = btop(curproc->p_zone->zone_phys_mem_ctl); if (cap > physinstalled) return (freemem); - if (vm_getusage(VMUSAGE_ZONE, 1, &in_use, &cnt, - FKIOCTL) != 0) - in_use.vmu_rss_all = 0; - rss = btop(in_use.vmu_rss_all); + rss = btop(curproc->p_zone->zone_phys_mem); /* - * Because rcapd implements a soft cap, it is possible + * Because this is a soft cap, it is possible * for rss to be temporarily over the cap. */ if (cap > rss) diff --git a/usr/src/uts/common/syscall/uadmin.c b/usr/src/uts/common/syscall/uadmin.c index 1bdfbbfd0b..dbff1b637c 100644 --- a/usr/src/uts/common/syscall/uadmin.c +++ b/usr/src/uts/common/syscall/uadmin.c @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ #include <sys/param.h> @@ -76,7 +77,7 @@ volatile int fastreboot_dryrun = 0; * system with many zones. */ void -killall(zoneid_t zoneid) +killall(zoneid_t zoneid, boolean_t force) { proc_t *p; @@ -106,7 +107,7 @@ killall(zoneid_t zoneid) p->p_stat != SIDL && p->p_stat != SZOMB) { mutex_enter(&p->p_lock); - if (sigismember(&p->p_sig, SIGKILL)) { + if (!force && sigismember(&p->p_sig, SIGKILL)) { mutex_exit(&p->p_lock); p = p->p_next; } else { @@ -243,7 +244,7 @@ kadmin(int cmd, int fcn, void *mdep, cred_t *credp) */ zone_shutdown_global(); - killall(ALL_ZONES); + killall(ALL_ZONES, B_FALSE); /* * If we are calling kadmin() from a kernel context then we * do not release these resources. diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h index 1d91475e38..156b810046 100644 --- a/usr/src/uts/common/vm/hat.h +++ b/usr/src/uts/common/vm/hat.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -460,6 +461,7 @@ void hat_setstat(struct as *, caddr_t, size_t, uint_t); */ #define HAT_ADV_PGUNLOAD 0x00 #define HAT_FORCE_PGUNLOAD 0x01 +#define HAT_CURPROC_PGUNLOAD 0x02 /* * Attributes for hat_page_*attr, hat_setstats and diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c index 31c293d416..5f106f6c06 100644 --- a/usr/src/uts/common/vm/seg_vn.c +++ b/usr/src/uts/common/vm/seg_vn.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -7254,7 +7255,8 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) vpp = svd->vpage; offset = svd->offset + (uintptr_t)(addr - seg->s_base); bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | - ((flags & MS_INVALIDATE) ? B_INVAL : 0); + ((flags & MS_INVALIDATE) ? B_INVAL : 0) | + ((flags & MS_INVALCURPROC) ? (B_INVALCURONLY | B_INVAL) : 0); if (attr) { pageprot = attr & ~(SHARED|PRIVATE); @@ -7279,11 +7281,11 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) vpp = &svd->vpage[seg_page(seg, addr)]; } else if (svd->vp && svd->amp == NULL && - (flags & MS_INVALIDATE) == 0) { + (flags & (MS_INVALIDATE | MS_INVALCURPROC)) == 0) { /* - * No attributes, no anonymous pages and MS_INVALIDATE flag - * is not on, just use one big request. + * No attributes, no anonymous pages and MS_INVAL* flags + * are not on, just use one big request. */ err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, bflags, svd->cred, NULL); @@ -7335,7 +7337,7 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) * might race in and lock the page after we unlock and before * we do the PUTPAGE, then PUTPAGE simply does nothing. */ - if (flags & MS_INVALIDATE) { + if (flags & (MS_INVALIDATE | MS_INVALCURPROC)) { if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { page_unlock(pp); diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c index fdf9f7790c..f30ba7ef2e 100644 --- a/usr/src/uts/common/vm/vm_anon.c +++ b/usr/src/uts/common/vm/vm_anon.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1986, 2010, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -792,6 +793,7 @@ anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) mutex_enter(&p->p_lock); if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { mutex_exit(&p->p_lock); + atomic_add_64(&zone->zone_anon_alloc_fail, 1); return (0); } mutex_exit(&p->p_lock); diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c index 01ad32e0b1..8caa257486 100644 --- a/usr/src/uts/common/vm/vm_as.c +++ b/usr/src/uts/common/vm/vm_as.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -56,6 +57,7 @@ #include <sys/debug.h> #include <sys/tnf_probe.h> #include <sys/vtrace.h> +#include <sys/ddi.h> #include <vm/hat.h> #include <vm/xhat.h> @@ -879,6 +881,7 @@ as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, struct seg *segsav; int as_lock_held; klwp_t *lwp = ttolwp(curthread); + zone_t *zonep = curzone; int is_xhat = 0; int holding_wpage = 0; extern struct seg_ops segdev_ops; @@ -928,6 +931,23 @@ retry: if (as == &kas) CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1); CPU_STATS_EXIT_K(); + if (zonep->zone_pg_flt_delay != 0) { + /* + * The zone in which this process is running + * is currently over it's physical memory cap. + * Throttle page faults to help the user-land + * memory capper catch up. Note that + * drv_usectohz() rounds up. + */ + atomic_add_64(&zonep->zone_pf_throttle, 1); + atomic_add_64(&zonep->zone_pf_throttle_usec, + zonep->zone_pg_flt_delay); + if (zonep->zone_pg_flt_delay < TICK_TO_USEC(1)) + drv_usecwait(zonep->zone_pg_flt_delay); + else + delay(drv_usectohz( + zonep->zone_pg_flt_delay)); + } break; } } diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c index 7233581227..39ace0b3c2 100644 --- a/usr/src/uts/common/vm/vm_pvn.c +++ b/usr/src/uts/common/vm/vm_pvn.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -431,7 +432,14 @@ pvn_write_done(page_t *plist, int flags) page_io_unlock(pp); page_unlock(pp); } - } else if (flags & B_INVAL) { + } else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) { + /* + * If B_INVALCURONLY is set, then we handle that case + * in the next conditional if hat_page_is_mapped() + * indicates that there are no additional mappings + * to the page. + */ + /* * XXX - Failed writes with B_INVAL set are * not handled appropriately. @@ -572,8 +580,9 @@ pvn_write_done(page_t *plist, int flags) } /* - * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, - * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster + * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE, + * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}. + * B_DELWRI indicates that this page is part of a kluster * operation and is only to be considered if it doesn't involve any * waiting here. B_TRUNC indicates that the file is being truncated * and so no i/o needs to be done. B_FORCE indicates that the page @@ -627,13 +636,17 @@ pvn_getdirty(page_t *pp, int flags) * If we want to free or invalidate the page then * we need to unload it so that anyone who wants * it will have to take a minor fault to get it. + * If we are only invalidating the page for the + * current process, then pass in a different flag. * Otherwise, we're just writing the page back so we * need to sync up the hardwre and software mod bit to * detect any future modifications. We clear the * software mod bit when we put the page on the dirty * list. */ - if (flags & (B_INVAL | B_FREE)) { + if (flags & B_INVALCURONLY) { + (void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD); + } else if (flags & (B_INVAL | B_FREE)) { (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); } else { (void) hat_pagesync(pp, HAT_SYNC_ZERORM); @@ -645,7 +658,7 @@ pvn_getdirty(page_t *pp, int flags) * list after all. */ page_io_unlock(pp); - if (flags & B_INVAL) { + if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) { /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else if (flags & B_FREE) { @@ -657,6 +670,9 @@ pvn_getdirty(page_t *pp, int flags) * of VOP_PUTPAGE() who prefer freeing the * page _only_ if no one else is accessing it. * E.g. segmap_release() + * We also take this path for B_INVALCURONLY and + * let page_release call VN_DISPOSE if no one else is + * using the page. * * The above hat_ismod() check is useless because: * (1) we may not be holding SE_EXCL lock; @@ -681,7 +697,7 @@ pvn_getdirty(page_t *pp, int flags) * We'll detect the fact that they used it when the * i/o is done and avoid freeing the page. */ - if (flags & B_FREE) + if (flags & (B_FREE | B_INVALCURONLY)) page_downgrade(pp); diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c index d422f8d0e8..8f425e9e4f 100644 --- a/usr/src/uts/common/vm/vm_usage.c +++ b/usr/src/uts/common/vm/vm_usage.c @@ -25,6 +25,10 @@ */ /* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +/* * vm_usage * * This file implements the getvmusage() private system call. @@ -114,7 +118,7 @@ * For accurate counting of map-shared and COW-shared pages. * * - visited private anons (refcnt > 1) for each collective. - * (entity->vme_anon_hash) + * (entity->vme_anon) * For accurate counting of COW-shared pages. * * The common accounting structure is the vmu_entity_t, which represents @@ -152,6 +156,7 @@ #include <sys/vm_usage.h> #include <sys/zone.h> #include <sys/sunddi.h> +#include <sys/sysmacros.h> #include <sys/avl.h> #include <vm/anon.h> #include <vm/as.h> @@ -199,6 +204,14 @@ typedef struct vmu_object { } vmu_object_t; /* + * Node for tree of visited COW anons. + */ +typedef struct vmu_anon { + avl_node_t vma_node; + uintptr_t vma_addr; +} vmu_anon_t; + +/* * Entity by which to count results. * * The entity structure keeps the current rss/swap counts for each entity @@ -221,7 +234,7 @@ typedef struct vmu_entity { struct vmu_entity *vme_next_calc; mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */ mod_hash_t *vme_amp_hash; /* shared amps visited for entity */ - mod_hash_t *vme_anon_hash; /* COW anons visited for entity */ + avl_tree_t vme_anon; /* COW anons visited for entity */ vmusage_t vme_result; /* identifies entity and results */ } vmu_entity_t; @@ -324,6 +337,23 @@ bounds_cmp(const void *bnd1, const void *bnd2) } /* + * Comparison routine for our AVL tree of anon structures. + */ +static int +vmu_anon_cmp(const void *lhs, const void *rhs) +{ + const vmu_anon_t *l = lhs, *r = rhs; + + if (l->vma_addr == r->vma_addr) + return (0); + + if (l->vma_addr < r->vma_addr) + return (-1); + + return (1); +} + +/* * Save a bound on the free list. */ static void @@ -363,13 +393,18 @@ static void vmu_free_entity(mod_hash_val_t val) { vmu_entity_t *entity = (vmu_entity_t *)val; + vmu_anon_t *anon; + void *cookie = NULL; if (entity->vme_vnode_hash != NULL) i_mod_hash_clear_nosync(entity->vme_vnode_hash); if (entity->vme_amp_hash != NULL) i_mod_hash_clear_nosync(entity->vme_amp_hash); - if (entity->vme_anon_hash != NULL) - i_mod_hash_clear_nosync(entity->vme_anon_hash); + + while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL) + kmem_free(anon, sizeof (vmu_anon_t)); + + avl_destroy(&entity->vme_anon); entity->vme_next = vmu_data.vmu_free_entities; vmu_data.vmu_free_entities = entity; @@ -485,10 +520,10 @@ vmu_alloc_entity(id_t id, int type, id_t zoneid) "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, sizeof (struct anon_map)); - if (entity->vme_anon_hash == NULL) - entity->vme_anon_hash = mod_hash_create_ptrhash( - "vmusage anon hash", VMUSAGE_HASH_SIZE, - mod_hash_null_valdtor, sizeof (struct anon)); + VERIFY(avl_first(&entity->vme_anon) == NULL); + + avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon), + offsetof(struct vmu_anon, vma_node)); entity->vme_next = vmu_data.vmu_entities; vmu_data.vmu_entities = entity; @@ -518,7 +553,8 @@ vmu_alloc_zone(id_t id) zone->vmz_id = id; - if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) + if ((vmu_data.vmu_calc_flags & + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0) zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | @@ -613,21 +649,19 @@ vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type) } static int -vmu_find_insert_anon(mod_hash_t *hash, caddr_t key) +vmu_find_insert_anon(vmu_entity_t *entity, void *key) { - int ret; - caddr_t val; + vmu_anon_t anon, *ap; - ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, - (mod_hash_val_t *)&val); + anon.vma_addr = (uintptr_t)key; - if (ret == 0) + if (avl_find(&entity->vme_anon, &anon, NULL) != NULL) return (0); - ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, - (mod_hash_val_t)key, (mod_hash_hndl_t)0); + ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP); + ap->vma_addr = (uintptr_t)key; - ASSERT(ret == 0); + avl_add(&entity->vme_anon, ap); return (1); } @@ -937,7 +971,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, if (ap != NULL && vn != NULL && vn->v_pages != NULL && (page = page_exists(vn, off)) != NULL) { - page_type = VMUSAGE_BOUND_INCORE; + if (PP_ISFREE(page)) + page_type = VMUSAGE_BOUND_NOT_INCORE; + else + page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); @@ -1024,7 +1061,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, if (vnode->v_pages != NULL && (page = page_exists(vnode, ptob(index))) != NULL) { - page_type = VMUSAGE_BOUND_INCORE; + if (PP_ISFREE(page)) + page_type = VMUSAGE_BOUND_NOT_INCORE; + else + page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); @@ -1304,6 +1344,12 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) } /* + * Pages on the free list aren't counted for the rss. + */ + if (PP_ISFREE(page)) + continue; + + /* * Assume anon structs with a refcnt * of 1 are not COW shared, so there * is no reason to track them per entity. @@ -1320,8 +1366,7 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) * Track COW anons per entity so * they are not double counted. */ - if (vmu_find_insert_anon(entity->vme_anon_hash, - (caddr_t)ap) == 0) + if (vmu_find_insert_anon(entity, ap) == 0) continue; result->vmu_rss_all += (pgcnt << PAGESHIFT); @@ -1461,8 +1506,9 @@ vmu_calculate_proc(proc_t *p) entities = tmp; } if (vmu_data.vmu_calc_flags & - (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | - VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE | + VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | + VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, @@ -1594,8 +1640,7 @@ vmu_free_extra() mod_hash_destroy_hash(te->vme_vnode_hash); if (te->vme_amp_hash != NULL) mod_hash_destroy_hash(te->vme_amp_hash); - if (te->vme_anon_hash != NULL) - mod_hash_destroy_hash(te->vme_anon_hash); + VERIFY(avl_first(&te->vme_anon) == NULL); kmem_free(te, sizeof (vmu_entity_t)); } while (vmu_data.vmu_free_zones != NULL) { @@ -1739,12 +1784,34 @@ vmu_cache_rele(vmu_cache_t *cache) } /* + * When new data is calculated, update the phys_mem rctl usage value in the + * zones. + */ +static void +vmu_update_zone_rctls(vmu_cache_t *cache) +{ + vmusage_t *rp; + size_t i = 0; + zone_t *zp; + + for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) { + if (rp->vmu_type == VMUSAGE_ZONE && + rp->vmu_zoneid != ALL_ZONES) { + if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) { + zp->zone_phys_mem = rp->vmu_rss_all; + zone_rele(zp); + } + } + } +} + +/* * Copy out the cached results to a caller. Inspect the callers flags * and zone to determine which cached results should be copied. */ static int vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, - uint_t flags, int cpflg) + uint_t flags, id_t req_zone_id, int cpflg) { vmusage_t *result, *out_result; vmusage_t dummy; @@ -1763,7 +1830,7 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, /* figure out what results the caller is interested in. */ if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) types |= VMUSAGE_SYSTEM; - if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) + if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) types |= VMUSAGE_ZONE; if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) @@ -1826,26 +1893,33 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, continue; } - /* Skip "other zone" results if not requested */ - if (result->vmu_zoneid != curproc->p_zone->zone_id) { - if (result->vmu_type == VMUSAGE_ZONE && - (flags & VMUSAGE_ALL_ZONES) == 0) - continue; - if (result->vmu_type == VMUSAGE_PROJECTS && - (flags & (VMUSAGE_ALL_PROJECTS | - VMUSAGE_COL_PROJECTS)) == 0) - continue; - if (result->vmu_type == VMUSAGE_TASKS && - (flags & VMUSAGE_ALL_TASKS) == 0) - continue; - if (result->vmu_type == VMUSAGE_RUSERS && - (flags & (VMUSAGE_ALL_RUSERS | - VMUSAGE_COL_RUSERS)) == 0) - continue; - if (result->vmu_type == VMUSAGE_EUSERS && - (flags & (VMUSAGE_ALL_EUSERS | - VMUSAGE_COL_EUSERS)) == 0) + if (result->vmu_type == VMUSAGE_ZONE && + flags & VMUSAGE_A_ZONE) { + /* Skip non-requested zone results */ + if (result->vmu_zoneid != req_zone_id) continue; + } else { + /* Skip "other zone" results if not requested */ + if (result->vmu_zoneid != curproc->p_zone->zone_id) { + if (result->vmu_type == VMUSAGE_ZONE && + (flags & VMUSAGE_ALL_ZONES) == 0) + continue; + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & (VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_TASKS && + (flags & VMUSAGE_ALL_TASKS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & (VMUSAGE_ALL_RUSERS | + VMUSAGE_COL_RUSERS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & (VMUSAGE_ALL_EUSERS | + VMUSAGE_COL_EUSERS)) == 0) + continue; + } } count++; if (out_result != NULL) { @@ -1901,10 +1975,12 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) int cacherecent = 0; hrtime_t now; uint_t flags_orig; + id_t req_zone_id; /* * Non-global zones cannot request system wide and/or collated - * results, or the system result, so munge the flags accordingly. + * results, or the system result, or usage of another zone, so munge + * the flags accordingly. */ flags_orig = flags; if (curproc->p_zone != global_zone) { @@ -1924,6 +2000,10 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) flags &= ~VMUSAGE_SYSTEM; flags |= VMUSAGE_ZONE; } + if (flags & VMUSAGE_A_ZONE) { + flags &= ~VMUSAGE_A_ZONE; + flags |= VMUSAGE_ZONE; + } } /* Check for unknown flags */ @@ -1934,6 +2014,21 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) if ((flags & VMUSAGE_MASK) == 0) return (set_errno(EINVAL)); + /* If requesting results for a specific zone, get the zone ID */ + if (flags & VMUSAGE_A_ZONE) { + size_t bufsize; + vmusage_t zreq; + + if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg)) + return (set_errno(EFAULT)); + /* Requested zone ID is passed in buf, so 0 len not allowed */ + if (bufsize == 0) + return (set_errno(EINVAL)); + if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg)) + return (set_errno(EFAULT)); + req_zone_id = zreq.vmu_id; + } + mutex_enter(&vmu_data.vmu_lock); now = gethrtime(); @@ -1953,7 +2048,7 @@ start: mutex_exit(&vmu_data.vmu_lock); ret = vmu_copyout_results(cache, buf, nres, flags_orig, - cpflg); + req_zone_id, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); if (vmu_data.vmu_pending_waiters > 0) @@ -2009,8 +2104,11 @@ start: mutex_exit(&vmu_data.vmu_lock); + /* update zone's phys. mem. rctl usage */ + vmu_update_zone_rctls(cache); /* copy cache */ - ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg); + ret = vmu_copyout_results(cache, buf, nres, flags_orig, + req_zone_id, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); mutex_exit(&vmu_data.vmu_lock); |