diff options
183 files changed, 56870 insertions, 2043 deletions
diff --git a/usr/src/Targetdirs b/usr/src/Targetdirs index 036b228d34..35ab557045 100644 --- a/usr/src/Targetdirs +++ b/usr/src/Targetdirs @@ -159,6 +159,7 @@ ROOT.SYS= \ /var/svc/manifest/platform \ /var/svc/manifest/platform/i86pc \ /var/svc/manifest/platform/sun4u \ + /var/svc/manifest/platform/sun4v \ /var/svc/manifest/site \ /var/svc/profile @@ -170,7 +171,13 @@ XROOT.BIN= \ /usr/lib/inet/wanboot # EXPORT DELETE END +i386_ROOT.BIN= + +sparc_ROOT.BIN= \ + /usr/lib/ldoms + ROOT.BIN= \ + $($(MACH)_ROOT.BIN) \ /etc/saf \ /etc/sma \ /etc/sma/snmp \ diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile index 0e4a192222..ccf9b44d6d 100644 --- a/usr/src/cmd/Makefile +++ b/usr/src/cmd/Makefile @@ -456,6 +456,7 @@ sparc_SUBDIRS= \ sckmd \ sf880drd \ stmsboot \ + vntsd \ wrsmconf \ wrsmstat @@ -692,6 +693,7 @@ sparc_MSGSUBDIRS= \ prtdscp \ prtfru \ stmsboot \ + vntsd \ wrsmconf \ wrsmstat @@ -765,6 +767,7 @@ MANIFEST_TOPDIRS= \ syseventd \ syslogd \ utmpd \ + vntsd \ ypcmd \ zoneadmd diff --git a/usr/src/cmd/Makefile.cmd b/usr/src/cmd/Makefile.cmd index b7c8ec8f83..6ea1b083b5 100644 --- a/usr/src/cmd/Makefile.cmd +++ b/usr/src/cmd/Makefile.cmd @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "%Z%%M% %I% %E% SMI" @@ -209,6 +208,7 @@ ROOTSVCNETWORKSSL= $(ROOTSVCNETWORK)/ssl ROOTSVCPLATFORM= $(ROOTVARSVCMANIFEST)/platform ROOTSVCPLATFORMI86PC= $(ROOTSVCPLATFORM)/i86pc ROOTSVCPLATFORMSUN4U= $(ROOTSVCPLATFORM)/sun4u +ROOTSVCPLATFORMSUN4V= $(ROOTSVCPLATFORM)/sun4v ROOTSVCAPPLICATION= $(ROOTVARSVCMANIFEST)/application ROOTSVCAPPLICATIONMANAGEMENT= $(ROOTSVCAPPLICATION)/management ROOTSVCAPPLICATIONSECURITY= $(ROOTSVCAPPLICATION)/security @@ -388,6 +388,9 @@ $(ROOTSVCPLATFORMI86PC)/%: % $(ROOTSVCPLATFORMSUN4U)/%: % $(INS.file) +$(ROOTSVCPLATFORMSUN4V)/%: % + $(INS.file) + $(ROOTMAN1)/%: %.sunman $(INS.rename) diff --git a/usr/src/cmd/mdb/common/kmdb/kctl/kctl.h b/usr/src/cmd/mdb/common/kmdb/kctl/kctl.h index fd2c0c7701..612ed8db13 100644 --- a/usr/src/cmd/mdb/common/kmdb/kctl/kctl.h +++ b/usr/src/cmd/mdb/common/kmdb/kctl/kctl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -106,6 +105,10 @@ extern void kctl_auxv_init(kmdb_auxv_t *, const char *, const char **, void *); extern void kctl_auxv_init_isadep(kmdb_auxv_t *, void *); extern void kctl_auxv_fini(kmdb_auxv_t *); extern void kctl_auxv_fini_isadep(kmdb_auxv_t *); +#ifdef sun4v +extern void kctl_auxv_set_promif(kmdb_auxv_t *); +extern void kctl_switch_promif(void); +#endif extern void kctl_wrintr(void); extern void kctl_wrintr_fire(void); diff --git a/usr/src/cmd/mdb/common/kmdb/kmdb_auxv.h b/usr/src/cmd/mdb/common/kmdb/kmdb_auxv.h index 7faf9b980b..bf3cc8fdf1 100644 --- a/usr/src/cmd/mdb/common/kmdb/kmdb_auxv.h +++ b/usr/src/cmd/mdb/common/kmdb/kmdb_auxv.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -53,6 +52,9 @@ #include <gelf.h> #include <sys/machelf.h> #include <sys/kdi.h> +#ifdef sun4v +#include <sys/obpdefs.h> +#endif #ifdef __cplusplus extern "C" { @@ -104,6 +106,16 @@ typedef struct kmdb_auxv { #ifdef __sparc void (*kav_ktrap_install)(int, void (*)(void)); /* Add to krnl trptbl */ void (*kav_ktrap_restore)(void); /* Restore krnl trap hdlrs */ +#ifdef sun4v + uint_t kav_domaining; /* Domaining status */ + caddr_t kav_promif_root; /* PROM shadow tree root */ + ihandle_t kav_promif_in; /* PROM input dev instance */ + ihandle_t kav_promif_out; /* PROM output dev instance */ + phandle_t kav_promif_pin; /* PROM input dev package */ + phandle_t kav_promif_pout; /* PROM output dev package */ + pnode_t kav_promif_chosennode; /* PROM "/chosen" node */ + pnode_t kav_promif_optionsnode; /* PROM "/options" node */ +#endif #endif } kmdb_auxv_t; diff --git a/usr/src/cmd/mdb/common/kmdb/kmdb_fault.c b/usr/src/cmd/mdb/common/kmdb/kmdb_fault.c index e595f7be02..06f7c2927f 100644 --- a/usr/src/cmd/mdb/common/kmdb/kmdb_fault.c +++ b/usr/src/cmd/mdb/common/kmdb/kmdb_fault.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -78,7 +78,10 @@ kmdb_fault(kreg_t tt, kreg_t pc, kreg_t sp, int cpuid) for (;;) { mdb_iob_printf(mdb.m_err, "\n%s: " #if defined(__sparc) - "(o)bp, (p)anic" +#ifndef sun4v + "(o)bp, " +#endif /* sun4v */ + "(p)anic" #else "reboo(t)" #endif @@ -98,8 +101,10 @@ kmdb_fault(kreg_t tt, kreg_t pc, kreg_t sp, int cpuid) continue; #endif +#ifndef sun4v case 'o': case 'O': +#endif /* sun4v */ case 't': case 'T': kmdb_dpi_enter_mon(); diff --git a/usr/src/cmd/mdb/common/kmdb/kmdb_kctl.h b/usr/src/cmd/mdb/common/kmdb/kmdb_kctl.h index 54b7df2938..ad772338a5 100644 --- a/usr/src/cmd/mdb/common/kmdb/kmdb_kctl.h +++ b/usr/src/cmd/mdb/common/kmdb/kmdb_kctl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -53,6 +52,16 @@ extern "C" { extern int kmdb_init(const char *, kmdb_auxv_t *); +/* + * This function should only be defined for sun4v. However the mdb build + * uses a custom tool (hdr2map) to generate mapfile from header files but + * this tool does not take care of preprocessor directives and functions + * are included into the mapfile whatever the architecture is and even + * if there is an #ifdef sun4v. So we always declare this function but it + * has a fake definition for all architecture but sun4v. + */ +extern void kmdb_init_promif(char *, kmdb_auxv_t *); + extern void kmdb_activate(kdi_debugvec_t **, uint_t); extern void kmdb_deactivate(void); diff --git a/usr/src/cmd/mdb/common/kmdb/kmdb_main.c b/usr/src/cmd/mdb/common/kmdb/kmdb_main.c index 1e34f218f2..7cc1d3f7b0 100644 --- a/usr/src/cmd/mdb/common/kmdb/kmdb_main.c +++ b/usr/src/cmd/mdb/common/kmdb/kmdb_main.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -276,6 +275,28 @@ kmdb_init(const char *execname, kmdb_auxv_t *kav) return (0); } +#ifdef sun4v + +void +kmdb_init_promif(char *pgmname, kmdb_auxv_t *kav) +{ + kmdb_prom_init_promif(pgmname, kav); +} + +#else + +/*ARGSUSED*/ +void +kmdb_init_promif(char *pgmname, kmdb_auxv_t *kav) +{ + /* + * Fake function for non sun4v. See comments in kmdb_ctl.h + */ + ASSERT(0); +} + +#endif + /* * First-time kmdb startup. Run when kmdb has control of the machine for the * first time. diff --git a/usr/src/cmd/mdb/common/kmdb/kmdb_promif.c b/usr/src/cmd/mdb/common/kmdb/kmdb_promif.c index 50d65677cd..0757ccb48d 100644 --- a/usr/src/cmd/mdb/common/kmdb/kmdb_promif.c +++ b/usr/src/cmd/mdb/common/kmdb/kmdb_promif.c @@ -18,7 +18,6 @@ * * CDDL HEADER END */ - /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -29,6 +28,9 @@ #include <sys/types.h> #include <sys/termios.h> #include <sys/promif.h> +#ifdef sun4v +#include <sys/promif_impl.h> +#endif #include <unistd.h> #include <string.h> #include <stdlib.h> @@ -754,7 +756,14 @@ kmdb_prom_assfail(const char *assertion, const char *file, int line) void kmdb_prom_init_begin(char *pgmname, kmdb_auxv_t *kav) { +#ifdef sun4v + if (kav->kav_domaining) + kmdb_prom_init_promif(pgmname, kav); + else + prom_init(pgmname, kav->kav_romp); +#else prom_init(pgmname, kav->kav_romp); +#endif /* Initialize the interrupt ring buffer */ kmdb_prom_readbuf_head = kmdb_prom_readbuf_tail; @@ -764,6 +773,18 @@ kmdb_prom_init_begin(char *pgmname, kmdb_auxv_t *kav) #endif } +#ifdef sun4v +void +kmdb_prom_init_promif(char *pgmname, kmdb_auxv_t *kav) +{ + ASSERT(kav->kav_domaining); + cif_init(pgmname, kav->kav_promif_root, + kav->kav_promif_in, kav->kav_promif_out, + kav->kav_promif_pin, kav->kav_promif_pout, + kav->kav_promif_chosennode, kav->kav_promif_optionsnode); +} +#endif + /* * Conclude the initialization of the debugger/PROM interface. Memory * allocation and the global `mdb' object are now available. diff --git a/usr/src/cmd/mdb/common/kmdb/kmdb_promif.h b/usr/src/cmd/mdb/common/kmdb/kmdb_promif.h index bdbadb5996..baca42f615 100644 --- a/usr/src/cmd/mdb/common/kmdb/kmdb_promif.h +++ b/usr/src/cmd/mdb/common/kmdb/kmdb_promif.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -42,6 +41,9 @@ extern "C" { extern void kmdb_prom_init_begin(char *, kmdb_auxv_t *); extern void kmdb_prom_init_finish(kmdb_auxv_t *); +#ifdef sun4v +extern void kmdb_prom_init_promif(char *, kmdb_auxv_t *); +#endif extern ssize_t kmdb_prom_read(void *, size_t, struct termios *); extern ssize_t kmdb_prom_write(const void *, size_t, struct termios *); extern ihandle_t kmdb_prom_get_handle(char *); diff --git a/usr/src/cmd/mdb/sparc/kmdb/kaif_activate.c b/usr/src/cmd/mdb/sparc/kmdb/kaif_activate.c index 971ff13dd9..a17431d53f 100644 --- a/usr/src/cmd/mdb/sparc/kmdb/kaif_activate.c +++ b/usr/src/cmd/mdb/sparc/kmdb/kaif_activate.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -131,7 +131,34 @@ kaif_install_generic(caddr_t tgt, caddr_t arg) bcopy((caddr_t)kaif_hdlr_generic, tgt, 32); } -#ifndef sun4v +#ifdef sun4v + +/*ARGSUSED*/ +static void +kaif_install_goto_tt64(caddr_t tgt, caddr_t arg) +{ + /* LINTED - pointer alignment */ + uint32_t *hdlr = (uint32_t *)tgt; + uint32_t disp = (T_FAST_INSTR_MMU_MISS - T_INSTR_MMU_MISS) * 0x20; + + *hdlr++ = 0x10480000 | (disp >> 2); /* ba,pt (to tt64) */ + *hdlr++ = 0x01000000; /* nop */ +} + +/*ARGSUSED*/ +static void +kaif_install_goto_tt68(caddr_t tgt, caddr_t arg) +{ + /* LINTED - pointer alignment */ + uint32_t *hdlr = (uint32_t *)tgt; + uint32_t disp = (T_FAST_DATA_MMU_MISS - T_DATA_MMU_MISS) * 0x20; + + *hdlr++ = 0x10480000 | (disp >> 2); /* ba,pt (to tt68) */ + *hdlr++ = 0x01000000; /* nop */ +} + +#endif /* sun4v */ + static void kaif_install_dmmumiss(caddr_t tgt, caddr_t vatotte) { @@ -159,25 +186,31 @@ kaif_install_immumiss(caddr_t tgt, caddr_t vatotte) *patch++ |= (uintptr_t)vatotte >> 10; *patch |= ((uintptr_t)vatotte) & 0x3ff; } -#endif /* sun4v */ static struct kaif_trap_handlers { uint_t th_tt; void (*th_install)(caddr_t, caddr_t); } kaif_trap_handlers[] = { { T_INSTR_EXCEPTION, kaif_install_generic }, +#ifdef sun4v + { T_INSTR_MMU_MISS, kaif_install_goto_tt64 }, +#endif { T_IDIV0, kaif_install_generic }, { T_DATA_EXCEPTION, kaif_install_generic }, +#ifdef sun4v + { T_DATA_MMU_MISS, kaif_install_goto_tt68 }, +#endif { T_DATA_ERROR, kaif_install_generic }, { T_ALIGNMENT, kaif_install_generic }, -#ifdef sun4v -#else /* sun4v */ { T_FAST_INSTR_MMU_MISS, kaif_install_immumiss }, { T_FAST_DATA_MMU_MISS, kaif_install_dmmumiss }, { T_FAST_DATA_MMU_PROT, kaif_install_generic }, +#ifdef sun4v + { T_INSTR_MMU_MISS + T_TL1, kaif_install_goto_tt64 }, + { T_DATA_MMU_MISS + T_TL1, kaif_install_goto_tt68 }, +#endif { T_FAST_INSTR_MMU_MISS + T_TL1, kaif_install_immumiss }, { T_FAST_DATA_MMU_MISS + T_TL1, kaif_install_dmmumiss }, -#endif /* sun4v */ { 0 } }; @@ -189,34 +222,27 @@ kaif_trap_init(void) int i; /* + * sun4u: * We rely upon OBP for the handling of a great many traps. As such, * we begin by populating our table with pointers to OBP's handlers. * We then copy in our own handlers where appropriate. At some point, * when we provide the bulk of the handlers, this process will be * reversed. + * + * sun4v: + * The sun4v kernel dismisses OBP at boot. Both fast and slow TLB + * misses are handled by KMDB. Breakpoint traps go directly KMDB. + * All other trap entries are redirected to their respective + * trap implemenation within the Solaris trap table. */ for (i = 0; i < kaif_tba_native_sz; i += 0x20) { /* LINTED - pointer alignment */ uint32_t *hdlr = (uint32_t *)(kaif_tba_native + i); #ifdef sun4v - uint32_t tt = i/0x20; - - /* - * We use obp's tl0 handlers. Sine kmdb installs itsdebug - * hook in obp, if obp cannot handle any traps, such as - * user enter an invalid address in kmdb, obp will call - * kmdb's callback and the control goes back to kmdb. - * For tl>0 traps, kernel's trap handlers are good at - * handling these on sun4v. - */ - if (tt >= T_TL1) - brtgt = (uintptr_t)(kaif_tba_kernel + i); - else - brtgt = (uintptr_t)(kaif_tba_obp + i); -#else /* !sun4v */ + brtgt = (uintptr_t)(kaif_tba_kernel + i); +#else brtgt = (uintptr_t)(kaif_tba_obp + i); -#endif /* sun4v */ - +#endif *hdlr++ = 0x03000000 | (brtgt >> 10); /* sethi brtgt, %g1 */ *hdlr++ = 0x81c06000 | (brtgt & 0x3ff); /* jmp %g1 + brtgt */ *hdlr++ = 0x01000000; /* nop */ diff --git a/usr/src/cmd/mdb/sparc/kmdb/kctl/kctl_isadep.c b/usr/src/cmd/mdb/sparc/kmdb/kctl/kctl_isadep.c index 4c14d32067..d849441ac1 100644 --- a/usr/src/cmd/mdb/sparc/kmdb/kctl/kctl_isadep.c +++ b/usr/src/cmd/mdb/sparc/kmdb/kctl/kctl_isadep.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,6 +35,11 @@ #include <sys/cpuvar.h> #include <sys/kobj.h> #include <sys/kobj_impl.h> +#ifdef sun4v +#include <sys/ldoms.h> +#include <sys/promif_impl.h> +#include <kmdb/kmdb_kctl.h> +#endif #include <kmdb/kctl/kctl.h> @@ -229,8 +233,48 @@ kctl_auxv_init_isadep(kmdb_auxv_t *kav, void *romp) kav->kav_ktrap_install = kctl_ktrap_install; kav->kav_ktrap_restore = kctl_ktrap_restore; +#ifdef sun4v + if (kctl.kctl_boot_loaded) { + /* + * When booting kmdb, kmdb starts before domaining is + * enabled and before the cif handler is changed to the + * kernel cif handler. So we start kmdb with using the + * OBP and we will change this when the cif handler is + * installed. + */ + kav->kav_domaining = 0; + } else { + kctl_auxv_set_promif(kav); + } +#endif } +#ifdef sun4v + +void +kctl_auxv_set_promif(kmdb_auxv_t *kav) +{ + kav->kav_domaining = domaining_enabled; + kav->kav_promif_root = promif_stree_getroot(); + kav->kav_promif_in = prom_stdin_ihandle(); + kav->kav_promif_out = prom_stdout_ihandle(); + kav->kav_promif_pin = prom_stdin_node(); + kav->kav_promif_pout = prom_stdout_node(); + kav->kav_promif_chosennode = prom_chosennode(); + kav->kav_promif_optionsnode = prom_finddevice("/options"); +} + +void +kctl_switch_promif(void) +{ + kmdb_auxv_t kav; + + kctl_auxv_set_promif(&kav); + kmdb_init_promif(NULL, &kav); +} + +#endif + /*ARGSUSED*/ void kctl_auxv_fini_isadep(kmdb_auxv_t *auxv) diff --git a/usr/src/cmd/mdb/sparc/v9/kmdb/kaif_handlers.s b/usr/src/cmd/mdb/sparc/v9/kmdb/kaif_handlers.s index 1d7c9eaf8e..a90f7b2e4f 100644 --- a/usr/src/cmd/mdb/sparc/v9/kmdb/kaif_handlers.s +++ b/usr/src/cmd/mdb/sparc/v9/kmdb/kaif_handlers.s @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -31,6 +31,20 @@ #include <sys/machtrap.h> #include <sys/privregs.h> #include <sys/mmu.h> +#include <vm/mach_sfmmu.h> + +#if defined(sun4v) && !defined(lint) +#include <sys/machparam.h> +#endif + +#if defined(sun4v) && defined(KMDB_TRAPCOUNT) +/* + * The sun4v implemenations of the fast miss handlers are larger than those + * of their sun4u kin. This is unfortunate because there is not enough space + * remaining in the respective trap table entries for this debug feature. + */ +#error "KMDB_TRAPCOUNT not supported on sun4v" +#endif /* * This file contains the trap handlers that will be copied to kmdb's trap @@ -50,12 +64,7 @@ #if defined(lint) #include <kmdb/kaif.h> -#endif /* lint */ -#if defined(lint) - -#ifdef sun4v -#else /* sun4v */ void kaif_hdlr_dmiss(void) { @@ -65,51 +74,149 @@ void kaif_itlb_handler(void) { } -#endif /* sun4v */ -#else /* lint */ + +#else /* lint */ #ifdef sun4v + +#define GET_MMU_D_ADDR_CTX(daddr, ctx) \ + MMU_FAULT_STATUS_AREA(ctx); \ + ldx [ctx + MMFSA_D_ADDR], daddr; \ + ldx [ctx + MMFSA_D_CTX], ctx + +#define GET_MMU_I_ADDR_CTX(iaddr, ctx) \ + MMU_FAULT_STATUS_AREA(ctx); \ + ldx [ctx + MMFSA_I_ADDR], iaddr; \ + ldx [ctx + MMFSA_I_CTX], ctx + +/* + * KAIF_ITLB_STUFF + * derived from ITLB_STUFF in uts/sun4v/vm/mach_sfmmu.h + * + * Load ITLB entry + * + * In: + * tte = reg containing tte + * ouch = branch target label used if hcall fails (sun4v only) + * scr1, scr2, scr3, scr4 = scratch registers (must not be %o0-%o3) + */ +#define KAIF_ITLB_STUFF(tte, ouch, scr1, scr2, scr3, scr4) \ + mov %o0, scr1; \ + mov %o1, scr2; \ + mov %o2, scr3; \ + mov %o3, scr4; \ + MMU_FAULT_STATUS_AREA(%o2); \ + ldx [%o2 + MMFSA_I_ADDR], %o0; \ + ldx [%o2 + MMFSA_I_CTX], %o1; \ + srlx %o0, PAGESHIFT, %o0; \ + sllx %o0, PAGESHIFT, %o0; \ + mov tte, %o2; \ + mov MAP_ITLB, %o3; \ + ta MMU_MAP_ADDR; \ + /* BEGIN CSTYLED */ \ + brnz,a,pn %o0, ouch; \ + nop; \ + /* END CSTYLED */ \ + mov scr1, %o0; \ + mov scr2, %o1; \ + mov scr3, %o2; \ + mov scr4, %o3 + +/* + * KAIF_DTLB_STUFF + * derived from DTLB_STUFF in uts/sun4v/vm/mach_sfmmu.h + * + * Load DTLB entry + * + * In: + * tte = reg containing tte + * ouch = branch target label used if hcall fails (sun4v only) + * scr1, scr2, scr3, scr4 = scratch registers (must not be %o0-%o3) + */ +#define KAIF_DTLB_STUFF(tte, ouch, scr1, scr2, scr3, scr4) \ + mov %o0, scr1; \ + mov %o1, scr2; \ + mov %o2, scr3; \ + mov %o3, scr4; \ + MMU_FAULT_STATUS_AREA(%o2); \ + ldx [%o2 + MMFSA_D_ADDR], %o0; \ + ldx [%o2 + MMFSA_D_CTX], %o1; \ + srlx %o0, PAGESHIFT, %o0; \ + sllx %o0, PAGESHIFT, %o0; \ + mov tte, %o2; \ + mov MAP_DTLB, %o3; \ + ta MMU_MAP_ADDR; \ + /* BEGIN CSTYLED */ \ + brnz,a,pn %o0, ouch; \ + nop; \ + /* END CSTYLED */ \ + mov scr1, %o0; \ + mov scr2, %o1; \ + mov scr3, %o2; \ + mov scr4, %o3 + #else /* sun4v */ - .global kaif_hdlr_dmiss_patch - .global kaif_hdlr_imiss_patch +#define GET_MMU_D_ADDR_CTX(daddr, ctx) \ + mov MMU_TAG_ACCESS, ctx; \ + ldxa [ctx]ASI_DMMU, daddr; \ + sllx daddr, TAGACC_CTX_LSHIFT, ctx; \ + srlx ctx, TAGACC_CTX_LSHIFT, ctx + +#define GET_MMU_I_ADDR_CTX(iaddr, ctx) \ + rdpr %tpc, iaddr; \ + ldxa [%g0]ASI_IMMU, ctx; \ + srlx ctx, TTARGET_CTX_SHIFT, ctx + +#define KAIF_DTLB_STUFF(tte, ouch, scr1, scr2, scr3, scr4) \ + DTLB_STUFF(tte, scr1, scr2, scr3, scr4) + +#define KAIF_ITLB_STUFF(tte, ouch, scr1, scr2, scr3, scr4) \ + ITLB_STUFF(tte, scr1, scr2, scr3, scr4) + +#endif /* sun4v */ + +/* + * KAIF_CALL_KDI_VATOTTE + * + * Use kdi_vatotte to look up the tte. We don't bother stripping the + * context, as it won't change the tte we get. + * + * The two instruction at patch_lbl are modified during runtime + * by kaif to point to kdi_vatotte + * + * Clobbers all globals. + * Returns tte in %g1 if successful, otherwise 0 in %g1 + * Leaves address of next instruction following this macro in scr1 + */ +#define KAIF_CALL_KDI_VATOTTE(addr, ctx, patch_lbl, scr0, scr1) \ + .global patch_lbl; \ +patch_lbl: \ + sethi %hi(0), scr0; \ + or scr0, %lo(0), scr0; \ + jmpl scr0, scr1; \ + add scr1, 8, scr1 + - /* - * This routine must be exactly 32 instructions long. - */ ENTRY_NP(kaif_hdlr_dmiss) - mov MMU_TAG_ACCESS, %g1 - ldxa [%g1]ASI_DMMU, %g1 /* %g1 = addr|ctx */ - sllx %g1, TAGACC_CTX_LSHIFT, %g2 /* strip addr */ - srlx %g2, TAGACC_CTX_LSHIFT, %g2 /* %g2 = ctx */ - - /* - * Use kdi_vatotte to look up the tte. We don't bother stripping the - * context, as it won't change the tte we get. - */ -kaif_hdlr_dmiss_patch: - sethi %hi(0), %g3 /* set by kaif to kdi_vatotte */ - or %g3, %lo(0), %g3 - jmpl %g3, %g7 /* uses all regs, ret to %g7, tte or 0 in %g1 */ - add %g7, 8, %g7 /* adjust return */ + GET_MMU_D_ADDR_CTX(%g1, %g2) - brz %g1, 1f + KAIF_CALL_KDI_VATOTTE(%g1, %g2, kaif_hdlr_dmiss_patch, %g3, %g7) +0: brz %g1, 1f nop /* * kdi_vatotte gave us a TTE to use. Load it up and head back * into the world, but first bump a counter. */ -#ifdef KMDB_TRAPCOUNT - ldx [%g7 + 0x40], %g2 /* Trap counter. See top comment */ + +#ifdef KMDB_TRAPCOUNT /* Trap counter. See top comment */ + ldx [%g7 + .count-0b], %g2 add %g2, 1, %g2 - stx %g2, [%g7 + 0x40] -#else - nop - nop - nop + stx %g2, [%g7 + .count-0b] #endif - stxa %g1, [%g0]ASI_DTLB_IN + + KAIF_DTLB_STUFF(%g1, 1f, %g2, %g3, %g4, %g5) retry 1: /* @@ -126,63 +233,47 @@ kaif_hdlr_dmiss_patch: * find the TTE for the debugger without missing. */ -#ifdef KMDB_TRAPCOUNT - mov MMU_TAG_ACCESS, %g1 /* Trap address "counter". */ - ldxa [%g1]ASI_DMMU, %g1 - stx %g1, [%g7 + 0x48] -#else - nop - nop - nop +#ifdef KMDB_TRAPCOUNT /* Trap address "counter". */ + GET_MMU_D_ADDR(%g2, %g3) + stx %g2, [%g7 + .daddr-0b] + stx %g1, [%g7 + .ecode-0b] #endif - mov PTSTATE_KERN_COMMON | PSTATE_AG, %g3 - wrpr %g3, %pstate - sethi %hi(kaif_dtrap), %g4 - jmp %g4 + %lo(kaif_dtrap) + sethi %hi(kaif_dtrap), %g1 + jmp %g1 + %lo(kaif_dtrap) nop - unimp 0 - unimp 0 /* counter goes here (base + 0x60) */ - unimp 0 - unimp 0 /* miss address goes here (base + 0x68) */ - unimp 0 - unimp 0 - unimp 0 - unimp 0 - unimp 0 + /* NOTREACHED */ + +#ifdef KMDB_TRAPCOUNT + .align 8 +.count: .xword 0 /* counter goes here */ +.daddr: .xword 0 /* miss address goes here */ +.ecode: .xword 0 /* sun4v: g1 contains err code */ +#endif + + .align 32*4 /* force length to 32 instr. */ SET_SIZE(kaif_hdlr_dmiss) - /* - * This routine must be exactly 32 instructions long. - */ - ENTRY_NP(kaif_hdlr_imiss) - rdpr %tpc, %g1 - ldxa [%g0]ASI_IMMU, %g2 - srlx %g2, TTARGET_CTX_SHIFT, %g2 -kaif_hdlr_imiss_patch: - sethi %hi(0), %g3 /* set by kaif to kdi_vatotte */ - or %g3, %lo(0), %g3 - jmpl %g3, %g7 /* uses all regs, ret to %g7, tte or 0 in %g1 */ - add %g7, 8, %g7 /* adjust return */ - brz %g1, 1f + ENTRY_NP(kaif_hdlr_imiss) + GET_MMU_I_ADDR_CTX(%g1, %g2) + + KAIF_CALL_KDI_VATOTTE(%g1, %g2, kaif_hdlr_imiss_patch, %g3, %g7) +0: brz %g1, 1f nop /* * kdi_vatotte gave us a TTE to use. Load it up and head back * into the world, but first bump a counter. */ -#ifdef KMDB_TRAPCOUNT - ldx [%g7 + 0x3c], %g2 /* Trap counter. See top comment */ +#ifdef KMDB_TRAPCOUNT /* Trap counter. See top comment */ + ldx [%g7 + .count-0b], %g2 add %g2, 1, %g2 - stx %g2, [%g7 + 0x3c] -#else - nop - nop - nop + stx %g2, [%g7 + .count-0b] #endif - stxa %g1, [%g0]ASI_ITLB_IN + + KAIF_ITLB_STUFF(%g1, 1f, %g2, %g3, %g4, %g5) retry 1: /* @@ -197,42 +288,41 @@ kaif_hdlr_imiss_patch: * We will only reach this point at TL=1, as kdi_vatotte will always * find the TTE for the debugger without missing. */ - rdpr %pstate, %g1 - or %g0, PTSTATE_KERN_COMMON | PSTATE_AG, %g2 - set kaif_dtrap, %g3 - jmp %g3 - wrpr %g2, %pstate - unimp 0 - unimp 0 - unimp 0 /* counter goes here */ - unimp 0 - unimp 0 - unimp 0 - unimp 0 - unimp 0 - unimp 0 - unimp 0 - unimp 0 - unimp 0 + + sethi %hi(kaif_dtrap), %g1 + jmp %g1 + %lo(kaif_dtrap) + nop + /* NOTREACHED */ + +#ifdef KMDB_TRAPCOUNT + .align 8 +.count: .xword 0 +#endif + + .align 32*4 /* force length to 32 instr. */ SET_SIZE(kaif_hdlr_imiss) -#endif /* sun4v */ + + ENTRY_NP(kaif_hdlr_generic) -#ifdef KMDB_TRAPCOUNT - rd %pc, %g3 /* Trap counter. See top comment */ - ld [%g3 + 0x1c], %g4 +#ifdef KMDB_TRAPCOUNT /* Trap counter. See top comment */ +0: rd %pc, %g3 + ldx [%g3 + .count-0b], %g4 add %g4, 1, %g4 - st %g4, [%g3 + 0x1c] -#else - nop - nop - nop + stx %g4, [%g3 + .count-0b] +#endif + + sethi %hi(kaif_dtrap), %g1 + jmp %g1 + %lo(kaif_dtrap) nop + /* NOTREACHED */ + +#ifdef KMDB_TRAPCOUNT + .align 8 +.count: .xword 0 /* counter goes here */ #endif - sethi %hi(kaif_dtrap), %g3 - jmp %g3 + %lo(kaif_dtrap) - rdpr %pstate, %g1 - unimp 0 /* counter goes here */ + + .align 32*4 /* force length to 32 instr. */ SET_SIZE(kaif_hdlr_generic) -#endif +#endif /* lint */ diff --git a/usr/src/cmd/mdb/sun4v/Makefile.kmdb b/usr/src/cmd/mdb/sun4v/Makefile.kmdb index 43471fe808..d307d5f6f8 100644 --- a/usr/src/cmd/mdb/sun4v/Makefile.kmdb +++ b/usr/src/cmd/mdb/sun4v/Makefile.kmdb @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # #ident "%Z%%M% %I% %E% SMI" @@ -46,10 +45,19 @@ PROMSRCS += \ prom_printf.c \ prom_prop.c \ prom_putchar.c \ + prom_reboot.c \ prom_sparc.c \ prom_stdin.c \ prom_stdout.c \ - prom_string.c + prom_string.c \ + promif_emul.c \ + promif_interp.c \ + promif_io.c \ + promif_mon.c \ + promif_node.c \ + promif_prop.c \ + promif_reboot.c \ + promif_stree.c KMDBSRCS += \ kaif.c \ @@ -113,6 +121,10 @@ MAPFILE_SOURCES = \ $(COMPILE.c) $< $(CTFCONVERT_O) +%.o: ../../../../../uts/sun4v/promif/%.c + $(COMPILE.c) $< + $(CTFCONVERT_O) + %.ln: ../../../../../psm/promif/ieee1275/common/%.c $(LINT.c) -c $< @@ -121,3 +133,6 @@ MAPFILE_SOURCES = \ %.ln: ../../../../../psm/promif/ieee1275/sun4u/%.c $(LINT.c) -c $< + +%.ln: ../../../../../uts/sun4v/promif/%.c + $(LINT.c) -c $< diff --git a/usr/src/cmd/mdb/sun4v/modules/Makefile b/usr/src/cmd/mdb/sun4v/modules/Makefile index 77d76d05b0..957bcfcfbb 100644 --- a/usr/src/cmd/mdb/sun4v/modules/Makefile +++ b/usr/src/cmd/mdb/sun4v/modules/Makefile @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -19,11 +18,13 @@ # # CDDL HEADER END # + # -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # #ident "%Z%%M% %I% %E% SMI" +# -SUBDIRS = unix +SUBDIRS = unix vdsk include ../../Makefile.subdirs diff --git a/usr/src/cmd/mdb/sun4v/modules/vdsk/Makefile b/usr/src/cmd/mdb/sun4v/modules/vdsk/Makefile new file mode 100644 index 0000000000..4c5460e696 --- /dev/null +++ b/usr/src/cmd/mdb/sun4v/modules/vdsk/Makefile @@ -0,0 +1,30 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" +# + +SUBDIRS = v9 +include ../../../Makefile.subdirs diff --git a/usr/src/cmd/mdb/sun4v/modules/vdsk/v9/Makefile b/usr/src/cmd/mdb/sun4v/modules/vdsk/v9/Makefile new file mode 100644 index 0000000000..a449a6b174 --- /dev/null +++ b/usr/src/cmd/mdb/sun4v/modules/vdsk/v9/Makefile @@ -0,0 +1,44 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +MODULE = vdsk.so +MDBTGT = kvm + +MODSRCS = vdsk.c + +include ../../../../../Makefile.cmd +include ../../../../../Makefile.cmd.64 +include ../../../../sparc/Makefile.sparcv9 +include ../../../Makefile.sun4v +include ../../../../Makefile.module + +MODSRCS_DIR = ../ + +CPPFLAGS += -DMP -D_MACHDEP +CPPFLAGS += -Dsun4v +CPPFLAGS += -I$(SRC)/uts/sun4v diff --git a/usr/src/cmd/mdb/sun4v/modules/vdsk/vdsk.c b/usr/src/cmd/mdb/sun4v/modules/vdsk/vdsk.c new file mode 100644 index 0000000000..b43c3f9d95 --- /dev/null +++ b/usr/src/cmd/mdb/sun4v/modules/vdsk/vdsk.c @@ -0,0 +1,105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This module provides debugging tools for the LDoms vDisk drivers + * (vds and vdc). + */ + +#include <sys/mdb_modapi.h> + +#include <sys/vdsk_common.h> + +/* + */ +int +vd_dring_entry_walk_init(mdb_walk_state_t *wsp) +{ + /* Must have a start addr. */ + if (wsp->walk_addr == NULL) { + mdb_warn("Descriptor Ring base address required\n"); + + return (WALK_ERR); + } + + return (WALK_NEXT); +} + + +/* + * Generic entry walker step routine. + */ +int +vd_dring_entry_walk_step(mdb_walk_state_t *wsp) +{ + static int entry_count = 0; + int status; + vd_dring_entry_t dring_entry; + + if (mdb_vread(&dring_entry, VD_DRING_ENTRY_SZ, + (uintptr_t)wsp->walk_addr) == -1) { + mdb_warn("failed to read vd_dring_entry_t at %p", + wsp->walk_addr); + + return (WALK_ERR); + } + + status = wsp->walk_callback(wsp->walk_addr, &dring_entry, + wsp->walk_cbdata); + wsp->walk_addr = (uintptr_t)(wsp->walk_addr + VD_DRING_ENTRY_SZ); + + /* Check if we're at the last element */ + if (++entry_count >= VD_DRING_LEN) { + /* reset counter for next call to this walker */ + entry_count = 0; + + return (WALK_DONE); + } + + return (status); +} + +/* + * MDB module linkage information: + */ + +static const mdb_walker_t walkers[] = { + { "vd_dring_entry", "walk vDisk public Descriptor Ring entries", + vd_dring_entry_walk_init, vd_dring_entry_walk_step, NULL, NULL }, + { NULL } +}; + +static const mdb_modinfo_t modinfo = { + MDB_API_VERSION, NULL, walkers +}; + +const mdb_modinfo_t * +_mdb_init(void) +{ + return (&modinfo); +} diff --git a/usr/src/cmd/mdb/sun4v/v9/Makefile.kmdb b/usr/src/cmd/mdb/sun4v/v9/Makefile.kmdb index 32fc72077b..3e45c49ee2 100644 --- a/usr/src/cmd/mdb/sun4v/v9/Makefile.kmdb +++ b/usr/src/cmd/mdb/sun4v/v9/Makefile.kmdb @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # #ident "%Z%%M% %I% %E% SMI" @@ -35,6 +34,7 @@ PROMINCDIRS += $(SRC)/uts/sun4u KMDBML += \ client_handler.s \ + hcall.s \ kaif_handlers.s \ kaif_invoke.s \ kaif_resume.s \ diff --git a/usr/src/cmd/mdb/sun4v/v9/kmdb/Makefile b/usr/src/cmd/mdb/sun4v/v9/kmdb/Makefile index c7c5450a95..fb865e9e03 100644 --- a/usr/src/cmd/mdb/sun4v/v9/kmdb/Makefile +++ b/usr/src/cmd/mdb/sun4v/v9/kmdb/Makefile @@ -18,6 +18,7 @@ # # CDDL HEADER END # + # # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. @@ -58,6 +59,9 @@ KMDB_FPTEST = \ # will be added for the trap table, and each handler installed by KMDB will use # its padding to keep a trap count. See kaif_handlers.s. # +# NOTE: not currently supported by the sun4v fast miss handlers. See +# ../../../sparc/v9/kmdb/kaif_handlers.s to verify before use. +# #TRAPCOUNT = -DKMDB_TRAPCOUNT CPPFLAGS += -I../../../sparc/mdb -I.. -I$(SRC)/uts/sun4 -I$(SRC)/uts/sun4v @@ -146,6 +150,9 @@ kmdb_context_off.h: ../../../sparc/kmdb/kmdb_context_off.in $(COMPILE.c) $< $(CTFCONVERT_O) +%.o: ../../../../../uts/sun4v/ml/%.s + $(COMPILE.s) -o $@ $< + # # Lint # @@ -189,6 +196,9 @@ kmdb_context_off.h: ../../../sparc/kmdb/kmdb_context_off.in %.ln: $(SRC)/common/net/util/%.c $(LINT.c) -c $< +%.ln: ../../../../../uts/sun4v/ml/%.s + $(LINT.s) -c $< + # # Installation targets # diff --git a/usr/src/cmd/picl/plugins/sun4v/mdesc/Makefile b/usr/src/cmd/picl/plugins/sun4v/mdesc/Makefile index 45e9bd2d98..2fc2704617 100644 --- a/usr/src/cmd/picl/plugins/sun4v/mdesc/Makefile +++ b/usr/src/cmd/picl/plugins/sun4v/mdesc/Makefile @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -19,12 +18,13 @@ # # CDDL HEADER END # + # -# ident "%Z%%M% %I% %E% SMI" -# -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # +# ident "%Z%%M% %I% %E% SMI" +# # cmd/picl/plugins/sun4v/mdesc/Makefile # LIBRARY= libmdescplugin.a @@ -32,7 +32,7 @@ VERS= .1 OBJS_DIR= pics -OBJECTS= mdescplugin.o init.o \ +OBJECTS= mdescplugin.o init.o dr.o \ cpu_prop_update.o disk_discovery.o \ mdesc_findname.o mdesc_findnodeprop.o \ mdesc_fini.o mdesc_getpropstr.o \ @@ -48,7 +48,7 @@ include $(SRC)/cmd/picl/plugins/Makefile.com SRCS= $(OBJECTS:%.o=%.c) -LINT_SRC= ./mdescplugin.c ./init.c \ +LINT_SRC= ./mdescplugin.c ./init.c ./dr.c \ ./cpu_prop_update.c ./disk_discovery.c \ $(SRC)/common/mdesc/mdesc_findname.c \ $(SRC)/common/mdesc/mdesc_findnodeprop.c \ @@ -85,7 +85,7 @@ LDLIBS += -L$(SRC)/lib/libpicl/$(MACH) -L$(SRC)/lib/libpicltree/$(MACH) LDLIBS += -L$(ROOT)/usr/lib/picl/plugins -L$(ROOT)/usr/lib/sparcv9 DYNFLAGS += -R$(DYNFLAGS_COM) -LDLIBS += -lc -lpicltree -lrt -lpicldevtree -lcfgadm -lnvpair +LDLIBS += -lc -lpicltree -ldevinfo -lrt -lpicldevtree -lcfgadm -lnvpair LINTFLAGS += -erroff=E_BAD_PTR_CAST_ALIGN -v diff --git a/usr/src/cmd/picl/plugins/sun4v/mdesc/cpu_prop_update.c b/usr/src/cmd/picl/plugins/sun4v/mdesc/cpu_prop_update.c index c7e47d21d1..7e6428fa96 100644 --- a/usr/src/cmd/picl/plugins/sun4v/mdesc/cpu_prop_update.c +++ b/usr/src/cmd/picl/plugins/sun4v/mdesc/cpu_prop_update.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -213,16 +212,16 @@ add_cpu_prop(picl_nodehdl_t node, void *args) int x, num_nodes; int ncpus, ncaches, ntlbs; int status; - int reg_prop[4], reg; + int reg_prop[SUN4V_CPU_REGSIZE], cpuid; uint64_t int_value; - status = ptree_get_propval_by_name(node, "reg", reg_prop, + status = ptree_get_propval_by_name(node, OBP_REG, reg_prop, sizeof (reg_prop)); if (status != PICL_SUCCESS) { return (PICL_WALK_TERMINATE); } - reg = reg_prop[0] & 0x3f; + cpuid = CFGHDL_TO_CPUID(reg_prop[0]); /* * Allocate space for our searches. @@ -266,7 +265,7 @@ add_cpu_prop(picl_nodehdl_t node, void *args) continue; } - if (int_value != reg) + if (int_value != cpuid) continue; add_md_prop(node, sizeof (int_value), "cpuid", &int_value, diff --git a/usr/src/cmd/picl/plugins/sun4v/mdesc/dr.c b/usr/src/cmd/picl/plugins/sun4v/mdesc/dr.c new file mode 100644 index 0000000000..5323a23264 --- /dev/null +++ b/usr/src/cmd/picl/plugins/sun4v/mdesc/dr.c @@ -0,0 +1,517 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include "mdescplugin.h" + +static di_prom_handle_t ph = DI_PROM_HANDLE_NIL; + +typedef struct cpu_lookup { + di_node_t di_node; + picl_nodehdl_t nodeh; + int result; +} cpu_lookup_t; + +extern int add_cpu_prop(picl_nodehdl_t node, void *args); +extern md_t *mdesc_devinit(void); + +/* + * This function is identical to the one in the picldevtree plugin. + * Unfortunately we can't just reuse that code. + */ +static int +add_string_list_prop(picl_nodehdl_t nodeh, char *name, char *strlist, + unsigned int nrows) +{ + ptree_propinfo_t propinfo; + picl_prophdl_t proph; + picl_prophdl_t tblh; + int err; + unsigned int i; + unsigned int j; + picl_prophdl_t *proprow; + int len; + +#define NCOLS_IN_STRING_TABLE 1 + + err = ptree_init_propinfo(&propinfo, PTREE_PROPINFO_VERSION, + PICL_PTYPE_TABLE, PICL_READ, sizeof (picl_prophdl_t), name, + NULL, NULL); + if (err != PICL_SUCCESS) + return (err); + + err = ptree_create_table(&tblh); + if (err != PICL_SUCCESS) + return (err); + + err = ptree_create_and_add_prop(nodeh, &propinfo, &tblh, &proph); + if (err != PICL_SUCCESS) + return (err); + + proprow = alloca(sizeof (picl_prophdl_t) * nrows); + if (proprow == NULL) { + (void) ptree_destroy_prop(proph); + return (PICL_FAILURE); + } + + for (j = 0; j < nrows; ++j) { + len = strlen(strlist) + 1; + err = ptree_init_propinfo(&propinfo, PTREE_PROPINFO_VERSION, + PICL_PTYPE_CHARSTRING, PICL_READ, len, name, + NULL, NULL); + if (err != PICL_SUCCESS) + break; + err = ptree_create_prop(&propinfo, strlist, &proprow[j]); + if (err != PICL_SUCCESS) + break; + strlist += len; + err = ptree_add_row_to_table(tblh, NCOLS_IN_STRING_TABLE, + &proprow[j]); + if (err != PICL_SUCCESS) + break; + } + + if (err != PICL_SUCCESS) { + for (i = 0; i < j; ++i) + (void) ptree_destroy_prop(proprow[i]); + (void) ptree_delete_prop(proph); + (void) ptree_destroy_prop(proph); + return (err); + } + + return (PICL_SUCCESS); +} + +/* + * This function is identical to the one in the picldevtree plugin. + * Unfortunately we can't just reuse that code. + */ +static void +add_devinfo_props(picl_nodehdl_t nodeh, di_node_t di_node) +{ + int instance; + char *di_val; + di_prop_t di_prop; + int di_ptype; + ptree_propinfo_t propinfo; + + instance = di_instance(di_node); + (void) ptree_init_propinfo(&propinfo, PTREE_PROPINFO_VERSION, + PICL_PTYPE_INT, PICL_READ, sizeof (instance), PICL_PROP_INSTANCE, + NULL, NULL); + (void) ptree_create_and_add_prop(nodeh, &propinfo, &instance, NULL); + + di_val = di_bus_addr(di_node); + if (di_val) { + (void) ptree_init_propinfo(&propinfo, PTREE_PROPINFO_VERSION, + PICL_PTYPE_CHARSTRING, PICL_READ, strlen(di_val) + 1, + PICL_PROP_BUS_ADDR, NULL, NULL); + (void) ptree_create_and_add_prop(nodeh, &propinfo, di_val, + NULL); + } + + di_val = di_binding_name(di_node); + if (di_val) { + (void) ptree_init_propinfo(&propinfo, PTREE_PROPINFO_VERSION, + PICL_PTYPE_CHARSTRING, PICL_READ, strlen(di_val) + 1, + PICL_PROP_BINDING_NAME, NULL, NULL); + (void) ptree_create_and_add_prop(nodeh, &propinfo, di_val, + NULL); + } + + di_val = di_driver_name(di_node); + if (di_val) { + (void) ptree_init_propinfo(&propinfo, PTREE_PROPINFO_VERSION, + PICL_PTYPE_CHARSTRING, PICL_READ, strlen(di_val) + 1, + PICL_PROP_DRIVER_NAME, NULL, NULL); + (void) ptree_create_and_add_prop(nodeh, &propinfo, di_val, + NULL); + } + + di_val = di_devfs_path(di_node); + if (di_val) { + (void) ptree_init_propinfo(&propinfo, PTREE_PROPINFO_VERSION, + PICL_PTYPE_CHARSTRING, PICL_READ, strlen(di_val) + 1, + PICL_PROP_DEVFS_PATH, NULL, NULL); + (void) ptree_create_and_add_prop(nodeh, &propinfo, di_val, + NULL); + di_devfs_path_free(di_val); + } + + for (di_prop = di_prop_next(di_node, DI_PROP_NIL); + di_prop != DI_PROP_NIL; + di_prop = di_prop_next(di_node, di_prop)) { + + di_val = di_prop_name(di_prop); + di_ptype = di_prop_type(di_prop); + switch (di_ptype) { + case DI_PROP_TYPE_BOOLEAN: + (void) ptree_init_propinfo(&propinfo, + PTREE_PROPINFO_VERSION, PICL_PTYPE_VOID, + PICL_READ, (size_t)0, di_val, NULL, NULL); + (void) ptree_create_and_add_prop(nodeh, &propinfo, + NULL, NULL); + break; + case DI_PROP_TYPE_INT: { + int *idata; + int len; + + len = di_prop_ints(di_prop, &idata); + if (len < 0) + /* Recieved error, so ignore prop */ + break; + + if (len == 1) + (void) ptree_init_propinfo(&propinfo, + PTREE_PROPINFO_VERSION, PICL_PTYPE_INT, + PICL_READ, len * sizeof (int), di_val, + NULL, NULL); + else + (void) ptree_init_propinfo(&propinfo, + PTREE_PROPINFO_VERSION, + PICL_PTYPE_BYTEARRAY, PICL_READ, + len * sizeof (int), di_val, + NULL, NULL); + + (void) ptree_create_and_add_prop(nodeh, &propinfo, + idata, NULL); + } + break; + case DI_PROP_TYPE_STRING: { + char *sdata; + int len; + + len = di_prop_strings(di_prop, &sdata); + if (len < 0) + break; + + if (len == 1) { + (void) ptree_init_propinfo(&propinfo, + PTREE_PROPINFO_VERSION, + PICL_PTYPE_CHARSTRING, PICL_READ, + strlen(sdata) + 1, di_val, + NULL, NULL); + (void) ptree_create_and_add_prop(nodeh, + &propinfo, sdata, NULL); + } else { + (void) add_string_list_prop(nodeh, di_val, + sdata, len); + } + } + break; + case DI_PROP_TYPE_BYTE: { + int len; + unsigned char *bdata; + + len = di_prop_bytes(di_prop, &bdata); + if (len < 0) + break; + (void) ptree_init_propinfo(&propinfo, + PTREE_PROPINFO_VERSION, PICL_PTYPE_BYTEARRAY, + PICL_READ, len, di_val, NULL, NULL); + (void) ptree_create_and_add_prop(nodeh, &propinfo, + bdata, NULL); + } + break; + case DI_PROP_TYPE_UNKNOWN: + break; + case DI_PROP_TYPE_UNDEF_IT: + break; + default: + break; + } + } +} + +/* + * Create a picl node of type cpu and fill it. + * properties are filled from both the device tree and the + * Machine description. + */ +static int +construct_cpu_node(picl_nodehdl_t plath, di_node_t dn) +{ + int err; + char *nodename; + picl_nodehdl_t anodeh; + + nodename = di_node_name(dn); /* PICL_PROP_NAME */ + + err = ptree_create_and_add_node(plath, nodename, PICL_CLASS_CPU, + &anodeh); + if (err != PICL_SUCCESS) + return (err); + + add_devinfo_props(anodeh, dn); + (void) add_cpu_prop(anodeh, NULL); + + return (err); +} + +/* + * Given a devinfo node find its reg property. + */ +static int +get_reg_prop(di_node_t dn, int **pdata) +{ + int dret = 0; + + dret = di_prop_lookup_ints(DDI_DEV_T_ANY, dn, OBP_REG, pdata); + if (dret > 0) + return (dret); + + if (!ph) + return (0); + dret = di_prom_prop_lookup_ints(ph, dn, OBP_REG, pdata); + return (dret < 0? 0 : dret); +} + +/* + * Given a devinfo cpu node find its cpuid property. + */ +int +get_cpuid(di_node_t di_node) +{ + int len; + int *idata; + int dcpuid = -1; + + len = get_reg_prop(di_node, &idata); + + if (len != SUN4V_CPU_REGSIZE) + return (dcpuid); + if (len == SUN4V_CPU_REGSIZE) + dcpuid = CFGHDL_TO_CPUID(idata[0]); + + return (dcpuid); +} + +int +find_cpu(di_node_t node, int cpuid) +{ + int dcpuid; + di_node_t cnode; + char *nodename; + + for (cnode = di_child_node(node); cnode != DI_NODE_NIL; + cnode = di_sibling_node(cnode)) { + nodename = di_node_name(cnode); + if (nodename == NULL) + continue; + if (strcmp(nodename, OBP_CPU) == 0) { + dcpuid = get_cpuid(cnode); + if (dcpuid == cpuid) { + return (1); + } + } + } + return (0); +} + +/* + * Callback to the ptree walk function during remove_cpus. + * As a part of the args receives a picl nodeh, searches + * the device tree for a cpu whose cpuid matches the picl cpu node. + * Sets arg struct's result to 1 if it failed to match and terminates + * the walk. + */ +static int +remove_cpu_candidate(picl_nodehdl_t nodeh, void *c_args) +{ + di_node_t di_node; + cpu_lookup_t *cpu_arg; + int err; + int pcpuid; + int reg_prop[SUN4V_CPU_REGSIZE]; + + if (c_args == NULL) + return (PICL_INVALIDARG); + + cpu_arg = c_args; + di_node = cpu_arg->di_node; + + err = ptree_get_propval_by_name(nodeh, OBP_REG, reg_prop, + sizeof (reg_prop)); + + if (err != PICL_SUCCESS) { + return (PICL_WALK_CONTINUE); + } + + pcpuid = CFGHDL_TO_CPUID(reg_prop[0]); + + if (!find_cpu(di_node, pcpuid)) { + cpu_arg->result = 1; + cpu_arg->nodeh = nodeh; + return (PICL_WALK_TERMINATE); + } + + cpu_arg->result = 0; + return (PICL_WALK_CONTINUE); +} + +/* + * Given the start node of the device tree. + * find all cpus in the picl tree that don't have + * device tree counterparts and remove them. + */ +static void +remove_cpus(di_node_t di_start) +{ + int err; + picl_nodehdl_t plath; + cpu_lookup_t cpu_arg; + + err = ptree_get_node_by_path(PLATFORM_PATH, &plath); + if (err != PICL_SUCCESS) + return; + + do { + cpu_arg.di_node = di_start; + cpu_arg.nodeh = 0; + cpu_arg.result = 0; + + if (ptree_walk_tree_by_class(plath, + PICL_CLASS_CPU, &cpu_arg, remove_cpu_candidate) + != PICL_SUCCESS) + return; + + if (cpu_arg.result == 1) { + err = ptree_delete_node(cpu_arg.nodeh); + if (err == PICL_SUCCESS) + ptree_destroy_node(cpu_arg.nodeh); + } + } while (cpu_arg.result); +} + +/* + * Callback to the ptree walk function during add_cpus. + * As a part of the args receives a cpu di_node, compares + * each picl cpu node's cpuid to the device tree node's cpuid. + * Sets arg struct's result to 1 on a match. + */ +static int +cpu_exists(picl_nodehdl_t nodeh, void *c_args) +{ + di_node_t di_node; + cpu_lookup_t *cpu_arg; + int err; + int dcpuid, pcpuid; + int reg_prop[4]; + + if (c_args == NULL) + return (PICL_INVALIDARG); + + cpu_arg = c_args; + di_node = cpu_arg->di_node; + dcpuid = get_cpuid(di_node); + + err = ptree_get_propval_by_name(nodeh, OBP_REG, reg_prop, + sizeof (reg_prop)); + + if (err != PICL_SUCCESS) + return (PICL_WALK_CONTINUE); + + pcpuid = CFGHDL_TO_CPUID(reg_prop[0]); + + if (dcpuid == pcpuid) { + cpu_arg->result = 1; + return (PICL_WALK_TERMINATE); + } + + cpu_arg->result = 0; + return (PICL_WALK_CONTINUE); +} + +/* + * Given the root node of the device tree. + * compare it to the picl tree and add to it cpus + * that are new. + */ +static void +add_cpus(di_node_t di_node) +{ + int err; + di_node_t cnode; + picl_nodehdl_t plath; + cpu_lookup_t cpu_arg; + char *nodename; + + err = ptree_get_node_by_path(PLATFORM_PATH, &plath); + if (err != PICL_SUCCESS) + return; + + for (cnode = di_child_node(di_node); cnode != DI_NODE_NIL; + cnode = di_sibling_node(cnode)) { + nodename = di_node_name(cnode); + if (nodename == NULL) + continue; + if (strcmp(nodename, OBP_CPU) == 0) { + cpu_arg.di_node = cnode; + + if (ptree_walk_tree_by_class(plath, + PICL_CLASS_CPU, &cpu_arg, cpu_exists) + != PICL_SUCCESS) + return; + + if (cpu_arg.result == 0) + /* + * Didn't find a matching cpu, add it. + */ + (void) construct_cpu_node(plath, + cnode); + } + } +} + +/* + * Handle DR events. Only supports cpu add and remove. + */ +int +update_devices(char *dev, int op) +{ + di_node_t di_root; + + if ((di_root = di_init("/", DINFOCPYALL)) == DI_NODE_NIL) + return (PICL_FAILURE); + + if ((ph = di_prom_init()) == NULL) + return (PICL_FAILURE); + + if (op == DEV_ADD) { + if (strcmp(dev, OBP_CPU) == 0) + add_cpus(di_root); + } + + if (op == DEV_REMOVE) { + if (strcmp(dev, OBP_CPU) == 0) + remove_cpus(di_root); + } + + di_fini(di_root); + di_prom_fini(ph); + return (PICL_SUCCESS); +} diff --git a/usr/src/cmd/picl/plugins/sun4v/mdesc/init.c b/usr/src/cmd/picl/plugins/sun4v/mdesc/init.c index 8b6a7f2af3..d9e52a293f 100644 --- a/usr/src/cmd/picl/plugins/sun4v/mdesc/init.c +++ b/usr/src/cmd/picl/plugins/sun4v/mdesc/init.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -43,17 +43,20 @@ #define SIZE 8192 static void mdesc_free(void *bufp, size_t size); +uint8_t *md_bufp; md_t * mdesc_devinit(void) { int fh; - uint8_t *bufp = NULL; int res; int size; int offset; md_t *mdp; + if (md_bufp != NULL) + return (NULL); + fh = open(MDESC_PATH, O_RDONLY, 0); if (fh < 0) { return (NULL); @@ -62,8 +65,8 @@ mdesc_devinit(void) size = SIZE; /* initial size */ offset = 0; - bufp = malloc(size); - if (NULL == bufp) { + md_bufp = malloc(size); + if (NULL == md_bufp) { return (NULL); } @@ -76,18 +79,18 @@ mdesc_devinit(void) while (len < SIZE) { size += SIZE; - bufp = realloc(bufp, size); - if (NULL == bufp) + md_bufp = realloc(md_bufp, size); + if (NULL == md_bufp) return (NULL); len = size - offset; } do { - res = read(fh, bufp+offset, len); + res = read(fh, md_bufp + offset, len); } while ((res < 0) && (errno == EAGAIN)); if (res < 0) { - free(bufp); + free(md_bufp); return (NULL); } @@ -96,13 +99,13 @@ mdesc_devinit(void) (void) close(fh); - bufp = realloc(bufp, offset); - if (NULL == bufp) + md_bufp = realloc(md_bufp, offset); + if (NULL == md_bufp) return (NULL); - mdp = md_init_intern((uint64_t *)bufp, malloc, mdesc_free); + mdp = md_init_intern((uint64_t *)md_bufp, malloc, mdesc_free); if (NULL == mdp) { - free(bufp); + free(md_bufp); return (NULL); } @@ -113,5 +116,17 @@ mdesc_devinit(void) void mdesc_free(void *bufp, size_t size) { - free(bufp); + if (bufp) + free(bufp); +} + +void +mdesc_devfini(md_t *mdp) +{ + if (mdp) + (void) md_fini(mdp); + + if (md_bufp) + free(md_bufp); + md_bufp = NULL; } diff --git a/usr/src/cmd/picl/plugins/sun4v/mdesc/mdescplugin.c b/usr/src/cmd/picl/plugins/sun4v/mdesc/mdescplugin.c index 3af30d6678..1883993934 100644 --- a/usr/src/cmd/picl/plugins/sun4v/mdesc/mdescplugin.c +++ b/usr/src/cmd/picl/plugins/sun4v/mdesc/mdescplugin.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -50,6 +50,8 @@ void mdescplugin_fini(void); extern int add_cpu_prop(picl_nodehdl_t node, void *args); extern int disk_discovery(void); extern md_t *mdesc_devinit(void); +extern void mdesc_devfini(md_t *mdp); +extern int update_devices(char *dev, int op); picld_plugin_reg_t mdescplugin_reg = { PICLD_PLUGIN_VERSION_1, @@ -91,6 +93,65 @@ find_disk(picl_nodehdl_t node, void *args) } /* + * DR event handler + * respond to the picl events: + * PICLEVENT_DR_AP_STATE_CHANGE + */ +static void +dr_handler(const char *ename, const void *earg, size_t size, void *cookie) +{ + nvlist_t *nvlp = NULL; + char *dtype; + char *ap_id; + char *hint; + + + if (strcmp(ename, PICLEVENT_DR_AP_STATE_CHANGE) != 0) { + return; + } + + if (nvlist_unpack((char *)earg, size, &nvlp, NULL)) { + return; + } + + if (nvlist_lookup_string(nvlp, PICLEVENTARG_DATA_TYPE, &dtype)) { + nvlist_free(nvlp); + return; + } + + if (strcmp(dtype, PICLEVENTARG_PICLEVENT_DATA) != 0) { + nvlist_free(nvlp); + return; + } + + if (nvlist_lookup_string(nvlp, PICLEVENTARG_AP_ID, &ap_id)) { + nvlist_free(nvlp); + return; + } + + if (nvlist_lookup_string(nvlp, PICLEVENTARG_HINT, &hint)) { + nvlist_free(nvlp); + return; + } + + mdp = mdesc_devinit(); + if (mdp == NULL) { + nvlist_free(nvlp); + return; + } + + rootnode = md_root_node(mdp); + + if (strcmp(hint, DR_HINT_INSERT) == 0) + (void) update_devices(ap_id, DEV_ADD); + else if (strcmp(hint, DR_HINT_REMOVE) == 0) + (void) update_devices(ap_id, DEV_REMOVE); + + mdesc_devfini(mdp); + nvlist_free(nvlp); +} + +/* * Discovery event handler * respond to the picl events: * PICLEVENT_SYSEVENT_DEVICE_ADDED @@ -170,8 +231,10 @@ mdescplugin_init(void) dsc_handler, NULL); (void) ptree_register_handler(PICLEVENT_SYSEVENT_DEVICE_REMOVED, dsc_handler, NULL); + (void) ptree_register_handler(PICLEVENT_DR_AP_STATE_CHANGE, + dr_handler, NULL); - (void) md_fini(mdp); + mdesc_devfini(mdp); } void @@ -182,6 +245,8 @@ mdescplugin_fini(void) dsc_handler, NULL); (void) ptree_unregister_handler(PICLEVENT_SYSEVENT_DEVICE_REMOVED, dsc_handler, NULL); + (void) ptree_unregister_handler(PICLEVENT_DR_AP_STATE_CHANGE, + dr_handler, NULL); } void diff --git a/usr/src/cmd/picl/plugins/sun4v/mdesc/mdescplugin.h b/usr/src/cmd/picl/plugins/sun4v/mdesc/mdescplugin.h index 073e9ff825..437fbecbbf 100644 --- a/usr/src/cmd/picl/plugins/sun4v/mdesc/mdescplugin.h +++ b/usr/src/cmd/picl/plugins/sun4v/mdesc/mdescplugin.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -29,6 +29,10 @@ #pragma ident "%Z%%M% %I% %E% SMI" +#ifdef __cplusplus +extern "C" { +#endif + #include <picl.h> #include <picltree.h> #include <picldefs.h> @@ -50,6 +54,9 @@ #include <dirent.h> #include <config_admin.h> #include <sys/param.h> +#include <libdevinfo.h> +#include <sys/systeminfo.h> +#include <sys/sysevent/dr.h> #define MAXSTRLEN 256 #define ICACHE_FLAG 0x01 @@ -58,5 +65,13 @@ #define DISK_DISCOVERY_NAME "disk_discovery" #define CONFIGURED "configured" #define UNCONFIGURED "unconfigured" +#define DEV_ADD 0 +#define DEV_REMOVE 1 +#define SUN4V_CPU_REGSIZE 4 +#define CFGHDL_TO_CPUID(x) (x & ~(0xful << 28)) + +#ifdef __cplusplus +} +#endif #endif /* _MDESCPLUGIN_H */ diff --git a/usr/src/cmd/vntsd/Makefile b/usr/src/cmd/vntsd/Makefile new file mode 100644 index 0000000000..9cbd356516 --- /dev/null +++ b/usr/src/cmd/vntsd/Makefile @@ -0,0 +1,74 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +#ident "%Z%%M% %I% %E% SMI" +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +PROG = vntsd +SRCS = cmd.c common.c console.c listen.c queue.c read.c vntsd.c vntsdvcc.c \ + write.c +OBJS = $(SRCS:.c=.o) + +include ../Makefile.cmd + +POFILES = $(SRCS:.c=.po) +POFILE = $(PROG)_msg.po + +MANIFEST = vntsd.xml +SVCMETHOD = svc-vntsd + +CFLAGS += $(CCVERBOSE) + +LDLIBS += -lsocket -lnsl + +ROOTCMDDIR = $(ROOTLIB)/ldoms +ROOTMANIFESTDIR = $(ROOTSVCPLATFORMSUN4V) +$(ROOTMANIFEST) := FILEMODE = 0444 + + +.KEEP_STATE: + +all: $(PROG) + +install: all \ + $(ROOTCMD) \ + $(ROOTMANIFEST) \ + $(ROOTSVCMETHOD) + +$(PROG): $(OBJS) + $(LINK.c) $(OBJS) -o $@ $(LDLIBS) + $(POST_PROCESS) + +$(POFILE): $(POFILES) + $(RM) $@ + $(CAT) $(POFILES) > $@ + +check: $(CHKMANIFEST) + +lint: lint_SRCS + +clean: + $(RM) $(OBJS) + +include ../Makefile.targ diff --git a/usr/src/cmd/vntsd/chars.h b/usr/src/cmd/vntsd/chars.h new file mode 100644 index 0000000000..66abce66b7 --- /dev/null +++ b/usr/src/cmd/vntsd/chars.h @@ -0,0 +1,87 @@ + +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _CHARS_H +#define _CHARS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#define CR 13 +#define LF 10 + + +/* telnet protocol command support */ +#define BEL 7 /* not support */ +#define BS 8 /* supported */ +#define HT 9 /* eoln */ +#define VT 11 /* not support */ +#define FF 12 /* not support */ +#define STOP 18 +#define START 19 + +#define SE 240 /* end of subnegotiation params */ +#define NOP 241 +#define DM 242 /* Data Mark not support */ +#define BRK 243 /* termial support */ +#define IP 244 /* control-C */ +#define AO 245 /* abort output not support */ +#define AYT 246 /* Are you there */ +#define EC 247 /* Erase character - not support */ +#define EL 248 /* Erase line - not support */ +#define GA 249 /* Go ahead. */ +#define SB 250 /* Subnegotiation of the indicated option */ +#define WILL 251 /* will do */ +#define WONT 252 /* refuse */ +#define DO 253 /* request do */ +#define DONT 254 /* request do not do */ +#define IAC 255 /* command */ + + + +/* telnet options */ + +#define TEL_ECHO 1 +#define SUPRESS 3 +#define STATUS 5 +#define TM 6 /* timing mark - not supported */ +#define TERM_TYPE 24 /* Terminal type -not supported */ +#define WIN_SIZE 31 /* window size - not supported */ +#define TERM_SP 32 /* terminal speed - not supported */ +#define FC 33 /* remote flow control - not supported */ +#define LINEMODE 34 /* line mode */ +#define ENV 36 /* environment variables */ + +#define VNTSD_DAEMON_CMD '~' + +#ifdef __cplusplus +} +#endif + +#endif /* _CHARS_H */ diff --git a/usr/src/cmd/vntsd/cmd.c b/usr/src/cmd/vntsd/cmd.c new file mode 100644 index 0000000000..faabebd613 --- /dev/null +++ b/usr/src/cmd/vntsd/cmd.c @@ -0,0 +1,486 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Vntsd handles two types of special commands, one is telnet + * commands and another is vntsd special commands. + * telnet commands supported are: + * WILL + * WONT + * DO + * DONT + * TEL_ECHO + * SUPRESS + * LINEMODE + * BRK + * AYT + * HT + * + * Vntsd special commands are: + * send break (~#) + * exit (~.) + * force write access (~w) + * cycle console down (~n) + * cycle console up (~p) + * help (~?) + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <thread.h> +#include <ctype.h> +#include <sys/termio.h> +#include <libintl.h> +#include <syslog.h> +#include "vntsd.h" +#include "chars.h" + +char vntsd_eol[] = { CR, LF, 0}; + +typedef int (*e_func_t)(vntsd_client_t *clientp); +/* structure for daemon special cmd */ +typedef struct { + char e_char; /* char to match on */ + char *e_help; /* help string */ + e_func_t e_func; /* command */ +} esctable_t; + +/* genbrk() - send a break to vcc driver */ +static int +genbrk(vntsd_client_t *clientp) +{ + + vntsd_cons_t *consp; + + assert(clientp); + assert(clientp->cons); + + consp = clientp->cons; + D1(stderr, "t@%d genbrk fd=%d sockfd %d\n", thr_self(), + consp->vcc_fd, clientp->sockfd); + + assert(consp->clientpq != NULL); + if (consp->clientpq->handle != clientp) { + /* reader */ + return (vntsd_write_line(clientp, + gettext(VNTSD_NO_WRITE_ACCESS_MSG))); + } + + /* writer */ + if (ioctl(consp->vcc_fd, TCSBRK, NULL)) { + return (VNTSD_ERR_VCC_IOCTL); + } + + return (VNTSD_STATUS_CONTINUE); +} + +/* + * console_forward() - cycle client to the next console + * in the group queue. + */ +static int +console_forward(void) +{ + return (VNTSD_STATUS_MOV_CONS_FORWARD); +} + +/* + * console_backward() - cycle client to the previous + * console in the group queue. + */ +static int +console_backward(void) +{ + return (VNTSD_STATUS_MOV_CONS_BACKWARD); +} + +/* acquire_write() - acquire write access to a console. */ +static int +acquire_write(vntsd_client_t *clientp) +{ + int rv; + int yes_no = 1; + vntsd_cons_t *consp; + + assert(clientp); + consp = clientp->cons; + assert(consp); + + if (consp->clientpq->handle == clientp) { + /* client is a writer */ + if ((rv = vntsd_write_line(clientp, + gettext("You have write permission"))) != + VNTSD_SUCCESS) { + return (rv); + + } + return (VNTSD_STATUS_CONTINUE); + } + + /* message to client */ + if ((rv = vntsd_write_client(clientp, vntsd_eol, VNTSD_EOL_LEN)) + != VNTSD_SUCCESS) { + return (rv); + } + + /* + * TRANSLATION_NOTE + * The following string should be formatted to fit on multiple lines + * assuming a line width of at most 78 characters. There must be no + * trailing newline. + */ + if ((rv = vntsd_write_lines(clientp, + gettext("Warning: another user currently " + "has write permission\nto this console and forcibly removing " + "him/her will terminate\nany current write action and all work " + "will be lost."))) != VNTSD_SUCCESS) { + return (rv); + } + + /* get client yes no */ + if ((rv = vntsd_write_client(clientp, vntsd_eol, + VNTSD_EOL_LEN)) != VNTSD_SUCCESS) { + return (rv); + } + + if ((rv = vntsd_get_yes_no(clientp, + gettext("Would you like to continue?"), + &yes_no)) != VNTSD_SUCCESS) { + return (rv); + } + + if (yes_no == B_FALSE) { + /* client change mind no need to acquire write access */ + return (VNTSD_STATUS_CONTINUE); + } + + return (VNTSD_STATUS_ACQUIRE_WRITER); +} + +/* client_exit() - disconnect client from the console. */ +static int +client_exit(void) +{ + return (VNTSD_STATUS_RESELECT_CONS); +} + +static int daemon_cmd_help(vntsd_client_t *clientp); + +/* table for daemon commands */ + +static esctable_t etable[] = { + + /* send a break to vcc */ + {'#', "send break", genbrk}, + + /* exit */ + {'.', "exit from this console", (e_func_t)client_exit}, + + /* acquire write access */ + {'w', "force write access", acquire_write}, + + /* connect to next console in queue */ + {'n', "console down", (e_func_t)console_forward}, + + /* connect to previous console in queue */ + {'p', "console up", (e_func_t)console_backward}, + + /* help must be next to last */ + {'?', "_", daemon_cmd_help}, + + /* table terminator */ + {0, 0, 0} +}; + +void +vntsd_init_esctable_msgs(void) +{ + esctable_t *p; + + for (p = etable; p->e_char != '\0'; p++) { + p->e_help = gettext(p->e_help); + } +} + +/* daemon_cmd_help() - print help. */ +static int +daemon_cmd_help(vntsd_client_t *clientp) +{ + esctable_t *p; + int rv; + char buf[VNTSD_LINE_LEN]; + + if ((rv = vntsd_write_client(clientp, vntsd_eol, + VNTSD_EOL_LEN)) != VNTSD_SUCCESS) { + return (rv); + } + + /* + * TRANSLATION_NOTE + * VNTSD is the name of the VNTS daemon and should not be translated. + */ + if ((rv = vntsd_write_line(clientp, gettext("VNTSD commands"))) != + VNTSD_SUCCESS) { + return (rv); + } + + for (p = etable; p->e_char; p++) { + (void) snprintf(buf, sizeof (buf), + "~%c --%s", p->e_char, p->e_help); + + if ((rv = vntsd_write_line(clientp, buf)) != VNTSD_SUCCESS) { + return (rv); + } + } + + return (VNTSD_STATUS_CONTINUE); +} + +/* exit from daemon command */ +static int +exit_daemon_cmd(vntsd_client_t *clientp, int rv) +{ + (void) mutex_lock(&clientp->lock); + clientp->status &= ~VNTSD_CLIENT_DISABLE_DAEMON_CMD; + (void) mutex_unlock(&clientp->lock); + return (rv); +} + +/* vntsd_process_daemon_cmd() - special commands */ +int +vntsd_process_daemon_cmd(vntsd_client_t *clientp, char c) +{ + esctable_t *p; + int rv; + + if (c != VNTSD_DAEMON_CMD) { + /* not a daemon command */ + return (VNTSD_SUCCESS); + } + + if (clientp->status & VNTSD_CLIENT_DISABLE_DAEMON_CMD) { + return (VNTSD_STATUS_CONTINUE); + } + + /* no reentry to process_daemon_cmd */ + (void) mutex_lock(&clientp->lock); + clientp->status |= VNTSD_CLIENT_DISABLE_DAEMON_CMD; + (void) mutex_unlock(&clientp->lock); + + D3(stderr, "t@%d process_daemon_cmd %d %d \n", thr_self(), + clientp->cons->vcc_fd, clientp->sockfd); + + /* read in command */ + if ((rv = vntsd_read_char(clientp, &c)) != VNTSD_SUCCESS) { + return (exit_daemon_cmd(clientp, rv)); + } + + for (p = etable; p->e_char; p++) { + if (p->e_char == c) { + /* found match */ + assert(p->e_func); + rv = (*p->e_func)(clientp); + return (exit_daemon_cmd(clientp, rv)); + } + } + + /* no match, print out the help */ + p--; + assert(p->e_char == '?'); + rv = (*p->e_func)(clientp); + + return (exit_daemon_cmd(clientp, rv)); + +} + +/* vntsd_set_telnet_options() - change telnet client to character mode. */ +int +vntsd_set_telnet_options(int fd) +{ + /* set client telnet options */ + uint8_t buf[] = {IAC, DONT, LINEMODE, IAC, WILL, SUPRESS, IAC, WILL, + TEL_ECHO, IAC, DONT, TERM_TYPE, IAC, DONT, TERM_SP, + IAC, DONT, STATUS, IAC, DONT, FC, IAC, DONT, TM, IAC, DONT, ENV, + IAC, DONT, WIN_SIZE}; + + return (vntsd_write_fd(fd, (char *)buf, 30)); +} + +/* vntsd_telnet_cmd() process telnet commands */ +int +vntsd_telnet_cmd(vntsd_client_t *clientp, char c) +{ + uint8_t buf[4]; + char cmd; + int rv = VNTSD_STATUS_CONTINUE; + + bzero(buf, 4); + + if ((uint8_t)c != IAC) { + /* not telnet cmd */ + return (VNTSD_SUCCESS); + } + + if ((rv = vntsd_read_char(clientp, &cmd)) != VNTSD_SUCCESS) { + return (rv); + } + + if ((rv = vntsd_read_char(clientp, &c)) != VNTSD_SUCCESS) { + return (rv); + } + + + switch ((uint8_t)cmd) { + + case WILL: + + switch ((uint8_t)c) { + case TEL_ECHO: + case SUPRESS: + case LINEMODE: + break; + default: + syslog(LOG_ERR, "not support telnet WILL %x\n", c); + break; + } + break; + + case WONT: + + switch ((uint8_t)c) { + case TEL_ECHO: + case SUPRESS: + case LINEMODE: + default: + syslog(LOG_ERR, "not support telnet WONT %x\n", c); + break; + } + break; + + case DO: + case DONT: + + buf[0] = IAC; + buf[1] = WILL; + buf[2] = c; + rv = vntsd_write_client(clientp, (char *)buf, 3); + + break; + + case BRK: + + /* send break to vcc */ + rv = genbrk(clientp); + break; + + case IP: + + break; + + case AYT: + + rv = vntsd_write_client(clientp, &c, 1); + break; + + case HT: + return (VNTSD_STATUS_CONTINUE); + + default: + syslog(LOG_ERR, "not support telnet ctrl %x\n", c); + break; + } + + if (rv == VNTSD_SUCCESS) { + return (VNTSD_STATUS_CONTINUE); + } else { + return (rv); + } +} + + +/* + * vntsd_ctrl_cmd() - control keys + * read and write suspend are supported. + */ +int +vntsd_ctrl_cmd(vntsd_client_t *clientp, char c) +{ + int cmd; + + D3(stderr, "t@%d vntsd_ctrl_cmd%d %d\n", thr_self(), + clientp->cons->vcc_fd, clientp->sockfd); + + if ((c != START) && (c != STOP)) { + /* not a supported control command */ + return (VNTSD_SUCCESS); + } + + if (c == START) { + + D3(stderr, "t@%d client restart\n", thr_self()); + + /* send resume read */ + cmd = 1; + + if (ioctl(clientp->cons->vcc_fd, TCXONC, &cmd)) { + return (VNTSD_STATUS_VCC_IO_ERR); + } + + /* send resume write */ + cmd = 3; + + if (ioctl(clientp->cons->vcc_fd, TCXONC, &cmd)) { + return (VNTSD_STATUS_VCC_IO_ERR); + } + } + + if (c == STOP) { + D3(stderr, "t@%d client suspend\n", thr_self()); + + /* send suspend read */ + cmd = 0; + + if (ioctl(clientp->cons->vcc_fd, TCXONC, &cmd)) { + return (VNTSD_STATUS_VCC_IO_ERR); + } + + /* send suspend write */ + cmd = 2; + + if (ioctl(clientp->cons->vcc_fd, TCXONC, &cmd)) { + perror("ioctl TCXONC"); + return (VNTSD_STATUS_VCC_IO_ERR); + } + } + + return (VNTSD_STATUS_CONTINUE); +} diff --git a/usr/src/cmd/vntsd/common.c b/usr/src/cmd/vntsd/common.c new file mode 100644 index 0000000000..2cbad73309 --- /dev/null +++ b/usr/src/cmd/vntsd/common.c @@ -0,0 +1,654 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * supporting modules. + */ + +#include <stdio.h> +#include <sys/types.h> +#include <sys/ipc.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/socket.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <sys/sem.h> +#include <sys/poll.h> +#include <wait.h> +#include <time.h> +#include <netinet/in.h> +#include <thread.h> +#include <signal.h> +#include <ctype.h> +#include <langinfo.h> +#include <libintl.h> +#include <syslog.h> +#include "vntsd.h" +#include "chars.h" + +/* vntsd_write_line() - write a line to TCP client */ +int +vntsd_write_line(vntsd_client_t *clientp, char *line) +{ + int rv; + + rv = vntsd_write_client(clientp, line, strlen(line)); + if (rv == VNTSD_SUCCESS) { + rv = vntsd_write_client(clientp, vntsd_eol, VNTSD_EOL_LEN); + } + + return (rv); +} + +/* vntsd_write_lines() write one or more lines to client. */ +int +vntsd_write_lines(vntsd_client_t *clientp, char *lines) +{ + char *buf; + char *line; + char *endofline; + + buf = strdup(lines); + if (buf == NULL) { + return (VNTSD_ERR_NO_MEM); + } + + line = buf; + + while ((line != NULL) && (*line != '\0')) { + + endofline = strchr(line, '\n'); + if (endofline != NULL) { + *endofline = '\0'; + } + + (void) vntsd_write_line(clientp, line); + + if (endofline != NULL) + line = endofline + 1; + else + line = NULL; + } + + free(buf); + return (VNTSD_SUCCESS); +} + +/* vntsd_get_yes_no() - read in a "y" or "n" */ +int +vntsd_get_yes_no(vntsd_client_t *clientp, char *msg, int *yes_no) +{ + char c; + char yesno[8]; + int rv; + + /* create [y/n] prompt */ + (void) snprintf(yesno, sizeof (yesno), "[%c/%c] ", + *nl_langinfo(YESSTR), *nl_langinfo(NOSTR)); + + for (; ; ) { + if ((rv = vntsd_write_client(clientp, msg, strlen(msg))) + != VNTSD_SUCCESS) { + return (rv); + } + + if ((rv = vntsd_write_client(clientp, yesno, strlen(yesno))) != + VNTSD_SUCCESS) { + return (rv); + } + + if ((rv = vntsd_read_data(clientp, &c)) + != VNTSD_SUCCESS) { + return (rv); + } + + /* echo */ + if ((rv = vntsd_write_client(clientp, &c, 1)) != + VNTSD_SUCCESS) { + return (rv); + } + + if ((rv = vntsd_write_client(clientp, vntsd_eol, + VNTSD_EOL_LEN)) != + VNTSD_SUCCESS) { + return (rv); + } + + c = tolower(c); + + if (c == *nl_langinfo(YESSTR)) { + *yes_no = B_TRUE; + return (VNTSD_SUCCESS); + } + + if (c == *nl_langinfo(NOSTR)) { + *yes_no = B_FALSE; + return (VNTSD_SUCCESS); + } + + if ((rv = vntsd_write_line(clientp, + gettext("Invalid response. Try again."))) + != VNTSD_SUCCESS) { + return (rv); + } + } + + /*NOTREACHED*/ + return (0); +} + +/* vntsd_open_vcc() - open a vcc port */ +int +vntsd_open_vcc(char *dev_name, uint_t cons_no) +{ + int drvfd; + int sz; + char *path; + sz = strlen(VCC_DEVICE_PATH) + strlen(dev_name)+1; + + path = calloc(sz, 1); + + if (path == NULL) { + return (-1); + } + + (void) snprintf(path, sz-1, VCC_DEVICE_PATH, dev_name); + + for (; ; ) { + drvfd = open(path, O_RDWR); + + if ((drvfd < 0) && (errno == EAGAIN)) { + if (vntsd_vcc_ioctl(VCC_FORCE_CLOSE, cons_no, &cons_no) + != VNTSD_SUCCESS) { + break; + } + } else { + break; + } + } + + + if (drvfd < 0) { + D1(stderr, "t@%d open_vcc@%s exit\n", thr_self(), dev_name); + free(path); + return (-1); + } + + free(path); + return (drvfd); +} + +/* vntsd_cons_by_consno() - match a console structure to cons no */ +boolean_t +vntsd_cons_by_consno(vntsd_cons_t *consp, int *cons_id) +{ + if (consp->status & VNTSD_CONS_DELETED) { + return (B_FALSE); + } + return (consp->cons_no == *cons_id); +} + +/* vntsd_write_client() write to telnet client */ +int +vntsd_write_client(vntsd_client_t *client, char *buffer, size_t sz) +{ + int rv; + + + /* write to client */ + rv = vntsd_write_fd(client->sockfd, buffer, sz); + + /* client has output, reset timer */ + vntsd_reset_timer(client->cons_tid); + + return (rv); +} + +/* vntsd_write_fd() write to tcp socket file descriptor */ +int +vntsd_write_fd(int fd, void *buf, size_t sz) +{ + int n; + + while (sz > 0) { + n = write(fd, buf, sz); + if (n < 0) { + if (errno == EINTR) { + return (VNTSD_STATUS_INTR); + } + + return (VNTSD_STATUS_CLIENT_QUIT); + } + + if (n == 0) { + return (VNTSD_STATUS_CLIENT_QUIT); + } + + buf = (caddr_t)buf + n; + sz -= n; + } + return (VNTSD_SUCCESS); + +} + +/* + * vntsd_read_char() - read a char from TCP Clienti. Returns: + * VNTSD_SUCCESS, VNTSD_STATUS_CLIENT_QUIT or VNTSD_STATUS_INTR + */ +int +vntsd_read_char(vntsd_client_t *clientp, char *c) +{ + int n; + vntsd_timeout_t tmo; + int rv; + + tmo.tid = thr_self(); + tmo.minutes = 0; + tmo.clientp = clientp; + + /* attach to timer */ + if ((rv = vntsd_attach_timer(&tmo)) != VNTSD_SUCCESS) { + return (rv); + } + + n = read(clientp->sockfd, c, 1); + + /* detach from timer */ + if ((rv = vntsd_detach_timer(&tmo)) != VNTSD_SUCCESS) { + return (rv); + } + + if (n == 1) { + return (VNTSD_SUCCESS); + } + + if (n == 0) { + return (VNTSD_STATUS_CLIENT_QUIT); + } + + /* + * read error or wake up by signal, either console is being removed or + * timeout occurs. + */ + if (errno == EINTR) { + return (VNTSD_STATUS_INTR); + } + + /* any other error, we close client */ + return (VNTSD_STATUS_CLIENT_QUIT); +} + +/* + * vntsd_read_data() - handle special commands + * such as telnet, daemon and ctrl cmds. Returns: + * from vntsd_read_char: + * VNTSD_STATUS_CLIENT_QUIT + * VNTSD_STATUS_INTR + * from vnts_process_daemon_cmd: + * VNTSD_STATUS_RESELECT_CONS + * VNTSD_STATUS_MOV_CONS_FORWARD + * VNTSD_STATUS_MOV_CONS_BACKWARD + * VNTSD_STATUS_ACQURE_WRITER + * VNTSD_STATUS_CONTINUE + * from vntsd_telnet_cmd + * VNTSD_STATUS_CONTINUE + */ +int +vntsd_read_data(vntsd_client_t *clientp, char *c) +{ + int rv; + + for (; ; ) { + if ((rv = vntsd_read_char(clientp, c)) != VNTSD_SUCCESS) { + return (rv); + } + + /* daemon cmd? */ + rv = vntsd_process_daemon_cmd(clientp, *c); + + if (rv == VNTSD_SUCCESS) { + /* telnet cmd? */ + rv = vntsd_telnet_cmd(clientp, *c); + } + + if (rv == VNTSD_STATUS_CONTINUE) { + continue; + } + + return (rv); + } + + /*NOTREACHED*/ + return (0); +} +/* vntsd_read_line() - read a line from TCP client */ +int +vntsd_read_line(vntsd_client_t *clientp, char *buf, int *in_sz) +{ + char c; + int rv; + int out_sz = 0; + + + for (; ; ) { + + if ((rv = vntsd_read_data(clientp, &c)) != VNTSD_SUCCESS) { + return (rv); + } + + if (c == BS) { + /* back */ + if ((rv = vntsd_write_client(clientp, &c, 1)) != + VNTSD_SUCCESS) { + return (rv); + } + + c = ' '; + if ((rv = vntsd_write_client(clientp, &c, 1)) != + VNTSD_SUCCESS) { + return (rv); + } + + buf--; + out_sz--; + continue; + } + /* echo */ + if ((rv = vntsd_write_client(clientp, &c, 1)) != + VNTSD_SUCCESS) { + return (rv); + } + + *buf++ = c; + out_sz++; + + if (c == CR) { + /* end of line */ + *in_sz = out_sz; + return (VNTSD_SUCCESS); + } + + if (out_sz == *in_sz) { + return (VNTSD_SUCCESS); + } + } + + /*NOTREACHED*/ + return (0); +} + +/* free a client */ +void +vntsd_free_client(vntsd_client_t *clientp) +{ + + if (clientp->sockfd != -1) { + (void) close(clientp->sockfd); + } + + (void) mutex_destroy(&clientp->lock); + + free(clientp); +} + + +/* check if a vcc console port still ok */ +boolean_t +vntsd_vcc_cons_alive(vntsd_cons_t *consp) +{ + vcc_console_t vcc_cons; + int rv; + + assert(consp); + assert(consp->group); + + /* construct current configuration */ + (void) strncpy(vcc_cons.domain_name, consp->domain_name, MAXPATHLEN); + (void) strncpy(vcc_cons.group_name, consp->group->group_name, + MAXPATHLEN); + vcc_cons.tcp_port = consp->group->tcp_port; + vcc_cons.cons_no = consp->cons_no; + + /* call vcc to verify */ + rv = vntsd_vcc_ioctl(VCC_CONS_STATUS, consp->cons_no, &vcc_cons); + if (rv != VNTSD_SUCCESS) { + return (B_FALSE); + } + + if (vcc_cons.cons_no == -1) { + /* port is gone */ + return (B_FALSE); + } + + /* port is ok */ + return (B_TRUE); + +} + +/* add to total if a console is alive */ +static boolean_t +total_cons(vntsd_cons_t *consp, int *num_cons) +{ + int rv; + + assert(consp->group); + rv = vntsd_vcc_err(consp); + if (rv == VNTSD_STATUS_CONTINUE) { + (*num_cons)++; + } + return (B_FALSE); +} + + +/* total alive consoles in a group */ +int +vntsd_chk_group_total_cons(vntsd_group_t *groupp) +{ + uint_t num_cons = 0; + + (void) vntsd_que_find(groupp->conspq, (compare_func_t)total_cons, + &num_cons); + return (num_cons); +} + +/* vntsd_log() log function for errors */ +void +vntsd_log(vntsd_status_t status, char *msg) +{ + char *status_msg = NULL; + int critical = 0; + + switch (status) { + + case VNTSD_SUCCESS: + status_msg = "STATUS_OK"; + break; + + case VNTSD_STATUS_CONTINUE: + status_msg = "CONTINUE"; + break; + + case VNTSD_STATUS_EXIT_SIG: + critical = 1; + status_msg = "KILL SIGNAL RECV"; + break; + + case VNTSD_STATUS_SIG: + status_msg = "SIG RECV"; + break; + + case VNTSD_STATUS_NO_HOST_NAME: + status_msg = "Warining NO HOST NAME"; + break; + + case VNTSD_STATUS_CLIENT_QUIT: + status_msg = "CLIENT CLOSED GROUP CONNECTION"; + break; + + case VNTSD_STATUS_RESELECT_CONS: + status_msg = "CLIENT RESELECTS CONSOLE"; + break; + + case VNTSD_STATUS_VCC_IO_ERR: + status_msg = "CONSOLE WAS DELETED"; + break; + + case VNTSD_STATUS_MOV_CONS_FORWARD: + status_msg = "MOVE CONSOLE FORWARD"; + break; + + case VNTSD_STATUS_MOV_CONS_BACKWARD: + status_msg = "MOVE CONSOLE BACKWARD"; + break; + + case VNTSD_STATUS_ACQUIRE_WRITER: + status_msg = "FORCE CONSOLE WRITE"; + break; + + case VNTSD_STATUS_INTR: + status_msg = "RECV SIGNAL"; + break; + + case VNTSD_STATUS_DISCONN_CONS: + status_msg = "DELETING CONSOLE"; + break; + + case VNTSD_STATUS_NO_CONS: + status_msg = "GROUP HAS NO CONSOLE"; + break; + + case VNTSD_ERR_NO_MEM: + critical = 1; + status_msg = "NO MEMORY"; + break; + + case VNTSD_ERR_NO_DRV: + critical = 1; + status_msg = "NO VCC DRIVER"; + break; + + case VNTSD_ERR_WRITE_CLIENT: + status_msg = "WRITE CLIENT ERR"; + break; + + case VNTSD_ERR_EL_NOT_FOUND: + critical = 1; + status_msg = "ELEMENT_NOT_FOUND"; + break; + + case VNTSD_ERR_VCC_CTRL_DATA: + critical = 1; + status_msg = "VCC CTRL DATA ERROR"; + break; + + case VNTSD_ERR_VCC_POLL: + critical = 1; + status_msg = "VCC POLL ERROR"; + break; + + case VNTSD_ERR_VCC_IOCTL: + critical = 1; + status_msg = "VCC IOCTL ERROR"; + break; + + case VNTSD_ERR_VCC_GRP_NAME: + critical = 1; + status_msg = "VCC GROUP NAME ERROR"; + break; + + case VNTSD_ERR_CREATE_LISTEN_THR: + critical = 1; + status_msg = "FAIL TO CREATE LISTEN THREAD"; + break; + + case VNTSD_ERR_CREATE_WR_THR: + critical = 1; + status_msg = "FAIL TO CREATE WRITE THREAD"; + break; + + case VNTSD_ERR_ADD_CONS_FAILED: + critical = 1; + status_msg = "FAIL TO ADD A CONSOLE"; + break; + + case VNTSD_ERR_LISTEN_SOCKET: + critical = 1; + status_msg = "LISTEN SOCKET ERROR"; + break; + + case VNTSD_ERR_LISTEN_OPTS: + critical = 1; + status_msg = "SET SOCKET OPTIONS ERROR"; + break; + + case VNTSD_ERR_LISTEN_BIND: + critical = 1; + status_msg = "BIND SOCKET ERROR"; + break; + + case VNTSD_STATUS_ACCEPT_ERR: + critical = 1; + status_msg = "LISTEN ACCEPT ERROR"; + break; + + case VNTSD_ERR_CREATE_CONS_THR: + critical = 1; + status_msg = "CREATE CONSOLE THREAD ERROR "; + break; + + case VNTSD_ERR_SIG: + critical = 1; + status_msg = "RECV UNKNOWN SIG"; + break; + + case VNTSD_ERR_UNKNOWN_CMD: + critical = 1; + status_msg = "RECV UNKNOWN COMMAND"; + break; + + case VNTSD_ERR_CLIENT_TIMEOUT: + status_msg = "CLOSE CLIENT BECAUSE TIMEOUT"; + break; + default: + status_msg = "Unknown status recv"; + break; + } + + + if (critical) { + syslog(LOG_ERR, "%s: thread[%d] %s\n", status_msg, + thr_self(), msg); + } +#ifdef DEBUG + DERR(stderr, "%s: thread[%d] %s\n", status_msg, + thr_self(), msg); + syslog(LOG_ERR, "%s: thread[%d] %s\n", status_msg, thr_self(), msg); +#endif +} diff --git a/usr/src/cmd/vntsd/console.c b/usr/src/cmd/vntsd/console.c new file mode 100644 index 0000000000..4b7c145e0e --- /dev/null +++ b/usr/src/cmd/vntsd/console.c @@ -0,0 +1,721 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Listen thread creates a console thread whenever there is a tcp client + * made a conection to its port. In the console thread, if there are + * multiple consoles in the group, client will be asked for a console selection. + * a write thread for a console is created when first client connects to a + * selected console and console thread becomes read thread for the client. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <thread.h> +#include <synch.h> +#include <signal.h> +#include <assert.h> +#include <ctype.h> +#include <syslog.h> +#include <libintl.h> +#include <netdb.h> +#include "vntsd.h" +#include "chars.h" + +/* display domain names in the group */ +static boolean_t +display_domain_name(vntsd_cons_t *consp, int *fd) +{ + char buf[VNTSD_LINE_LEN]; + char *status; + + + if (consp->clientpq != NULL) { + status = gettext("connected"); + } else if (consp->status & VNTSD_CONS_DELETED) { + status = gettext("removing..."); + } else { + status = gettext("online"); + } + + (void) snprintf(buf, sizeof (buf), "%-20d%-30s%-25s%s", + consp->cons_no, consp->domain_name, status, vntsd_eol); + + return (vntsd_write_fd(*fd, buf, strlen(buf)) != VNTSD_SUCCESS); +} + +/* output connected message to tcp client */ +static int +write_connect_msg(vntsd_client_t *clientp, char *group_name, + char *domain_name) +{ + + int rv = VNTSD_SUCCESS; + char buf[VNTSD_LINE_LEN]; + + if ((rv = vntsd_write_client(clientp, vntsd_eol, VNTSD_EOL_LEN)) != + VNTSD_SUCCESS) { + return (rv); + } + + (void) snprintf(buf, sizeof (buf), + gettext("Connecting to console \"%s\" in group \"%s\" ...."), + domain_name, group_name); + + if ((rv = vntsd_write_line(clientp, buf)) != VNTSD_SUCCESS) { + return (rv); + } + + if ((rv = vntsd_write_line(clientp, + gettext("Press ~? for control options .."))) != + VNTSD_SUCCESS) { + return (rv); + } + + return (VNTSD_SUCCESS); +} + +static int +create_write_thread(vntsd_cons_t *consp) +{ + + assert(consp); + + /* create write thread for the console */ + (void) mutex_lock(&consp->lock); + if (thr_create(NULL, 0, (thr_func_t)vntsd_write_thread, + (void *)consp, NULL, &consp->wr_tid)) { + + DERR(stderr, "t@%d create_rd_wr_thread@%d: " + "create write thread failed\n", + thr_self(), consp->cons_no); + (void) close(consp->vcc_fd); + consp->vcc_fd = -1; + (void) mutex_unlock(&consp->lock); + + return (VNTSD_ERR_CREATE_WR_THR); + } + (void) mutex_unlock(&consp->lock); + return (VNTSD_SUCCESS); +} + +/* Display all domain consoles in a group. */ +static int +list_all_domains(vntsd_group_t *groupp, vntsd_client_t *clientp) +{ + char vntsd_line[VNTSD_LINE_LEN]; + int rv = VNTSD_SUCCESS; + + if ((rv = vntsd_write_client(clientp, vntsd_eol, VNTSD_EOL_LEN)) + != VNTSD_SUCCESS) { + return (rv); + } + + /* + * TRANSLATION_NOTE + * The following three strings of the form "DOMAIN .." are table + * headers and should be all uppercase. + */ + (void) snprintf(vntsd_line, sizeof (vntsd_line), + "%-20s%-30s%-25s", + gettext("DOMAIN ID"), gettext("DOMAIN NAME"), + gettext("DOMAIN STATE")); + + if ((rv = vntsd_write_line(clientp, vntsd_line)) != VNTSD_SUCCESS) { + return (rv); + } + + (void) mutex_lock(&groupp->lock); + + if (vntsd_que_find(groupp->conspq, (compare_func_t)display_domain_name, + &(clientp->sockfd)) != NULL) { + rv = VNTSD_ERR_WRITE_CLIENT; + } + + (void) mutex_unlock(&groupp->lock); + + return (rv); +} + +/* display help */ +static int +display_help(vntsd_client_t *clientp) +{ + int rv = VNTSD_SUCCESS; + char *bufp; + + if ((rv = vntsd_write_client(clientp, vntsd_eol, VNTSD_EOL_LEN)) + != VNTSD_SUCCESS) { + return (rv); + } + + /* + * TRANSLATION_NOTE + * The following three strings of the form ". -- ..." are help + * messages for single character commands. Do not translate the + * character before the --. + */ + bufp = gettext("h -- this help)"); + + if ((rv = vntsd_write_line(clientp, bufp)) != VNTSD_SUCCESS) { + return (rv); + } + + bufp = gettext("l -- list of consoles"); + + if ((rv = vntsd_write_line(clientp, bufp)) != VNTSD_SUCCESS) { + return (rv); + } + + bufp = gettext("q -- quit"); + + if ((rv = vntsd_write_line(clientp, bufp)) != VNTSD_SUCCESS) { + return (rv); + } + + /* + * TRANSLATION_NOTE + * In the following string, "id" is a short mnemonic for + * "identifier" and both occurrences should be translated. + */ + + bufp = gettext("[c[c ]]{id} -- connect to console of domain {id}"); + + if ((rv = vntsd_write_line(clientp, bufp)) != VNTSD_SUCCESS) { + return (rv); + } + + return (VNTSD_SUCCESS); +} + +/* select a console to connect */ +static int +select_cons(vntsd_group_t *groupp, int num_cons, vntsd_cons_t **consp, + vntsd_client_t *clientp, char c) +{ + int cons_no = -2; + int n; + int i; + char buf[VNTSD_LINE_LEN]; + int rv; + + + + (void) mutex_lock(&groupp->lock); + if (groupp->num_cons == 0) { + (void) mutex_unlock(&groupp->lock); + /* no console in this group */ + return (VNTSD_STATUS_NO_CONS); + } + (void) mutex_unlock(&groupp->lock); + + if (num_cons == 1) { + /* by pass selecting console */ + *consp = (vntsd_cons_t *)(groupp->conspq->handle); + return (VNTSD_SUCCESS); + } + + + if (isdigit(c)) { + /* {id} input */ + cons_no = c - '0'; + } else if (c == 'c') { + /* c{id} or c {id} input */ + cons_no = -1; + } else if (!isspace(c)) { + return (VNTSD_ERR_INVALID_INPUT); + } + + /* get client selections */ + n = VNTSD_LINE_LEN; + + if ((rv = vntsd_read_line(clientp, buf, &n)) != VNTSD_SUCCESS) { + return (rv); + } + + /* parse command */ + for (i = 0; i < n; i++) { + if (cons_no == -1) { + /* c{id} */ + cons_no = atoi(buf + i); + break; + } + + if (isspace(buf[i]) && cons_no == -2) { + /* skip space */ + continue; + } + + if (buf[i] == 'c') { + /* c{id} or c {id} */ + cons_no = -1; + } else if (buf[i] == CR) { + break; + } else { + return (VNTSD_ERR_INVALID_INPUT); + } + } + + if (cons_no < 0) { + return (VNTSD_ERR_INVALID_INPUT); + } + + /* get selected console */ + (void) mutex_lock(&groupp->lock); + + *consp = (vntsd_cons_t *)vntsd_que_find(groupp->conspq, + (compare_func_t)vntsd_cons_by_consno, &cons_no); + + if (*consp == NULL) { + /* during console selection, the console has been deleted */ + (void) mutex_unlock(&groupp->lock); + + return (VNTSD_ERR_INVALID_INPUT); + } + if ((*consp)->status & VNTSD_CONS_DELETED) { + return (VNTSD_ERR_INVALID_INPUT); + } + + (void) mutex_unlock(&groupp->lock); + + return (VNTSD_SUCCESS); +} + +/* compare if there is a match console in the gorup */ +static boolean_t +find_cons_in_group(vntsd_cons_t *consp_in_group, vntsd_cons_t *consp) +{ + if (consp_in_group == consp) { + return (B_TRUE); + } else { + return (B_FALSE); + } +} + +/* connect a client to a console */ +static int +connect_cons(vntsd_cons_t *consp, vntsd_client_t *clientp) +{ + int rv, rv1; + vntsd_group_t *groupp; + + assert(consp); + groupp = consp->group; + assert(groupp); + assert(clientp); + + (void) mutex_lock(&groupp->lock); + + /* check if console is valid */ + consp = vntsd_que_find(groupp->conspq, + (compare_func_t)find_cons_in_group, consp); + + if (consp == NULL) { + (void) mutex_unlock(&groupp->lock); + return (VNTSD_STATUS_NO_CONS); + } + if (consp->status & VNTSD_CONS_DELETED) { + (void) mutex_unlock(&groupp->lock); + return (VNTSD_STATUS_NO_CONS); + } + + (void) mutex_lock(&consp->lock); + (void) mutex_lock(&clientp->lock); + + + clientp->cons = consp; + + /* enable daemon cmd */ + clientp->status &= ~VNTSD_CLIENT_DISABLE_DAEMON_CMD; + + if (consp->clientpq == NULL) { + /* first connect to console - a writer */ + assert(consp->vcc_fd == -1); + /* open vcc */ + consp->vcc_fd = vntsd_open_vcc(consp->dev_name, consp->cons_no); + if (consp->vcc_fd < 0) { + (void) mutex_unlock(&clientp->lock); + (void) mutex_unlock(&consp->lock); + (void) mutex_unlock(&groupp->lock); + assert(consp->group); + return (vntsd_vcc_err(consp)); + } + } + + (void) mutex_unlock(&clientp->lock); + + /* + * move the client from group's no console selected queue + * to cons queue + */ + + rv = vntsd_que_rm(&groupp->no_cons_clientpq, clientp); + assert(rv == VNTSD_SUCCESS); + + rv = vntsd_que_append(&consp->clientpq, clientp); + (void) mutex_unlock(&groupp->lock); + + if (rv != VNTSD_SUCCESS) { + if (consp->clientpq->handle == clientp) { + /* writer */ + (void) close(consp->vcc_fd); + consp->vcc_fd = -1; + } + + (void) mutex_unlock(&consp->lock); + return (rv); + } + + (void) mutex_unlock(&consp->lock); + + if (consp->clientpq->handle == clientp) { + /* create a write thread */ + rv = create_write_thread(consp); + if (rv != VNTSD_SUCCESS) { + return (rv); + } + } + + /* write connecting message */ + if ((rv = write_connect_msg(clientp, consp->group->group_name, + consp->domain_name)) != VNTSD_SUCCESS) { + return (rv); + } + + /* process input from client */ + rv = vntsd_read(clientp); + + /* client disconnected from the console */ + (void) mutex_lock(&groupp->lock); + + /* remove client from console queue */ + (void) mutex_lock(&consp->lock); + rv1 = vntsd_que_rm(&consp->clientpq, clientp); + assert(rv1 == VNTSD_SUCCESS); + + /* append client to group's no console selected queue */ + rv1 = vntsd_que_append(&groupp->no_cons_clientpq, clientp); + (void) mutex_unlock(&groupp->lock); + + if (consp->clientpq == NULL) { + /* clean up console since there is no client connected to it */ + assert(consp->vcc_fd != -1); + + /* close vcc port */ + (void) close(consp->vcc_fd); + consp->vcc_fd = -1; + + /* force write thread to exit */ + assert(consp->wr_tid != (thread_t)-1); + (void) thr_kill(consp->wr_tid, SIGUSR1); + (void) mutex_unlock(&consp->lock); + (void) thr_join(consp->wr_tid, NULL, NULL); + (void) mutex_lock(&consp->lock); + } + + if (consp->status & VNTSD_CONS_SIG_WAIT) { + /* console is waiting for client to disconnect */ + (void) cond_signal(&consp->cvp); + } + + (void) mutex_unlock(&consp->lock); + + return (rv1 == VNTSD_SUCCESS ? rv : rv1); + +} + +/* read command line input */ +static int +read_cmd(vntsd_client_t *clientp, char *prompt, char *cmd) +{ + int rv; + + /* disable daemon special command */ + (void) mutex_lock(&clientp->lock); + clientp->status |= VNTSD_CLIENT_DISABLE_DAEMON_CMD; + (void) mutex_unlock(&clientp->lock); + + if ((rv = vntsd_write_client(clientp, vntsd_eol, VNTSD_EOL_LEN)) + != VNTSD_SUCCESS) { + return (rv); + } + + if ((rv = vntsd_write_client(clientp, prompt, strlen(prompt))) + != VNTSD_SUCCESS) { + return (rv); + } + + if ((rv = vntsd_read_data(clientp, cmd)) != VNTSD_SUCCESS) { + return (rv); + } + if (*cmd == BS) { + return (VNTSD_SUCCESS); + } + + rv = vntsd_write_client(clientp, cmd, 1); + + *cmd = tolower(*cmd); + + return (rv); +} + +/* reset client for selecting a console in the group */ +static void +client_init(vntsd_client_t *clientp) +{ + (void) mutex_lock(&clientp->lock); + clientp->cons = NULL; + clientp->status = 0; + (void) mutex_unlock(&clientp->lock); +} + +/* clean up client and exit the thread */ +static void +client_fini(vntsd_group_t *groupp, vntsd_client_t *clientp) +{ + + assert(groupp); + assert(clientp); + + /* disconnct client from tcp port */ + assert(clientp->sockfd != -1); + (void) close(clientp->sockfd); + + (void) mutex_lock(&groupp->lock); + (void) vntsd_que_rm(&groupp->no_cons_clientpq, clientp); + + if ((groupp->no_cons_clientpq == NULL) && + (groupp->status & VNTSD_GROUP_SIG_WAIT)) { + /* group is waiting to be deleted */ + groupp->status &= ~VNTSD_GROUP_SIG_WAIT; + (void) cond_signal(&groupp->cvp); + } + (void) mutex_unlock(&groupp->lock); + + (void) mutex_destroy(&clientp->lock); + free(clientp); + + thr_exit(0); +} + +/* check client's status. exit if client quits or fatal errors */ +static void +console_chk_status(vntsd_group_t *groupp, vntsd_client_t *clientp, int status) +{ + char err_msg[VNTSD_LINE_LEN]; + + D1(stderr, "t@%d console_chk_status() status=%d " + "client status=%x num consoles=%d \n", + thr_self(), status, clientp->status, groupp->num_cons); + + (void) snprintf(err_msg, VNTSD_LINE_LEN, "console_chk_status client%d" + " num_cos=%d", clientp->sockfd, groupp->num_cons); + + if (groupp->num_cons == 0) { + /* no more console in the group */ + client_fini(groupp, clientp); + } + + if (status == VNTSD_STATUS_INTR) { + /* reason for signal? */ + status = vntsd_cons_chk_intr(clientp); + } + + switch (status) { + + case VNTSD_STATUS_CLIENT_QUIT: + client_fini(groupp, clientp); + return; + + case VNTSD_STATUS_RESELECT_CONS: + assert(clientp->cons); + if ((groupp->num_cons == 1) && + (groupp->conspq->handle == clientp->cons)) { + /* no other selection available */ + client_fini(groupp, clientp); + } else { + client_init(clientp); + } + return; + + case VNTSD_STATUS_VCC_IO_ERR: + if ((clientp->status & VNTSD_CLIENT_CONS_DELETED) == 0) { + /* check if console was deleted */ + status = vntsd_vcc_err(clientp->cons); + } + + if (status != VNTSD_STATUS_CONTINUE) { + /* console was deleted */ + if (groupp->num_cons == 1) { + client_fini(groupp, clientp); + } + } + + /* console is ok */ + client_init(clientp); + return; + + case VNTSD_STATUS_MOV_CONS_FORWARD: + case VNTSD_STATUS_MOV_CONS_BACKWARD: + if (groupp->num_cons == 1) { + /* same console */ + return; + } + + /* get selected console */ + (void) mutex_lock(&(clientp->cons->group->lock)); + clientp->cons = vntsd_que_pos(clientp->cons->group->conspq, + clientp->cons, + (status == VNTSD_STATUS_MOV_CONS_FORWARD)?(1):(-1)); + (void) mutex_unlock(&(clientp->cons->group->lock)); + return; + + case VNTSD_SUCCESS: + case VNTSD_STATUS_CONTINUE: + case VNTSD_STATUS_NO_CONS: + client_init(clientp); + return; + + case VNTSD_ERR_INVALID_INPUT: + return; + + default: + /* fatal error */ + vntsd_log(status, err_msg); + client_fini(groupp, clientp); + return; + } +} + +/* console thread */ +void * +vntsd_console_thread(vntsd_thr_arg_t *argp) +{ + vntsd_group_t *groupp; + vntsd_cons_t *consp; + vntsd_client_t *clientp; + + char buf[MAXHOSTNAMELEN]; + char prompt[72]; + char cmd; + int rv = VNTSD_SUCCESS; + int num_cons; + + + groupp = (vntsd_group_t *)argp->handle; + clientp = (vntsd_client_t *)argp->arg; + + assert(groupp); + assert(clientp); + + /* check if group is removed */ + + D1(stderr, "t@%d get_client_sel@%lld:client@%d\n", thr_self(), + groupp->tcp_port, clientp->sockfd); + + bzero(buf, MAXHOSTNAMELEN); + + /* host name */ + if (gethostname(buf, MAXHOSTNAMELEN)) { + vntsd_log(VNTSD_STATUS_NO_HOST_NAME, "vntsd_console_thread()"); + (void) snprintf(buf, sizeof (buf), "unkown host"); + } + + if (snprintf(prompt, sizeof (prompt), + "%s-vnts-%s: h,l,{id},c{id},c {id},q:", + buf, groupp->group_name) >= sizeof (prompt)) { + /* long prompt doesn't fit, use short one */ + (void) snprintf(prompt, sizeof (prompt), + "vnts: h,l,{id},c{id},c {id}, q:"); + } + + + for (;;) { + cmd = ' '; + D1(stderr, "t@%d console_thread()@%lld:client@%d\n", thr_self(), + groupp->tcp_port, clientp->sockfd); + + num_cons = vntsd_chk_group_total_cons(groupp); + + if ((num_cons > 1) && (clientp->cons == NULL)) { + /* console to connect to */ + rv = read_cmd(clientp, prompt, &cmd); + /* check error and may exit */ + console_chk_status(groupp, clientp, rv); + } + + switch (cmd) { + + case 'l': + + /* list domain names */ + rv = list_all_domains(groupp, clientp); + break; + + + case 'q': + + rv = VNTSD_STATUS_CLIENT_QUIT; + break; + + case 'h': + rv = display_help(clientp); + break; + + default: + /* select console */ + if (clientp->cons == NULL) { + rv = select_cons(groupp, num_cons, + &consp, clientp, cmd); + if (rv == VNTSD_ERR_INVALID_INPUT) { + rv = display_help(clientp); + break; + } + } else { + consp = clientp->cons; + } + assert(consp); + + /* connect to console */ + rv = connect_cons(consp, clientp); + D1(stderr, "t@%d console_thread()" + "connect_cons returns %d\n", + thr_self(), rv); + break; + + } + /* check error and may exit */ + console_chk_status(groupp, clientp, rv); + } + + /*NOTREACHED*/ + return (NULL); +} diff --git a/usr/src/cmd/vntsd/listen.c b/usr/src/cmd/vntsd/listen.c new file mode 100644 index 0000000000..358e2665aa --- /dev/null +++ b/usr/src/cmd/vntsd/listen.c @@ -0,0 +1,285 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Each group has a listen thread. It is created at the time + * of a group creation and destroyed when a group does not have + * any console associated with it. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <thread.h> +#include <assert.h> +#include <signal.h> +#include <ctype.h> +#include <syslog.h> +#include "vntsd.h" + +/* + * check the state of listen thread. exit if there is an fatal error + * or the group is removed. + */ +static void +listen_chk_status(vntsd_group_t *groupp, int status) +{ + char err_msg[VNTSD_LINE_LEN]; + + + D1(stderr, "t@%d listen_chk_status() status=%d group=%s " + "tcp=%lld group status = %x\n", thr_self(), status, + groupp->group_name, groupp->tcp_port, groupp->status); + + (void) snprintf(err_msg, sizeof (err_msg), + "Group:%s TCP port %lld status %x", + groupp->group_name, groupp->tcp_port, groupp->status); + + + switch (status) { + + case VNTSD_SUCCESS: + return; + + case VNTSD_STATUS_INTR: + assert(groupp->status & VNTSD_GROUP_SIG_WAIT); + /* close listen socket */ + (void) mutex_lock(&groupp->lock); + (void) close(groupp->sockfd); + groupp->sockfd = -1; + + /* let group know */ + groupp->status &= ~VNTSD_GROUP_SIG_WAIT; + (void) cond_signal(&groupp->cvp); + + (void) mutex_unlock(&groupp->lock); + /* exit thread */ + thr_exit(0); + break; + + case VNTSD_STATUS_ACCEPT_ERR: + return; + + case VNTSD_STATUS_NO_CONS: + default: + /* fatal, exit thread */ + + (void) mutex_lock(&groupp->lock); + (void) close(groupp->sockfd); + groupp->sockfd = -1; + (void) mutex_unlock(&groupp->lock); + vntsd_log(status, err_msg); + vntsd_clean_group(groupp); + + thr_exit(0); + break; + } +} + +/* allocate and initialize listening socket. */ +static int +open_socket(int port_no, int *sockfd) +{ + + struct sockaddr_in addr; + int on; + + + /* allocate a socket */ + *sockfd = socket(AF_INET, SOCK_STREAM, 0); + if (*sockfd < 0) { + if (errno == EINTR) { + return (VNTSD_STATUS_INTR); + } + return (VNTSD_ERR_LISTEN_SOCKET); + } + +#ifdef DEBUG + /* set reuse local socket address */ + on = 1; + if (setsockopt(*sockfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof (on))) { + return (VNTSD_ERR_LISTEN_OPTS); + } +#endif + + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = (vntsd_ip_addr()).s_addr; + addr.sin_port = htons(port_no); + + /* bind socket */ + if (bind(*sockfd, (struct sockaddr *)&addr, sizeof (addr)) < 0) { + if (errno == EINTR) { + return (VNTSD_STATUS_INTR); + } + return (VNTSD_ERR_LISTEN_BIND); + + } + + if (listen(*sockfd, VNTSD_MAX_SOCKETS) == -1) { + if (errno == EINTR) { + return (VNTSD_STATUS_INTR); + } + return (VNTSD_ERR_LISTEN_BIND); + } + + D1(stderr, "t@%d open_socket() sockfd=%d\n", thr_self(), *sockfd); + return (VNTSD_SUCCESS); +} + +/* ceate console selection thread */ +static int +create_console_thread(vntsd_group_t *groupp, int sockfd) +{ + vntsd_client_t *clientp; + vntsd_thr_arg_t arg; + int rv; + + + assert(groupp); + D1(stderr, "t@%d create_console_thread@%lld:client@%d\n", thr_self(), + groupp->tcp_port, sockfd); + + /* allocate a new client */ + clientp = (vntsd_client_t *)malloc(sizeof (vntsd_client_t)); + if (clientp == NULL) { + return (VNTSD_ERR_NO_MEM); + } + + /* initialize the client */ + bzero(clientp, sizeof (vntsd_client_t)); + + clientp->sockfd = sockfd; + clientp->cons_tid = (thread_t)-1; + + (void) mutex_init(&clientp->lock, USYNC_THREAD|LOCK_ERRORCHECK, NULL); + + /* append client to group */ + (void) mutex_lock(&groupp->lock); + + if ((rv = vntsd_que_append(&groupp->no_cons_clientpq, clientp)) + != VNTSD_SUCCESS) { + (void) mutex_unlock(&groupp->lock); + vntsd_free_client(clientp); + return (rv); + } + + (void) mutex_unlock(&groupp->lock); + + (void) mutex_lock(&clientp->lock); + + /* parameters for console thread */ + bzero(&arg, sizeof (arg)); + + arg.handle = groupp; + arg.arg = clientp; + + /* create console selection thread */ + if (thr_create(NULL, 0, (thr_func_t)vntsd_console_thread, + &arg, THR_DETACHED, &clientp->cons_tid)) { + + (void) mutex_unlock(&clientp->lock); + (void) mutex_lock(&groupp->lock); + (void) vntsd_que_rm(&groupp->no_cons_clientpq, clientp); + (void) mutex_unlock(&groupp->lock); + vntsd_free_client(clientp); + + return (VNTSD_ERR_CREATE_CONS_THR); + } + + (void) mutex_unlock(&clientp->lock); + + return (VNTSD_SUCCESS); +} + +/* listen thread */ +void * +vntsd_listen_thread(vntsd_group_t *groupp) +{ + + int newsockfd; + size_t clilen; + struct sockaddr_in cli_addr; + int rv; + int num_cons; + + assert(groupp); + + D1(stderr, "t@%d listen@%lld\n", thr_self(), groupp->tcp_port); + + + /* initialize listen socket */ + (void) mutex_lock(&groupp->lock); + rv = open_socket(groupp->tcp_port, &groupp->sockfd); + (void) mutex_unlock(&groupp->lock); + listen_chk_status(groupp, rv); + + for (; ; ) { + + clilen = sizeof (cli_addr); + + /* listen to the socket */ + newsockfd = accept(groupp->sockfd, (struct sockaddr *)&cli_addr, + &clilen); + + D1(stderr, "t@%d listen_thread() connected sockfd=%d\n", + thr_self(), newsockfd); + + if (newsockfd <= 0) { + + if (errno == EINTR) { + listen_chk_status(groupp, VNTSD_STATUS_INTR); + } else { + listen_chk_status(groupp, + VNTSD_STATUS_ACCEPT_ERR); + } + continue; + } + num_cons = vntsd_chk_group_total_cons(groupp); + if (num_cons == 0) { + (void) close(newsockfd); + listen_chk_status(groupp, VNTSD_STATUS_NO_CONS); + } + + /* a connection is established */ + rv = vntsd_set_telnet_options(newsockfd); + if (rv != VNTSD_SUCCESS) { + (void) close(newsockfd); + listen_chk_status(groupp, rv); + } + rv = create_console_thread(groupp, newsockfd); + if (rv != VNTSD_SUCCESS) { + (void) close(newsockfd); + listen_chk_status(groupp, rv); + } + } + + /*NOTREACHED*/ + return (NULL); +} diff --git a/usr/src/cmd/vntsd/queue.c b/usr/src/cmd/vntsd/queue.c new file mode 100644 index 0000000000..4d50428198 --- /dev/null +++ b/usr/src/cmd/vntsd/queue.c @@ -0,0 +1,288 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * utility for vntsd queue handling + */ +#include <stdio.h> +#include <sys/types.h> +#include <sys/ipc.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/socket.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <sys/sem.h> +#include <wait.h> +#include <time.h> +#include <netinet/in.h> +#include <thread.h> +#include <signal.h> +#include "vntsd.h" + +/* alloc_que_el() allocates a queue element */ +static vntsd_que_t * +alloc_que_el(void *handle) +{ + vntsd_que_t *el; + + /* allocate a queue element */ + el = (vntsd_que_t *)malloc(sizeof (vntsd_que_t)); + if (el == NULL) { + return (NULL); + } + + + el->nextp = NULL; + el->prevp = NULL; + el->handle = handle; + + return (el); +} + +/* vntsd_que_append() appends a element to a queue */ +int +vntsd_que_append(vntsd_que_t **que_hd, void *handle) +{ + vntsd_que_t *p; + vntsd_que_t *el; + + assert(que_hd); + assert(handle); + + /* allocate a queue element */ + el = alloc_que_el(handle); + + if (el == NULL) { + return (VNTSD_ERR_NO_MEM); + } + + p = *que_hd; + + if (p == NULL) { + /* first one */ + *que_hd = el; + } else { + /* walk to the last one */ + while (p->nextp != NULL) + p = p->nextp; + p->nextp = el; + } + + el->prevp = p; + + return (VNTSD_SUCCESS); +} + +/* vntsd_que_insert_after() inserts element arter the handle */ +int +vntsd_que_insert_after(vntsd_que_t *que, void *handle, void *next) +{ + vntsd_que_t *q, *el; + + assert(que); + + q = que; + + while (q != NULL) { + if (q->handle == handle) { + break; + } + + q = q->nextp; + } + + if (q == NULL) { + /* not in queue */ + return (VNTSD_ERR_EL_NOT_FOUND); + } + + el = alloc_que_el(next); + + if (el == NULL) { + return (VNTSD_ERR_NO_MEM); + } + + el->nextp = q->nextp; + q->nextp = el; + el->prevp = q; + + return (VNTSD_SUCCESS); +} + + + +/* vntsd_que_rm() removes an element from a queue */ +int +vntsd_que_rm(vntsd_que_t **que_hd, void *handle) +{ + vntsd_que_t *p = *que_hd; + vntsd_que_t *prevp = NULL; + + + while (p != NULL) { + /* match handle */ + if (p->handle == handle) { + break; + } + prevp = p; + p = p->nextp; + } + + if (p == NULL) { + /* not found */ + return (VNTSD_ERR_EL_NOT_FOUND); + } + + /* found */ + if (p == *que_hd) { + /* first one */ + *que_hd = p->nextp; + } else { + prevp->nextp = p->nextp; + } + + if (p->nextp != NULL) { + p->nextp->prevp = prevp; + } + + handle = p->handle; + + free(p); + + return (VNTSD_SUCCESS); + +} + +/* vntsd_que_walk() - walk queue and apply function to each element */ +void * +vntsd_que_walk(vntsd_que_t *que_hd, el_func_t el_func) +{ + vntsd_que_t *p = que_hd; + + while (p != NULL) { + if ((*el_func)(p->handle)) { + return (p->handle); + } + + p = p->nextp; + } + return (VNTSD_SUCCESS); +} + + +/* vntsd_que_find() finds first match */ +void * +vntsd_que_find(vntsd_que_t *que_hd, compare_func_t compare_func, void *data) +{ + vntsd_que_t *p = que_hd; + + assert(compare_func != NULL); + while (p != NULL) { + if ((*compare_func)(p->handle, data)) { + /* found match */ + return (p->handle); + } + + p = p->nextp; + } + + /* not found */ + return (NULL); +} + +/* vntsd_free_que() frees entire queue */ +void +vntsd_free_que(vntsd_que_t **q, clean_func_t clean_func) +{ + vntsd_que_t *p; + + while (*q != NULL) { + p = *q; + + *q = p->nextp; + + if (clean_func) { + /* clean func will free the handle */ + (*clean_func)(p->handle); + } else { + free(p->handle); + } + + free(p); + } +} + +/* + * vntsd_que_pos() matches a handle and returns a handle located at "pos" + * relative to the matched handle. pos supported are 1 or -1. + */ +void * +vntsd_que_pos(vntsd_que_t *que_hd, void *handle, int pos) +{ + vntsd_que_t *p = que_hd; + + assert((pos == 1) || (pos == -1)); + + + while (p != NULL) { + if (p->handle == handle) { + /* find match */ + if (pos == 1) { + /* forward 1 */ + if (p->nextp != NULL) { + return (p->nextp->handle); + } + + /* last one go to first */ + return (que_hd->handle); + + } else { + /* backward 1 */ + if (p->prevp != NULL) { + return (p->prevp->handle); + } + + /* first one, return last one */ + while (p->nextp != NULL) { + p = p->nextp; + } + + assert(p != NULL); + assert(p->handle != NULL); + return (p->handle); + + } + } + p = p->nextp; + } + + DERR(stderr, "t@%d vntsd_que_pos can not find handle \n", + thr_self()); + + return (NULL); +} diff --git a/usr/src/cmd/vntsd/read.c b/usr/src/cmd/vntsd/read.c new file mode 100644 index 0000000000..c5431a0ac1 --- /dev/null +++ b/usr/src/cmd/vntsd/read.c @@ -0,0 +1,265 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * read thread - Read from tcp client and write to vcc driver. There are one + * writer and multiple readers per console. The first client who connects to + * a console get write access. An error message is returned to readers if they + * attemp to input commands. Read thread accepts special daemon commands from + * all clients. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <thread.h> +#include <synch.h> +#include <signal.h> +#include <assert.h> +#include <ctype.h> +#include <syslog.h> +#include <libintl.h> +#include "vntsd.h" +#include "chars.h" + +/* write_vcc() - write to vcc virtual console */ +static int +write_vcc(vntsd_client_t *clientp, char c) +{ + int n; + + + assert(clientp); + assert(clientp->cons); + + if (c == 0) { + return (VNTSD_SUCCESS); + } + n = write(clientp->cons->vcc_fd, &c, 1); + + if (n < 0) { + /* write error */ + if (errno == EINTR) { + return (vntsd_cons_chk_intr(clientp)); + } + + return (VNTSD_STATUS_VCC_IO_ERR); + } + + assert(n != 0); + return (VNTSD_SUCCESS); + +} + +/* + * acquire_writer() the client is going to be writer. + * insert the client to the head of the console client queue. + */ +static int +acquire_writer(vntsd_client_t *clientp) +{ + vntsd_cons_t *consp; + vntsd_client_t *writerp; + int rv; + + D1(stderr, "t@%d:acuire_writer :client@%d\n", thr_self(), + clientp->sockfd); + + assert(clientp != NULL); + consp = clientp->cons; + + assert(consp); + + (void) mutex_lock(&consp->lock); + + assert(consp->clientpq != NULL); + if (consp->clientpq->handle == clientp) { + /* clientp is a writer already */ + (void) mutex_unlock(&consp->lock); + return (VNTSD_SUCCESS); + } + + /* current writer */ + writerp = (vntsd_client_t *)(consp->clientpq->handle); + + (void) mutex_lock(&writerp->lock); + + rv = vntsd_que_rm(&(consp->clientpq), clientp); + assert(rv == VNTSD_SUCCESS); + + (void) mutex_lock(&clientp->lock); + + /* move client to be first in the console queue */ + consp->clientpq->handle = clientp; + + /* move previous writer to be the second in the queue */ + rv = vntsd_que_insert_after(consp->clientpq, clientp, writerp); + + (void) mutex_unlock(&consp->lock); + (void) mutex_unlock(&writerp->lock); + (void) mutex_unlock(&clientp->lock); + + if (rv != VNTSD_SUCCESS) { + return (rv); + } + + /* write warning message to the writer */ + + if ((rv = vntsd_write_line(writerp, + gettext("Warning: Console connection forced into read-only mode"))) + != VNTSD_SUCCESS) { + return (rv); + } + + return (VNTSD_SUCCESS); +} + +/* interrupt handler */ +int +vntsd_cons_chk_intr(vntsd_client_t *clientp) +{ + + if (clientp->status & VNTSD_CLIENT_TIMEOUT) { + return (VNTSD_STATUS_CLIENT_QUIT); + } + if (clientp->status & VNTSD_CLIENT_CONS_DELETED) { + return (VNTSD_STATUS_RESELECT_CONS); + } + + if (clientp->status & VNTSD_CLIENT_IO_ERR) { + return (VNTSD_STATUS_CLIENT_QUIT); + } + return (VNTSD_STATUS_CONTINUE); +} + +/* read from client */ +static int +read_char(vntsd_client_t *clientp, char *c) +{ + int rv; + + for (; ; ) { + + rv = vntsd_read_data(clientp, c); + + switch (rv) { + case VNTSD_STATUS_CONTINUE: + break; + + case VNTSD_STATUS_ACQUIRE_WRITER: + rv = acquire_writer(clientp); + if (rv != VNTSD_SUCCESS) { + return (rv); + } + break; + default: + return (rv); + } + } +} + +/* vntsd_read worker */ +int +vntsd_read(vntsd_client_t *clientp) +{ + char c; + int rv; + + + assert(clientp); + D3(stderr, "t@%d vntsd_read@%d\n", thr_self(), clientp->sockfd); + + for (; ; ) { + + /* client input */ + rv = read_char(clientp, &c); + + if (rv == VNTSD_STATUS_INTR) { + rv = vntsd_cons_chk_intr(clientp); + } + + if (rv != VNTSD_SUCCESS) { + return (rv); + } + + assert(clientp->cons); + if (clientp->cons->clientpq->handle != clientp) { + /* reader - print error message */ + if ((c != CR) && (c != LF)) { + rv = vntsd_write_line(clientp, + gettext(VNTSD_NO_WRITE_ACCESS_MSG)); + + /* check errors and may exit */ + if (rv == VNTSD_STATUS_INTR) { + rv = vntsd_cons_chk_intr(clientp); + } + + if (rv != VNTSD_SUCCESS) { + return (rv); + } + + } + + continue; + } + + rv = vntsd_ctrl_cmd(clientp, c); + + switch (rv) { + case VNTSD_STATUS_CONTINUE: + continue; + break; + case VNTSD_STATUS_INTR: + rv = vntsd_cons_chk_intr(clientp); + if (rv != VNTSD_SUCCESS) { + return (rv); + } + break; + case VNTSD_SUCCESS: + break; + default: + return (rv); + } + + /* write to vcc */ + rv = write_vcc(clientp, c); + if (rv == VNTSD_STATUS_INTR) { + rv = vntsd_cons_chk_intr(clientp); + } + if (rv != VNTSD_SUCCESS) { + return (rv); + } + + } + + /*NOTREACHED*/ + return (NULL); +} diff --git a/usr/src/cmd/vntsd/svc-vntsd b/usr/src/cmd/vntsd/svc-vntsd new file mode 100644 index 0000000000..e573b4ecd5 --- /dev/null +++ b/usr/src/cmd/vntsd/svc-vntsd @@ -0,0 +1,64 @@ +#!/sbin/sh +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# ident "%Z%%M% %I% %E% SMI" +# +# Start script for vntsd +# +# For modifying parameters passed to vntsd, do not edit +# this script. Instead use svccfg(1m) to modify the SMF +# repository. For example: +# +# svccfg +# svc:> select ldoms/vntsd +# svc:/ldoms/vntsd> setprop vntsd/vcc_device = "virtual-console-concentrator@1" +# svc:/ldoms/vntsd> setprop vntsd/listen_addr = "192.168.1.1" +# svc:/ldoms/vntsd> exit + +. /lib/svc/share/smf_include.sh + +vcc_device=`svcprop -p vntsd/vcc_device $SMF_FMRI 2>/dev/null` +if [ -z "$vcc_device" ]; then + vcc_device="virtual-console-concentrator@0" +fi +args="-i $vcc_device" + +listen_addr=`svcprop -p vntsd/listen_addr $SMF_FMRI 2>/dev/null` +if [ -n "$listen_addr" ]; then + args="$args -p $listen_addr" +fi + +timeout=`svcprop -p vntsd/timeout_minutes $SMF_FMRI 2>/dev/null` +if [ -n "$timeout" ]; then + args="$args -t $timeout" +fi + +if [ -x /usr/lib/ldoms/vntsd ]; then + /usr/lib/ldoms/vntsd $args || exit $SMF_EXIT_ERR_CONFIG +else + echo "WARNING: /usr/lib/ldoms/vntsd is missing or not executable" >& 2 + exit $SMF_EXIT_ERR_CONFIG +fi + +exit $SMF_EXIT_OK diff --git a/usr/src/cmd/vntsd/vcc.h b/usr/src/cmd/vntsd/vcc.h new file mode 100644 index 0000000000..bcf1b2902b --- /dev/null +++ b/usr/src/cmd/vntsd/vcc.h @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _VCC_H +#define _VCC_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + + +#define VCC_MAX_NAME 25 + +#define VCC_NUM_CONSOLE 0x1 /* total number of groups */ +#define VCC_PORT_TBL 0x2 /* download all port in a group */ + +#define VCC_INQUIRY 0x4 /* inquiry evnts */ +#define VCC_PORT_CONFIG 0x8 /* download one port */ +#define VCC_CLEAN_POLL 0x10 /* vntsd exits */ +#define VCC_DEL_PORT_OK 0x20 /* vntsd delete port ok */ +#define VCC_PORT_HELLO 0x1 + +typedef enum { + VNTSD_MSG_ADD_PORT, + VNTSD_MSG_DEL_PORT +} vntsd_msg_t; + + +#define VCC_PORT_ON 0x40 + + +typedef struct vntsd_console { + int cons_no; + uint64_t status; + char domain_name[VCC_MAX_NAME]; +} vntsd_console_t; + +/* console configuration that is downloaded to vntsd */ +typedef struct vntsd_vcc_console { + vntsd_console_t console; + char group_name[VCC_MAX_NAME]; + uint64_t tcp_port; +} vntsd_vcc_console_t; + + +#ifdef __cplusplus +} +#endif + +#endif /* _VCC_H */ diff --git a/usr/src/cmd/vntsd/vntsd.c b/usr/src/cmd/vntsd/vntsd.c new file mode 100644 index 0000000000..e1ded5dc3b --- /dev/null +++ b/usr/src/cmd/vntsd/vntsd.c @@ -0,0 +1,582 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * VNTSD main + * + * VNTSD takes the following options: + * -i <device instance> + * VCC device instance to use, e.g. virtual-console-concentrator@0. + * Required option. + * -p <ip address> + * IP address VNTSD listens to. + * -d + * Do not daemonize. This is only available in a DEBUG build. + * -t timeout for inactivity 0 = indefinite + */ + +#include <stdio.h> +#include <sys/types.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/socket.h> +#include <arpa/inet.h> +#include <time.h> +#include <netinet/in.h> +#include <thread.h> +#include <signal.h> +#include <fcntl.h> +#include <ctype.h> +#include <libintl.h> +#include <locale.h> +#include <syslog.h> +#include "vntsd.h" +#include "chars.h" + +#if !defined(TEXT_DOMAIN) /* Should be defined by cc -D */ +#define TEXT_DOMAIN "SYS_TEST" /* Use this only if it weren't. */ +#endif + +/* global variables */ + +#ifdef DEBUG +int vntsddbg = 0x8; +#endif + +#define MINUTE 60 + +static vntsd_t *vntsdp; + + +static void vntsd_exit(void); +/* Signal handler for SIGINT, SIGKILL and SIGHUP */ +static void +exit_sig_handler(int sig) +{ + + char err_msg[VNTSD_LINE_LEN]; + + D1(stderr, "t@%d exit_sig_handler%d \n", thr_self(), sig); + + (void) snprintf(err_msg, sizeof (err_msg), "exit_sig_handler() sig=%d", + sig); + + vntsd_log(VNTSD_STATUS_EXIT_SIG, err_msg); + + exit(0); +} + +/* + * Before a thread reads in client's input, it attaches to vntsd timer so that + * it can be waken up if a client does not access the connection for + * VNTSD_INPUT_TIMEOUT(10) minutes. + */ + +/* attach a thread to timer */ +int +vntsd_attach_timer(vntsd_timeout_t *tmop) +{ + int rv; + + if (vntsdp->timeout == 0) { + return (VNTSD_SUCCESS); + } + + (void) mutex_lock(&vntsdp->tmo_lock); + rv = vntsd_que_append(&vntsdp->tmoq, (void *)tmop); + (void) mutex_unlock(&vntsdp->tmo_lock); + return (rv); +} + +/* detach a thread from timer */ +int +vntsd_detach_timer(vntsd_timeout_t *tmop) +{ + int rv; + + if (vntsdp->timeout == 0) { + return (VNTSD_SUCCESS); + } + + (void) mutex_lock(&vntsdp->tmo_lock); + rv = vntsd_que_rm(&vntsdp->tmoq, (void *)tmop); + (void) mutex_unlock(&vntsdp->tmo_lock); + + return (rv); +} + +/* check threadd's timeout */ +static boolean_t +chk_timeout(vntsd_timeout_t *tmop) +{ + tmop->minutes++; + + if (tmop->minutes == vntsdp->timeout) { + /* wake up the thread */ + tmop->clientp->status |= VNTSD_CLIENT_TIMEOUT; + (void) thr_kill(tmop->tid, SIGALRM); + } + + /* return false to walk the queue */ + return (B_FALSE); +} + +/* reset timer */ +static boolean_t +reset_timeout(vntsd_timeout_t *tmop, thread_t tid) +{ + if (tmop->tid == tid) { + tmop->minutes = 0; + } + /* return false to walk the queue */ + return (B_FALSE); +} + +void +vntsd_reset_timer(thread_t tid) +{ + if (vntsdp->timeout == 0) { + return; + } + + (void) mutex_lock(&vntsdp->tmo_lock); + (void) vntsd_que_find(vntsdp->tmoq, (compare_func_t)reset_timeout, + (void*)tid); + (void) mutex_unlock(&vntsdp->tmo_lock); +} + +/* + * When alarm goes off, wake up timeout threads. Alarm is set off every + * minutes. + */ +static void +vntsd_alarm_sig_handler(int sig) +{ + static thread_t main_thread = 0; + + D1(stderr, "t@%d alarm signal %d\n", thr_self(), sig); + if (vntsdp->timeout == 0) { + DERR(stderr, "t@%d alarm signal should not recv %d\n", + thr_self(), sig); + return; + } + + + if (main_thread == 0) { + /* initialize thread id */ + main_thread = thr_self(); + } else if (main_thread != thr_self()) { + /* get signal because thread is timeout */ + return; + } + + /* in main thread */ + (void) mutex_lock(&vntsdp->tmo_lock); + + /* wake up timeout threads */ + (void) vntsd_que_walk(vntsdp->tmoq, (el_func_t)chk_timeout); + (void) mutex_unlock(&vntsdp->tmo_lock); + + /* reset alarm */ + (void) alarm(MINUTE); +} + +/* got a SIGUSER1 siginal */ +static void +vntsd_sig_handler(int sig) +{ + char err_msg[VNTSD_LINE_LEN]; + + (void) snprintf(err_msg, sizeof (err_msg), "sig_handler() sig=%d", + sig); + + if (sig != SIGUSR1) { + vntsd_log(VNTSD_STATUS_SIG, err_msg); + } +} + +/* vntsd exits */ +static void +vntsd_exit(void) +{ + D1(stderr, "t@%d vntsd_exit\n", thr_self()); + + (void) mutex_lock(&vntsdp->lock); + + if (vntsdp->timeout > 0) { + /* cancel the timer */ + (void) alarm(0); + } + /* delete all groups */ + vntsd_free_que(&vntsdp->grouppq, (clean_func_t)vntsd_clean_group); + + /* close control port */ + (void) close(vntsdp->ctrl_fd); + + assert(vntsdp->tmoq == NULL); + (void) mutex_unlock(&vntsdp->lock); + + /* clean up vntsdp */ + (void) mutex_destroy(&vntsdp->tmo_lock); + (void) mutex_destroy(&vntsdp->lock); + free(vntsdp); + closelog(); +} + +/* + * vntsd_help() + * print out valid command line options + */ +static void +vntsd_help(void) +{ + + (void) fprintf(stderr, gettext("Usage: vntsd -i <VCC device instance> " + "[-p <listen address>] [-t <timeout in minutes>]\n")); +} + + +#ifdef DEBUG +#define DEBUG_OPTIONS "d" +#else +#define DEBUG_OPTIONS "" +#endif + +int +main(int argc, char ** argv) +{ + char *path; + struct pollfd poll_drv[1]; + struct sigaction act; + char *listen_addr = NULL; + pid_t pid; + int i; + int option; + int sz; + int fd; + int n; + + /* internationalization */ + (void) setlocale(LC_MESSAGES, ""); + (void) textdomain(TEXT_DOMAIN); + vntsd_init_esctable_msgs(); + + /* initialization */ + bzero(&act, sizeof (act)); + + vntsdp = calloc(sizeof (vntsd_t), 1); + if (vntsdp == NULL) { + vntsd_log(VNTSD_ERR_NO_MEM, "main:vntsdp"); + exit(1); + } + + vntsdp->ctrl_fd = -1; + vntsdp->devinst = NULL; + + (void) mutex_init(&vntsdp->lock, USYNC_THREAD|LOCK_ERRORCHECK, NULL); + (void) mutex_init(&vntsdp->tmo_lock, USYNC_THREAD|LOCK_ERRORCHECK, + NULL); + + /* get CLI options */ + while ((option = getopt(argc, argv, "i:t:p:"DEBUG_OPTIONS)) != EOF) { + switch (option) { +#ifdef DEBUG + case 'd': + vntsdp->options |= VNTSD_OPT_DAEMON_OFF; + break; +#endif + case 'i': + vntsdp->devinst = optarg; + break; + case 'p': + listen_addr = optarg; + break; + + case 't': + n = sscanf(optarg, "%d", &(vntsdp->timeout)); + if (n != 1) { + vntsdp->timeout = -1; + } + break; + + default: + vntsd_help(); + exit(1); + } + } + + if ((vntsdp->devinst == NULL) || (vntsdp->timeout == -1)) { + vntsd_help(); + exit(1); + } + + if (listen_addr == NULL || strcmp(listen_addr, "localhost") == 0) { + /* by default listen on loopback interface */ + vntsdp->ip_addr.s_addr = htonl(INADDR_LOOPBACK); + } else if (strcmp(listen_addr, "any") == 0) { + vntsdp->ip_addr.s_addr = htonl(INADDR_ANY); + } else { + vntsdp->ip_addr.s_addr = inet_addr(listen_addr); + if (vntsdp->ip_addr.s_addr == (in_addr_t)(-1)) { + (void) fprintf(stderr, + gettext("Invalid listen address '%s'\n"), + listen_addr); + exit(1); + } + } + + D3(stderr, "options = %llx, instance = %s, listen = %s\n", + vntsdp->options, vntsdp->devinst, + listen_addr ? listen_addr : "<null>"); + + /* open VCC driver control port */ + sz = strlen(VCC_DEVICE_CTL_PATH) + strlen(vntsdp->devinst) + 1; + path = calloc(sz, 1); + if (path == NULL) { + vntsd_log(VNTSD_ERR_NO_MEM, "main(): alloc dev path"); + exit(1); + } + (void) snprintf(path, sz-1, VCC_DEVICE_CTL_PATH, vntsdp->devinst, + sizeof (vntsdp->devinst)); + vntsdp->ctrl_fd = open(path, O_RDWR); + free(path); + + if (vntsdp->ctrl_fd == -1) { + /* + * do not print error if device is not present + * the daemon is probably being started incorrectly + */ + if (errno != ENOENT) { + syslog(LOG_ERR, + "Error opening VCC device control port: %s", + strerror(errno)); + } + exit(1); + } + if ((vntsdp->options & VNTSD_OPT_DAEMON_OFF) == 0) { + /* daemonize it */ + pid = fork(); + if (pid < 0) { + perror("fork"); + exit(1); + } + if (pid > 0) { + /* parent */ + exit(0); + } + + /* + * child process (daemon) + * + * Close all file descriptors other than 2 and the ctrl fd. + */ + (void) close(0); + (void) close(1); + for (i = 3; i < vntsdp->ctrl_fd; i++) { + (void) close(i); + } + closefrom(vntsdp->ctrl_fd + 1); + + /* obtain a new process group */ + (void) setsid(); + fd = open("/dev/null", O_RDWR); + if (fd < 0) { + syslog(LOG_ERR, "Can not open /dev/null"); + exit(1); + } + /* handle standard I/O */ + if (dup2(fd, 0) < 0) { + syslog(LOG_ERR, "Failed dup2()"); + exit(1); + } + + if (dup2(fd, 1) < 0) { + syslog(LOG_ERR, "Failed dup2()"); + exit(1); + } + + /* ignore terminal signals */ + (void) signal(SIGTSTP, SIG_IGN); + (void) signal(SIGTTOU, SIG_IGN); + (void) signal(SIGTTIN, SIG_IGN); + } + + + /* set up signal handlers */ + + /* exit signals */ + act.sa_handler = exit_sig_handler; + + (void) sigemptyset(&act.sa_mask); + (void) sigaction(SIGINT, &act, NULL); + (void) sigaction(SIGTERM, &act, NULL); + (void) sigaction(SIGHUP, &act, NULL); + + /* vntsd internal signals */ + act.sa_handler = vntsd_sig_handler; + (void) sigemptyset(&act.sa_mask); + (void) sigaction(SIGUSR1, &act, NULL); + + + act.sa_handler = vntsd_alarm_sig_handler; + (void) sigemptyset(&act.sa_mask); + (void) sigaction(SIGALRM, &act, NULL); + + + /* setup exit */ + (void) atexit(vntsd_exit); + + + + /* initialization */ + openlog("vntsd", LOG_CONS, LOG_DAEMON); + + + /* set alarm */ + if (vntsdp->timeout > 0) { + (void) alarm(MINUTE); + } + + vntsdp->tid = thr_self(); + + /* get exiting consoles from vcc */ + vntsd_get_config(vntsdp); + + for (; ; ) { + /* poll vcc for configuration change */ + bzero(poll_drv, sizeof (poll_drv)); + + poll_drv[0].fd = vntsdp->ctrl_fd; + poll_drv[0].events = POLLIN; + + if (poll(poll_drv, 1, -1) == -1) { + if (errno == EINTR) { + /* wake up because a consle was deleted */ + vntsd_delete_cons(vntsdp); + continue; + } + vntsd_log(VNTSD_ERR_VCC_POLL, + "vcc control poll err! aborting.."); + exit(1); + } + + D1(stderr, "t@%d driver event %x\n", thr_self(), + poll_drv[0].revents); + + vntsd_daemon_wakeup(vntsdp); + + } + + /*NOTREACHED*/ + return (0); +} + +/* export ip_addr */ +struct in_addr +vntsd_ip_addr(void) +{ + return (vntsdp->ip_addr); +} + +/* + * ioctl to vcc control port + * Supported ioctls interface are: + * ioctl code parameters return data + * VCC_NUM_CONSOLE none uint_t no consoles + * VCC_CONS_TBL none array of vcc_cons_t + * VCC_INQUIRY none vcc_response_t response + * VCC_CONS_INFO uint_t portno vcc_cons_t + * VCC_CONS_STATUS uint_t portno + * VCC_FORCE_CLOSE uint_t portno + */ +int +vntsd_vcc_ioctl(int ioctl_code, uint_t portno, void *buf) +{ + D1(stderr, "t@%d vcc_ioctl@%d code=%x\n", thr_self(), portno, + ioctl_code); + + if ((ioctl_code == (VCC_CONS_INFO)) || + (ioctl_code == (VCC_FORCE_CLOSE))) { + /* construct vcc in buf */ + *((uint_t *)buf) = portno; + } + + if (ioctl(vntsdp->ctrl_fd, ioctl_code, (caddr_t)buf)) { + /* control port get error */ + syslog(LOG_ERR, "vcc control port error! abort vntsd"); + (void) thr_kill(vntsdp->tid, SIGINT); + return (VNTSD_STATUS_VCC_IO_ERR); + } + + return (VNTSD_SUCCESS); +} + +/* + * check if a vcc i/o error is caused by removal of a console. If so notify + * all clients connected to the console and wake up main thread to cleanup + * the console. + */ +int +vntsd_vcc_err(vntsd_cons_t *consp) +{ + vntsd_group_t *groupp; + + assert(consp); + groupp = consp->group; + assert(groupp); + + if (consp->status & VNTSD_CONS_DELETED) { + /* console was deleted */ + return (VNTSD_STATUS_VCC_IO_ERR); + } + + if (vntsd_vcc_cons_alive(consp)) { + /* console is ok */ + return (VNTSD_STATUS_CONTINUE); + } + + /* console needs to be deleted */ + (void) mutex_lock(&consp->lock); + consp->status |= VNTSD_CONS_DELETED; + + /* signal all clients to disconnect from console */ + (void) vntsd_que_walk(consp->clientpq, + (el_func_t)vntsd_notify_client_cons_del); + (void) mutex_unlock(&consp->lock); + + /* mark the group */ + (void) mutex_lock(&groupp->lock); + groupp->status |= VNTSD_GROUP_CLEAN_CONS; + (void) mutex_unlock(&groupp->lock); + + /* signal main thread to deleted console */ + (void) thr_kill(vntsdp->tid, SIGUSR1); + + return (VNTSD_STATUS_VCC_IO_ERR); +} diff --git a/usr/src/cmd/vntsd/vntsd.h b/usr/src/cmd/vntsd/vntsd.h new file mode 100644 index 0000000000..16b1bbe90f --- /dev/null +++ b/usr/src/cmd/vntsd/vntsd.h @@ -0,0 +1,476 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * vntsd uses configuration information provided by vcc to export access + * to Ldom console access over regular TCP sockets. When it starts, it opens + * the vcc driver control port and obtains the list of ports that have been + * created by the vcc driver as well as TCP port number and group associated + * with each port. + * vntsd consists of multiple components as the follows: + * + * vntsd.c + * This module initializes vnts daemon, process user options such as instance + * number, ip address and etc., and provides main thread to poll any console + * port change. + * + * vntsdvcc.c + * This module provides vcc driver interface. It opens vcc driver control + * ports, read initial configuration, and provides interface to read, write and + * ioctl virtual console ports. This module creates a listen thread for each + * console group. It further dynamically adds and removes virtual consoles + * and groups following instructions of the vcc driver. This module + * is executed in the same thread as vntsd.c which is blocked on vcc control + * poll interface. + * + * listen.c + * This is a group listen thread. Each group's tcp-port has a listen thread + * associated with it. The thread is created when a console is associated with + * a new group and is removed when all consoles in the group are removed. + * + * console.c + * This is a console selection thread. The thread is created when a client + * connects to a group TCP port and exited when client disconnects. If there is + * only one console in the group, the client is connected to that console. If + * there are multiple consoles in the group, the client is asked to select a + * console. After determining which console to connect to, this thread + * a write thread if the cient is a writer and it self read in client input. + * + * read.c + * it reads input from a TCP client, processes + * special daemon and telent commands and write to vcc driver if the client + * is a writer. The client is a writer if the client is the first one connects + * to the console. Read thread print out an error message if a reader attempt + * to input to vcc. Read thread exits if console is deleted, client + * disconnects, or there is a fatal error. + * + * Write.c + * Write thread is creaed when first client connects to a console. It reads + * from vcc and writes to all clients that connect to the same console. + * Write thread exits when all clients disconnect from the console. + * + * cmd.c + * This is a supporting module for handling special daemon and telnet commands. + * + * common.c + * supporting modules shared by threads modules. + * + * queue.c + * This is a moudle supporting queue operations. Vntsd organizes its data + * in multiple queues <see data structure below>. + * + * vntsd.xml + * This is a manifest to support SMF interfaces. + * + * Data structures + * each group has a vntsd_group_t structure, which contains a queue of + * all console in that group. + * each console has a vntsd_cons_t structure, which contains a queue of + * all clients that connected to the console. + * + * +----------+ +----------+ +----------+ + * | group |-->| group |-->| group |-->.... + * +----------+ +----------+ +----------+ + * | + * |<-----------------------------------------+ + * |<------------------------+ | + * |<--------+ | | + * | | | | + * | +----------+ +----------+ +----------+ + * +----->| console |---->| console |---->| lconsole |---> .... + * +----------+ +----------+ +----------+ + * | | + * | | +----------+ +----------+ + * | +---->| client |----->| client |----->...... + * | +----------+ +----------+ + * | | | + * |<------------+ | + * |<------------------------------+ + * + * Locks + * Each vntsd has one lock to protect the group queue + * Each group has one lock to protect the console queue, the queue for + * clients without a console connection and status. + * Each console has one lock to protect client queue and status. + * Each client has one lock to protect the state of the client. The client + * states are: + * + * VCC_CLIENT_READER + * A client is connected to a console as either a writer or a reader. + * if this client is the first one connects the console, the client is + * a writer, otherwise the client is a reader. A writer' write thread + * reads from vcc and send output to all readers connected to the + * same console. a reader's write thread is blocked until a reader becomes + * a writer. + * + * When a client selected a console, the client becomes a reader if + * there is another client connected to the console before the client. + * A client will be a writer if + * 1. client is the first one connected to the console or + * 2. client has entered a ~w daemon command or + * 3. all clients connected to the console before the client have + * disconnected from the console. + * + * VCC_CLIENT_MOVE_CONS_FORWARD + * VCC_CLIENT_MOVE_CONS_BACKWOARD + * A client is disconnecting from one console and move to the next or + * previous console in the group queue. + * A client is in one of these state if + * 1. the client has entered the daemon command and + * 2. the vntsd is in process of switching the client from one + * console to another. + * + * VCC_CLIENT_DISABLE_DAEMON_CMD + * vntsd is in processing of a client's daemon command or the client is + * in selecting console. + * A client is in this state if + * 1. the client has not selected a console or + * 2. the vntsd is processing a client's daemon command. + * + * VCC_CLIENT_ACQUIRE_WRITER + * A reader forces to become a writer via vntsd special command. + * A client is in this state if + * 1. the client is a reader and + * 2. client has entered a daemon command to become a writer. + * + * VCC_CLIENT_CONS_DELETED + * The console that the client is connected to is being deleted and + * waiting for the client to disconnect. + * A client is in this state if + * 1. the console a client is connected to is being removed and + * 2. the vntsd is in process of disconnecting the client from the console. + * + */ + +#ifndef _VNTSD_H +#define _VNTSD_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/shm.h> +#include <strings.h> +#include <assert.h> +#include <sys/wait.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stropts.h> +#include <errno.h> +#include <sys/param.h> +#include "../../uts/sun4v/sys/vcc.h" + +#define DEBUG + +/* vntsd limits */ +#define VNTSD_MAX_BUF_SIZE 128 +#define VNTSD_LINE_LEN 100 +#define VNTSD_MAX_SOCKETS 5 +#define VNTSD_EOL_LEN 2 + +/* secons before re-send signal for cv_wait */ +#define VNTSD_CV_WAIT_DELTIME 10 + +#define VCC_PATH_PREFIX \ + "/devices/virtual-devices@100/channel-devices@200/" +#define VCC_DEVICE_PATH "/devices%s" +#define VCC_DEVICE_CTL_PATH VCC_PATH_PREFIX "%s:ctl" + +/* common messages */ +#define VNTSD_NO_WRITE_ACCESS_MSG "You do not have write access" + +/* vntsd options */ +#define VNTSD_OPT_DAEMON_OFF 0x1 + +/* group states */ + +#define VNTSD_GROUP_SIG_WAIT 0x1 /* waiting for signal */ +#define VNTSD_GROUP_CLEAN_CONS 0x2 /* cons needs to be clean */ +#define VNTSD_GROUP_CLEANUP 0x4 /* waiting for signal */ + + + + + +/* console status */ + +#define VNTSD_CONS_DELETED 0x1 /* deleted */ +#define VNTSD_CONS_SIG_WAIT 0x2 /* waiting fro signal */ + + +#define VNTSD_CLIENT_IO_ERR 0x1 /* reader */ +#define VNTSD_CLIENT_DISABLE_DAEMON_CMD 0x2 /* disable daemon cmd */ +#define VNTSD_CLIENT_TIMEOUT 0x4 /* timeout */ +#define VNTSD_CLIENT_CONS_DELETED 0x8 /* console deleted */ + +/* generic que structure */ +typedef struct vntsd_que { + void *handle; /* element in queue */ + struct vntsd_que *nextp; /* next queue element */ + struct vntsd_que *prevp; /* previous queue element */ +} vntsd_que_t; + +struct vntsd_cons; +struct vntsd_group; +struct vntsd; + +/* client structure */ +typedef struct vntsd_client { + mutex_t lock; /* protect the client */ + uint_t status; /* client's state */ + + int sockfd; /* connection socket */ + thread_t cons_tid; /* console thread */ + + struct vntsd_cons *cons; /* back link to console configuration */ + +} vntsd_client_t; + +/* console structure */ +typedef struct vntsd_cons { + mutex_t lock; /* protect console port */ + cond_t cvp; /* sync between threads */ + + vntsd_que_t *clientpq; /* client que */ + uint_t status; /* client's state */ + int vcc_fd; /* vcc console port */ + thread_t wr_tid; /* write thread */ + + uint_t cons_no; /* console port number */ + char domain_name[MAXPATHLEN]; /* domain name */ + char dev_name[MAXPATHLEN]; + + struct vntsd_group *group; /* back link to group */ +} vntsd_cons_t; + +/* group structure */ +typedef struct vntsd_group { + mutex_t lock; /* protect group */ + cond_t cvp; /* sync remove group */ + + uint_t status; /* group status */ + char group_name[MAXPATHLEN]; + uint64_t tcp_port; /* telnet port */ + + thread_t listen_tid; /* listen thread */ + int sockfd; /* listen socket */ + + vntsd_que_t *conspq; /* console queue */ + uint_t num_cons; /* num console */ + + /* clients have no console connection */ + vntsd_que_t *no_cons_clientpq; + struct vntsd *vntsd; + +} vntsd_group_t; + +/* daemon structure */ +typedef struct vntsd { + + mutex_t lock; /* protect vntsd */ + mutex_t tmo_lock; /* protect tmo queue */ + + int instance; /* vcc instance */ + struct in_addr ip_addr; /* ip address to listen */ + uint64_t options; /* daemon options */ + int timeout; /* connection timeout */ + + char *devinst; /* device name */ + int ctrl_fd; /* vcc ctrl port */ + + vntsd_que_t *grouppq; /* group queue */ + uint_t num_grps; /* num groups */ + + vntsd_que_t *tmoq; /* timeout queue */ + thread_t tid; /* main thread id */ + +} vntsd_t; + +/* handle for creating thread */ +typedef struct vntsd_thr_arg { + void *handle; + void *arg; +} vntsd_thr_arg_t; + +/* timeout structure */ +typedef struct vntsd_timeout { + thread_t tid; /* thread tid */ + uint_t minutes; /* idle minutes */ + vntsd_client_t *clientp; /* client */ +} vntsd_timeout_t; + +/* vntsd status and error definitions */ +typedef enum { + + /* status */ + VNTSD_SUCCESS = 0, /* success */ + VNTSD_STATUS_CONTINUE, /* continue to execute */ + VNTSD_STATUS_EXIT_SIG, /* exit siginal */ + VNTSD_STATUS_SIG, /* known signal */ + VNTSD_STATUS_NO_HOST_NAME, /* no host name set */ + VNTSD_STATUS_CLIENT_QUIT, /* client disconnected from group */ + VNTSD_STATUS_RESELECT_CONS, /* client re-selecting console */ + VNTSD_STATUS_VCC_IO_ERR, /* a vcc io error occurs */ + VNTSD_STATUS_MOV_CONS_FORWARD, /* down arrow */ + VNTSD_STATUS_MOV_CONS_BACKWARD, /* up arrow */ + VNTSD_STATUS_ACQUIRE_WRITER, /* force become the writer */ + VNTSD_STATUS_INTR, /* thread receive a signal */ + VNTSD_STATUS_DISCONN_CONS, /* disconnect a client from cons */ + VNTSD_STATUS_NO_CONS, /* disconnect a client from cons */ + + /* resource errors */ + VNTSD_ERR_NO_MEM, /* memory allocation error */ + VNTSD_ERR_NO_DRV, /* cannot open vcc port */ + + /* vcc errors */ + VNTSD_ERR_VCC_CTRL_DATA, /* vcc ctrl data error */ + VNTSD_ERR_VCC_POLL, /* error poll vcc driver */ + VNTSD_ERR_VCC_IOCTL, /* vcc ioctl call error */ + VNTSD_ERR_VCC_GRP_NAME, /* group name differs from database */ + VNTSD_ERR_ADD_CONS_FAILED, /* addition of a console failed */ + + /* create thread errors */ + VNTSD_ERR_CREATE_LISTEN_THR, /* listen thread creation failed */ + VNTSD_ERR_CREATE_CONS_THR, /* create console thread err */ + VNTSD_ERR_CREATE_WR_THR, /* listen thread creation failed */ + + /* listen thread errors */ + VNTSD_ERR_LISTEN_SOCKET, /* can not create tcp socket */ + VNTSD_ERR_LISTEN_OPTS, /* can not set socket opt */ + VNTSD_ERR_LISTEN_BIND, /* can not bind socket */ + VNTSD_STATUS_ACCEPT_ERR, /* accept error */ + + /* tcp client read and write errors */ + VNTSD_ERR_WRITE_CLIENT, /* writing tcp client err */ + + /* tcp client timeout */ + VNTSD_ERR_CLIENT_TIMEOUT, /* client has no activity for timeout */ + + /* signal errors */ + VNTSD_ERR_SIG, /* unknown signal */ + + /* user input error */ + VNTSD_ERR_INVALID_INPUT, /* client typed in */ + + /* internal errors */ + VNTSD_ERR_EL_NOT_FOUND, /* element not found */ + VNTSD_ERR_UNKNOWN_CMD /* unknown error/cmd */ + +} vntsd_status_t; + +/* function prototype defines */ +typedef int (*compare_func_t)(void *el, void *data); +typedef int (*el_func_t)(void *el); +typedef void (*clean_func_t)(void *el); +typedef void (*sig_handler_t)(int sig); +typedef void *(*thr_func_t)(void *); + + + +/* function prototype */ +void vntsd_log(vntsd_status_t err, char *msg); +struct in_addr vntsd_ip_addr(void); + +void vntsd_get_config(vntsd_t *vntsdp); +void vntsd_daemon_wakeup(vntsd_t *vntsdp); +int vntsd_open_vcc(char *domain_name, uint_t cons_no); +void vntsd_delete_cons(vntsd_t *vntsdp); +void vntsd_clean_group(vntsd_group_t *groupp); + + +void *vntsd_listen_thread(vntsd_group_t *groupp); +void *vntsd_console_thread(vntsd_thr_arg_t *argp); +int vntsd_read(vntsd_client_t *clientp); +void *vntsd_write_thread(vntsd_cons_t *consp); + +boolean_t vntsd_cons_by_consno(vntsd_cons_t *consp, int *cons_id); + +int vntsd_que_append(vntsd_que_t **que_hd, void *handle); +int vntsd_que_rm(vntsd_que_t **que_hd, void *handle); +void *vntsd_que_find(vntsd_que_t *que_hd, compare_func_t + compare_func, void *data); +void *vntsd_que_walk(vntsd_que_t *que_hd, el_func_t el_func); + +int vntsd_que_insert_after(vntsd_que_t *que, void *handle, + void *next); +void *vntsd_que_pos(vntsd_que_t *que_hd, void *handle, int pos); +void vntsd_free_que(vntsd_que_t **q, clean_func_t clean_func); + +int vntsd_read_char(vntsd_client_t *clientp, char *c); +int vntsd_read_line(vntsd_client_t *clientp, char *buf, int *size); +int vntsd_read_data(vntsd_client_t *clientp, char *c); +int vntsd_get_yes_no(vntsd_client_t *clientp, char *msg, + int *yes_no); +int vntsd_ctrl_cmd(vntsd_client_t *clientp, char c); +int vntsd_process_daemon_cmd(vntsd_client_t *clientp, char c); +int vntsd_telnet_cmd(vntsd_client_t *clientp, char c); + +int vntsd_set_telnet_options(int fd); +int vntsd_write_client(vntsd_client_t *client, char *buffer, + size_t sz); +int vntsd_write_fd(int fd, void *buffer, size_t sz); +int vntsd_write_line(vntsd_client_t *clientp, char *line); +int vntsd_write_lines(vntsd_client_t *clientp, char *lines); +extern char vntsd_eol[]; + +void vntsd_clean_group(vntsd_group_t *portp); +void vntsd_free_client(vntsd_client_t *clientp); +int vntsd_attach_timer(vntsd_timeout_t *tmop); +int vntsd_detach_timer(vntsd_timeout_t *tmop); +void vntsd_reset_timer(thread_t tid); +void vntsd_init_esctable_msgs(void); +int vntsd_vcc_ioctl(int ioctl_code, uint_t portno, void *buf); +int vntsd_vcc_err(vntsd_cons_t *consp); +int vntsd_cons_chk_intr(vntsd_client_t *clientp); +boolean_t vntsd_vcc_cons_alive(vntsd_cons_t *consp); +boolean_t vntsd_notify_client_cons_del(vntsd_client_t *clientp); +int vntsd_chk_group_total_cons(vntsd_group_t *groupp); + + +#ifdef DEBUG + +extern int vntsddbg; + +#define D1 if (vntsddbg & 0x01) (void) fprintf +#define D2 if (vntsddbg & 0x02) (void) fprintf +#define D3 if (vntsddbg & 0x04) (void) fprintf +#define DERR if (vntsddbg & 0x08) (void) fprintf + +#else /* not DEBUG */ + +#define D1 +#define D2 +#define D3 +#define DERR + +#endif /* not DEBUG */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VNTSD_H */ diff --git a/usr/src/cmd/vntsd/vntsd.xml b/usr/src/cmd/vntsd/vntsd.xml new file mode 100644 index 0000000000..f5474dd807 --- /dev/null +++ b/usr/src/cmd/vntsd/vntsd.xml @@ -0,0 +1,94 @@ +<?xml version="1.0"?> +<!DOCTYPE service_bundle SYSTEM "/usr/share/lib/xml/dtd/service_bundle.dtd.1"> +<!-- + Copyright 2006 Sun Microsystems, Inc. All rights reserved. + Use is subject to license terms. + + CDDL HEADER START + + The contents of this file are subject to the terms of the + Common Development and Distribution License (the "License"). + You may not use this file except in compliance with the License. + + You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + or http://www.opensolaris.org/os/licensing. + See the License for the specific language governing permissions + and limitations under the License. + + When distributing Covered Code, include this CDDL HEADER in each + file and include the License file at usr/src/OPENSOLARIS.LICENSE. + If applicable, add the following below this CDDL HEADER, with the + fields enclosed by brackets "[]" replaced with your own identifying + information: Portions Copyright [yyyy] [name of copyright owner] + + CDDL HEADER END + + ident "%Z%%M% %I% %E% SMI" + + NOTE: This service manifest is not editable; its contents will + be overwritten by package or patch operations, including + operating system upgrade. Make customizations in a different + file. +--> + +<service_bundle type='manifest' name='SUNWldomu:vntsd'> + +<service + name='ldoms/vntsd' + type='service' + version='1'> + + <create_default_instance enabled='false' /> + + <dependency + name='network' + grouping='optional_all' + restart_on='error' + type='service'> + <service_fmri value='svc:/milestone/network' /> + </dependency> + + <dependency + name='syslog' + grouping='optional_all' + restart_on='none' + type='service'> + <service_fmri value='svc:/system/system-log' /> + </dependency> + + <exec_method + type='method' + name='start' + exec='/lib/svc/method/svc-vntsd' + timeout_seconds='60' /> + + <exec_method + type='method' + name='stop' + exec=':kill' + timeout_seconds='30' /> + + <!-- these are passed to vntsd in the method script --> + <property_group name='vntsd' type='application'> + <propval name='vcc_device' type='astring' + value='virtual-console-concentrator@0' /> + <propval name='listen_addr' type='astring' value='localhost' /> + <propval name='timeout_minutes' type='integer' value='0' /> + </property_group> + + <stability value='Unstable' /> + + <template> + <common_name> + <loctext xml:lang='C'> + virtual network terminal server + </loctext> + </common_name> + <documentation> + <manpage title='vntsd' section='1M' + manpath='/usr/share/man' /> + </documentation> + </template> +</service> + +</service_bundle> diff --git a/usr/src/cmd/vntsd/vntsdvcc.c b/usr/src/cmd/vntsd/vntsdvcc.c new file mode 100644 index 0000000000..9facdf7c75 --- /dev/null +++ b/usr/src/cmd/vntsd/vntsdvcc.c @@ -0,0 +1,633 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Configuration and setup interface to vcc driver. + * At intialization time, vntsd opens vcc ctrl port and read initial + * configuratioa. It manages console groups, creates the listen thread, + * dynamically adds and removes virtual console within a group. + */ + + +#include <syslog.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/ipc.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/socket.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <sys/sem.h> +#include <wait.h> +#include <time.h> +#include <synch.h> +#include <netinet/in.h> +#include <thread.h> +#include <signal.h> +#include "vntsd.h" + +/* signal all clients that console has been deleted */ +boolean_t +vntsd_notify_client_cons_del(vntsd_client_t *clientp) +{ + (void) mutex_lock(&clientp->lock); + clientp->status |= VNTSD_CLIENT_CONS_DELETED; + (void) thr_kill(clientp->cons_tid, SIGUSR1); + (void) mutex_unlock(&clientp->lock); + return (B_FALSE); +} + +/* free console structure */ +static void +free_cons(vntsd_cons_t *consp) +{ + assert(consp); + (void) mutex_destroy(&consp->lock); + (void) cond_destroy(&consp->cvp); + free(consp); +} + +/* + * all clients connected to a console must disconnect before + * removing a console. + */ +static void +cleanup_cons(vntsd_cons_t *consp) +{ + vntsd_group_t *groupp; + timestruc_t to; + + assert(consp); + D1(stderr, "t@%d vntsd_disconn_clients@%d\n", thr_self(), + consp->cons_no); + + groupp = consp->group; + assert(groupp); + + + (void) mutex_lock(&consp->lock); + + /* wait for all clients disconnect from the console */ + while (consp->clientpq != NULL) { + consp->status |= VNTSD_CONS_SIG_WAIT; + + /* signal client to disconnect the console */ + (void) vntsd_que_walk(consp->clientpq, + (el_func_t)vntsd_notify_client_cons_del); + + (void) thr_kill(consp->wr_tid, SIGUSR1); + to.tv_sec = VNTSD_CV_WAIT_DELTIME; + to.tv_nsec = 0; + + /* wait for clients to disconnect */ + (void) cond_reltimedwait(&consp->cvp, &consp->lock, &to); + } + + (void) mutex_unlock(&consp->lock); + + free_cons(consp); +} + +/* search for a group whose console is being deleted */ +static boolean_t +find_clean_cons_group(vntsd_group_t *groupp) +{ + if (groupp->status & VNTSD_GROUP_CLEAN_CONS) { + return (B_TRUE); + } else { + return (B_FALSE); + } +} + +/* search for a console that is being deleted */ +static boolean_t +find_clean_cons(vntsd_cons_t *consp) +{ + if (consp->status & VNTSD_CONS_DELETED) { + return (B_TRUE); + } else { + return (B_FALSE); + } +} + +/* delete a console */ +void +vntsd_delete_cons(vntsd_t *vntsdp) +{ + vntsd_group_t *groupp; + vntsd_cons_t *consp; + + for (; ; ) { + /* get the group contains deleted console */ + (void) mutex_lock(&vntsdp->lock); + groupp = vntsd_que_walk(vntsdp->grouppq, + (el_func_t)find_clean_cons_group); + if (groupp == NULL) { + /* no more group has console deleted */ + (void) mutex_unlock(&vntsdp->lock); + return; + } + groupp->status &= ~VNTSD_GROUP_CLEAN_CONS; + (void) mutex_unlock(&vntsdp->lock); + + for (; ; ) { + /* get the console to be deleted */ + (void) mutex_lock(&groupp->lock); + assert(groupp->conspq); + consp = vntsd_que_walk(groupp->conspq, + (el_func_t)find_clean_cons); + if (consp == NULL) { + /* no more cons to delete */ + (void) mutex_unlock(&groupp->lock); + break; + } + + /* remove console from the group */ + (void) vntsd_que_rm(&groupp->conspq, consp); + groupp->num_cons--; + (void) mutex_unlock(&groupp->lock); + + /* clean up the console */ + cleanup_cons(consp); + + /* delete group? */ + if (groupp->num_cons == 0) { + /* no more console delete it */ + assert(groupp->vntsd); + + (void) mutex_lock(&groupp->vntsd->lock); + (void) vntsd_que_rm(&groupp->vntsd->grouppq, + groupp); + (void) mutex_unlock(&groupp->vntsd->lock); + + /* clean up the group */ + vntsd_clean_group(groupp); + break; + } + } + } +} + +/* clean up a group */ +void +vntsd_clean_group(vntsd_group_t *groupp) +{ + + timestruc_t to; + + D1(stderr, "t@%d clean_group() group=%s tcp=%lld\n", thr_self(), + groupp->group_name, groupp->tcp_port); + + (void) mutex_lock(&groupp->lock); + + /* prevent from reentry */ + if (groupp->status & VNTSD_GROUP_CLEANUP) { + (void) mutex_unlock(&groupp->lock); + return; + } + groupp->status |= VNTSD_GROUP_CLEANUP; + vntsd_free_que(&groupp->conspq, (clean_func_t)cleanup_cons); + (void) mutex_unlock(&groupp->lock); + + /* walk through no cons client queue */ + while (groupp->no_cons_clientpq != NULL) { + groupp->status |= VNTSD_GROUP_SIG_WAIT; + (void) vntsd_que_walk(groupp->no_cons_clientpq, + (el_func_t)vntsd_notify_client_cons_del); + to.tv_sec = VNTSD_CV_WAIT_DELTIME; + to.tv_nsec = 0; + (void) cond_reltimedwait(&groupp->cvp, &groupp->lock, &to); + } + + if (groupp->listen_tid == thr_self()) { + /* listen thread is exiting */ + (void) mutex_lock(&(groupp->vntsd->lock)); + (void) vntsd_que_rm(&groupp->vntsd->grouppq, groupp); + (void) mutex_unlock(&groupp->vntsd->lock); + + (void) cond_destroy(&groupp->cvp); + (void) mutex_unlock(&groupp->lock); + (void) mutex_destroy(&groupp->lock); + free(groupp); + return; + } + + /* signal listen thread to exit */ + groupp->status |= VNTSD_GROUP_SIG_WAIT; + + while (groupp->status & VNTSD_GROUP_SIG_WAIT) { + (void) thr_kill(groupp->listen_tid, SIGUSR1); + to.tv_sec = VNTSD_CV_WAIT_DELTIME; + to.tv_nsec = 0; + /* wait listen thread to exit */ + (void) cond_reltimedwait(&groupp->cvp, &groupp->lock, &to); + } + + (void) mutex_unlock(&groupp->lock); + (void) thr_join(groupp->listen_tid, NULL, NULL); + /* free group */ + (void) cond_destroy(&groupp->cvp); + (void) mutex_destroy(&groupp->lock); + free(groupp); +} + +/* allocate and initialize console structure */ +static vntsd_cons_t * +alloc_cons(vntsd_group_t *groupp, vcc_console_t *consolep) +{ + vntsd_cons_t *consp; + int rv; + + /* allocate console */ + consp = (vntsd_cons_t *)malloc(sizeof (vntsd_cons_t)); + if (consp == NULL) { + vntsd_log(VNTSD_ERR_NO_MEM, "alloc_cons"); + return (NULL); + } + + /* intialize console */ + bzero(consp, sizeof (vntsd_cons_t)); + + (void) mutex_init(&consp->lock, USYNC_THREAD|LOCK_ERRORCHECK, NULL); + (void) cond_init(&consp->cvp, USYNC_THREAD, NULL); + + consp->cons_no = consolep->cons_no; + (void) strlcpy(consp->domain_name, consolep->domain_name, MAXPATHLEN); + (void) strlcpy(consp->dev_name, consolep->dev_name, MAXPATHLEN); + consp->wr_tid = (thread_t)-1; + consp->vcc_fd = (thread_t)-1; + + /* join the group */ + (void) mutex_lock(&groupp->lock); + + if ((rv = vntsd_que_append(&groupp->conspq, consp)) != + VNTSD_SUCCESS) { + (void) mutex_unlock(&groupp->lock); + vntsd_log(rv, "alloc_cons"); + free_cons(consp); + return (NULL); + } + groupp->num_cons++; + consp->group = groupp; + + (void) mutex_unlock(&groupp->lock); + + D1(stderr, "t@%d alloc_cons@%d %s %s\n", thr_self(), + consp->cons_no, consp->domain_name, consp->dev_name); + + return (consp); +} + +/* compare tcp with group->tcp */ +static boolean_t +grp_by_tcp(vntsd_group_t *groupp, uint64_t *tcp_port) +{ + assert(groupp); + assert(tcp_port); + return (groupp->tcp_port == *tcp_port); +} + +/* allocate and initialize group */ +static vntsd_group_t * +alloc_group(vntsd_t *vntsdp, char *group_name, uint64_t tcp_port) +{ + vntsd_group_t *groupp; + + /* allocate group */ + groupp = (vntsd_group_t *)malloc(sizeof (vntsd_group_t)); + if (groupp == NULL) { + vntsd_log(VNTSD_ERR_NO_MEM, "alloc_group"); + return (NULL); + } + + /* initialize group */ + bzero(groupp, sizeof (vntsd_group_t)); + + (void) mutex_init(&groupp->lock, USYNC_THREAD|LOCK_ERRORCHECK, NULL); + (void) cond_init(&groupp->cvp, USYNC_THREAD, NULL); + + if (group_name != NULL) { + (void) memcpy(groupp->group_name, group_name, MAXPATHLEN); + } + + groupp->tcp_port = tcp_port; + groupp->listen_tid = (thread_t)-1; + groupp->sockfd = (thread_t)-1; + groupp->vntsd = vntsdp; + + D1(stderr, "t@%d alloc_group@%lld:%s\n", thr_self(), groupp->tcp_port, + groupp->group_name); + + return (groupp); +} + +/* + * Initialize a console, if console is associated with with a + * new group, intialize the group. + */ +static int +alloc_cons_with_group(vntsd_t *vntsdp, vcc_console_t *consp, + vntsd_group_t **new_groupp) +{ + vntsd_group_t *groupp = NULL; + int rv; + + *new_groupp = NULL; + + /* match group by tcp port */ + + + (void) mutex_lock(&vntsdp->lock); + groupp = vntsd_que_find(vntsdp->grouppq, + (compare_func_t)grp_by_tcp, (void *)&(consp->tcp_port)); + (void) mutex_unlock(&vntsdp->lock); + + if (groupp != NULL) { + /* group with same tcp port found */ + + if (strcmp(groupp->group_name, consp->group_name)) { + /* conflict group name */ + vntsd_log(VNTSD_ERR_VCC_GRP_NAME, + "group name is different from existing group"); + return (VNTSD_ERR_VCC_CTRL_DATA); + } + + } else { + /* new group */ + groupp = alloc_group(vntsdp, consp->group_name, + consp->tcp_port); + if (groupp == NULL) { + return (VNTSD_ERR_NO_MEM); + } + + assert(groupp->conspq == NULL); + /* queue group to vntsdp */ + (void) mutex_lock(&vntsdp->lock); + rv = vntsd_que_append(&vntsdp->grouppq, groupp); + (void) mutex_unlock(&vntsdp->lock); + + if (rv != VNTSD_SUCCESS) { + return (rv); + } + + *new_groupp = groupp; + } + + /* intialize console */ + if (alloc_cons(groupp, consp) == NULL) { + /* no memory */ + if (new_groupp != NULL) { + /* clean up new group */ + (void) cond_destroy(&groupp->cvp); + (void) mutex_destroy(&groupp->lock); + free(groupp); + } + + return (VNTSD_ERR_NO_MEM); + } + + return (VNTSD_SUCCESS); + +} + + +/* create listen thread */ +static boolean_t +create_listen_thread(vntsd_group_t *groupp) +{ + + char err_msg[VNTSD_LINE_LEN]; + int rv; + + assert(groupp); + + (void) mutex_lock(&groupp->lock); + assert(groupp->num_cons); + + D1(stderr, "t@%d create_listen:%lld\n", thr_self(), groupp->tcp_port); + + if ((rv = thr_create(NULL, 0, (thr_func_t)vntsd_listen_thread, + (void *)groupp, THR_DETACHED, &groupp->listen_tid)) + != 0) { + (void) (void) snprintf(err_msg, sizeof (err_msg), + "Can not create listen thread for" + "group %s tcp %llx\n", groupp->group_name, + groupp->tcp_port); + vntsd_log(VNTSD_ERR_CREATE_LISTEN_THR, err_msg); + + /* clean up group queue */ + vntsd_free_que(&groupp->conspq, (clean_func_t)free_cons); + groupp->listen_tid = (thread_t)-1; + } + + (void) mutex_unlock(&groupp->lock); + + return (rv != 0); +} + +/* delete a console if the console exists in the vntsd */ +static void +delete_cons_before_add(vntsd_t *vntsdp, uint64_t tcp_port, uint_t cons_no) +{ + vntsd_group_t *groupp; + vntsd_cons_t *consp; + + /* group exists? */ + (void) mutex_lock(&vntsdp->lock); + groupp = vntsd_que_find(vntsdp->grouppq, (compare_func_t)grp_by_tcp, + (void *)&(tcp_port)); + (void) mutex_unlock(&vntsdp->lock); + + if (groupp == NULL) { + /* no such group */ + return; + } + + /* group exists, if console exists? */ + (void) mutex_lock(&groupp->lock); + consp = vntsd_que_find(groupp->conspq, + (compare_func_t)vntsd_cons_by_consno, &cons_no); + + if (consp == NULL) { + /* no such console */ + (void) mutex_unlock(&groupp->lock); + return; + } + /* console exists - delete console */ + + (void) mutex_lock(&consp->lock); + + consp->status |= VNTSD_CONS_DELETED; + groupp->status |= VNTSD_GROUP_CLEAN_CONS; + + (void) mutex_unlock(&consp->lock); + + (void) mutex_unlock(&groupp->lock); + + vntsd_delete_cons(vntsdp); +} + +/* add a console */ +static void +do_add_cons(vntsd_t *vntsdp, int cons_no) +{ + vcc_console_t console; + vntsd_group_t *groupp; + int rv; + char err_msg[VNTSD_LINE_LEN]; + + + (void) snprintf(err_msg, sizeof (err_msg), + "do_add_cons():Can not add console=%d", cons_no); + + /* get console configuration from vcc */ + + if ((rv = vntsd_vcc_ioctl(VCC_CONS_INFO, cons_no, (void *)&console)) + != VNTSD_SUCCESS) { + vntsd_log(rv, err_msg); + return; + } + + /* clean up the console if console was deleted and added again */ + delete_cons_before_add(vntsdp, console.tcp_port, console.cons_no); + + /* initialize console */ + + if ((rv = alloc_cons_with_group(vntsdp, &console, &groupp)) != + VNTSD_SUCCESS) { + /* no memory to add this new console */ + vntsd_log(rv, err_msg); + return; + } + + if (groupp != NULL) { + /* new group */ + /* create listen thread for this console */ + if (create_listen_thread(groupp)) { + vntsd_log(VNTSD_ERR_CREATE_LISTEN_THR, err_msg); + (void) cond_destroy(&groupp->cvp); + (void) mutex_destroy(&groupp->lock); + free(groupp); + } + + } +} + +/* daemon wake up */ +void +vntsd_daemon_wakeup(vntsd_t *vntsdp) +{ + + vcc_response_t inq_data; + + /* reason to wake up */ + if (vntsd_vcc_ioctl(VCC_INQUIRY, 0, (void *)&inq_data) != + VNTSD_SUCCESS) { + vntsd_log(VNTSD_ERR_VCC_IOCTL, "vntsd_daemon_wakeup()"); + return; + } + + D1(stderr, "t@%d vntsd_daemon_wakup:msg %d port %x\n", thr_self(), + inq_data.reason, inq_data.cons_no); + + switch (inq_data.reason) { + + case VCC_CONS_ADDED: + do_add_cons(vntsdp, inq_data.cons_no); + break; + + default: + DERR(stderr, "t@%d daemon_wakeup:ioctl_unknown %d\n", + thr_self(), inq_data.reason); + vntsd_log(VNTSD_ERR_UNKNOWN_CMD, "from vcc\n"); + break; + } +} + +/* initial console configuration */ +void +vntsd_get_config(vntsd_t *vntsdp) +{ + + int i; + int num_cons; + vcc_console_t *consp; + vntsd_group_t *groupp; + + /* num of consoles */ + num_cons = 0; + + if (vntsd_vcc_ioctl(VCC_NUM_CONSOLE, 0, (void *)&num_cons) != + VNTSD_SUCCESS) { + vntsd_log(VNTSD_ERR_VCC_IOCTL, "VCC_NUM_CONSOLE failed\n"); + return; + } + + D3(stderr, "get_config:num_cons=%d", num_cons); + + if (num_cons == 0) { + return; + } + + /* allocate memory for all consoles */ + consp = malloc(num_cons*sizeof (vcc_console_t)); + + if (consp == NULL) { + vntsd_log(VNTSD_ERR_NO_MEM, "for console table."); + return; + } + + /* get console table */ + if (vntsd_vcc_ioctl(VCC_CONS_TBL, 0, (void *)consp) != VNTSD_SUCCESS) { + vntsd_log(VNTSD_ERR_VCC_IOCTL, " VCC_CONS_TBL " + "for console table\n"); + return; + } + + /* intialize groups and consoles */ + for (i = 0; i < num_cons; i++) { + if (alloc_cons_with_group(vntsdp, &consp[i], &groupp) + != VNTSD_SUCCESS) { + vntsd_log(VNTSD_ERR_ADD_CONS_FAILED, "get_config"); + } + } + + /* create listen thread for each group */ + (void) mutex_lock(&vntsdp->lock); + + for (; ; ) { + groupp = vntsd_que_walk(vntsdp->grouppq, + (el_func_t)create_listen_thread); + if (groupp == NULL) { + break; + } + vntsd_log(VNTSD_ERR_CREATE_LISTEN_THR, "get config()"); + } + + (void) mutex_unlock(&vntsdp->lock); +} diff --git a/usr/src/cmd/vntsd/write.c b/usr/src/cmd/vntsd/write.c new file mode 100644 index 0000000000..16f07029c5 --- /dev/null +++ b/usr/src/cmd/vntsd/write.c @@ -0,0 +1,251 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * write thread - read from vcc console and write to tcp client. There are one + * writer and multiple readers per console. The first client who connects to + * a console get write access. + * Writer thread writes vcc data to all tcp clients that connected to + * the console. + */ + +#include <stdio.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <thread.h> +#include <synch.h> +#include <signal.h> +#include <assert.h> +#include <poll.h> +#include <syslog.h> +#include <libintl.h> +#include "vntsd.h" +#include "chars.h" + +/* + * check the state of write thread. exit if no more client connects to the + * console. + */ +static void +write_chk_status(vntsd_cons_t *consp, int status) +{ + + if ((consp->status & VNTSD_CONS_DELETED) || (consp->clientpq == NULL)) { + thr_exit(0); + } + + switch (status) { + case VNTSD_STATUS_VCC_IO_ERR: + assert(consp->group != NULL); + if (vntsd_vcc_err(consp) != VNTSD_STATUS_CONTINUE) { + thr_exit(0); + } + break; + case VNTSD_STATUS_INTR: + thr_exit(0); + default: + break; + + } +} + +/* + * skip_terminal_null() + * scan terminal null character sequence (0x5e 0x40) + * return number of characters in the buf after skipping terminal null + * sequence. + */ +static int +skip_terminal_null(char *buf, int buf_sz, int sz) +{ + int i, j; + static int term_null_seq = 0; + + assert(sz >= 0); + + if (buf_sz < sz+1) { + return (-1); + } + + if (term_null_seq) { + /* skip 0x5e previously */ + term_null_seq = 0; + + if (buf[0] != 0x40) { + /* not terminal null sequence put 0x5e back */ + for (i = sz; i > 0; i--) { + buf[i] = buf[i-1]; + } + + buf[0] = 0x5e; + + sz++; + } else { + /* skip terminal null sequence */ + sz--; + + if (sz == 0) { + return (sz); + } + + for (i = 0; i < sz; i++) { + buf[i] = buf[i+1]; + } + } + } + + for (; ; ) { + for (i = 0; i < sz; i++) { + if (buf[i] == '\0') { + return (i); + } + + if (buf[i] == 0x5e) { + /* possible terminal null sequence */ + if (i == sz -1) { + /* last character in buffer */ + term_null_seq = 1; + sz--; + buf[i] = 0; + return (sz); + } + + if (buf[i+1] == 0x40) { + /* found terminal null sequence */ + sz -= 2; + for (j = i; j < sz -i; j++) { + buf[j] = buf[j+2]; + } + break; + } + + if (buf[i+1] == '\0') { + buf[i] = 0; + term_null_seq = 1; + return (i); + } + + } + } + + if (i == sz) { + /* end of scan */ + return (sz); + } + } +} + +/* read data from vcc */ +static int +read_vcc(vntsd_cons_t *consp, char *buf, ssize_t *sz) +{ + /* read from vcc */ + *sz = read(consp->vcc_fd, buf, VNTSD_MAX_BUF_SIZE); + + if (errno == EINTR) { + return (VNTSD_STATUS_INTR); + } + + if ((*sz > 0)) { + return (VNTSD_SUCCESS); + } + return (VNTSD_STATUS_VCC_IO_ERR); +} + +static int s_sz; +/* write to a client */ +static boolean_t +write_all_clients(vntsd_client_t *clientp, char *buf) +{ + int rv; + + rv = vntsd_write_client(clientp, buf, s_sz); + if (rv != VNTSD_SUCCESS) { + (void) mutex_lock(&clientp->lock); + clientp->status |= VNTSD_CLIENT_IO_ERR; + assert(clientp->cons); + (void) thr_kill(clientp->cons_tid, NULL); + (void) mutex_unlock(&clientp->lock); + } + return (B_FALSE); + +} + +/* vntsd_write_thread() */ +void* +vntsd_write_thread(vntsd_cons_t *consp) +{ + char buf[VNTSD_MAX_BUF_SIZE+1]; + int sz; + int rv; + + D1(stderr, "t@%d vntsd_write@%d\n", thr_self(), consp->vcc_fd); + + assert(consp); + write_chk_status(consp, VNTSD_SUCCESS); + + for (; ; ) { + bzero(buf, VNTSD_MAX_BUF_SIZE +1); + + /* read data */ + rv = read_vcc(consp, buf, &sz); + + write_chk_status(consp, rv); + + if (sz <= 0) { + continue; + } + + /* has data */ + if ((s_sz = skip_terminal_null(buf, sz+1, sz)) == 0) { + /* terminal null sequence */ + continue; + } + + assert(s_sz > 0); + + /* + * output data to all clients connected + * to this console + */ + + (void) mutex_lock(&consp->lock); + (void) vntsd_que_find(consp->clientpq, + (compare_func_t)write_all_clients, buf); + (void) mutex_unlock(&consp->lock); + + write_chk_status(consp, VNTSD_SUCCESS); + + } + + /*NOTREACHED*/ + return (NULL); +} diff --git a/usr/src/common/mdesc/mdesc_diff.c b/usr/src/common/mdesc/mdesc_diff.c new file mode 100644 index 0000000000..28f55abc92 --- /dev/null +++ b/usr/src/common/mdesc/mdesc_diff.c @@ -0,0 +1,602 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#ifdef _KERNEL +#include <sys/systm.h> +#else /* _KERNEL */ +#include <string.h> +#include <strings.h> +#endif /* _KERNEL */ +#include <sys/note.h> +#include <sys/mdesc.h> +#include <sys/mdesc_impl.h> + +#define MDD_FREE_CHECK(mdp, ptr, sz) \ + do { \ + if (ptr) mdp->freep(ptr, sz); \ + _NOTE(CONSTCOND) } while (0) + +#define MD_DIFF_MAGIC 0x4D445F4449464621ull /* 'MD_DIFF!' */ +#define MD_DIFF_NOMATCH (-1) +#define MD_DIFF_MATCH (1) + +typedef struct { + mde_cookie_t *mdep; + uint_t nelem; +} md_diff_t; + +typedef struct { + uint64_t mdd_magic; + md_diff_t added; + md_diff_t removed; + md_diff_t match1; + md_diff_t match2; + void *(*allocp)(size_t); + void (*freep)(void *, size_t); +} md_diff_impl_t; + +/* + * Internal utility functions + */ +static int mdd_scan_for_nodes(md_t *mdp, mde_cookie_t start, + char *compnodep, int *countp, mde_cookie_t **nodespp); + +static boolean_t mdd_any_dup_nodes(md_impl_t *mdp, md_prop_match_t *pmp, + int count, mde_cookie_t *nodesp); + +static int mdd_node_list_match(md_impl_t *md1, md_impl_t *md2, + md_element_t *match_nodep, mde_cookie_t *match_listp, + uint8_t *match_seenp, int start, int end, md_prop_match_t *match_elemsp); + +static int mdd_node_compare(md_impl_t *mdap, md_impl_t *mdbp, + md_element_t *nodeap, md_element_t *nodebp, md_prop_match_t *match_elemsp); + +/* + * Given two DAGs and information about how to uniquely identify + * the nodes of interest, determine which nodes have been added + * to the second MD, removed from the first MD, or exist in both + * MDs. This information is recorded and can be accessed using the + * opaque cookie returned to the caller. + */ +md_diff_cookie_t +md_diff_init(md_t *md1p, mde_cookie_t start1, md_t *md2p, mde_cookie_t start2, + char *compnodep, md_prop_match_t *match_fieldsp) +{ + int idx; + md_impl_t *md1 = (md_impl_t *)md1p; + md_impl_t *md2 = (md_impl_t *)md2p; + mde_cookie_t *md1nodesp = NULL; + mde_cookie_t *md2nodesp = NULL; + int md1count = 0; + int md2count = 0; + uint8_t *seenp = NULL; + + /* variables used to gather results */ + md_diff_impl_t *diff_res; + mde_cookie_t *mde_add_scr; + mde_cookie_t *mde_rem_scr; + mde_cookie_t *mde_match1_scr; + mde_cookie_t *mde_match2_scr; + int nadd = 0; + int nrem = 0; + int nmatch = 0; + + /* sanity check params */ + if ((md1p == NULL) || (md2p == NULL)) + return (MD_INVAL_DIFF_COOKIE); + + if ((start1 == MDE_INVAL_ELEM_COOKIE) || + (start2 == MDE_INVAL_ELEM_COOKIE)) + return (MD_INVAL_DIFF_COOKIE); + + if ((compnodep == NULL) || (match_fieldsp == NULL)) + return (MD_INVAL_DIFF_COOKIE); + + /* + * Prepare an array of the matching nodes from the first MD. + */ + if (mdd_scan_for_nodes(md1p, + start1, compnodep, &md1count, &md1nodesp) == -1) + return (MD_INVAL_DIFF_COOKIE); + + /* sanity check that all nodes are unique */ + if (md1nodesp && + mdd_any_dup_nodes(md1, match_fieldsp, md1count, md1nodesp)) { + MDD_FREE_CHECK(md1, md1nodesp, sizeof (mde_cookie_t) * + md1count); + return (MD_INVAL_DIFF_COOKIE); + } + + + /* + * Prepare an array of the matching nodes from the second MD. + */ + if (mdd_scan_for_nodes(md2p, + start2, compnodep, &md2count, &md2nodesp) == -1) + return (MD_INVAL_DIFF_COOKIE); + + /* sanity check that all nodes are unique */ + if (md2nodesp && + mdd_any_dup_nodes(md2, match_fieldsp, md2count, md2nodesp)) { + MDD_FREE_CHECK(md1, md1nodesp, sizeof (mde_cookie_t) * + md1count); + MDD_FREE_CHECK(md2, md2nodesp, sizeof (mde_cookie_t) * + md2count); + return (MD_INVAL_DIFF_COOKIE); + } + + /* setup our result structure */ + diff_res = md1->allocp(sizeof (md_diff_impl_t)); + bzero(diff_res, sizeof (md_diff_impl_t)); + diff_res->allocp = md1->allocp; + diff_res->freep = md1->freep; + diff_res->mdd_magic = MD_DIFF_MAGIC; + + /* + * Special cases for empty lists + */ + if ((md1count == 0) && (md2count != 0)) { + /* all the nodes found were added */ + diff_res->added.mdep = md2nodesp; + diff_res->added.nelem = md2count; + return ((mde_cookie_t)diff_res); + } + + if ((md1count != 0) && (md2count == 0)) { + /* all the nodes found were removed */ + diff_res->removed.mdep = md1nodesp; + diff_res->removed.nelem = md1count; + return ((mde_cookie_t)diff_res); + } + + if ((md1count == 0) && (md2count == 0)) + /* no nodes found */ + return ((mde_cookie_t)diff_res); + + /* + * Both lists have some elements. Allocate some scratch + * buffers to sort them into our three categories, added, + * removed, and matched pairs. + */ + mde_add_scr = diff_res->allocp(sizeof (mde_cookie_t) * md2count); + mde_rem_scr = diff_res->allocp(sizeof (mde_cookie_t) * md1count); + mde_match1_scr = diff_res->allocp(sizeof (mde_cookie_t) * md1count); + mde_match2_scr = diff_res->allocp(sizeof (mde_cookie_t) * md2count); + + /* array of seen flags only needed for md2 */ + seenp = (uint8_t *)diff_res->allocp(sizeof (uint8_t) * md2count); + bzero(seenp, sizeof (uint8_t) * md2count); + + /* + * Make a pass through the md1 node array. Make note of + * any nodes not in the md2 array, indicating that they + * have been removed. Also keep track of the nodes that + * are present in both arrays for the matched pair results. + */ + for (idx = 0; idx < md1count; idx++) { + + md_element_t *elem = &(md1->mdep[md1nodesp[idx]]); + + int match = mdd_node_list_match(md1, md2, elem, md2nodesp, + seenp, 0, md2count - 1, match_fieldsp); + + if (match == MD_DIFF_NOMATCH) + /* record deleted node */ + mde_rem_scr[nrem++] = md1nodesp[idx]; + else { + /* record matched node pair */ + mde_match1_scr[nmatch] = md1nodesp[idx]; + mde_match2_scr[nmatch] = md2nodesp[match]; + nmatch++; + + /* mark that this match has been recorded */ + seenp[match] = 1; + } + } + + /* + * Make a pass through the md2 array. Any nodes that have + * not been marked as seen have been added. + */ + for (idx = 0; idx < md2count; idx++) { + if (!seenp[idx]) + /* record added node */ + mde_add_scr[nadd++] = md2nodesp[idx]; + } + + /* fill in the added node list */ + if (nadd) { + int addsz = sizeof (mde_cookie_t) * nadd; + diff_res->added.mdep = (mde_cookie_t *)diff_res->allocp(addsz); + + bcopy(mde_add_scr, diff_res->added.mdep, addsz); + + diff_res->added.nelem = nadd; + } + + /* fill in the removed node list */ + if (nrem) { + int remsz = sizeof (mde_cookie_t) * nrem; + diff_res->removed.mdep = + (mde_cookie_t *)diff_res->allocp(remsz); + + bcopy(mde_rem_scr, diff_res->removed.mdep, remsz); + diff_res->removed.nelem = nrem; + } + + /* fill in the matching node lists */ + if (nmatch) { + int matchsz = sizeof (mde_cookie_t) * nmatch; + diff_res->match1.mdep = + (mde_cookie_t *)diff_res->allocp(matchsz); + diff_res->match2.mdep = + (mde_cookie_t *)diff_res->allocp(matchsz); + + bcopy(mde_match1_scr, diff_res->match1.mdep, matchsz); + bcopy(mde_match2_scr, diff_res->match2.mdep, matchsz); + diff_res->match1.nelem = nmatch; + diff_res->match2.nelem = nmatch; + } + + /* clean up */ + md1->freep(md1nodesp, sizeof (mde_cookie_t) * md1count); + md2->freep(md2nodesp, sizeof (mde_cookie_t) * md2count); + + diff_res->freep(mde_add_scr, sizeof (mde_cookie_t) * md2count); + diff_res->freep(mde_rem_scr, sizeof (mde_cookie_t) * md1count); + diff_res->freep(mde_match1_scr, sizeof (mde_cookie_t) * md1count); + diff_res->freep(mde_match2_scr, sizeof (mde_cookie_t) * md2count); + + diff_res->freep(seenp, sizeof (uint8_t) * md2count); + + return ((md_diff_cookie_t)diff_res); +} + +/* + * Returns an array of the nodes added to the second MD in a + * previous md_diff_init() call. Returns the number of elements + * in the returned array. If the value is zero, the pointer + * passed back will be NULL. + */ +int +md_diff_added(md_diff_cookie_t mdd, mde_cookie_t **mde_addedp) +{ + md_diff_impl_t *mddp = (md_diff_impl_t *)mdd; + + if ((mddp == NULL) || (mddp->mdd_magic != MD_DIFF_MAGIC)) + return (-1); + + *mde_addedp = mddp->added.mdep; + + return (mddp->added.nelem); +} + +/* + * Returns an array of the nodes removed from the first MD in a + * previous md_diff_init() call. Returns the number of elements + * in the returned array. If the value is zero, the pointer + * passed back will be NULL. + */ +int +md_diff_removed(md_diff_cookie_t mdd, mde_cookie_t **mde_removedp) +{ + md_diff_impl_t *mddp = (md_diff_impl_t *)mdd; + + if ((mddp == NULL) || (mddp->mdd_magic != MD_DIFF_MAGIC)) + return (-1); + + *mde_removedp = mddp->removed.mdep; + + return (mddp->removed.nelem); +} + +/* + * Returns a pair of parallel arrays that contain nodes that were + * considered matching based on the match criteria passed in to + * a previous md_diff_init() call. Returns the number of elements + * in the arrays. If the value is zero, both pointers passed back + * will be NULL. + */ +int +md_diff_matched(md_diff_cookie_t mdd, mde_cookie_t **mde_match1p, + mde_cookie_t **mde_match2p) +{ + md_diff_impl_t *mddp = (md_diff_impl_t *)mdd; + + if ((mddp == NULL) || (mddp->mdd_magic != MD_DIFF_MAGIC)) + return (-1); + + *mde_match1p = mddp->match1.mdep; + *mde_match2p = mddp->match2.mdep; + + return (mddp->match1.nelem); +} + +/* + * Deallocate any storage used to store results of a previous + * md_diff_init() call. Returns 0 on success and -1 on failure. + */ +int +md_diff_fini(md_diff_cookie_t mdd) +{ + md_diff_impl_t *mddp = (md_diff_impl_t *)mdd; + + if ((mddp == NULL) || (mddp->mdd_magic != MD_DIFF_MAGIC)) + return (-1); + + mddp->mdd_magic = 0; + + MDD_FREE_CHECK(mddp, mddp->added.mdep, mddp->added.nelem * + sizeof (mde_cookie_t)); + + MDD_FREE_CHECK(mddp, mddp->removed.mdep, mddp->removed.nelem * + sizeof (mde_cookie_t)); + + MDD_FREE_CHECK(mddp, mddp->match1.mdep, mddp->match1.nelem * + sizeof (mde_cookie_t)); + + MDD_FREE_CHECK(mddp, mddp->match2.mdep, mddp->match2.nelem * + sizeof (mde_cookie_t)); + + mddp->freep(mddp, sizeof (md_diff_impl_t)); + + return (0); +} + +/* + * Walk the "fwd" DAG in an MD and return an array of nodes that are + * of the specified type. The start param is used to start the walk + * from an arbitrary location in the DAG. Returns an array of nodes + * as well as a count of the number of nodes in the array. If the + * count is zero, the node pointer will be passed back as NULL. + * + * Returns: 0 success; -1 failure + */ +static int +mdd_scan_for_nodes(md_t *mdp, + mde_cookie_t start, char *compnodep, int *countp, mde_cookie_t **nodespp) +{ + mde_str_cookie_t cname; + mde_str_cookie_t aname; + md_impl_t *mdip = (md_impl_t *)mdp; + + if (mdip == NULL) + return (-1); + + cname = md_find_name(mdp, compnodep); + aname = md_find_name(mdp, "fwd"); + + /* get the number of nodes of interest in the DAG */ + *countp = md_scan_dag(mdp, start, cname, aname, NULL); + if (*countp == 0) { + *nodespp = NULL; + return (0); + } + + /* allocate the storage */ + *nodespp = mdip->allocp(sizeof (mde_cookie_t) * (*countp)); + + /* populate our array with the matching nodes */ + (void) md_scan_dag(mdp, start, cname, aname, *nodespp); + + return (0); +} + +/* + * Walk an array of nodes and check if there are any duplicate + * nodes. A duplicate is determined based on the specified match + * criteria. Returns B_TRUE if there are any duplicates and B_FALSE + * otherwise. + */ +static boolean_t +mdd_any_dup_nodes(md_impl_t *mdp, md_prop_match_t *pmp, int count, + mde_cookie_t *nodesp) +{ + int idx; + int match; + md_element_t *elem; + + ASSERT(count > 0 || nodesp == NULL); + + for (idx = 0; idx < count; idx++) { + elem = &(mdp->mdep[nodesp[idx]]); + + match = mdd_node_list_match(mdp, mdp, elem, nodesp, NULL, + idx + 1, count - 1, pmp); + + if (match != MD_DIFF_NOMATCH) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Given a node and a array of nodes, compare the node to all elements + * in the specified start-end range of the array. If the node matches + * one of the nodes in the array, return the index of that node. Otherwise + * return MD_DIFF_NOMATCH. + * + * The optional seen array parameter can be used to optimize repeated + * calls to this function. If the seen array indicates that an element + * has already been matched, the full comparison is not necessary. + */ +static int +mdd_node_list_match(md_impl_t *md1, md_impl_t *md2, md_element_t *match_nodep, + mde_cookie_t *match_listp, uint8_t *match_seenp, int start, int end, + md_prop_match_t *match_elemsp) +{ + int match; + int idx; + md_element_t *elem; + + for (idx = start; idx <= end; idx++) { + + if ((match_seenp != NULL) && (match_seenp[idx])) + continue; + + elem = &(md2->mdep[match_listp[idx]]); + + match = mdd_node_compare(md1, md2, match_nodep, elem, + match_elemsp); + if (match == MD_DIFF_MATCH) + return (idx); + } + + return (MD_DIFF_NOMATCH); +} + +/* + * Given two nodes and a list of properties, compare the nodes. + * A match is concluded if both nodes have all of the specified + * properties and all the values of those properties are the + * same. Returns MD_DIFF_NOMATCH if the nodes do not match and + * MD_DIFF_MATCH otherwise. + */ +static int +mdd_node_compare(md_impl_t *mdap, md_impl_t *mdbp, md_element_t *nodeap, + md_element_t *nodebp, md_prop_match_t *match_elemsp) +{ + md_element_t *ap; + md_element_t *bp; + boolean_t nodea_interest; + boolean_t nodeb_interest; + int idx; + + /* make sure we are starting at the beginning of the nodes */ + if ((MDE_TAG(nodeap) != MDET_NODE) || (MDE_TAG(nodebp) != MDET_NODE)) + return (MD_DIFF_NOMATCH); + + for (idx = 0; match_elemsp[idx].type != MDET_LIST_END; idx++) { + + int type; + + nodea_interest = B_FALSE; + nodeb_interest = B_FALSE; + + type = match_elemsp[idx].type; + + /* + * Check node A for the property of interest + */ + for (ap = nodeap; MDE_TAG(ap) != MDET_NODE_END; ap++) { + char *elemname; + + if (MDE_TAG(ap) != type) + continue; + + elemname = mdap->namep + MDE_NAME(ap); + + if (strcmp(elemname, match_elemsp[idx].namep) == 0) { + /* found the property of interest */ + nodea_interest = B_TRUE; + break; + } + } + + /* node A is not of interest */ + if (!nodea_interest) + return (MD_DIFF_NOMATCH); + + /* + * Check node B for the property of interest + */ + for (bp = nodebp; MDE_TAG(bp) != MDET_NODE_END; bp++) { + char *elemname; + + if (MDE_TAG(bp) != type) + continue; + + elemname = mdbp->namep + MDE_NAME(bp); + + if (strcmp(elemname, match_elemsp[idx].namep) == 0) { + nodeb_interest = B_TRUE; + break; + } + } + + /* node B is not of interest */ + if (!nodeb_interest) + return (MD_DIFF_NOMATCH); + + /* + * Both nodes have the property of interest. The + * nodes are not a match unless the value of that + * property match + */ + switch (type) { + case MDET_PROP_VAL: + if (MDE_PROP_VALUE(ap) != MDE_PROP_VALUE(bp)) + return (MD_DIFF_NOMATCH); + break; + + case MDET_PROP_STR: { + char *stra = (char *)(mdap->datap + + MDE_PROP_DATA_OFFSET(ap)); + char *strb = (char *)(mdbp->datap + + MDE_PROP_DATA_OFFSET(bp)); + + if (strcmp(stra, strb) != 0) + return (MD_DIFF_NOMATCH); + break; + } + + case MDET_PROP_DAT: { + + caddr_t dataa; + caddr_t datab; + + if (MDE_PROP_DATA_LEN(ap) != MDE_PROP_DATA_LEN(bp)) + return (MD_DIFF_NOMATCH); + + dataa = (caddr_t)(mdap->datap + + MDE_PROP_DATA_OFFSET(ap)); + datab = (caddr_t)(mdbp->datap + + MDE_PROP_DATA_OFFSET(bp)); + + if (memcmp(dataa, datab, MDE_PROP_DATA_LEN(ap)) != 0) + return (MD_DIFF_NOMATCH); + + break; + } + + default: + /* unsupported prop type */ + return (MD_DIFF_NOMATCH); + } + } + + /* + * All the specified properties exist in both + * nodes and have the same value. The two nodes + * match. + */ + + return (MD_DIFF_MATCH); +} diff --git a/usr/src/common/mdesc/mdesc_fini.c b/usr/src/common/mdesc/mdesc_fini.c index f0b010b386..70340e0b8e 100644 --- a/usr/src/common/mdesc/mdesc_fini.c +++ b/usr/src/common/mdesc/mdesc_fini.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,6 +30,10 @@ #include <sys/mdesc.h> #include <sys/mdesc_impl.h> +/* + * Cleanup the internal MD structure. Does not + * deallocate the buffer holding the MD. + */ int md_fini(md_t *ptr) { diff --git a/usr/src/common/mdesc/mdesc_getbinsize.c b/usr/src/common/mdesc/mdesc_getbinsize.c new file mode 100644 index 0000000000..e672806b1f --- /dev/null +++ b/usr/src/common/mdesc/mdesc_getbinsize.c @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/mdesc.h> +#include <sys/mdesc_impl.h> + +size_t +md_get_bin_size(md_t *ptr) +{ + md_impl_t *mdp; + + mdp = (md_impl_t *)ptr; + + if (mdp == NULL) + return (0); + + return (mdp->size); +} diff --git a/usr/src/common/mdesc/mdesc_getgen.c b/usr/src/common/mdesc/mdesc_getgen.c new file mode 100644 index 0000000000..691343d772 --- /dev/null +++ b/usr/src/common/mdesc/mdesc_getgen.c @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/mdesc.h> +#include <sys/mdesc_impl.h> + +uint64_t +md_get_gen(md_t *ptr) +{ + md_impl_t *mdp; + + mdp = (md_impl_t *)ptr; + + if (mdp == NULL) + return (MDESC_INVAL_GEN); + + return (mdp->gen); +} diff --git a/usr/src/common/mdesc/mdesc_init_intern.c b/usr/src/common/mdesc/mdesc_init_intern.c index d4a9226e9e..c2bad6def2 100644 --- a/usr/src/common/mdesc/mdesc_init_intern.c +++ b/usr/src/common/mdesc/mdesc_init_intern.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -32,23 +32,25 @@ #include <sys/mdesc_impl.h> md_t * -md_init_intern(uint64_t *ptr, void *(*allocp)(size_t), - void (*freep)(void *, size_t)) +md_init_intern(uint64_t *ptr, void *(*allocp)(size_t), + void (*freep)(void *, size_t)) { md_impl_t *mdp; int idx; int count; int done; + uint64_t gen; mde_str_cookie_t root_name; /* * Very basic checkup for alignment to avoid * bus error issues. */ - if ((((uintptr_t)ptr)&7) != 0) + if ((((uintptr_t)ptr) & 7) != 0) return (NULL); mdp = (md_impl_t *)allocp(sizeof (md_impl_t)); + if (mdp == NULL) return (NULL); @@ -60,6 +62,7 @@ md_init_intern(uint64_t *ptr, void *(*allocp)(size_t), /* * setup internal structures */ + mdp->headerp = (md_header_t *)mdp->caddr; if (mdtoh32(mdp->headerp->transport_version) != MD_TRANSPORT_VERSION) { @@ -70,13 +73,13 @@ md_init_intern(uint64_t *ptr, void *(*allocp)(size_t), mdp->name_blk_size = mdtoh32(mdp->headerp->name_blk_sz); mdp->data_blk_size = mdtoh32(mdp->headerp->data_blk_sz); - mdp->size = MD_HEADER_SIZE+mdp->node_blk_size+ - mdp->name_blk_size+mdp->data_blk_size; + mdp->size = MD_HEADER_SIZE + mdp->node_blk_size + + mdp->name_blk_size + mdp->data_blk_size; - mdp->mdep = (md_element_t *)(mdp->caddr+MD_HEADER_SIZE); - mdp->namep = (char *)(mdp->caddr+MD_HEADER_SIZE+mdp->node_blk_size); - mdp->datap = (uint8_t *)(mdp->caddr+MD_HEADER_SIZE+mdp->name_blk_size+ - mdp->node_blk_size); + mdp->mdep = (md_element_t *)(mdp->caddr + MD_HEADER_SIZE); + mdp->namep = (char *)(mdp->caddr + MD_HEADER_SIZE + mdp->node_blk_size); + mdp->datap = (uint8_t *)(mdp->caddr + MD_HEADER_SIZE + + mdp->name_blk_size + mdp->node_blk_size); mdp->root_node = MDE_INVAL_ELEM_COOKIE; @@ -123,7 +126,7 @@ md_init_intern(uint64_t *ptr, void *(*allocp)(size_t), mdp->root_node = (mde_cookie_t)idx; } idx = MDE_PROP_INDEX(np); - count ++; + count++; break; default: @@ -142,25 +145,35 @@ md_init_intern(uint64_t *ptr, void *(*allocp)(size_t), * Register the counts */ - mdp->element_count = idx+1; /* include LIST_END */ + mdp->element_count = idx + 1; /* include LIST_END */ mdp->node_count = count; /* * Final sanity check that everything adds up */ - if (mdp->element_count != (mdp->node_blk_size/MD_ELEMENT_SIZE)) + if (mdp->element_count != (mdp->node_blk_size / MD_ELEMENT_SIZE)) goto cleanup; mdp->md_magic = LIBMD_MAGIC; + /* + * Setup MD generation + */ + if (md_get_prop_val((md_t *)mdp, mdp->root_node, + "md-generation#", &gen) != 0) + mdp->gen = MDESC_INVAL_GEN; + else + mdp->gen = gen; + return ((md_t *)mdp); -cleanup:; +cleanup: /* * Clean up here - including a name hash if * we build one. */ -cleanup_nohash:; + +cleanup_nohash: mdp->freep(mdp, sizeof (md_impl_t)); return (NULL); } diff --git a/usr/src/common/mdesc/mdesc_rootnode.c b/usr/src/common/mdesc/mdesc_rootnode.c index 551130fc72..364c5004e2 100644 --- a/usr/src/common/mdesc/mdesc_rootnode.c +++ b/usr/src/common/mdesc/mdesc_rootnode.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,7 +38,7 @@ md_root_node(md_t *ptr) mdp = (md_impl_t *)ptr; if (mdp->md_magic != LIBMD_MAGIC) - return (-1); + return (MDE_INVAL_ELEM_COOKIE); return (mdp->root_node); } diff --git a/usr/src/common/mdesc/mdesc_scandag.c b/usr/src/common/mdesc/mdesc_scandag.c index 11b4e24ab2..ad1c74c9c2 100644 --- a/usr/src/common/mdesc/mdesc_scandag.c +++ b/usr/src/common/mdesc/mdesc_scandag.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -121,7 +121,8 @@ md_scan_dag(md_t *ptr, -static int mdl_scan_dag(md_impl_t *mdp, +static int +mdl_scan_dag(md_impl_t *mdp, int nodeidx, mde_str_cookie_t node_name_cookie, mde_str_cookie_t arc_name_cookie, diff --git a/usr/src/lib/libpcp/common/libpcp.c b/usr/src/lib/libpcp/common/libpcp.c index 9d32387ff0..6c9eb09263 100644 --- a/usr/src/lib/libpcp/common/libpcp.c +++ b/usr/src/lib/libpcp/common/libpcp.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -48,6 +47,8 @@ #include <sys/types.h> #include <sys/stat.h> #include <sys/glvc.h> +#include <sys/vldc.h> +#include <sys/ldc.h> #include <netinet/in.h> #include "libpcp.h" @@ -81,6 +82,11 @@ static int check_magic_byte_presence(int byte_cnt, uint8_t *byte_val, static uint16_t checksum(uint16_t *addr, int32_t count); static int pcp_cleanup(int channel_fd); +static int vldc_read(int fd, uint8_t *bufp, int size); +static int vldc_write(int fd, uint8_t *bufp, int size); +static int pcp_update_read_area(int byte_cnt); +static int pcp_vldc_frame_error_handle(void); + /* * local channel (glvc) file descriptor set by pcp_send_recv() */ @@ -156,6 +162,19 @@ static struct sigaction glvc_act; /* To restore old SIGALRM signal handler */ static struct sigaction old_act; +/* + * Variables to support vldc based streaming transport + */ +typedef enum { + GLVC_NON_STREAM, + VLDC_STREAMING +} xport_t; + +static int xport_type = GLVC_NON_STREAM; +#define CHANNEL_DEV "channel-devices" + +#define VLDC_MTU_SIZE (2048) + static void glvc_timeout_handler(void) { @@ -178,6 +197,7 @@ pcp_init(char *channel_name) if (channel_name == NULL) return (PCPL_INVALID_ARGS); + /* * Open virtual channel name. */ @@ -186,12 +206,33 @@ pcp_init(char *channel_name) } /* - * Get the Channel MTU size + * Check if the channel-name points to a vldc node + * or a glvc node */ + if (strstr(channel_name, CHANNEL_DEV) != NULL) { + vldc_opt_op_t op; + + xport_type = VLDC_STREAMING; + mtu_size = VLDC_MTU_SIZE; + + op.op_sel = VLDC_OP_SET; + op.opt_sel = VLDC_OPT_MODE; + op.opt_val = LDC_MODE_STREAM; + if (ioctl(channel_fd, VLDC_IOCTL_OPT_OP, &op) != 0) { + (void) close(channel_fd); + return (PCPL_GLVC_ERROR); + } + } else { + xport_type = GLVC_NON_STREAM; + /* + * Get the Channel MTU size + */ - if (pcp_get_prop(channel_fd, GLVC_XPORT_OPT_MTU_SZ, &mtu_size) != 0) { - (void) close(channel_fd); - return (PCPL_GLVC_ERROR); + if (pcp_get_prop(channel_fd, GLVC_XPORT_OPT_MTU_SZ, + &mtu_size) != 0) { + (void) close(channel_fd); + return (PCPL_GLVC_ERROR); + } } /* @@ -233,7 +274,8 @@ pcp_close(int channel_fd) { if (channel_fd >= 0) { - (void) pcp_cleanup(channel_fd); + if (xport_type == GLVC_NON_STREAM) + (void) pcp_cleanup(channel_fd); (void) close(channel_fd); } else { return (-1); @@ -631,7 +673,6 @@ pcp_peek(uint8_t *buf, int bytes_cnt) (void) memcpy(buf, peek_area, m); return (m); - } /* @@ -648,13 +689,19 @@ pcp_write(uint8_t *buf, int byte_cnt) return (PCPL_INVALID_ARGS); } - (void) alarm(glvc_timeout); + if (xport_type == GLVC_NON_STREAM) { + (void) alarm(glvc_timeout); - if ((ret = write(chnl_fd, buf, byte_cnt)) < 0) { + if ((ret = write(chnl_fd, buf, byte_cnt)) < 0) { + (void) alarm(0); + return (ret); + } (void) alarm(0); - return (ret); + } else { + if ((ret = vldc_write(chnl_fd, buf, byte_cnt)) <= 0) { + return (ret); + } } - (void) alarm(0); return (ret); } @@ -718,17 +765,28 @@ pcp_read(uint8_t *buf, int byte_cnt) * do a peek to see how much data is available and read complete data. */ - if ((m = pcp_peek(read_tail, mtu_size)) < 0) { - return (m); - } + if (xport_type == GLVC_NON_STREAM) { + if ((m = pcp_peek(read_tail, mtu_size)) < 0) { + return (m); + } + + (void) alarm(glvc_timeout); + if ((ret = read(chnl_fd, read_tail, m)) < 0) { + (void) alarm(0); + return (ret); + } - (void) alarm(glvc_timeout); - if ((ret = read(chnl_fd, read_tail, m)) < 0) { (void) alarm(0); - return (ret); + } else { + /* + * Read the extra number of bytes + */ + m = byte_cnt - (read_tail - read_head); + if ((ret = vldc_read(chnl_fd, + read_tail, m)) <= 0) { + return (ret); + } } - - (void) alarm(0); read_tail += ret; /* @@ -743,6 +801,69 @@ pcp_read(uint8_t *buf, int byte_cnt) } /* + * Issue read from the driver until byet_cnt number + * of bytes are present in read buffer. Do not + * move the read head. + */ +static int +pcp_update_read_area(int byte_cnt) +{ + int ret; + int n, i; + + if (byte_cnt < 0 || byte_cnt > mtu_size) { + return (PCPL_INVALID_ARGS); + } + + /* + * initialization of local read buffer + * from which the stream read requests are serviced. + */ + if (read_area == NULL) { + read_area = (uint8_t *)umem_zalloc(READ_AREA_SIZE, + UMEM_DEFAULT); + if (read_area == NULL) { + return (PCPL_MALLOC_FAIL); + } + read_head = read_area; + read_tail = read_area; + } + + /* + * if we already have sufficient data in the buffer, + * just return + */ + if (byte_cnt <= (read_tail - read_head)) { + return (byte_cnt); + } + + /* + * if the request is not satisfied from the buffered data, then move the + * remaining data to front of the buffer and read new data. + */ + for (i = 0; i < (read_tail - read_head); ++i) { + read_area[i] = read_head[i]; + } + read_head = read_area; + read_tail = read_head + i; + + n = byte_cnt - (read_tail - read_head); + + if ((ret = vldc_read(chnl_fd, + read_tail, n)) <= 0) { + return (ret); + } + read_tail += ret; + + /* + * Return the number of bytes we could read + */ + n = MIN(byte_cnt, (read_tail - read_head)); + + return (n); +} + +/* * This function is slight different from pcp_peek. The peek requests are first * serviced from local read buffer, if data is available. If the peek request * is not serviceble from local read buffer, then the data is peeked from @@ -798,7 +919,6 @@ pcp_peek_read(uint8_t *buf, int byte_cnt) if ((m = pcp_peek(peek_read_tail, mtu_size)) < 0) { return (m); } - peek_read_tail += m; /* @@ -874,7 +994,12 @@ pcp_recv_resp_msg_hdr(pcp_resp_msg_hdr_t *resp_hdr) * (magic seq) or if an error happens while reading data from * channel. */ - if ((ret = pcp_frame_error_handle()) != 0) + if (xport_type == GLVC_NON_STREAM) + ret = pcp_frame_error_handle(); + else + ret = pcp_vldc_frame_error_handle(); + + if (ret != 0) return (PCPL_FRAME_ERROR); /* read magic number first */ @@ -1059,6 +1184,55 @@ pcp_frame_error_handle(void) } /* + * This function handles channel framing errors. It waits until proper + * frame with starting sequence as magic numder (0xAFBCAFA0) + * is arrived. It removes unexpected data (before the magic number sequence) + * on the channel. It returns when proper magic number sequence is seen + * or when any failure happens while reading/peeking the channel. + */ +static int +pcp_vldc_frame_error_handle(void) +{ + uint8_t magic_num_buf[4]; + uint32_t net_magic_num; /* magic byte in network byte order */ + uint32_t host_magic_num = PCP_MAGIC_NUM; + int found_magic = 0; + + net_magic_num = htonl(host_magic_num); + (void) memcpy(magic_num_buf, (uint8_t *)&net_magic_num, 4); + + /* + * For vldc, we need to read whatever data is available and + * advance the read pointer one byte at a time until we get + * the magic word. When this function is invoked, we do not + * have any byte in the read buffer. + */ + + /* + * Keep reading until we find the matching magic number + */ + while (!found_magic) { + while ((read_tail - read_head) < sizeof (host_magic_num)) { + if (pcp_update_read_area(sizeof (host_magic_num)) < 0) + return (-1); + } + + /* + * We should have at least 4 bytes in read buffer. Check + * if the magic number can be matched + */ + if (memcmp(read_head, magic_num_buf, + sizeof (host_magic_num))) { + read_head += 1; + } else { + found_magic = 1; + } + } + + return (0); +} + +/* * checks whether certain byte sequence is present in the data stream. */ static int @@ -1188,3 +1362,81 @@ pcp_cleanup(int channel_fd) umem_free(buf, mtu_size); return (ret); } + +static int +vldc_write(int fd, uint8_t *bufp, int size) +{ + int res; + int left = size; + pollfd_t pollfd; + + pollfd.events = POLLOUT; + pollfd.revents = 0; + pollfd.fd = fd; + + /* + * Poll for the vldc channel to be ready + */ + if (poll(&pollfd, 1, glvc_timeout * MILLISEC) <= 0) { + return (-1); + } + + do { + if ((res = write(fd, bufp, left)) <= 0) { + if (errno != EWOULDBLOCK) { + return (res); + } + } else { + bufp += res; + left -= res; + } + } while (left > 0); + + /* + * Return number of bytes actually written + */ + return (size - left); +} + +/* + * Keep reading until we get the specified number of bytes + */ +static int +vldc_read(int fd, uint8_t *bufp, int size) +{ + int res; + int left = size; + + struct pollfd fds[1]; + + fds[0].events = POLLIN | POLLPRI; + fds[0].revents = 0; + fds[0].fd = fd; + + if (poll(fds, 1, glvc_timeout * MILLISEC) <= 0) { + return (-1); + } + + while (left > 0) { + res = read(fd, bufp, left); + /* return on error or short read */ + if ((res == 0) || ((res < 0) && + (errno == EAGAIN))) { + /* poll until the read is unblocked */ + if ((poll(fds, 1, glvc_timeout * MILLISEC)) < 0) + return (-1); + + continue; + } else + if (res < 0) { + /* unrecoverable error */ + + return (-1); + } else { + bufp += res; + left -= res; + } + } + + return (size - left); +} diff --git a/usr/src/pkgdefs/Makefile b/usr/src/pkgdefs/Makefile index a8d94762ad..868a18bdfd 100644 --- a/usr/src/pkgdefs/Makefile +++ b/usr/src/pkgdefs/Makefile @@ -69,6 +69,8 @@ sparc_SUBDIRS= \ SUNWkvm.u \ SUNWkvm.v \ SUNWkvmt200.v \ + SUNWldomr.v \ + SUNWldomu.v \ SUNWluxd.u \ SUNWluxl \ SUNWonmtst.u \ diff --git a/usr/src/pkgdefs/SUNWldomr.v/Makefile b/usr/src/pkgdefs/SUNWldomr.v/Makefile new file mode 100644 index 0000000000..fbabdf4e9e --- /dev/null +++ b/usr/src/pkgdefs/SUNWldomr.v/Makefile @@ -0,0 +1,38 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include ../Makefile.com + +DATAFILES += depend + +.KEEP_STATE: + +all: $(FILES) + +install: all pkg + +include ../Makefile.targ diff --git a/usr/src/pkgdefs/SUNWldomr.v/i.manifest b/usr/src/pkgdefs/SUNWldomr.v/i.manifest new file mode 100644 index 0000000000..262b987697 --- /dev/null +++ b/usr/src/pkgdefs/SUNWldomr.v/i.manifest @@ -0,0 +1,76 @@ +#!/bin/sh +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +# i.manifest - smf(5) service manifest install class action script +# + +repfile=$PKG_INSTALL_ROOT/etc/svc/repository.db +export repfile + +# +# If the repository does not yet exist, create it from the appropriate seed. If +# for some reason the seeds do not exist, svccfg(1M) will create the repository +# automatically. +# +if [ ! -f $repfile ]; then + if [ -n "$SUNW_PKG_INSTALL_ZONENAME" -a \ + "$SUNW_PKG_INSTALL_ZONENAME" != "global" ]; then + [ -f $PKG_INSTALL_ROOT/lib/svc/seed/nonglobal.db ] && \ + /usr/bin/cp $PKG_INSTALL_ROOT/lib/svc/seed/nonglobal.db \ + $repfile + else + [ -f $PKG_INSTALL_ROOT/lib/svc/seed/global.db ] && \ + /usr/bin/cp $PKG_INSTALL_ROOT/lib/svc/seed/global.db \ + $repfile + fi + /usr/bin/chmod 0600 $repfile + /usr/bin/chown root:sys $repfile +fi + +if [ ! -r $PKG_INSTALL_ROOT/etc/svc/volatile/repository_door ]; then + # + # smf(5) is not presently running for the destination environment. + # Since we presently cannot refresh without a running svc.startd(1M), we + # cannot consistently handle dependent placement. Defer to next boot. + # + while read src dst; do + /usr/bin/cp -p $src $dst + done +else + # + # Local package install. + # + while read src dst; do + /usr/bin/cp -p $src $dst + + [ "$PKG_INSTALL_ROOT" = "" -o "$PKG_INSTALL_ROOT" = "/" ] && \ + SVCCFG_CHECKHASH=1 /usr/sbin/svccfg import $dst + done +fi + +exit 0 diff --git a/usr/src/pkgdefs/SUNWldomr.v/pkginfo.tmpl b/usr/src/pkgdefs/SUNWldomr.v/pkginfo.tmpl new file mode 100644 index 0000000000..c728c3e77e --- /dev/null +++ b/usr/src/pkgdefs/SUNWldomr.v/pkginfo.tmpl @@ -0,0 +1,55 @@ +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# ident "%Z%%M% %I% %E% SMI" +# +# This required package information file describes characteristics of the +# package, such as package abbreviation, full package name, package version, +# and package architecture. +# +PKG="SUNWldomr" +NAME="Solaris Logical Domains (Root)" +ARCH="sparc.sun4v" +VERSION="ONVERS,REV=0.0.0" +SUNW_PRODNAME="SunOS" +SUNW_PRODVERS="RELEASE/VERSION" +SUNW_PKGTYPE="root" +MAXINST="1000" +CATEGORY="system" +DESC="Solaris Logical Domains Configuration Files" +VENDOR="Sun Microsystems, Inc." +HOTLINE="Please contact your local service provider" +EMAIL="" +CLASSES="none" +BASEDIR=/ +SUNW_PKGVERS="1.0" +SUNW_PKG_ALLZONES="true" +SUNW_PKG_HOLLOW="true" +SUNW_PKG_THISZONE="false" +#VSTOCK="<reserved by Release Engineering for package part #>" +#ISTATES="<developer defined>" +#RSTATES='<developer defined>' +#ULIMIT="<developer defined>" +#ORDER="<developer defined>" +#PSTAMP="<developer defined>" +#INTONLY="<developer defined>" diff --git a/usr/src/pkgdefs/SUNWldomr.v/postinstall b/usr/src/pkgdefs/SUNWldomr.v/postinstall new file mode 100644 index 0000000000..0c81ca9f6d --- /dev/null +++ b/usr/src/pkgdefs/SUNWldomr.v/postinstall @@ -0,0 +1,136 @@ +#!/sbin/sh +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +# Function: check_add_drv() +# +# This function will check if the module has an entry in etc/name_to_major +# If not simply calls add_drv with the arguments given. If there is +# such an entry in name_to_major file, it adds entries in driver_aliases +# driver_classes and minor_perm if necessary. +# The syntax of this function is the same as add_drv. + +check_add_drv() +{ + if [ "$BASEDIR" = "" ] + then + BASEDIR=/ + fi + alias="" + class="" + ADD_ALIAS=0 + ADD_CLASS=0 + ADD_MINOR=0 + OPTIND=1 + IS_NET_DRIVER=0 + + cmd="add_drv" + + NO_CMD= + while getopts i:b:m:c:N opt + do + case $opt in + N ) NO_CMD=1;; + i ) ADD_ALIAS=1 + alias=$OPTARG + cmd=$cmd" -i '$alias'" + ;; + m ) ADD_MINOR=1 + minor=$OPTARG + cmd=$cmd" -m '$minor'" + ;; + c) ADD_CLASS=1 + class=$OPTARG + cmd=$cmd" -c $class" + ;; + b) BASEDIR=$OPTARG + cmd=$cmd" -b $BASEDIR" + ;; + \?) echo "check_add_drv can not handle this option" + return + ;; + esac + done + shift `/usr/bin/expr $OPTIND - 1` + + drvname=$1 + + cmd=$cmd" "$drvname + + drvname=`echo $drvname | /usr/bin/sed 's;.*/;;g'` + + /usr/bin/grep "^$drvname[ ]" $BASEDIR/etc/name_to_major > /dev/null 2>&1 + + if [ "$NO_CMD" = "" -a $? -ne 0 ] + then + eval $cmd + else + # entry already in name_to_major, add alias, class, minorperm + # if necessary + if [ $ADD_ALIAS = 1 ] + then + for i in $alias + do + /usr/bin/egrep "^$drvname[ ]+$i" $BASEDIR/etc/driver_aliases>/dev/null 2>&1 + if [ $? -ne 0 ] + then + echo "$drvname $i" >> $BASEDIR/etc/driver_aliases + fi + done + fi + + if [ $ADD_CLASS = 1 ] + then + /usr/bin/egrep "^$drvname[ ]+$class( | |$)" $BASEDIR/etc/driver_classes > /dev/null 2>&1 + if [ $? -ne 0 ] + then + echo "$drvname\t$class" >> $BASEDIR/etc/driver_classes + fi + fi + + if [ $ADD_MINOR = 1 ] + then + /usr/bin/grep "^$drvname:" $BASEDIR/etc/minor_perm > /dev/null 2>&1 + if [ $? -ne 0 ] + then + minorentry="$drvname:$minor" + echo $minorentry >> $BASEDIR/etc/minor_perm + fi + fi + + fi + + +} + +check_add_drv -b "${BASEDIR}" -i '"SUNW,sun4v-channel-devices"' cnex +check_add_drv -b "${BASEDIR}" -i '"SUNW,sun4v-console-concentrator"' vcc +check_add_drv -b "${BASEDIR}" -i '"SUNW,sun4v-disk"' vdc +check_add_drv -b "${BASEDIR}" -i '"SUNW,sun4v-disk-server"' vds +check_add_drv -b "${BASEDIR}" -i '"SUNW,sun4v-channel"' vldc +check_add_drv -b "${BASEDIR}" -i '"SUNW,sun4v-network"' vnet +check_add_drv -b "${BASEDIR}" -i '"SUNW,sun4v-network-switch"' vsw diff --git a/usr/src/pkgdefs/SUNWldomr.v/preremove b/usr/src/pkgdefs/SUNWldomr.v/preremove new file mode 100644 index 0000000000..b350bce8f1 --- /dev/null +++ b/usr/src/pkgdefs/SUNWldomr.v/preremove @@ -0,0 +1,58 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +PATH=/usr/bin:/usr/sbin:${PATH} +export PATH + +EXIT=0 + +not_installed() +{ + driver=$1 + + grep "^${driver} " ${BASEDIR}/etc/name_to_major > /dev/null 2>&1 + + if [ "$?" -eq 0 ]; then + return 1 + else + return 0 + fi +} + +# +# Unload and remove drivers +# +not_installed cnex || rem_drv -b "${BASEDIR}" cnex || EXIT=1 +not_installed vcc || rem_drv -b "${BASEDIR}" vcc || EXIT=1 +not_installed vdc || rem_drv -b "${BASEDIR}" vdc || EXIT=1 +not_installed vds || rem_drv -b "${BASEDIR}" vds || EXIT=1 +not_installed vldc || rem_drv -b "${BASEDIR}" vldc || EXIT=1 +not_installed vnet || rem_drv -b "${BASEDIR}" vnet || EXIT=1 +not_installed vsw || rem_drv -b "${BASEDIR}" vsw || EXIT=1 + +exit ${EXIT} diff --git a/usr/src/pkgdefs/SUNWldomr.v/prototype_com b/usr/src/pkgdefs/SUNWldomr.v/prototype_com new file mode 100644 index 0000000000..ef552c2cd2 --- /dev/null +++ b/usr/src/pkgdefs/SUNWldomr.v/prototype_com @@ -0,0 +1,52 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +# This required package information file contains a list of package contents. +# The 'pkgmk' command uses this file to identify the contents of a package +# and their location on the development machine when building the package. +# Can be created via a text editor or through use of the 'pkgproto' command. +# + +#!search <pathname pathname ...> # where to find pkg objects +#!include <filename> # include another 'prototype' file +#!default <mode> <owner> <group> # default used if not specified on entry +#!<param>=<value> # puts parameter in pkg environment + +# packaging files +i pkginfo +i copyright +i depend +i postinstall +i preremove +i i.manifest +i r.manifest + +# +# source locations relative to the prototype file +# +# SUNWldomr.v +# diff --git a/usr/src/pkgdefs/SUNWldomr.v/prototype_sparc b/usr/src/pkgdefs/SUNWldomr.v/prototype_sparc new file mode 100644 index 0000000000..2ff55f802f --- /dev/null +++ b/usr/src/pkgdefs/SUNWldomr.v/prototype_sparc @@ -0,0 +1,79 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +# This required package information file contains a list of package contents. +# The 'pkgmk' command uses this file to identify the contents of a package +# and their location on the development machine when building the package. +# Can be created via a text editor or through use of the 'pkgproto' command. +# + +#!search <pathname pathname ...> # where to find pkg objects +#!include <filename> # include another 'prototype' file +#!default <mode> <owner> <group> # default used if not specified on entry +#!<param>=<value> # puts parameter in pkg environment + +# +# Include ISA independent files (prototype_com) +# +!include prototype_com + +# +# List files which are SPARC specific here +# +# source locations relative to the prototype file +# +# SUNWldomr.v +# +d none lib 755 root bin +d none lib/svc 0755 root bin +d none lib/svc/method 0755 root bin +f none lib/svc/method/svc-vntsd 0555 root bin +d none platform 755 root sys +d none platform/sun4v 755 root sys +d none platform/sun4v/kernel 755 root sys +d none platform/sun4v/kernel/drv 755 root sys +d none platform/sun4v/kernel/drv/sparcv9 755 root sys +f none platform/sun4v/kernel/drv/sparcv9/cnex 755 root sys +f none platform/sun4v/kernel/drv/sparcv9/vcc 755 root sys +f none platform/sun4v/kernel/drv/sparcv9/vdc 755 root sys +f none platform/sun4v/kernel/drv/sparcv9/vds 755 root sys +f none platform/sun4v/kernel/drv/sparcv9/vldc 755 root sys +f none platform/sun4v/kernel/drv/sparcv9/vnet 755 root sys +f none platform/sun4v/kernel/drv/sparcv9/vsw 755 root sys +d none platform/sun4v/kernel/misc 755 root sys +d none platform/sun4v/kernel/misc/sparcv9 755 root sys +f none platform/sun4v/kernel/misc/sparcv9/dr_cpu 755 root sys +f none platform/sun4v/kernel/misc/sparcv9/ds 755 root sys +f none platform/sun4v/kernel/misc/sparcv9/fault_iso 755 root sys +f none platform/sun4v/kernel/misc/sparcv9/ldc 755 root sys +f none platform/sun4v/kernel/misc/sparcv9/platsvc 755 root sys +d none var 755 root sys +d none var/svc 755 root sys +d none var/svc/manifest 755 root sys +d none var/svc/manifest/platform 755 root sys +d none var/svc/manifest/platform/sun4v 755 root sys +f manifest var/svc/manifest/platform/sun4v/vntsd.xml 0444 root sys diff --git a/usr/src/pkgdefs/SUNWldomr.v/r.manifest b/usr/src/pkgdefs/SUNWldomr.v/r.manifest new file mode 100644 index 0000000000..e4690e7e5f --- /dev/null +++ b/usr/src/pkgdefs/SUNWldomr.v/r.manifest @@ -0,0 +1,83 @@ +#!/bin/sh +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +# r.manifest - smf(5) manifest remove class action script +# + +if [ "$PKG_INSTALL_ROOT" != "" -a "$PKG_INSTALL_ROOT" != "/" ]; then + # + # We can't safely disable the service in this case. + # + smf_alive=no +else + # + # We can verify if the service is disabled prior to + # removal. + # + if [ -r /etc/svc/volatile/repository_door ]; then + smf_alive=yes + fi +fi + +MFSTSCAN=/lib/svc/bin/mfstscan +SVCCFG=/usr/sbin/svccfg +SVCPROP=/usr/bin/svcprop + +while read mfst; do + if [ "$smf_alive" = "yes" ]; then + ENTITIES=`$SVCCFG inventory $mfst` + + for fmri in $ENTITIES; do + # + # Determine whether any of our instances are + # enabled. + # + en_p=`$SVCPROP -C -p general/enabled $fmri 2>/dev/null` + en_o=`$SVCPROP -C -p general_ovr/enabled $fmri 2>/dev/null` + + if [ "$en_p" = "true" -o "$en_o" = "true" ]; then + echo "$fmri remains enabled; aborting" + exit 1 + fi + + $SVCCFG delete $fmri + done + + # + # Delete the manifest hash value. + # + pg_name=`$MFSTSCAN -t $mfst` + if $SVCPROP -q -p $pg_name smf/manifest; then + $SVCCFG -s smf/manifest delpg $pg_name + fi + fi + + /usr/bin/rm $mfst +done + +exit 0 diff --git a/usr/src/pkgdefs/SUNWldomu.v/Makefile b/usr/src/pkgdefs/SUNWldomu.v/Makefile new file mode 100644 index 0000000000..41d0832757 --- /dev/null +++ b/usr/src/pkgdefs/SUNWldomu.v/Makefile @@ -0,0 +1,38 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include ../Makefile.com + +DATAFILES += i.manifest r.manifest + +.KEEP_STATE: + +all: $(FILES) + +install: all pkg + +include ../Makefile.targ diff --git a/usr/src/pkgdefs/SUNWldomu.v/depend b/usr/src/pkgdefs/SUNWldomu.v/depend new file mode 100644 index 0000000000..e7ae45116d --- /dev/null +++ b/usr/src/pkgdefs/SUNWldomu.v/depend @@ -0,0 +1,56 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +# +# This package information file defines software dependencies associated +# with the pkg. You can define three types of pkg dependencies with this file: +# P indicates a prerequisite for installation +# I indicates an incompatible package +# R indicates a reverse dependency +# <pkg.abbr> see pkginfo(4), PKG parameter +# <name> see pkginfo(4), NAME parameter +# <version> see pkginfo(4), VERSION parameter +# <arch> see pkginfo(4), ARCH parameter +# <type> <pkg.abbr> <name> +# (<arch>)<version> +# (<arch>)<version> +# ... +# <type> <pkg.abbr> <name> +# ... +# + +P SUNWcar Core Architecture, (Root) +P SUNWcakr Core Solaris Kernel Architecture (Root) +P SUNWkvm Core Architecture, (Kvm) +P SUNWcsr Core Solaris, (Root) +P SUNWckr Core Solaris Kernel (Root) +P SUNWcnetr Core Solaris Network Infrastructure (Root) +P SUNWcsu Core Solaris, (Usr) +P SUNWcsd Core Solaris Devices +P SUNWcsl Core Solaris Libraries +P SUNWldomr Solaris Logical Domains (Root) diff --git a/usr/src/pkgdefs/SUNWldomu.v/pkginfo.tmpl b/usr/src/pkgdefs/SUNWldomu.v/pkginfo.tmpl new file mode 100644 index 0000000000..20f2cac7dd --- /dev/null +++ b/usr/src/pkgdefs/SUNWldomu.v/pkginfo.tmpl @@ -0,0 +1,55 @@ +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# ident "%Z%%M% %I% %E% SMI" +# +# This required package information file describes characteristics of the +# package, such as package abbreviation, full package name, package version, +# and package architecture. +# +PKG="SUNWldomu" +NAME="Solaris Logical Domains (Usr)" +ARCH="sparc.sun4v" +VERSION="ONVERS,REV=0.0.0" +SUNW_PRODNAME="SunOS" +SUNW_PRODVERS="RELEASE/VERSION" +SUNW_PKGTYPE="usr" +MAXINST="1000" +CATEGORY="system" +DESC="Solaris Logical Domains Configuration and Administration" +VENDOR="Sun Microsystems, Inc." +HOTLINE="Please contact your local service provider" +EMAIL="" +CLASSES="none" +BASEDIR=/ +SUNW_PKGVERS="1.0" +SUNW_PKG_ALLZONES="true" +SUNW_PKG_HOLLOW="true" +SUNW_PKG_THISZONE="false" +#VSTOCK="<reserved by Release Engineering for package part #>" +#ISTATES="<developer defined>" +#RSTATES='<developer defined>' +#ULIMIT="<developer defined>" +#ORDER="<developer defined>" +#PSTAMP="<developer defined>" +#INTONLY="<developer defined>" diff --git a/usr/src/pkgdefs/SUNWldomu.v/prototype_com b/usr/src/pkgdefs/SUNWldomu.v/prototype_com new file mode 100644 index 0000000000..a493d36d3f --- /dev/null +++ b/usr/src/pkgdefs/SUNWldomu.v/prototype_com @@ -0,0 +1,48 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +# This required package information file contains a list of package contents. +# The 'pkgmk' command uses this file to identify the contents of a package +# and their location on the development machine when building the package. +# Can be created via a text editor or through use of the 'pkgproto' command. +# + +#!search <pathname pathname ...> # where to find pkg objects +#!include <filename> # include another 'prototype' file +#!default <mode> <owner> <group> # default used if not specified on entry +#!<param>=<value> # puts parameter in pkg environment + +# packaging files +i pkginfo +i copyright +i depend + +# +# source locations relative to the prototype file +# +# SUNWldomu.v +# diff --git a/usr/src/pkgdefs/SUNWldomu.v/prototype_sparc b/usr/src/pkgdefs/SUNWldomu.v/prototype_sparc new file mode 100644 index 0000000000..860533427d --- /dev/null +++ b/usr/src/pkgdefs/SUNWldomu.v/prototype_sparc @@ -0,0 +1,54 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +# This required package information file contains a list of package contents. +# The 'pkgmk' command uses this file to identify the contents of a package +# and their location on the development machine when building the package. +# Can be created via a text editor or through use of the 'pkgproto' command. +# + +#!search <pathname pathname ...> # where to find pkg objects +#!include <filename> # include another 'prototype' file +#!default <mode> <owner> <group> # default used if not specified on entry +#!<param>=<value> # puts parameter in pkg environment + +# +# Include ISA independent files (prototype_com) +# +!include prototype_com + +# +# List files which are SPARC specific here +# +# source locations relative to the prototype file +# +# SUNWldomu.v +# +d none usr 755 root sys +d none usr/lib 755 root bin +d none usr/lib/ldoms 755 root bin +f none usr/lib/ldoms/vntsd 555 root bin diff --git a/usr/src/pkgdefs/SUNWmdb/prototype_sparc b/usr/src/pkgdefs/SUNWmdb/prototype_sparc index 0a24c3f4b3..a5b29115a6 100644 --- a/usr/src/pkgdefs/SUNWmdb/prototype_sparc +++ b/usr/src/pkgdefs/SUNWmdb/prototype_sparc @@ -90,3 +90,4 @@ d none usr/platform/sun4v/lib/mdb 755 root sys d none usr/platform/sun4v/lib/mdb/kvm 755 root sys d none usr/platform/sun4v/lib/mdb/kvm/sparcv9 755 root sys f none usr/platform/sun4v/lib/mdb/kvm/sparcv9/unix.so 555 root sys +f none usr/platform/sun4v/lib/mdb/kvm/sparcv9/vdsk.so 555 root sys diff --git a/usr/src/pkgdefs/SUNWmdbr/prototype_sparc b/usr/src/pkgdefs/SUNWmdbr/prototype_sparc index d08979e697..154c47733f 100644 --- a/usr/src/pkgdefs/SUNWmdbr/prototype_sparc +++ b/usr/src/pkgdefs/SUNWmdbr/prototype_sparc @@ -75,3 +75,4 @@ d none platform/sun4v/kernel 755 root sys d none platform/sun4v/kernel/kmdb 755 root sys d none platform/sun4v/kernel/kmdb/sparcv9 755 root sys f none platform/sun4v/kernel/kmdb/sparcv9/unix 555 root sys +f none platform/sun4v/kernel/kmdb/sparcv9/vdsk 555 root sys diff --git a/usr/src/pkgdefs/etc/exception_list_i386 b/usr/src/pkgdefs/etc/exception_list_i386 index 532972b732..47b57dcb09 100644 --- a/usr/src/pkgdefs/etc/exception_list_i386 +++ b/usr/src/pkgdefs/etc/exception_list_i386 @@ -642,6 +642,7 @@ usr/include/librestart.h i386 usr/include/librestart_priv.h i386 usr/include/libcontract_priv.h i386 var/svc/manifest/platform/sun4u i386 +var/svc/manifest/platform/sun4v i386 var/svc/profile/platform_SUNW,SPARC-Enterprise.xml i386 var/svc/profile/platform_SUNW,Sun-Fire.xml i386 var/svc/profile/platform_SUNW,Sun-Fire-880.xml i386 diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh index a1e6559bea..5056ba6dfb 100644 --- a/usr/src/tools/scripts/bfu.sh +++ b/usr/src/tools/scripts/bfu.sh @@ -325,6 +325,7 @@ superfluous_local_zone_files=" lib/svc/method/svc-scheduler lib/svc/method/svc-sckmd lib/svc/method/svc-syseventd + lib/svc/method/svc-vntsd lib/svc/method/svc-zones platform/*/kernel platform/SUNW,Sun-Fire-15000/lib/cvcd @@ -357,6 +358,7 @@ superfluous_local_zone_files=" usr/include/netinet/ipl.h usr/include/sys/dcam usr/lib/devfsadm/linkmod/SUNW_dcam1394_link.so + usr/lib/ldoms usr/platform/SUNW,SPARC-Enterprise/lib/dscp.ppp.options usr/platform/SUNW,SPARC-Enterprise/lib/libdscp.so usr/platform/SUNW,SPARC-Enterprise/lib/libdscp.so.1 @@ -376,6 +378,7 @@ superfluous_local_zone_files=" var/svc/manifest/platform/sun4u/efdaemon.xml var/svc/manifest/platform/sun4u/sckmd.xml var/svc/manifest/platform/sun4u/sf880drd.xml + var/svc/manifest/platform/sun4v/vntsd.xml var/svc/manifest/system/cvc.xml var/svc/manifest/system/dumpadm.xml var/svc/manifest/system/fmd.xml diff --git a/usr/src/uts/common/sys/mdesc.h b/usr/src/uts/common/sys/mdesc.h index e05374f60e..4bd335c38f 100644 --- a/usr/src/uts/common/sys/mdesc.h +++ b/usr/src/uts/common/sys/mdesc.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -87,22 +87,39 @@ extern "C" { #ifndef _ASM /* { */ -typedef uint64_t mde_cookie_t; +/* + * Opaque handles for use in external interfaces + */ + +typedef void *md_t; + +typedef uint64_t mde_cookie_t; #define MDE_INVAL_ELEM_COOKIE ((mde_cookie_t)-1) typedef uint32_t mde_str_cookie_t; #define MDE_INVAL_STR_COOKIE ((mde_str_cookie_t)-1) +typedef uint64_t md_diff_cookie_t; +#define MD_INVAL_DIFF_COOKIE ((md_diff_cookie_t)-1) - /* Opaque structure for handling in functions */ -typedef void * md_t; +#define MDESC_INVAL_GEN (0) +/* + * External structure for MD diff interface + */ +typedef struct { + uint8_t type; /* property type */ + char *namep; /* property name */ +} md_prop_match_t; +/* + * External Interface + */ -extern md_t *md_init(void *); -extern md_t *md_init_intern(uint64_t *, void*(*)(size_t), - void (*)(void*, size_t)); +extern md_t *md_init_intern(uint64_t *, + void *(*allocp)(size_t), + void (*freep)(void *, size_t)); extern int md_fini(md_t *); @@ -112,6 +129,10 @@ extern mde_str_cookie_t md_find_name(md_t *, char *namep); extern mde_cookie_t md_root_node(md_t *); +extern uint64_t md_get_gen(md_t *); + +extern size_t md_get_bin_size(md_t *); + extern int md_scan_dag(md_t *, mde_cookie_t, mde_str_cookie_t, @@ -134,6 +155,24 @@ extern int md_get_prop_data(md_t *, uint8_t **, int *); +extern md_diff_cookie_t md_diff_init(md_t *, + mde_cookie_t, + md_t *, + mde_cookie_t, + char *, + md_prop_match_t *); + +extern int md_diff_added(md_diff_cookie_t, + mde_cookie_t **); + +extern int md_diff_removed(md_diff_cookie_t, + mde_cookie_t **); + +extern int md_diff_matched(md_diff_cookie_t, + mde_cookie_t **, + mde_cookie_t **); + +extern int md_diff_fini(md_diff_cookie_t); #endif /* } _ASM */ @@ -150,7 +189,6 @@ extern int md_get_prop_data(md_t *, #define MDESCIOCSSZ (MDESCIOC | 2) /* Set new quote buffer size */ #define MDESCIOCDISCARD (MDESCIOC | 3) /* Discard quotes and reset */ - #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/mdesc_impl.h b/usr/src/uts/common/sys/mdesc_impl.h index e0f2ced499..391a646b45 100644 --- a/usr/src/uts/common/sys/mdesc_impl.h +++ b/usr/src/uts/common/sys/mdesc_impl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -115,7 +115,7 @@ struct MACHINE_DESCRIPTION { caddr_t caddr; void *(*allocp)(size_t); - void (*freep)(void*, size_t); + void (*freep)(void *, size_t); md_header_t *headerp; md_element_t *mdep; @@ -132,6 +132,7 @@ struct MACHINE_DESCRIPTION { mde_cookie_t root_node; int size; + uint64_t gen; uint64_t md_magic; }; @@ -152,7 +153,6 @@ extern mde_cookie_t md_find_node_prop(md_impl_t *, mde_cookie_t, mde_str_cookie_t, int); - #endif /* _ASM */ #ifdef __cplusplus diff --git a/usr/src/uts/sfmmu/ml/sfmmu_kdi.s b/usr/src/uts/sfmmu/ml/sfmmu_kdi.s index e968c1d90f..53b5f9f938 100644 --- a/usr/src/uts/sfmmu/ml/sfmmu_kdi.s +++ b/usr/src/uts/sfmmu/ml/sfmmu_kdi.s @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -303,8 +303,11 @@ kdi_trap_vatotte(void) ldx [%g2], %g2 /* VA %g1, sfmmup %g2 */ mov 1, %g3 /* VA %g1, sfmmup %g2, idx %g3 */ -1: mov HBLK_RANGE_SHIFT, %g4 - mulx %g3, 3, %g4 + mov HBLK_RANGE_SHIFT, %g4 + ba 3f + nop + +1: mulx %g3, 3, %g4 /* 3: see TTE_BSZS_SHIFT */ add %g4, MMU_PAGESHIFT, %g4 3: KDI_HME_HASH_FUNCTION /* %g1, %g2, %g4 => hash in %g4 */ @@ -321,11 +324,9 @@ kdi_trap_vatotte(void) 4: ba,a 6f 5: add %g3, 1, %g3 -#ifdef sun4v - cmp %g3, MAX_HASHCNT -#else - cmp %g3, DEFAULT_MAX_HASHCNT /* no 32/256M kernel pages */ -#endif + set mmu_hashcnt, %g4 + lduw [%g4], %g4 + cmp %g3, %g4 ble 1b nop diff --git a/usr/src/uts/sun4/io/trapstat.c b/usr/src/uts/sun4/io/trapstat.c index bdaac735fe..9aa25eca4f 100644 --- a/usr/src/uts/sun4/io/trapstat.c +++ b/usr/src/uts/sun4/io/trapstat.c @@ -1712,8 +1712,25 @@ trapstat_cpu_setup(cpu_setup_t what, processorid_t cpu) break; case CPU_UNCONFIG: - if (tcpu->tcpu_flags & TSTAT_CPU_ENABLED) + if (tcpu->tcpu_flags & TSTAT_CPU_ENABLED) { tcpu->tcpu_flags &= ~TSTAT_CPU_ENABLED; +#ifdef sun4v + /* + * A power-off, causes the cpu mondo queues to be + * unconfigured on sun4v. Since we can't teardown + * trapstat's mappings on the cpu that is going away, + * we simply mark it as not allocated. This will + * prevent a teardown on a cpu with the same cpu id + * that might have been added while trapstat is running. + */ + if (tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED) { + tcpu->tcpu_pfn = NULL; + tcpu->tcpu_instr = NULL; + tcpu->tcpu_data = NULL; + tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED; + } +#endif + } break; default: diff --git a/usr/src/uts/sun4/os/ddi_impl.c b/usr/src/uts/sun4/os/ddi_impl.c index 517f109fed..343d3391b5 100644 --- a/usr/src/uts/sun4/os/ddi_impl.c +++ b/usr/src/uts/sun4/os/ddi_impl.c @@ -53,6 +53,7 @@ #include <sys/fs/dv_node.h> #include <sys/fs/snode.h> #include <sys/ddi_isa.h> +#include <sys/modhash.h> dev_info_t *get_intr_parent(dev_info_t *, dev_info_t *, ddi_intr_handle_impl_t *); @@ -1968,3 +1969,831 @@ peekpoke_mem(ddi_ctl_enum_t cmd, peekpoke_ctlops_t *in_args) return (err); } + +/* + * Platform independent DR routines + */ + +static int +ndi2errno(int n) +{ + int err = 0; + + switch (n) { + case NDI_NOMEM: + err = ENOMEM; + break; + case NDI_BUSY: + err = EBUSY; + break; + case NDI_FAULT: + err = EFAULT; + break; + case NDI_FAILURE: + err = EIO; + break; + case NDI_SUCCESS: + break; + case NDI_BADHANDLE: + default: + err = EINVAL; + break; + } + return (err); +} + +/* + * Prom tree node list + */ +struct ptnode { + pnode_t nodeid; + struct ptnode *next; +}; + +/* + * Prom tree walk arg + */ +struct pta { + dev_info_t *pdip; + devi_branch_t *bp; + uint_t flags; + dev_info_t *fdip; + struct ptnode *head; +}; + +static void +visit_node(pnode_t nodeid, struct pta *ap) +{ + struct ptnode **nextp; + int (*select)(pnode_t, void *, uint_t); + + ASSERT(nodeid != OBP_NONODE && nodeid != OBP_BADNODE); + + select = ap->bp->create.prom_branch_select; + + ASSERT(select); + + if (select(nodeid, ap->bp->arg, 0) == DDI_SUCCESS) { + + for (nextp = &ap->head; *nextp; nextp = &(*nextp)->next) + ; + + *nextp = kmem_zalloc(sizeof (struct ptnode), KM_SLEEP); + + (*nextp)->nodeid = nodeid; + } + + if ((ap->flags & DEVI_BRANCH_CHILD) == DEVI_BRANCH_CHILD) + return; + + nodeid = prom_childnode(nodeid); + while (nodeid != OBP_NONODE && nodeid != OBP_BADNODE) { + visit_node(nodeid, ap); + nodeid = prom_nextnode(nodeid); + } +} + +/*ARGSUSED*/ +static int +set_dip_offline(dev_info_t *dip, void *arg) +{ + ASSERT(dip); + + mutex_enter(&(DEVI(dip)->devi_lock)); + if (!DEVI_IS_DEVICE_OFFLINE(dip)) + DEVI_SET_DEVICE_OFFLINE(dip); + mutex_exit(&(DEVI(dip)->devi_lock)); + + return (DDI_WALK_CONTINUE); +} + +/*ARGSUSED*/ +static int +create_prom_branch(void *arg, int has_changed) +{ + int circ, c; + int exists, rv; + pnode_t nodeid; + struct ptnode *tnp; + dev_info_t *dip; + struct pta *ap = arg; + devi_branch_t *bp; + + ASSERT(ap); + ASSERT(ap->fdip == NULL); + ASSERT(ap->pdip && ndi_dev_is_prom_node(ap->pdip)); + + bp = ap->bp; + + nodeid = ddi_get_nodeid(ap->pdip); + if (nodeid == OBP_NONODE || nodeid == OBP_BADNODE) { + cmn_err(CE_WARN, "create_prom_branch: invalid " + "nodeid: 0x%x", nodeid); + return (EINVAL); + } + + ap->head = NULL; + + nodeid = prom_childnode(nodeid); + while (nodeid != OBP_NONODE && nodeid != OBP_BADNODE) { + visit_node(nodeid, ap); + nodeid = prom_nextnode(nodeid); + } + + if (ap->head == NULL) + return (ENODEV); + + rv = 0; + while ((tnp = ap->head) != NULL) { + ap->head = tnp->next; + + ndi_devi_enter(ap->pdip, &circ); + + /* + * Check if the branch already exists. + */ + exists = 0; + dip = e_ddi_nodeid_to_dip(tnp->nodeid); + if (dip != NULL) { + exists = 1; + + /* Parent is held busy, so release hold */ + ndi_rele_devi(dip); +#ifdef DEBUG + cmn_err(CE_WARN, "create_prom_branch: dip(%p) exists" + " for nodeid 0x%x", (void *)dip, tnp->nodeid); +#endif + } else { + dip = i_ddi_create_branch(ap->pdip, tnp->nodeid); + } + + kmem_free(tnp, sizeof (struct ptnode)); + + if (dip == NULL) { + ndi_devi_exit(ap->pdip, circ); + rv = EIO; + continue; + } + + ASSERT(ddi_get_parent(dip) == ap->pdip); + + /* + * Hold the branch if it is not already held + */ + if (!exists) + e_ddi_branch_hold(dip); + + ASSERT(e_ddi_branch_held(dip)); + + /* + * Set all dips in the branch offline so that + * only a "configure" operation can attach + * the branch + */ + (void) set_dip_offline(dip, NULL); + + ndi_devi_enter(dip, &c); + ddi_walk_devs(ddi_get_child(dip), set_dip_offline, NULL); + ndi_devi_exit(dip, c); + + ndi_devi_exit(ap->pdip, circ); + + if (ap->flags & DEVI_BRANCH_CONFIGURE) { + int error = e_ddi_branch_configure(dip, &ap->fdip, 0); + if (error && rv == 0) + rv = error; + } + + /* + * Invoke devi_branch_callback() (if it exists) only for + * newly created branches + */ + if (bp->devi_branch_callback && !exists) + bp->devi_branch_callback(dip, bp->arg, 0); + } + + return (rv); +} + +static int +sid_node_create(dev_info_t *pdip, devi_branch_t *bp, dev_info_t **rdipp) +{ + int rv, circ, len; + int i, flags; + dev_info_t *dip; + char *nbuf; + static const char *noname = "<none>"; + + ASSERT(pdip); + ASSERT(DEVI_BUSY_OWNED(pdip)); + + flags = 0; + + /* + * Creating the root of a branch ? + */ + if (rdipp) { + *rdipp = NULL; + flags = DEVI_BRANCH_ROOT; + } + + ndi_devi_alloc_sleep(pdip, (char *)noname, DEVI_SID_NODEID, &dip); + rv = bp->create.sid_branch_create(dip, bp->arg, flags); + + nbuf = kmem_alloc(OBP_MAXDRVNAME, KM_SLEEP); + + if (rv == DDI_WALK_ERROR) { + cmn_err(CE_WARN, "e_ddi_branch_create: Error setting" + " properties on devinfo node %p", (void *)dip); + goto fail; + } + + len = OBP_MAXDRVNAME; + if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, "name", nbuf, &len) + != DDI_PROP_SUCCESS) { + cmn_err(CE_WARN, "e_ddi_branch_create: devinfo node %p has" + "no name property", (void *)dip); + goto fail; + } + + ASSERT(i_ddi_node_state(dip) == DS_PROTO); + if (ndi_devi_set_nodename(dip, nbuf, 0) != NDI_SUCCESS) { + cmn_err(CE_WARN, "e_ddi_branch_create: cannot set name (%s)" + " for devinfo node %p", nbuf, (void *)dip); + goto fail; + } + + kmem_free(nbuf, OBP_MAXDRVNAME); + + /* + * Ignore bind failures just like boot does + */ + (void) ndi_devi_bind_driver(dip, 0); + + switch (rv) { + case DDI_WALK_CONTINUE: + case DDI_WALK_PRUNESIB: + ndi_devi_enter(dip, &circ); + + i = DDI_WALK_CONTINUE; + for (; i == DDI_WALK_CONTINUE; ) { + i = sid_node_create(dip, bp, NULL); + } + + ASSERT(i == DDI_WALK_ERROR || i == DDI_WALK_PRUNESIB); + if (i == DDI_WALK_ERROR) + rv = i; + /* + * If PRUNESIB stop creating siblings + * of dip's child. Subsequent walk behavior + * is determined by rv returned by dip. + */ + + ndi_devi_exit(dip, circ); + break; + case DDI_WALK_TERMINATE: + /* + * Don't create children and ask our parent + * to not create siblings either. + */ + rv = DDI_WALK_PRUNESIB; + break; + case DDI_WALK_PRUNECHILD: + /* + * Don't create children, but ask parent to continue + * with siblings. + */ + rv = DDI_WALK_CONTINUE; + break; + default: + ASSERT(0); + break; + } + + if (rdipp) + *rdipp = dip; + + /* + * Set device offline - only the "configure" op should cause an attach + */ + (void) set_dip_offline(dip, NULL); + + return (rv); +fail: + (void) ndi_devi_free(dip); + kmem_free(nbuf, OBP_MAXDRVNAME); + return (DDI_WALK_ERROR); +} + +static int +create_sid_branch( + dev_info_t *pdip, + devi_branch_t *bp, + dev_info_t **dipp, + uint_t flags) +{ + int rv = 0, state = DDI_WALK_CONTINUE; + dev_info_t *rdip; + + while (state == DDI_WALK_CONTINUE) { + int circ; + + ndi_devi_enter(pdip, &circ); + + state = sid_node_create(pdip, bp, &rdip); + if (rdip == NULL) { + ndi_devi_exit(pdip, circ); + ASSERT(state == DDI_WALK_ERROR); + break; + } + + e_ddi_branch_hold(rdip); + + ndi_devi_exit(pdip, circ); + + if (flags & DEVI_BRANCH_CONFIGURE) { + int error = e_ddi_branch_configure(rdip, dipp, 0); + if (error && rv == 0) + rv = error; + } + + /* + * devi_branch_callback() is optional + */ + if (bp->devi_branch_callback) + bp->devi_branch_callback(rdip, bp->arg, 0); + } + + ASSERT(state == DDI_WALK_ERROR || state == DDI_WALK_PRUNESIB); + + return (state == DDI_WALK_ERROR ? EIO : rv); +} + +int +e_ddi_branch_create( + dev_info_t *pdip, + devi_branch_t *bp, + dev_info_t **dipp, + uint_t flags) +{ + int prom_devi, sid_devi, error; + + if (pdip == NULL || bp == NULL || bp->type == 0) + return (EINVAL); + + prom_devi = (bp->type == DEVI_BRANCH_PROM) ? 1 : 0; + sid_devi = (bp->type == DEVI_BRANCH_SID) ? 1 : 0; + + if (prom_devi && bp->create.prom_branch_select == NULL) + return (EINVAL); + else if (sid_devi && bp->create.sid_branch_create == NULL) + return (EINVAL); + else if (!prom_devi && !sid_devi) + return (EINVAL); + + if (flags & DEVI_BRANCH_EVENT) + return (EINVAL); + + if (prom_devi) { + struct pta pta = {0}; + + pta.pdip = pdip; + pta.bp = bp; + pta.flags = flags; + + error = prom_tree_access(create_prom_branch, &pta, NULL); + + if (dipp) + *dipp = pta.fdip; + else if (pta.fdip) + ndi_rele_devi(pta.fdip); + } else { + error = create_sid_branch(pdip, bp, dipp, flags); + } + + return (error); +} + +int +e_ddi_branch_configure(dev_info_t *rdip, dev_info_t **dipp, uint_t flags) +{ + int circ, rv; + char *devnm; + dev_info_t *pdip; + + if (dipp) + *dipp = NULL; + + if (rdip == NULL || flags != 0 || (flags & DEVI_BRANCH_EVENT)) + return (EINVAL); + + pdip = ddi_get_parent(rdip); + + ndi_devi_enter(pdip, &circ); + + if (!e_ddi_branch_held(rdip)) { + ndi_devi_exit(pdip, circ); + cmn_err(CE_WARN, "e_ddi_branch_configure: " + "dip(%p) not held", (void *)rdip); + return (EINVAL); + } + + if (i_ddi_node_state(rdip) < DS_INITIALIZED) { + /* + * First attempt to bind a driver. If we fail, return + * success (On some platforms, dips for some device + * types (CPUs) may not have a driver) + */ + if (ndi_devi_bind_driver(rdip, 0) != NDI_SUCCESS) { + ndi_devi_exit(pdip, circ); + return (0); + } + + if (ddi_initchild(pdip, rdip) != DDI_SUCCESS) { + rv = NDI_FAILURE; + goto out; + } + } + + ASSERT(i_ddi_node_state(rdip) >= DS_INITIALIZED); + + devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP); + + (void) ddi_deviname(rdip, devnm); + + if ((rv = ndi_devi_config_one(pdip, devnm+1, &rdip, + NDI_DEVI_ONLINE | NDI_CONFIG)) == NDI_SUCCESS) { + /* release hold from ndi_devi_config_one() */ + ndi_rele_devi(rdip); + } + + kmem_free(devnm, MAXNAMELEN + 1); +out: + if (rv != NDI_SUCCESS && dipp) { + ndi_hold_devi(rdip); + *dipp = rdip; + } + ndi_devi_exit(pdip, circ); + return (ndi2errno(rv)); +} + +void +e_ddi_branch_hold(dev_info_t *rdip) +{ + if (e_ddi_branch_held(rdip)) { + cmn_err(CE_WARN, "e_ddi_branch_hold: branch already held"); + return; + } + + mutex_enter(&DEVI(rdip)->devi_lock); + if ((DEVI(rdip)->devi_flags & DEVI_BRANCH_HELD) == 0) { + DEVI(rdip)->devi_flags |= DEVI_BRANCH_HELD; + DEVI(rdip)->devi_ref++; + } + ASSERT(DEVI(rdip)->devi_ref > 0); + mutex_exit(&DEVI(rdip)->devi_lock); +} + +int +e_ddi_branch_held(dev_info_t *rdip) +{ + int rv = 0; + + mutex_enter(&DEVI(rdip)->devi_lock); + if ((DEVI(rdip)->devi_flags & DEVI_BRANCH_HELD) && + DEVI(rdip)->devi_ref > 0) { + rv = 1; + } + mutex_exit(&DEVI(rdip)->devi_lock); + + return (rv); +} +void +e_ddi_branch_rele(dev_info_t *rdip) +{ + mutex_enter(&DEVI(rdip)->devi_lock); + DEVI(rdip)->devi_flags &= ~DEVI_BRANCH_HELD; + DEVI(rdip)->devi_ref--; + mutex_exit(&DEVI(rdip)->devi_lock); +} + +int +e_ddi_branch_unconfigure( + dev_info_t *rdip, + dev_info_t **dipp, + uint_t flags) +{ + int circ, rv; + int destroy; + char *devnm; + uint_t nflags; + dev_info_t *pdip; + + if (dipp) + *dipp = NULL; + + if (rdip == NULL) + return (EINVAL); + + pdip = ddi_get_parent(rdip); + + ASSERT(pdip); + + /* + * Check if caller holds pdip busy - can cause deadlocks during + * devfs_clean() + */ + if (DEVI_BUSY_OWNED(pdip)) { + cmn_err(CE_WARN, "e_ddi_branch_unconfigure: failed: parent" + " devinfo node(%p) is busy held", (void *)pdip); + return (EINVAL); + } + + destroy = (flags & DEVI_BRANCH_DESTROY) ? 1 : 0; + + devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP); + + ndi_devi_enter(pdip, &circ); + (void) ddi_deviname(rdip, devnm); + ndi_devi_exit(pdip, circ); + + /* + * ddi_deviname() returns a component name with / prepended. + */ + rv = devfs_clean(pdip, devnm + 1, DV_CLEAN_FORCE); + if (rv) { + kmem_free(devnm, MAXNAMELEN + 1); + return (rv); + } + + ndi_devi_enter(pdip, &circ); + + /* + * Recreate device name as it may have changed state (init/uninit) + * when parent busy lock was dropped for devfs_clean() + */ + (void) ddi_deviname(rdip, devnm); + + if (!e_ddi_branch_held(rdip)) { + kmem_free(devnm, MAXNAMELEN + 1); + ndi_devi_exit(pdip, circ); + cmn_err(CE_WARN, "e_ddi_%s_branch: dip(%p) not held", + destroy ? "destroy" : "unconfigure", (void *)rdip); + return (EINVAL); + } + + /* + * Release hold on the branch. This is ok since we are holding the + * parent busy. If rdip is not removed, we must do a hold on the + * branch before returning. + */ + e_ddi_branch_rele(rdip); + + nflags = NDI_DEVI_OFFLINE; + if (destroy || (flags & DEVI_BRANCH_DESTROY)) { + nflags |= NDI_DEVI_REMOVE; + destroy = 1; + } else { + nflags |= NDI_UNCONFIG; /* uninit but don't remove */ + } + + if (flags & DEVI_BRANCH_EVENT) + nflags |= NDI_POST_EVENT; + + if (i_ddi_devi_attached(pdip) && + (i_ddi_node_state(rdip) >= DS_INITIALIZED)) { + rv = ndi_devi_unconfig_one(pdip, devnm+1, dipp, nflags); + } else { + rv = e_ddi_devi_unconfig(rdip, dipp, nflags); + if (rv == NDI_SUCCESS) { + ASSERT(!destroy || ddi_get_child(rdip) == NULL); + rv = ndi_devi_offline(rdip, nflags); + } + } + + if (!destroy || rv != NDI_SUCCESS) { + /* The dip still exists, so do a hold */ + e_ddi_branch_hold(rdip); + } +out: + kmem_free(devnm, MAXNAMELEN + 1); + ndi_devi_exit(pdip, circ); + return (ndi2errno(rv)); +} + +int +e_ddi_branch_destroy(dev_info_t *rdip, dev_info_t **dipp, uint_t flag) +{ + return (e_ddi_branch_unconfigure(rdip, dipp, + flag|DEVI_BRANCH_DESTROY)); +} + +/* + * Number of chains for hash table + */ +#define NUMCHAINS 17 + +/* + * Devinfo busy arg + */ +struct devi_busy { + int dv_total; + int s_total; + mod_hash_t *dv_hash; + mod_hash_t *s_hash; + int (*callback)(dev_info_t *, void *, uint_t); + void *arg; +}; + +static int +visit_dip(dev_info_t *dip, void *arg) +{ + uintptr_t sbusy, dvbusy, ref; + struct devi_busy *bsp = arg; + + ASSERT(bsp->callback); + + /* + * A dip cannot be busy if its reference count is 0 + */ + if ((ref = e_ddi_devi_holdcnt(dip)) == 0) { + return (bsp->callback(dip, bsp->arg, 0)); + } + + if (mod_hash_find(bsp->dv_hash, dip, (mod_hash_val_t *)&dvbusy)) + dvbusy = 0; + + /* + * To catch device opens currently maintained on specfs common snodes. + */ + if (mod_hash_find(bsp->s_hash, dip, (mod_hash_val_t *)&sbusy)) + sbusy = 0; + +#ifdef DEBUG + if (ref < sbusy || ref < dvbusy) { + cmn_err(CE_WARN, "dip(%p): sopen = %lu, dvopen = %lu " + "dip ref = %lu\n", (void *)dip, sbusy, dvbusy, ref); + } +#endif + + dvbusy = (sbusy > dvbusy) ? sbusy : dvbusy; + + return (bsp->callback(dip, bsp->arg, dvbusy)); +} + +static int +visit_snode(struct snode *sp, void *arg) +{ + uintptr_t sbusy; + dev_info_t *dip; + int count; + struct devi_busy *bsp = arg; + + ASSERT(sp); + + /* + * The stable lock is held. This prevents + * the snode and its associated dip from + * going away. + */ + dip = NULL; + count = spec_devi_open_count(sp, &dip); + + if (count <= 0) + return (DDI_WALK_CONTINUE); + + ASSERT(dip); + + if (mod_hash_remove(bsp->s_hash, dip, (mod_hash_val_t *)&sbusy)) + sbusy = count; + else + sbusy += count; + + if (mod_hash_insert(bsp->s_hash, dip, (mod_hash_val_t)sbusy)) { + cmn_err(CE_WARN, "%s: s_hash insert failed: dip=0x%p, " + "sbusy = %lu", "e_ddi_branch_referenced", + (void *)dip, sbusy); + } + + bsp->s_total += count; + + return (DDI_WALK_CONTINUE); +} + +static void +visit_dvnode(struct dv_node *dv, void *arg) +{ + uintptr_t dvbusy; + uint_t count; + struct vnode *vp; + struct devi_busy *bsp = arg; + + ASSERT(dv && dv->dv_devi); + + vp = DVTOV(dv); + + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + + if (!count) + return; + + if (mod_hash_remove(bsp->dv_hash, dv->dv_devi, + (mod_hash_val_t *)&dvbusy)) + dvbusy = count; + else + dvbusy += count; + + if (mod_hash_insert(bsp->dv_hash, dv->dv_devi, + (mod_hash_val_t)dvbusy)) { + cmn_err(CE_WARN, "%s: dv_hash insert failed: dip=0x%p, " + "dvbusy=%lu", "e_ddi_branch_referenced", + (void *)dv->dv_devi, dvbusy); + } + + bsp->dv_total += count; +} + +/* + * Returns reference count on success or -1 on failure. + */ +int +e_ddi_branch_referenced( + dev_info_t *rdip, + int (*callback)(dev_info_t *dip, void *arg, uint_t ref), + void *arg) +{ + int circ; + char *path; + dev_info_t *pdip; + struct devi_busy bsa = {0}; + + ASSERT(rdip); + + path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + ndi_hold_devi(rdip); + + pdip = ddi_get_parent(rdip); + + ASSERT(pdip); + + /* + * Check if caller holds pdip busy - can cause deadlocks during + * devfs_walk() + */ + if (!e_ddi_branch_held(rdip) || DEVI_BUSY_OWNED(pdip)) { + cmn_err(CE_WARN, "e_ddi_branch_referenced: failed: " + "devinfo branch(%p) not held or parent busy held", + (void *)rdip); + ndi_rele_devi(rdip); + kmem_free(path, MAXPATHLEN); + return (-1); + } + + ndi_devi_enter(pdip, &circ); + (void) ddi_pathname(rdip, path); + ndi_devi_exit(pdip, circ); + + bsa.dv_hash = mod_hash_create_ptrhash("dv_node busy hash", NUMCHAINS, + mod_hash_null_valdtor, sizeof (struct dev_info)); + + bsa.s_hash = mod_hash_create_ptrhash("snode busy hash", NUMCHAINS, + mod_hash_null_valdtor, sizeof (struct snode)); + + if (devfs_walk(path, visit_dvnode, &bsa)) { + cmn_err(CE_WARN, "e_ddi_branch_referenced: " + "devfs walk failed for: %s", path); + kmem_free(path, MAXPATHLEN); + bsa.s_total = bsa.dv_total = -1; + goto out; + } + + kmem_free(path, MAXPATHLEN); + + /* + * Walk the snode table to detect device opens, which are currently + * maintained on specfs common snodes. + */ + spec_snode_walk(visit_snode, &bsa); + + if (callback == NULL) + goto out; + + bsa.callback = callback; + bsa.arg = arg; + + if (visit_dip(rdip, &bsa) == DDI_WALK_CONTINUE) { + ndi_devi_enter(rdip, &circ); + ddi_walk_devs(ddi_get_child(rdip), visit_dip, &bsa); + ndi_devi_exit(rdip, circ); + } + +out: + ndi_rele_devi(rdip); + mod_hash_destroy_ptrhash(bsa.s_hash); + mod_hash_destroy_ptrhash(bsa.dv_hash); + return (bsa.s_total > bsa.dv_total ? bsa.s_total : bsa.dv_total); +} diff --git a/usr/src/uts/sun4/os/mlsetup.c b/usr/src/uts/sun4/os/mlsetup.c index 7095e6551d..53e4812dae 100644 --- a/usr/src/uts/sun4/os/mlsetup.c +++ b/usr/src/uts/sun4/os/mlsetup.c @@ -71,6 +71,7 @@ */ extern void map_wellknown_devices(void); extern void hsvc_setup(void); +extern void mach_descrip_startup_init(void); int dcache_size; int dcache_linesize; @@ -242,6 +243,13 @@ mlsetup(struct regs *rp, void *cif, kfpu_t *fp) ctlp->d.limit = TRAP_TSIZE; /* XXX dynamic someday */ ctlp->d.paddr_base = va_to_pa(trap_tr0); #endif /* TRAPTRACE */ + + /* + * Initialize the Machine Description kernel framework + */ + + mach_descrip_startup_init(); + /* * initialize HV trap trace buffer for the boot cpu */ diff --git a/usr/src/uts/sun4/os/mp_startup.c b/usr/src/uts/sun4/os/mp_startup.c index ed3477597d..0139e6a5f0 100644 --- a/usr/src/uts/sun4/os/mp_startup.c +++ b/usr/src/uts/sun4/os/mp_startup.c @@ -18,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -45,6 +46,7 @@ #include <sys/cpu_sgnblk_defs.h> extern void cpu_intrq_setup(struct cpu *); +extern void cpu_intrq_cleanup(struct cpu *); extern void cpu_intrq_register(struct cpu *); struct cpu *cpus; /* pointer to other cpus; dynamically allocate */ @@ -469,6 +471,11 @@ cleanup_cpu_common(int cpuid) cleanup_intr_pool(cp); /* + * Clean any machine specific interrupt states. + */ + cpu_intrq_cleanup(cp); + + /* * At this point, the only threads bound to this CPU should be * special per-cpu threads: it's idle thread, it's pause thread, * and it's interrupt threads. Clean these up. diff --git a/usr/src/uts/sun4/os/startup.c b/usr/src/uts/sun4/os/startup.c index 8e2ce99fc8..3a0506263c 100644 --- a/usr/src/uts/sun4/os/startup.c +++ b/usr/src/uts/sun4/os/startup.c @@ -71,16 +71,21 @@ extern void cpu_intrq_register(struct cpu *); extern void contig_mem_init(void); extern void mach_dump_buffer_init(void); extern void mach_descrip_init(void); +extern void mach_descrip_startup_fini(void); extern void mach_memscrub(void); extern void mach_fpras(void); extern void mach_cpu_halt_idle(void); extern void mach_hw_copy_limit(void); +extern void load_mach_drivers(void); extern void load_tod_module(void); #pragma weak load_tod_module extern int ndata_alloc_mmfsa(struct memlist *ndata); #pragma weak ndata_alloc_mmfsa +extern void cif_init(void); +#pragma weak cif_init + extern void parse_idprom(void); extern void add_vx_handler(char *, int, void (*)(cell_t *)); extern void mem_config_init(void); @@ -1748,6 +1753,13 @@ startup_bop_gone(void) extern int bop_io_quiesced; /* + * Destroy the MD initialized at startup + * The startup initializes the MD framework + * using prom and BOP alloc free it now. + */ + mach_descrip_startup_fini(); + + /* * Call back into boot and release boots resources. */ BOP_QUIESCE_IO(bootops); @@ -2198,6 +2210,10 @@ post_startup(void) */ (void) modload("fs", "procfs"); + /* load machine class specific drivers */ + load_mach_drivers(); + + /* load platform specific drivers */ if (&load_platform_drivers) load_platform_drivers(); @@ -2214,6 +2230,9 @@ post_startup(void) #ifdef PTL1_PANIC_DEBUG init_ptl1_thread(); #endif /* PTL1_PANIC_DEBUG */ + + if (&cif_init) + cif_init(); } #ifdef PTL1_PANIC_DEBUG diff --git a/usr/src/uts/sun4u/os/mach_ddi_impl.c b/usr/src/uts/sun4u/os/mach_ddi_impl.c index 195d86520b..6f01f3408e 100644 --- a/usr/src/uts/sun4u/os/mach_ddi_impl.c +++ b/usr/src/uts/sun4u/os/mach_ddi_impl.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -35,7 +35,6 @@ #include <sys/ethernet.h> #include <sys/idprom.h> #include <sys/machsystm.h> -#include <sys/modhash.h> #include <sys/promif.h> #include <sys/prom_plat.h> #include <sys/sunndi.h> @@ -397,831 +396,3 @@ dip_to_cpu_id(dev_info_t *dip, processorid_t *cpu_id) } return (DDI_FAILURE); } - -/* - * Platform independent DR routines - */ - -static int -ndi2errno(int n) -{ - int err = 0; - - switch (n) { - case NDI_NOMEM: - err = ENOMEM; - break; - case NDI_BUSY: - err = EBUSY; - break; - case NDI_FAULT: - err = EFAULT; - break; - case NDI_FAILURE: - err = EIO; - break; - case NDI_SUCCESS: - break; - case NDI_BADHANDLE: - default: - err = EINVAL; - break; - } - return (err); -} - -/* - * Prom tree node list - */ -struct ptnode { - pnode_t nodeid; - struct ptnode *next; -}; - -/* - * Prom tree walk arg - */ -struct pta { - dev_info_t *pdip; - devi_branch_t *bp; - uint_t flags; - dev_info_t *fdip; - struct ptnode *head; -}; - -static void -visit_node(pnode_t nodeid, struct pta *ap) -{ - struct ptnode **nextp; - int (*select)(pnode_t, void *, uint_t); - - ASSERT(nodeid != OBP_NONODE && nodeid != OBP_BADNODE); - - select = ap->bp->create.prom_branch_select; - - ASSERT(select); - - if (select(nodeid, ap->bp->arg, 0) == DDI_SUCCESS) { - - for (nextp = &ap->head; *nextp; nextp = &(*nextp)->next) - ; - - *nextp = kmem_zalloc(sizeof (struct ptnode), KM_SLEEP); - - (*nextp)->nodeid = nodeid; - } - - if ((ap->flags & DEVI_BRANCH_CHILD) == DEVI_BRANCH_CHILD) - return; - - nodeid = prom_childnode(nodeid); - while (nodeid != OBP_NONODE && nodeid != OBP_BADNODE) { - visit_node(nodeid, ap); - nodeid = prom_nextnode(nodeid); - } -} - -/*ARGSUSED*/ -static int -set_dip_offline(dev_info_t *dip, void *arg) -{ - ASSERT(dip); - - mutex_enter(&(DEVI(dip)->devi_lock)); - if (!DEVI_IS_DEVICE_OFFLINE(dip)) - DEVI_SET_DEVICE_OFFLINE(dip); - mutex_exit(&(DEVI(dip)->devi_lock)); - - return (DDI_WALK_CONTINUE); -} - -/*ARGSUSED*/ -static int -create_prom_branch(void *arg, int has_changed) -{ - int circ, c; - int exists, rv; - pnode_t nodeid; - struct ptnode *tnp; - dev_info_t *dip; - struct pta *ap = arg; - devi_branch_t *bp; - - ASSERT(ap); - ASSERT(ap->fdip == NULL); - ASSERT(ap->pdip && ndi_dev_is_prom_node(ap->pdip)); - - bp = ap->bp; - - nodeid = ddi_get_nodeid(ap->pdip); - if (nodeid == OBP_NONODE || nodeid == OBP_BADNODE) { - cmn_err(CE_WARN, "create_prom_branch: invalid " - "nodeid: 0x%x", nodeid); - return (EINVAL); - } - - ap->head = NULL; - - nodeid = prom_childnode(nodeid); - while (nodeid != OBP_NONODE && nodeid != OBP_BADNODE) { - visit_node(nodeid, ap); - nodeid = prom_nextnode(nodeid); - } - - if (ap->head == NULL) - return (ENODEV); - - rv = 0; - while ((tnp = ap->head) != NULL) { - ap->head = tnp->next; - - ndi_devi_enter(ap->pdip, &circ); - - /* - * Check if the branch already exists. - */ - exists = 0; - dip = e_ddi_nodeid_to_dip(tnp->nodeid); - if (dip != NULL) { - exists = 1; - - /* Parent is held busy, so release hold */ - ndi_rele_devi(dip); -#ifdef DEBUG - cmn_err(CE_WARN, "create_prom_branch: dip(%p) exists" - " for nodeid 0x%x", (void *)dip, tnp->nodeid); -#endif - } else { - dip = i_ddi_create_branch(ap->pdip, tnp->nodeid); - } - - kmem_free(tnp, sizeof (struct ptnode)); - - if (dip == NULL) { - ndi_devi_exit(ap->pdip, circ); - rv = EIO; - continue; - } - - ASSERT(ddi_get_parent(dip) == ap->pdip); - - /* - * Hold the branch if it is not already held - */ - if (!exists) - e_ddi_branch_hold(dip); - - ASSERT(e_ddi_branch_held(dip)); - - /* - * Set all dips in the branch offline so that - * only a "configure" operation can attach - * the branch - */ - (void) set_dip_offline(dip, NULL); - - ndi_devi_enter(dip, &c); - ddi_walk_devs(ddi_get_child(dip), set_dip_offline, NULL); - ndi_devi_exit(dip, c); - - ndi_devi_exit(ap->pdip, circ); - - if (ap->flags & DEVI_BRANCH_CONFIGURE) { - int error = e_ddi_branch_configure(dip, &ap->fdip, 0); - if (error && rv == 0) - rv = error; - } - - /* - * Invoke devi_branch_callback() (if it exists) only for - * newly created branches - */ - if (bp->devi_branch_callback && !exists) - bp->devi_branch_callback(dip, bp->arg, 0); - } - - return (rv); -} - -static int -sid_node_create(dev_info_t *pdip, devi_branch_t *bp, dev_info_t **rdipp) -{ - int rv, circ, len; - int i, flags; - dev_info_t *dip; - char *nbuf; - static const char *noname = "<none>"; - - ASSERT(pdip); - ASSERT(DEVI_BUSY_OWNED(pdip)); - - flags = 0; - - /* - * Creating the root of a branch ? - */ - if (rdipp) { - *rdipp = NULL; - flags = DEVI_BRANCH_ROOT; - } - - ndi_devi_alloc_sleep(pdip, (char *)noname, DEVI_SID_NODEID, &dip); - rv = bp->create.sid_branch_create(dip, bp->arg, flags); - - nbuf = kmem_alloc(OBP_MAXDRVNAME, KM_SLEEP); - - if (rv == DDI_WALK_ERROR) { - cmn_err(CE_WARN, "e_ddi_branch_create: Error setting" - " properties on devinfo node %p", (void *)dip); - goto fail; - } - - len = OBP_MAXDRVNAME; - if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip, - DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, "name", nbuf, &len) - != DDI_PROP_SUCCESS) { - cmn_err(CE_WARN, "e_ddi_branch_create: devinfo node %p has" - "no name property", (void *)dip); - goto fail; - } - - ASSERT(i_ddi_node_state(dip) == DS_PROTO); - if (ndi_devi_set_nodename(dip, nbuf, 0) != NDI_SUCCESS) { - cmn_err(CE_WARN, "e_ddi_branch_create: cannot set name (%s)" - " for devinfo node %p", nbuf, (void *)dip); - goto fail; - } - - kmem_free(nbuf, OBP_MAXDRVNAME); - - /* - * Ignore bind failures just like boot does - */ - (void) ndi_devi_bind_driver(dip, 0); - - switch (rv) { - case DDI_WALK_CONTINUE: - case DDI_WALK_PRUNESIB: - ndi_devi_enter(dip, &circ); - - i = DDI_WALK_CONTINUE; - for (; i == DDI_WALK_CONTINUE; ) { - i = sid_node_create(dip, bp, NULL); - } - - ASSERT(i == DDI_WALK_ERROR || i == DDI_WALK_PRUNESIB); - if (i == DDI_WALK_ERROR) - rv = i; - /* - * If PRUNESIB stop creating siblings - * of dip's child. Subsequent walk behavior - * is determined by rv returned by dip. - */ - - ndi_devi_exit(dip, circ); - break; - case DDI_WALK_TERMINATE: - /* - * Don't create children and ask our parent - * to not create siblings either. - */ - rv = DDI_WALK_PRUNESIB; - break; - case DDI_WALK_PRUNECHILD: - /* - * Don't create children, but ask parent to continue - * with siblings. - */ - rv = DDI_WALK_CONTINUE; - break; - default: - ASSERT(0); - break; - } - - if (rdipp) - *rdipp = dip; - - /* - * Set device offline - only the "configure" op should cause an attach - */ - (void) set_dip_offline(dip, NULL); - - return (rv); -fail: - (void) ndi_devi_free(dip); - kmem_free(nbuf, OBP_MAXDRVNAME); - return (DDI_WALK_ERROR); -} - -static int -create_sid_branch( - dev_info_t *pdip, - devi_branch_t *bp, - dev_info_t **dipp, - uint_t flags) -{ - int rv = 0, state = DDI_WALK_CONTINUE; - dev_info_t *rdip; - - while (state == DDI_WALK_CONTINUE) { - int circ; - - ndi_devi_enter(pdip, &circ); - - state = sid_node_create(pdip, bp, &rdip); - if (rdip == NULL) { - ndi_devi_exit(pdip, circ); - ASSERT(state == DDI_WALK_ERROR); - break; - } - - e_ddi_branch_hold(rdip); - - ndi_devi_exit(pdip, circ); - - if (flags & DEVI_BRANCH_CONFIGURE) { - int error = e_ddi_branch_configure(rdip, dipp, 0); - if (error && rv == 0) - rv = error; - } - - /* - * devi_branch_callback() is optional - */ - if (bp->devi_branch_callback) - bp->devi_branch_callback(rdip, bp->arg, 0); - } - - ASSERT(state == DDI_WALK_ERROR || state == DDI_WALK_PRUNESIB); - - return (state == DDI_WALK_ERROR ? EIO : rv); -} - -int -e_ddi_branch_create( - dev_info_t *pdip, - devi_branch_t *bp, - dev_info_t **dipp, - uint_t flags) -{ - int prom_devi, sid_devi, error; - - if (pdip == NULL || bp == NULL || bp->type == 0) - return (EINVAL); - - prom_devi = (bp->type == DEVI_BRANCH_PROM) ? 1 : 0; - sid_devi = (bp->type == DEVI_BRANCH_SID) ? 1 : 0; - - if (prom_devi && bp->create.prom_branch_select == NULL) - return (EINVAL); - else if (sid_devi && bp->create.sid_branch_create == NULL) - return (EINVAL); - else if (!prom_devi && !sid_devi) - return (EINVAL); - - if (flags & DEVI_BRANCH_EVENT) - return (EINVAL); - - if (prom_devi) { - struct pta pta = {0}; - - pta.pdip = pdip; - pta.bp = bp; - pta.flags = flags; - - error = prom_tree_access(create_prom_branch, &pta, NULL); - - if (dipp) - *dipp = pta.fdip; - else if (pta.fdip) - ndi_rele_devi(pta.fdip); - } else { - error = create_sid_branch(pdip, bp, dipp, flags); - } - - return (error); -} - -int -e_ddi_branch_configure(dev_info_t *rdip, dev_info_t **dipp, uint_t flags) -{ - int circ, rv; - char *devnm; - dev_info_t *pdip; - - if (dipp) - *dipp = NULL; - - if (rdip == NULL || flags != 0 || (flags & DEVI_BRANCH_EVENT)) - return (EINVAL); - - pdip = ddi_get_parent(rdip); - - ndi_devi_enter(pdip, &circ); - - if (!e_ddi_branch_held(rdip)) { - ndi_devi_exit(pdip, circ); - cmn_err(CE_WARN, "e_ddi_branch_configure: " - "dip(%p) not held", (void *)rdip); - return (EINVAL); - } - - if (i_ddi_node_state(rdip) < DS_INITIALIZED) { - /* - * First attempt to bind a driver. If we fail, return - * success (On some platforms, dips for some device - * types (CPUs) may not have a driver) - */ - if (ndi_devi_bind_driver(rdip, 0) != NDI_SUCCESS) { - ndi_devi_exit(pdip, circ); - return (0); - } - - if (ddi_initchild(pdip, rdip) != DDI_SUCCESS) { - rv = NDI_FAILURE; - goto out; - } - } - - ASSERT(i_ddi_node_state(rdip) >= DS_INITIALIZED); - - devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP); - - (void) ddi_deviname(rdip, devnm); - - if ((rv = ndi_devi_config_one(pdip, devnm+1, &rdip, - NDI_DEVI_ONLINE | NDI_CONFIG)) == NDI_SUCCESS) { - /* release hold from ndi_devi_config_one() */ - ndi_rele_devi(rdip); - } - - kmem_free(devnm, MAXNAMELEN + 1); -out: - if (rv != NDI_SUCCESS && dipp) { - ndi_hold_devi(rdip); - *dipp = rdip; - } - ndi_devi_exit(pdip, circ); - return (ndi2errno(rv)); -} - -void -e_ddi_branch_hold(dev_info_t *rdip) -{ - if (e_ddi_branch_held(rdip)) { - cmn_err(CE_WARN, "e_ddi_branch_hold: branch already held"); - return; - } - - mutex_enter(&DEVI(rdip)->devi_lock); - if ((DEVI(rdip)->devi_flags & DEVI_BRANCH_HELD) == 0) { - DEVI(rdip)->devi_flags |= DEVI_BRANCH_HELD; - DEVI(rdip)->devi_ref++; - } - ASSERT(DEVI(rdip)->devi_ref > 0); - mutex_exit(&DEVI(rdip)->devi_lock); -} - -int -e_ddi_branch_held(dev_info_t *rdip) -{ - int rv = 0; - - mutex_enter(&DEVI(rdip)->devi_lock); - if ((DEVI(rdip)->devi_flags & DEVI_BRANCH_HELD) && - DEVI(rdip)->devi_ref > 0) { - rv = 1; - } - mutex_exit(&DEVI(rdip)->devi_lock); - - return (rv); -} -void -e_ddi_branch_rele(dev_info_t *rdip) -{ - mutex_enter(&DEVI(rdip)->devi_lock); - DEVI(rdip)->devi_flags &= ~DEVI_BRANCH_HELD; - DEVI(rdip)->devi_ref--; - mutex_exit(&DEVI(rdip)->devi_lock); -} - -int -e_ddi_branch_unconfigure( - dev_info_t *rdip, - dev_info_t **dipp, - uint_t flags) -{ - int circ, rv; - int destroy; - char *devnm; - uint_t nflags; - dev_info_t *pdip; - - if (dipp) - *dipp = NULL; - - if (rdip == NULL) - return (EINVAL); - - pdip = ddi_get_parent(rdip); - - ASSERT(pdip); - - /* - * Check if caller holds pdip busy - can cause deadlocks during - * devfs_clean() - */ - if (DEVI_BUSY_OWNED(pdip)) { - cmn_err(CE_WARN, "e_ddi_branch_unconfigure: failed: parent" - " devinfo node(%p) is busy held", (void *)pdip); - return (EINVAL); - } - - destroy = (flags & DEVI_BRANCH_DESTROY) ? 1 : 0; - - devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP); - - ndi_devi_enter(pdip, &circ); - (void) ddi_deviname(rdip, devnm); - ndi_devi_exit(pdip, circ); - - /* - * ddi_deviname() returns a component name with / prepended. - */ - rv = devfs_clean(pdip, devnm + 1, DV_CLEAN_FORCE); - if (rv) { - kmem_free(devnm, MAXNAMELEN + 1); - return (rv); - } - - ndi_devi_enter(pdip, &circ); - - /* - * Recreate device name as it may have changed state (init/uninit) - * when parent busy lock was dropped for devfs_clean() - */ - (void) ddi_deviname(rdip, devnm); - - if (!e_ddi_branch_held(rdip)) { - kmem_free(devnm, MAXNAMELEN + 1); - ndi_devi_exit(pdip, circ); - cmn_err(CE_WARN, "e_ddi_%s_branch: dip(%p) not held", - destroy ? "destroy" : "unconfigure", (void *)rdip); - return (EINVAL); - } - - /* - * Release hold on the branch. This is ok since we are holding the - * parent busy. If rdip is not removed, we must do a hold on the - * branch before returning. - */ - e_ddi_branch_rele(rdip); - - nflags = NDI_DEVI_OFFLINE; - if (destroy || (flags & DEVI_BRANCH_DESTROY)) { - nflags |= NDI_DEVI_REMOVE; - destroy = 1; - } else { - nflags |= NDI_UNCONFIG; /* uninit but don't remove */ - } - - if (flags & DEVI_BRANCH_EVENT) - nflags |= NDI_POST_EVENT; - - if (i_ddi_devi_attached(pdip) && - (i_ddi_node_state(rdip) >= DS_INITIALIZED)) { - rv = ndi_devi_unconfig_one(pdip, devnm+1, dipp, nflags); - } else { - rv = e_ddi_devi_unconfig(rdip, dipp, nflags); - if (rv == NDI_SUCCESS) { - ASSERT(!destroy || ddi_get_child(rdip) == NULL); - rv = ndi_devi_offline(rdip, nflags); - } - } - - if (!destroy || rv != NDI_SUCCESS) { - /* The dip still exists, so do a hold */ - e_ddi_branch_hold(rdip); - } -out: - kmem_free(devnm, MAXNAMELEN + 1); - ndi_devi_exit(pdip, circ); - return (ndi2errno(rv)); -} - -int -e_ddi_branch_destroy(dev_info_t *rdip, dev_info_t **dipp, uint_t flag) -{ - return (e_ddi_branch_unconfigure(rdip, dipp, - flag|DEVI_BRANCH_DESTROY)); -} - -/* - * Number of chains for hash table - */ -#define NUMCHAINS 17 - -/* - * Devinfo busy arg - */ -struct devi_busy { - int dv_total; - int s_total; - mod_hash_t *dv_hash; - mod_hash_t *s_hash; - int (*callback)(dev_info_t *, void *, uint_t); - void *arg; -}; - -static int -visit_dip(dev_info_t *dip, void *arg) -{ - uintptr_t sbusy, dvbusy, ref; - struct devi_busy *bsp = arg; - - ASSERT(bsp->callback); - - /* - * A dip cannot be busy if its reference count is 0 - */ - if ((ref = e_ddi_devi_holdcnt(dip)) == 0) { - return (bsp->callback(dip, bsp->arg, 0)); - } - - if (mod_hash_find(bsp->dv_hash, dip, (mod_hash_val_t *)&dvbusy)) - dvbusy = 0; - - /* - * To catch device opens currently maintained on specfs common snodes. - */ - if (mod_hash_find(bsp->s_hash, dip, (mod_hash_val_t *)&sbusy)) - sbusy = 0; - -#ifdef DEBUG - if (ref < sbusy || ref < dvbusy) { - cmn_err(CE_WARN, "dip(%p): sopen = %lu, dvopen = %lu " - "dip ref = %lu\n", (void *)dip, sbusy, dvbusy, ref); - } -#endif - - dvbusy = (sbusy > dvbusy) ? sbusy : dvbusy; - - return (bsp->callback(dip, bsp->arg, dvbusy)); -} - -static int -visit_snode(struct snode *sp, void *arg) -{ - uintptr_t sbusy; - dev_info_t *dip; - int count; - struct devi_busy *bsp = arg; - - ASSERT(sp); - - /* - * The stable lock is held. This prevents - * the snode and its associated dip from - * going away. - */ - dip = NULL; - count = spec_devi_open_count(sp, &dip); - - if (count <= 0) - return (DDI_WALK_CONTINUE); - - ASSERT(dip); - - if (mod_hash_remove(bsp->s_hash, dip, (mod_hash_val_t *)&sbusy)) - sbusy = count; - else - sbusy += count; - - if (mod_hash_insert(bsp->s_hash, dip, (mod_hash_val_t)sbusy)) { - cmn_err(CE_WARN, "%s: s_hash insert failed: dip=0x%p, " - "sbusy = %lu", "e_ddi_branch_referenced", - (void *)dip, sbusy); - } - - bsp->s_total += count; - - return (DDI_WALK_CONTINUE); -} - -static void -visit_dvnode(struct dv_node *dv, void *arg) -{ - uintptr_t dvbusy; - uint_t count; - struct vnode *vp; - struct devi_busy *bsp = arg; - - ASSERT(dv && dv->dv_devi); - - vp = DVTOV(dv); - - mutex_enter(&vp->v_lock); - count = vp->v_count; - mutex_exit(&vp->v_lock); - - if (!count) - return; - - if (mod_hash_remove(bsp->dv_hash, dv->dv_devi, - (mod_hash_val_t *)&dvbusy)) - dvbusy = count; - else - dvbusy += count; - - if (mod_hash_insert(bsp->dv_hash, dv->dv_devi, - (mod_hash_val_t)dvbusy)) { - cmn_err(CE_WARN, "%s: dv_hash insert failed: dip=0x%p, " - "dvbusy=%lu", "e_ddi_branch_referenced", - (void *)dv->dv_devi, dvbusy); - } - - bsp->dv_total += count; -} - -/* - * Returns reference count on success or -1 on failure. - */ -int -e_ddi_branch_referenced( - dev_info_t *rdip, - int (*callback)(dev_info_t *dip, void *arg, uint_t ref), - void *arg) -{ - int circ; - char *path; - dev_info_t *pdip; - struct devi_busy bsa = {0}; - - ASSERT(rdip); - - path = kmem_alloc(MAXPATHLEN, KM_SLEEP); - - ndi_hold_devi(rdip); - - pdip = ddi_get_parent(rdip); - - ASSERT(pdip); - - /* - * Check if caller holds pdip busy - can cause deadlocks during - * devfs_walk() - */ - if (!e_ddi_branch_held(rdip) || DEVI_BUSY_OWNED(pdip)) { - cmn_err(CE_WARN, "e_ddi_branch_referenced: failed: " - "devinfo branch(%p) not held or parent busy held", - (void *)rdip); - ndi_rele_devi(rdip); - kmem_free(path, MAXPATHLEN); - return (-1); - } - - ndi_devi_enter(pdip, &circ); - (void) ddi_pathname(rdip, path); - ndi_devi_exit(pdip, circ); - - bsa.dv_hash = mod_hash_create_ptrhash("dv_node busy hash", NUMCHAINS, - mod_hash_null_valdtor, sizeof (struct dev_info)); - - bsa.s_hash = mod_hash_create_ptrhash("snode busy hash", NUMCHAINS, - mod_hash_null_valdtor, sizeof (struct snode)); - - if (devfs_walk(path, visit_dvnode, &bsa)) { - cmn_err(CE_WARN, "e_ddi_branch_referenced: " - "devfs walk failed for: %s", path); - kmem_free(path, MAXPATHLEN); - bsa.s_total = bsa.dv_total = -1; - goto out; - } - - kmem_free(path, MAXPATHLEN); - - /* - * Walk the snode table to detect device opens, which are currently - * maintained on specfs common snodes. - */ - spec_snode_walk(visit_snode, &bsa); - - if (callback == NULL) - goto out; - - bsa.callback = callback; - bsa.arg = arg; - - if (visit_dip(rdip, &bsa) == DDI_WALK_CONTINUE) { - ndi_devi_enter(rdip, &circ); - ddi_walk_devs(ddi_get_child(rdip), visit_dip, &bsa); - ndi_devi_exit(rdip, circ); - } - -out: - ndi_rele_devi(rdip); - mod_hash_destroy_ptrhash(bsa.s_hash); - mod_hash_destroy_ptrhash(bsa.dv_hash); - return (bsa.s_total > bsa.dv_total ? bsa.s_total : bsa.dv_total); -} diff --git a/usr/src/uts/sun4u/os/mach_startup.c b/usr/src/uts/sun4u/os/mach_startup.c index 5a8366da64..78639b9a80 100644 --- a/usr/src/uts/sun4u/os/mach_startup.c +++ b/usr/src/uts/sun4u/os/mach_startup.c @@ -402,6 +402,13 @@ cpu_intrq_setup(struct cpu *cp) /*ARGSUSED*/ void +cpu_intrq_cleanup(struct cpu *cp) +{ + /* Interrupt mondo queues not applicable to sun4u */ +} + +/*ARGSUSED*/ +void cpu_intrq_register(struct cpu *cp) { /* Interrupt/error queues not applicable to sun4u */ @@ -429,9 +436,29 @@ mach_htraptrace_cleanup(int cpuid) } void +mach_descrip_startup_init(void) +{ + /* + * Only for sun4v. + * Initialize Machine description framework during startup. + */ +} +void +mach_descrip_startup_fini(void) +{ + /* + * Only for sun4v. + * Clean up Machine Description framework during startup. + */ +} + +void mach_descrip_init(void) { - /* Obtain Machine description - only for sun4v */ + /* + * Only for sun4v. + * Initialize Machine description framework. + */ } void @@ -440,6 +467,12 @@ hsvc_setup(void) /* Setup hypervisor services, not applicable to sun4u */ } +void +load_mach_drivers(void) +{ + /* Currently no machine class (sun4u) specific drivers to load */ +} + /* * Return true if the machine we're running on is a Positron. * (Positron is an unsupported developers platform.) diff --git a/usr/src/uts/sun4v/Makefile.files b/usr/src/uts/sun4v/Makefile.files index 8fd0f3f1af..adfda1129f 100644 --- a/usr/src/uts/sun4v/Makefile.files +++ b/usr/src/uts/sun4v/Makefile.files @@ -34,6 +34,7 @@ # object lists # CORE_OBJS += bootops.o +CORE_OBJS += prom_alloc.o CORE_OBJS += cmp.o CORE_OBJS += cpc_hwreg.o CORE_OBJS += cpc_subr.o @@ -44,11 +45,13 @@ CORE_OBJS += hardclk.o CORE_OBJS += hat_sfmmu.o CORE_OBJS += hat_kdi.o CORE_OBJS += hsvc.o +CORE_OBJS += lpad.o CORE_OBJS += mach_cpu_states.o CORE_OBJS += mach_ddi_impl.o -CORE_OBJS += mach_descrip.o +CORE_OBJS += mach_descrip.o CORE_OBJS += mach_mp_startup.o CORE_OBJS += mach_mp_states.o +CORE_OBJS += mach_proc_init.o CORE_OBJS += mach_sfmmu.o CORE_OBJS += mach_startup.o CORE_OBJS += mach_subr_asm.o @@ -59,13 +62,30 @@ CORE_OBJS += mem_cage.o CORE_OBJS += mem_config.o CORE_OBJS += memlist_new.o CORE_OBJS += ppage.o +CORE_OBJS += promif_asr.o +CORE_OBJS += promif_cpu.o +CORE_OBJS += promif_emul.o +CORE_OBJS += promif_mon.o +CORE_OBJS += promif_io.o +CORE_OBJS += promif_interp.o +CORE_OBJS += promif_key.o +CORE_OBJS += promif_power_off.o +CORE_OBJS += promif_prop.o +CORE_OBJS += promif_node.o +CORE_OBJS += promif_reboot.o +CORE_OBJS += promif_stree.o +CORE_OBJS += promif_test.o +CORE_OBJS += promif_version.o CORE_OBJS += sfmmu_kdi.o CORE_OBJS += swtch.o CORE_OBJS += xhat_sfmmu.o +CORE_OBJS += mdesc_diff.o CORE_OBJS += mdesc_findname.o CORE_OBJS += mdesc_findnodeprop.o CORE_OBJS += mdesc_fini.o +CORE_OBJS += mdesc_getbinsize.o +CORE_OBJS += mdesc_getgen.o CORE_OBJS += mdesc_getpropdata.o CORE_OBJS += mdesc_getpropstr.o CORE_OBJS += mdesc_getpropval.o @@ -109,14 +129,26 @@ MEMTEST_OBJS += memtest.o memtest_asm.o \ # QCN_OBJS = qcn.o VNEX_OBJS = vnex.o +CNEX_OBJS = cnex.o GLVC_OBJS = glvc.o glvc_hcall.o MDESC_OBJS = mdesc.o +LDC_OBJS = ldc.o +VLDC_OBJS = vldc.o +VCC_OBJS = vcc.o +VNET_OBJS = vnet.o vnet_gen.o +VSW_OBJS = vsw.o +VDC_OBJS = vdc.o +VDS_OBJS = vds.o # # Misc modules # -OBPSYM_OBJS += obpsym.o obpsym_1275.o BOOTDEV_OBJS += bootdev.o +DR_CPU_OBJS += dr_cpu.o dr_util.o +DS_OBJS = ds.o +FAULT_ISO_OBJS = fault_iso.o +OBPSYM_OBJS += obpsym.o obpsym_1275.o +PLATSVC_OBJS = platsvc.o mdeg.o # # Performance Counter BackEnd (PCBE) Modules @@ -163,4 +195,3 @@ ASSYM_DEPS += mach_sfmmu_asm.o sfmmu_asm.o # ARCFOUR_OBJS += arcfour.o arcfour_crypt.o - diff --git a/usr/src/uts/sun4v/Makefile.rules b/usr/src/uts/sun4v/Makefile.rules index 8b49649880..6afc3c0da0 100644 --- a/usr/src/uts/sun4v/Makefile.rules +++ b/usr/src/uts/sun4v/Makefile.rules @@ -62,6 +62,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/sun4v/pcbe/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/sun4v/promif/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/sun4v/io/px/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -163,6 +167,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/sun4v/os/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/sun4v/pcbe/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/sun4v/promif/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/sun4v/vm/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/usr/src/uts/sun4v/Makefile.sun4v.shared b/usr/src/uts/sun4v/Makefile.sun4v.shared index f5c55b3d47..1d6a5ad798 100644 --- a/usr/src/uts/sun4v/Makefile.sun4v.shared +++ b/usr/src/uts/sun4v/Makefile.sun4v.shared @@ -23,7 +23,7 @@ # Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" +#ident "%Z%%M% %I% %E% SMI" # # This makefile contains the common definitions for the sun4v unix # and all sun4v implementation architecture dependent modules. @@ -309,19 +309,26 @@ MACH_NOT_YET_KMODS = $(AUTOCONF_OBJS) # # Machine Specific Driver Modules (/kernel/drv): # -DRV_KMODS += vnex -DRV_KMODS += qcn -DRV_KMODS += dma +DRV_KMODS += bge +DRV_KMODS += cnex DRV_KMODS += cpc -DRV_KMODS += rootnex -DRV_KMODS += trapstat -DRV_KMODS += px +DRV_KMODS += dma +DRV_KMODS += ebus DRV_KMODS += fpc DRV_KMODS += glvc -DRV_KMODS += bge DRV_KMODS += mdesc -DRV_KMODS += ebus -DRV_KMODS += su +DRV_KMODS += px +DRV_KMODS += qcn +DRV_KMODS += rootnex +DRV_KMODS += su +DRV_KMODS += trapstat +DRV_KMODS += vcc +DRV_KMODS += vdc +DRV_KMODS += vds +DRV_KMODS += vldc +DRV_KMODS += vnet +DRV_KMODS += vnex +DRV_KMODS += vsw $(CLOSED_BUILD)CLOSED_DRV_KMODS += memtest $(CLOSED_BUILD)CLOSED_DRV_KMODS += ncp @@ -354,8 +361,16 @@ SYS_KMODS += # # 'User' Modules (/kernel/misc): # -MISC_KMODS += obpsym bootdev vis platmod - +MISC_KMODS += bootdev +MISC_KMODS += dr_cpu +MISC_KMODS += ds +MISC_KMODS += fault_iso +MISC_KMODS += ldc +MISC_KMODS += obpsym +MISC_KMODS += platmod +MISC_KMODS += platsvc +MISC_KMODS += vis + # md5 optimized for Niagara # MISC_KMODS += md5 diff --git a/usr/src/uts/sun4v/cnex/Makefile b/usr/src/uts/sun4v/cnex/Makefile new file mode 100644 index 0000000000..8f520908a5 --- /dev/null +++ b/usr/src/uts/sun4v/cnex/Makefile @@ -0,0 +1,99 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# uts/sun4v/cnex/Makefile +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" +# +# This makefile drives the production of the cnex driver kernel module. +# +# sun4v implementation architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = cnex +OBJECTS = $(CNEX_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(CNEX_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_DRV_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sun4v/Makefile.sun4v + +# +# Override defaults to build a unique, local modstubs.o. +# +MODSTUBS_DIR = $(OBJS_DIR) + +CLEANFILES += $(MODSTUBS_O) + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement +# +CFLAGS += $(CCVERBOSE) + +# +# Module dependencies +# +LDFLAGS += -dy -Nmisc/ldc + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/$(PLATFORM)/Makefile.targ diff --git a/usr/src/uts/sun4v/cpu/common_asm.s b/usr/src/uts/sun4v/cpu/common_asm.s index 63ff1e77c6..8de96b3bed 100644 --- a/usr/src/uts/sun4v/cpu/common_asm.s +++ b/usr/src/uts/sun4v/cpu/common_asm.s @@ -1050,8 +1050,16 @@ flush_instr_mem(caddr_t vaddr, size_t len) ta FAST_TRAP brz,pt %o0, 1f nop - ba ptl1_panic + mov PTL1_BAD_HCALL, %g1 + + cmp %o0, H_ENOMAP + move %xcc, PTL1_BAD_HCALL_UNMAP_PERM_ENOMAP, %g1 + + cmp %o0, H_EINVAL + move %xcc, PTL1_BAD_HCALL_UNMAP_PERM_EINVAL, %g1 + + ba,a ptl1_panic 1: mov %g6, %o5 mov %g5, %o2 diff --git a/usr/src/uts/sun4v/cpu/generic.c b/usr/src/uts/sun4v/cpu/generic.c index e753000a99..0a6d9394f1 100644 --- a/usr/src/uts/sun4v/cpu/generic.c +++ b/usr/src/uts/sun4v/cpu/generic.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -54,140 +53,77 @@ #include <sys/panic.h> #include <sys/dtrace.h> #include <vm/seg_spt.h> +#include <sys/simulate.h> +#include <sys/fault.h> -#define S_VAC_SIZE MMU_PAGESIZE /* XXXQ? */ - -/* - * Maximum number of contexts - */ -#define MAX_NCTXS (1 << 13) uint_t root_phys_addr_lo_mask = 0xffffffffU; void cpu_setup(void) { - extern int at_flags; - extern int disable_delay_tlb_flush, delay_tlb_flush; extern int mmu_exported_pagesize_mask; - extern int get_cpu_pagesizes(void); - - cache |= (CACHE_PTAG | CACHE_IOCOHERENT); - - at_flags = EF_SPARC_32PLUS | EF_SPARC_SUN_US1; /* XXXQ */ + char *generic_isa_set[] = { + "sparcv9+vis", + "sparcv8plus+vis", + NULL + }; /* - * Use the maximum number of contexts available for Spitfire unless - * it has been tuned for debugging. - * We are checking against 0 here since this value can be patched - * while booting. It can not be patched via /etc/system since it - * will be patched too late and thus cause the system to panic. + * The setup common to all CPU modules is done in cpu_setup_common + * routine. */ - if (nctxs == 0) - nctxs = MAX_NCTXS; + cpu_setup_common(generic_isa_set); - if (use_page_coloring) { - do_pg_coloring = 1; - if (use_virtual_coloring) - do_virtual_coloring = 1; - } - /* - * Initalize supported page sizes information before the PD. - * If no information is available, then initialize the - * mmu_exported_pagesize_mask to a reasonable value for that processor. - */ - mmu_exported_pagesize_mask = get_cpu_pagesizes(); - if (mmu_exported_pagesize_mask <= 0) { - mmu_exported_pagesize_mask = (1 << TTE8K) | (1 << TTE64K) | - (1 << TTE4M); - } - - /* - * Tune pp_slots to use up to 1/8th of the tlb entries. - */ - pp_slots = MIN(8, MAXPP_SLOTS); - - /* - * Block stores invalidate all pages of the d$ so pagecopy - * et. al. do not need virtual translations with virtual - * coloring taken into consideration. - */ - pp_consistent_coloring = 0; - isa_list = - "sparcv9+vis sparcv9 " - "sparcv8plus+vis sparcv8plus " - "sparcv8 sparcv8-fsmuld sparcv7 sparc"; - - /* - * On Spitfire, there's a hole in the address space - * that we must never map (the hardware only support 44-bits of - * virtual address). Later CPUs are expected to have wider - * supported address ranges. - * - * See address map on p23 of the UltraSPARC 1 user's manual. - */ -/* XXXQ get from machine description */ - hole_start = (caddr_t)0x80000000000ull; - hole_end = (caddr_t)0xfffff80000000000ull; - - /* - * The kpm mapping window. - * kpm_size: - * The size of a single kpm range. - * The overall size will be: kpm_size * vac_colors. - * kpm_vbase: - * The virtual start address of the kpm range within the kernel - * virtual address space. kpm_vbase has to be kpm_size aligned. - */ - kpm_size = (size_t)(2ull * 1024 * 1024 * 1024 * 1024); /* 2TB */ - kpm_size_shift = 41; - kpm_vbase = (caddr_t)0xfffffa0000000000ull; /* 16EB - 6TB */ + cache |= (CACHE_PTAG | CACHE_IOCOHERENT); - /* - * The traptrace code uses either %tick or %stick for - * timestamping. We have %stick so we can use it. - */ - traptrace_use_stick = 1; + if (broken_md_flag) { + /* + * Turn on the missing bits supported by sun4v architecture in + * MMU pagesize mask returned by MD. + */ + mmu_exported_pagesize_mask |= DEFAULT_SUN4V_MMU_PAGESIZE_MASK; + } else { + /* + * According to sun4v architecture each processor must + * support 8K, 64K and 4M page sizes. If any of the page + * size is missing from page size mask, then panic. + */ + if ((mmu_exported_pagesize_mask & + DEFAULT_SUN4V_MMU_PAGESIZE_MASK) != + DEFAULT_SUN4V_MMU_PAGESIZE_MASK) + cmn_err(CE_PANIC, "machine description" + " does not have required sun4v page sizes" + " 8K, 64K and 4M: MD mask is 0x%x", + mmu_exported_pagesize_mask); + } /* - * sun4v provides demap_all + * If processor supports the subset of full 64-bit virtual + * address space, then set VA hole accordingly. */ - if (!disable_delay_tlb_flush) - delay_tlb_flush = 1; + if (va_bits < VA_ADDRESS_SPACE_BITS) { + hole_start = (caddr_t)(1ull << (va_bits - 1)); + hole_end = (caddr_t)(0ull - (1ull << (va_bits - 1))); + } else { + hole_start = hole_end = 0; + } } -/* - * Set the magic constants of the implementation. - */ void cpu_fiximp(struct cpu_node *cpunode) { - extern int vac_size, vac_shift; - extern uint_t vac_mask; - int i, a; - - /* - * The assumption here is that fillsysinfo will eventually - * have code to fill this info in from the PD. - * We hard code this for now. - * Once the PD access library is done this code - * might need to be changed to get the info from the PD - */ /* - * Page Coloring defaults for sun4v + * The Cache node is optional in MD. Therefore in case "Cache" + * does not exists in MD, set the default L2 cache associativity, + * size, linesize for generic CPU module. */ - ecache_setsize = 0x100000; - ecache_alignsize = 64; - cpunode->ecache_setsize = 0x100000; - - vac_size = S_VAC_SIZE; - vac_mask = MMU_PAGEMASK & (vac_size - 1); - i = 0; a = vac_size; - while (a >>= 1) - ++i; - vac_shift = i; - shm_alignment = vac_size; - vac = 0; + if (cpunode->ecache_size == 0) + cpunode->ecache_size = 0x100000; + if (cpunode->ecache_linesize == 0) + cpunode->ecache_linesize = 64; + if (cpunode->ecache_associativity == 0) + cpunode->ecache_associativity = 1; } void @@ -220,7 +156,9 @@ cpu_init_private(struct cpu *cp) * unit sharing information from the Machine Description table. * It defaults to the CPU id in the absence of such information. */ - cp->cpu_m.cpu_ipipe = (id_t)(cp->cpu_id); + cp->cpu_m.cpu_ipipe = cpunodes[cp->cpu_id].exec_unit_mapping; + if (cp->cpu_m.cpu_ipipe == NO_EU_MAPPING_FOUND) + cp->cpu_m.cpu_ipipe = (id_t)(cp->cpu_id); } void @@ -246,6 +184,96 @@ cpu_inv_tsb(caddr_t tsb_base, uint_t tsb_bytes) } /* + * Sun4v kernel must emulate code a generic sun4v processor may not support + * i.e. VIS1 and VIS2. + */ +#define IS_FLOAT(i) (((i) & 0x1000000) != 0) +#define IS_IBIT_SET(x) (x & 0x2000) +#define IS_VIS1(op, op3)(op == 2 && op3 == 0x36) +#define IS_PARTIAL_OR_SHORT_FLOAT_LD_ST(op, op3, asi) \ + (op == 3 && (op3 == IOP_V8_LDDFA || \ + op3 == IOP_V8_STDFA) && asi > ASI_SNFL) +int +vis1_partial_support(struct regs *rp, k_siginfo_t *siginfo, uint_t *fault) +{ + char *badaddr; + int instr; + uint_t optype, op3, asi; + uint_t rd, ignor; + + if (!USERMODE(rp->r_tstate)) + return (-1); + + instr = fetch_user_instr((caddr_t)rp->r_pc); + + rd = (instr >> 25) & 0x1f; + optype = (instr >> 30) & 0x3; + op3 = (instr >> 19) & 0x3f; + ignor = (instr >> 5) & 0xff; + if (IS_IBIT_SET(instr)) { + asi = (uint32_t)((rp->r_tstate >> TSTATE_ASI_SHIFT) & + TSTATE_ASI_MASK); + } else { + asi = ignor; + } + + if (!IS_VIS1(optype, op3) && + !IS_PARTIAL_OR_SHORT_FLOAT_LD_ST(optype, op3, asi)) { + return (-1); + } + switch (simulate_unimp(rp, &badaddr)) { + case SIMU_RETRY: + break; /* regs are already set up */ + /*NOTREACHED*/ + + case SIMU_SUCCESS: + /* + * skip the successfully + * simulated instruction + */ + rp->r_pc = rp->r_npc; + rp->r_npc += 4; + break; + /*NOTREACHED*/ + + case SIMU_FAULT: + siginfo->si_signo = SIGSEGV; + siginfo->si_code = SEGV_MAPERR; + siginfo->si_addr = badaddr; + *fault = FLTBOUNDS; + break; + + case SIMU_DZERO: + siginfo->si_signo = SIGFPE; + siginfo->si_code = FPE_INTDIV; + siginfo->si_addr = (caddr_t)rp->r_pc; + *fault = FLTIZDIV; + break; + + case SIMU_UNALIGN: + siginfo->si_signo = SIGBUS; + siginfo->si_code = BUS_ADRALN; + siginfo->si_addr = badaddr; + *fault = FLTACCESS; + break; + + case SIMU_ILLEGAL: + default: + siginfo->si_signo = SIGILL; + op3 = (instr >> 19) & 0x3F; + if ((IS_FLOAT(instr) && (op3 == IOP_V8_STQFA) || + (op3 == IOP_V8_STDFA))) + siginfo->si_code = ILL_ILLADR; + else + siginfo->si_code = ILL_ILLOPC; + siginfo->si_addr = (caddr_t)rp->r_pc; + *fault = FLTILL; + break; + } + return (0); +} + +/* * Trapstat support for generic sun4v processor */ int diff --git a/usr/src/uts/sun4v/cpu/niagara.c b/usr/src/uts/sun4v/cpu/niagara.c index d2413f773e..125ca8e224 100644 --- a/usr/src/uts/sun4v/cpu/niagara.c +++ b/usr/src/uts/sun4v/cpu/niagara.c @@ -58,12 +58,8 @@ #include <sys/trapstat.h> #include <sys/hsvc.h> -#define S_VAC_SIZE MMU_PAGESIZE /* XXXQ? */ - -/* - * Maximum number of contexts - */ -#define MAX_NCTXS (1 << 13) +#define NI_MMU_PAGESIZE_MASK ((1 << TTE8K) | (1 << TTE64K) | (1 << TTE4M) \ + | (1 << TTE256M)) uint_t root_phys_addr_lo_mask = 0xffffffffU; static niagara_mmustat_t *cpu_tstat_va; /* VA of mmustat buffer */ @@ -82,12 +78,16 @@ static hsvc_info_t niagara_hsvc = { void cpu_setup(void) { - extern int at_flags; - extern int disable_delay_tlb_flush, delay_tlb_flush; extern int mmu_exported_pagesize_mask; - extern int get_cpu_pagesizes(void); extern int cpc_has_overflow_intr; int status; + char *ni_isa_set[] = { + "sparcv9+vis", + "sparcv9+vis2", + "sparcv8plus+vis", + "sparcv8plus+vis2", + NULL + }; /* * Negotiate the API version for Niagara specific hypervisor @@ -102,49 +102,29 @@ cpu_setup(void) niagara_hsvc_available = B_FALSE; } - cache |= (CACHE_PTAG | CACHE_IOCOHERENT); - at_flags = EF_SPARC_SUN_US3 | EF_SPARC_32PLUS | EF_SPARC_SUN_US1; - /* - * Use the maximum number of contexts available for Spitfire unless - * it has been tuned for debugging. - * We are checking against 0 here since this value can be patched - * while booting. It can not be patched via /etc/system since it - * will be patched too late and thus cause the system to panic. + * The setup common to all CPU modules is done in cpu_setup_common + * routine. */ - if (nctxs == 0) - nctxs = MAX_NCTXS; + cpu_setup_common(ni_isa_set); - if (use_page_coloring) { - do_pg_coloring = 1; - if (use_virtual_coloring) - do_virtual_coloring = 1; - } - /* - * Initalize supported page sizes information before the PD. - * If no information is available, then initialize the - * mmu_exported_pagesize_mask to a reasonable value for that processor. - */ - mmu_exported_pagesize_mask = get_cpu_pagesizes(); - if (mmu_exported_pagesize_mask <= 0) { - mmu_exported_pagesize_mask = (1 << TTE8K) | (1 << TTE64K) | - (1 << TTE4M) | (1 << TTE256M); - } - - /* - * Tune pp_slots to use up to 1/8th of the tlb entries. - */ - pp_slots = MIN(8, MAXPP_SLOTS); + cache |= (CACHE_PTAG | CACHE_IOCOHERENT); - /* - * Block stores invalidate all pages of the d$ so pagecopy - * et. al. do not need virtual translations with virtual - * coloring taken into consideration. - */ - pp_consistent_coloring = 0; - isa_list = - "sparcv9 sparcv8plus sparcv8 sparcv8-fsmuld sparcv7 " - "sparc sparcv9+vis sparcv9+vis2 sparcv8plus+vis sparcv8plus+vis2"; + if (broken_md_flag) { + /* + * Turn on the missing bits supported by Niagara CPU in + * MMU pagesize mask returned by MD. + */ + mmu_exported_pagesize_mask |= NI_MMU_PAGESIZE_MASK; + } else { + if ((mmu_exported_pagesize_mask & + DEFAULT_SUN4V_MMU_PAGESIZE_MASK) != + DEFAULT_SUN4V_MMU_PAGESIZE_MASK) + cmn_err(CE_PANIC, "machine description" + " does not have required sun4v page sizes" + " 8K, 64K and 4M: MD mask is 0x%x", + mmu_exported_pagesize_mask); + } cpu_hwcap_flags |= AV_SPARC_ASI_BLK_INIT; @@ -155,84 +135,34 @@ cpu_setup(void) * and must never be mapped. In addition, software must not use * pages within 4GB of the VA hole as instruction pages to * avoid problems with prefetching into the VA hole. - * - * VA hole information should be obtained from the machine - * description. - */ - hole_start = (caddr_t)(0x800000000000ul - (1ul << 32)); - hole_end = (caddr_t)(0xffff800000000000ul + (1ul << 32)); - - /* - * The kpm mapping window. - * kpm_size: - * The size of a single kpm range. - * The overall size will be: kpm_size * vac_colors. - * kpm_vbase: - * The virtual start address of the kpm range within the kernel - * virtual address space. kpm_vbase has to be kpm_size aligned. */ - kpm_size = (size_t)(2ull * 1024 * 1024 * 1024 * 1024); /* 2TB */ - kpm_size_shift = 41; - kpm_vbase = (caddr_t)0xfffffa0000000000ull; /* 16EB - 6TB */ + hole_start = (caddr_t)((1ull << (va_bits - 1)) - (1ull << 32)); + hole_end = (caddr_t)((0ull - (1ull << (va_bits - 1))) + (1ull << 32)); /* - * The traptrace code uses either %tick or %stick for - * timestamping. We have %stick so we can use it. - */ - traptrace_use_stick = 1; - - /* - * sun4v provides demap_all - */ - if (!disable_delay_tlb_flush) - delay_tlb_flush = 1; - /* * Niagara has a performance counter overflow interrupt */ cpc_has_overflow_intr = 1; } -#define MB * 1024 * 1024 +#define MB(n) ((n) * 1024 * 1024) /* * Set the magic constants of the implementation. */ void cpu_fiximp(struct cpu_node *cpunode) { - extern int vac_size, vac_shift; - extern uint_t vac_mask; - int i, a; - /* - * The assumption here is that fillsysinfo will eventually - * have code to fill this info in from the PD. - * We hard code this for niagara now. - * Once the PD access library is done this code - * might need to be changed to get the info from the PD + * The Cache node is optional in MD. Therefore in case "Cache" + * node does not exists in MD, set the default L2 cache associativity, + * size, linesize. */ if (cpunode->ecache_size == 0) - cpunode->ecache_size = 3 MB; + cpunode->ecache_size = MB(3); if (cpunode->ecache_linesize == 0) cpunode->ecache_linesize = 64; if (cpunode->ecache_associativity == 0) cpunode->ecache_associativity = 12; - - cpunode->ecache_setsize = - cpunode->ecache_size / cpunode->ecache_associativity; - - if (ecache_setsize == 0) - ecache_setsize = cpunode->ecache_setsize; - if (ecache_alignsize == 0) - ecache_alignsize = cpunode->ecache_linesize; - - vac_size = S_VAC_SIZE; - vac_mask = MMU_PAGEMASK & (vac_size - 1); - i = 0; a = vac_size; - while (a >>= 1) - ++i; - vac_shift = i; - shm_alignment = vac_size; - vac = 0; } static int niagara_cpucnt; @@ -243,13 +173,13 @@ cpu_init_private(struct cpu *cp) extern int niagara_kstat_init(void); /* - * This code change assumes that the virtual cpu ids are identical - * to the physical cpu ids which is true for ontario but not for - * niagara in general. - * This is a temporary fix which will later be modified to obtain - * the execution unit sharing information from MD table. + * The cpu_ipipe field is initialized based on the execution + * unit sharing information from the MD. It defaults to the + * virtual CPU id in the absence of such information. */ - cp->cpu_m.cpu_ipipe = (id_t)(cp->cpu_id / 4); + cp->cpu_m.cpu_ipipe = cpunodes[cp->cpu_id].exec_unit_mapping; + if (cp->cpu_m.cpu_ipipe == NO_EU_MAPPING_FOUND) + cp->cpu_m.cpu_ipipe = (id_t)(cp->cpu_id); ASSERT(MUTEX_HELD(&cpu_lock)); if (niagara_cpucnt++ == 0 && niagara_hsvc_available == B_TRUE) { diff --git a/usr/src/uts/sun4v/dr_cpu/Makefile b/usr/src/uts/sun4v/dr_cpu/Makefile new file mode 100644 index 0000000000..828679baa9 --- /dev/null +++ b/usr/src/uts/sun4v/dr_cpu/Makefile @@ -0,0 +1,93 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = dr_cpu +OBJECTS = $(DR_CPU_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(DR_CPU_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_MISC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sun4v/Makefile.sun4v + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement +# +CFLAGS += -v + +# +# Turn on doubleword alignment for 64 bit registers +# +CFLAGS += -dalign + +# +# Module Dependencies +# +LDFLAGS += -dy -Nmisc/ds + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/$(PLATFORM)/Makefile.targ diff --git a/usr/src/uts/sun4v/ds/Makefile b/usr/src/uts/sun4v/ds/Makefile new file mode 100644 index 0000000000..41c8351a4a --- /dev/null +++ b/usr/src/uts/sun4v/ds/Makefile @@ -0,0 +1,97 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +# This makefile drives the production of the zp kernel module. +# +# sun4v implementation architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = ds +OBJECTS = $(DS_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(DS_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_MISC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sun4v/Makefile.sun4v + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement +# +CFLAGS += -v + +# +# Turn on doubleword alignment for 64 bit registers +# +CFLAGS += -dalign + +# +# Module Dependencies +# +LDFLAGS += -dy -Nmisc/ldc + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/$(PLATFORM)/Makefile.targ diff --git a/usr/src/uts/sun4v/fault_iso/Makefile b/usr/src/uts/sun4v/fault_iso/Makefile new file mode 100644 index 0000000000..37188fcfff --- /dev/null +++ b/usr/src/uts/sun4v/fault_iso/Makefile @@ -0,0 +1,97 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +# This makefile drives the production of the fault_iso kernel module. +# +# sun4v implementation architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = fault_iso +OBJECTS = $(FAULT_ISO_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(FAULT_ISO_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_MISC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sun4v/Makefile.sun4v + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement +# +CFLAGS += -v + +# +# Turn on doubleword alignment for 64 bit registers +# +CFLAGS += -dalign + +# +# Module Dependencies +# +LDFLAGS += -dy -Nmisc/ds + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/$(PLATFORM)/Makefile.targ diff --git a/usr/src/uts/sun4v/io/cnex.c b/usr/src/uts/sun4v/io/cnex.c new file mode 100644 index 0000000000..08a70cc810 --- /dev/null +++ b/usr/src/uts/sun4v/io/cnex.c @@ -0,0 +1,1133 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Logical domain channel devices are devices implemented entirely + * in software; cnex is the nexus for channel-devices. They use + * the HV channel interfaces via the LDC transport module to send + * and receive data and to register callbacks. + */ + +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/ddi_impldefs.h> +#include <sys/devops.h> +#include <sys/instance.h> +#include <sys/modctl.h> +#include <sys/open.h> +#include <sys/stat.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/systm.h> +#include <sys/mkdev.h> +#include <sys/machsystm.h> +#include <sys/intr.h> +#include <sys/ddi_intr_impl.h> +#include <sys/ivintr.h> +#include <sys/hypervisor_api.h> +#include <sys/ldc.h> +#include <sys/cnex.h> +#include <sys/mach_descrip.h> + +/* + * Internal functions/information + */ +static struct cnex_pil_map cnex_class_to_pil[] = { + {LDC_DEV_GENERIC, PIL_3}, + {LDC_DEV_BLK, PIL_4}, + {LDC_DEV_BLK_SVC, PIL_3}, + {LDC_DEV_NT, PIL_6}, + {LDC_DEV_NT_SVC, PIL_4}, + {LDC_DEV_SERIAL, PIL_6} +}; +#define CNEX_MAX_DEVS (sizeof (cnex_class_to_pil) / \ + sizeof (cnex_class_to_pil[0])) + +#define SUN4V_REG_SPEC2CFG_HDL(x) ((x >> 32) & ~(0xfull << 28)) + +static hrtime_t cnex_pending_tmout = 2ull * NANOSEC; /* 2 secs in nsecs */ +static void *cnex_state; + +static void cnex_intr_redist(void *arg); +static uint_t cnex_intr_wrapper(caddr_t arg); + +/* + * Debug info + */ +#ifdef DEBUG + +/* + * Print debug messages + * + * set cnexdbg to 0xf for enabling all msgs + * 0x8 - Errors + * 0x4 - Warnings + * 0x2 - All debug messages + * 0x1 - Minimal debug messages + */ + +int cnexdbg = 0x8; + +static void +cnexdebug(const char *fmt, ...) +{ + char buf[512]; + va_list ap; + + va_start(ap, fmt); + (void) vsprintf(buf, fmt, ap); + va_end(ap); + + cmn_err(CE_CONT, "%s\n", buf); +} + +#define D1 \ +if (cnexdbg & 0x01) \ + cnexdebug + +#define D2 \ +if (cnexdbg & 0x02) \ + cnexdebug + +#define DWARN \ +if (cnexdbg & 0x04) \ + cnexdebug + +#define DERR \ +if (cnexdbg & 0x08) \ + cnexdebug + +#else + +#define D1 +#define D2 +#define DWARN +#define DERR + +#endif + +/* + * Config information + */ +static int cnex_attach(dev_info_t *, ddi_attach_cmd_t); +static int cnex_detach(dev_info_t *, ddi_detach_cmd_t); +static int cnex_open(dev_t *, int, int, cred_t *); +static int cnex_close(dev_t, int, int, cred_t *); +static int cnex_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); +static int cnex_ctl(dev_info_t *, dev_info_t *, ddi_ctl_enum_t, void *, + void *); + +static struct bus_ops cnex_bus_ops = { + BUSO_REV, + nullbusmap, /* bus_map */ + NULL, /* bus_get_intrspec */ + NULL, /* bus_add_intrspec */ + NULL, /* bus_remove_intrspec */ + i_ddi_map_fault, /* bus_map_fault */ + ddi_no_dma_map, /* bus_dma_map */ + ddi_no_dma_allochdl, /* bus_dma_allochdl */ + NULL, /* bus_dma_freehdl */ + NULL, /* bus_dma_bindhdl */ + NULL, /* bus_dma_unbindhdl */ + NULL, /* bus_dma_flush */ + NULL, /* bus_dma_win */ + NULL, /* bus_dma_ctl */ + cnex_ctl, /* bus_ctl */ + ddi_bus_prop_op, /* bus_prop_op */ + 0, /* bus_get_eventcookie */ + 0, /* bus_add_eventcall */ + 0, /* bus_remove_eventcall */ + 0, /* bus_post_event */ + NULL, /* bus_intr_ctl */ + NULL, /* bus_config */ + NULL, /* bus_unconfig */ + NULL, /* bus_fm_init */ + NULL, /* bus_fm_fini */ + NULL, /* bus_fm_access_enter */ + NULL, /* bus_fm_access_exit */ + NULL, /* bus_power */ + NULL /* bus_intr_op */ +}; + +static struct cb_ops cnex_cb_ops = { + cnex_open, /* open */ + cnex_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + cnex_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + 0, /* streamtab */ + D_MP | D_NEW | D_HOTPLUG /* Driver compatibility flag */ +}; + +static struct dev_ops cnex_ops = { + DEVO_REV, /* devo_rev, */ + 0, /* refcnt */ + ddi_getinfo_1to1, /* info */ + nulldev, /* identify */ + nulldev, /* probe */ + cnex_attach, /* attach */ + cnex_detach, /* detach */ + nodev, /* reset */ + &cnex_cb_ops, /* driver operations */ + &cnex_bus_ops, /* bus operations */ + nulldev /* power */ +}; + +/* + * Module linkage information for the kernel. + */ +static struct modldrv modldrv = { + &mod_driverops, + "sun4v channel-devices nexus driver v%I%", + &cnex_ops, +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modldrv, NULL +}; + +int +_init(void) +{ + int err; + + if ((err = ddi_soft_state_init(&cnex_state, + sizeof (cnex_soft_state_t), 0)) != 0) { + return (err); + } + if ((err = mod_install(&modlinkage)) != 0) { + ddi_soft_state_fini(&cnex_state); + return (err); + } + return (0); +} + +int +_fini(void) +{ + int err; + + if ((err = mod_remove(&modlinkage)) != 0) + return (err); + ddi_soft_state_fini(&cnex_state); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * Callback function invoked by the interrupt redistribution + * framework. This will redirect interrupts at CPUs that are + * currently available in the system. + */ +static void +cnex_intr_redist(void *arg) +{ + cnex_ldc_t *cldcp; + cnex_soft_state_t *cnex_ssp = arg; + int intr_state; + hrtime_t start; + uint64_t cpuid; + int rv; + + ASSERT(cnex_ssp != NULL); + mutex_enter(&cnex_ssp->clist_lock); + + cldcp = cnex_ssp->clist; + while (cldcp != NULL) { + + mutex_enter(&cldcp->lock); + + if (cldcp->tx.hdlr) { + /* + * Don't do anything for disabled interrupts. + */ + rv = hvldc_intr_getvalid(cnex_ssp->cfghdl, + cldcp->tx.ino, &intr_state); + if (rv) { + DWARN("cnex_intr_redist: tx ino=0x%llx, " + "can't get valid\n", cldcp->tx.ino); + mutex_exit(&cldcp->lock); + mutex_exit(&cnex_ssp->clist_lock); + return; + } + if (intr_state == HV_INTR_NOTVALID) { + cldcp = cldcp->next; + continue; + } + + cpuid = intr_dist_cpuid(); + + /* disable interrupts */ + rv = hvldc_intr_setvalid(cnex_ssp->cfghdl, + cldcp->tx.ino, HV_INTR_NOTVALID); + if (rv) { + DWARN("cnex_intr_redist: tx ino=0x%llx, " + "can't set valid\n", cldcp->tx.ino); + mutex_exit(&cldcp->lock); + mutex_exit(&cnex_ssp->clist_lock); + return; + } + + /* + * Make a best effort to wait for pending interrupts + * to finish. There is not much we can do if we timeout. + */ + start = gethrtime(); + + do { + rv = hvldc_intr_getstate(cnex_ssp->cfghdl, + cldcp->tx.ino, &intr_state); + if (rv) { + DWARN("cnex_intr_redist: tx ino=0x%llx," + "can't get state\n", cldcp->tx.ino); + mutex_exit(&cldcp->lock); + mutex_exit(&cnex_ssp->clist_lock); + return; + } + + if ((gethrtime() - start) > cnex_pending_tmout) + break; + + } while (!panicstr && + intr_state == HV_INTR_DELIVERED_STATE); + + (void) hvldc_intr_settarget(cnex_ssp->cfghdl, + cldcp->tx.ino, cpuid); + (void) hvldc_intr_setvalid(cnex_ssp->cfghdl, + cldcp->tx.ino, HV_INTR_VALID); + } + + if (cldcp->rx.hdlr) { + /* + * Don't do anything for disabled interrupts. + */ + rv = hvldc_intr_getvalid(cnex_ssp->cfghdl, + cldcp->rx.ino, &intr_state); + if (rv) { + DWARN("cnex_intr_redist: rx ino=0x%llx, " + "can't get valid\n", cldcp->rx.ino); + mutex_exit(&cldcp->lock); + mutex_exit(&cnex_ssp->clist_lock); + return; + } + if (intr_state == HV_INTR_NOTVALID) { + cldcp = cldcp->next; + continue; + } + + cpuid = intr_dist_cpuid(); + + /* disable interrupts */ + rv = hvldc_intr_setvalid(cnex_ssp->cfghdl, + cldcp->rx.ino, HV_INTR_NOTVALID); + if (rv) { + DWARN("cnex_intr_redist: rx ino=0x%llx, " + "can't set valid\n", cldcp->rx.ino); + mutex_exit(&cldcp->lock); + mutex_exit(&cnex_ssp->clist_lock); + return; + } + + /* + * Make a best effort to wait for pending interrupts + * to finish. There is not much we can do if we timeout. + */ + start = gethrtime(); + + do { + rv = hvldc_intr_getstate(cnex_ssp->cfghdl, + cldcp->rx.ino, &intr_state); + if (rv) { + DWARN("cnex_intr_redist: rx ino=0x%llx," + "can't set state\n", cldcp->rx.ino); + mutex_exit(&cldcp->lock); + mutex_exit(&cnex_ssp->clist_lock); + return; + } + + if ((gethrtime() - start) > cnex_pending_tmout) + break; + + } while (!panicstr && + intr_state == HV_INTR_DELIVERED_STATE); + + (void) hvldc_intr_settarget(cnex_ssp->cfghdl, + cldcp->rx.ino, cpuid); + (void) hvldc_intr_setvalid(cnex_ssp->cfghdl, + cldcp->rx.ino, HV_INTR_VALID); + } + + mutex_exit(&cldcp->lock); + + /* next channel */ + cldcp = cldcp->next; + } + + mutex_exit(&cnex_ssp->clist_lock); +} + +/* + * Exported interface to register a LDC endpoint with + * the channel nexus + */ +static int +cnex_reg_chan(dev_info_t *dip, uint64_t id, ldc_dev_t devclass) +{ + int idx; + cnex_ldc_t *cldcp; + int listsz, num_nodes, num_channels; + md_t *mdp = NULL; + mde_cookie_t rootnode, *listp = NULL; + uint64_t tmp_id, rxino, txino; + cnex_soft_state_t *cnex_ssp; + int status, instance; + + /* Get device instance and structure */ + instance = ddi_get_instance(dip); + cnex_ssp = ddi_get_soft_state(cnex_state, instance); + + /* Check to see if channel is already registered */ + mutex_enter(&cnex_ssp->clist_lock); + cldcp = cnex_ssp->clist; + while (cldcp) { + if (cldcp->id == id) { + DWARN("cnex_reg_chan: channel 0x%llx exists\n", id); + mutex_exit(&cnex_ssp->clist_lock); + return (EINVAL); + } + cldcp = cldcp->next; + } + + /* Get the Tx/Rx inos from the MD */ + if ((mdp = md_get_handle()) == NULL) { + DWARN("cnex_reg_chan: cannot init MD\n"); + mutex_exit(&cnex_ssp->clist_lock); + return (ENXIO); + } + num_nodes = md_node_count(mdp); + ASSERT(num_nodes > 0); + + listsz = num_nodes * sizeof (mde_cookie_t); + listp = (mde_cookie_t *)kmem_zalloc(listsz, KM_SLEEP); + + rootnode = md_root_node(mdp); + + /* search for all channel_endpoint nodes */ + num_channels = md_scan_dag(mdp, rootnode, + md_find_name(mdp, "channel-endpoint"), + md_find_name(mdp, "fwd"), listp); + if (num_channels <= 0) { + DWARN("cnex_reg_chan: invalid channel id\n"); + kmem_free(listp, listsz); + (void) md_fini_handle(mdp); + mutex_exit(&cnex_ssp->clist_lock); + return (EINVAL); + } + + for (idx = 0; idx < num_channels; idx++) { + + /* Get the channel ID */ + status = md_get_prop_val(mdp, listp[idx], "id", &tmp_id); + if (status) { + DWARN("cnex_reg_chan: cannot read LDC ID\n"); + kmem_free(listp, listsz); + (void) md_fini_handle(mdp); + mutex_exit(&cnex_ssp->clist_lock); + return (ENXIO); + } + if (tmp_id != id) + continue; + + /* Get the Tx and Rx ino */ + status = md_get_prop_val(mdp, listp[idx], "tx-ino", &txino); + if (status) { + DWARN("cnex_reg_chan: cannot read Tx ino\n"); + kmem_free(listp, listsz); + (void) md_fini_handle(mdp); + mutex_exit(&cnex_ssp->clist_lock); + return (ENXIO); + } + status = md_get_prop_val(mdp, listp[idx], "rx-ino", &rxino); + if (status) { + DWARN("cnex_reg_chan: cannot read Rx ino\n"); + kmem_free(listp, listsz); + (void) md_fini_handle(mdp); + mutex_exit(&cnex_ssp->clist_lock); + return (ENXIO); + } + } + kmem_free(listp, listsz); + (void) md_fini_handle(mdp); + + /* Allocate a new channel structure */ + cldcp = kmem_zalloc(sizeof (*cldcp), KM_SLEEP); + + /* Initialize the channel */ + mutex_init(&cldcp->lock, NULL, MUTEX_DRIVER, NULL); + + cldcp->id = id; + cldcp->tx.ino = txino; + cldcp->rx.ino = rxino; + cldcp->devclass = devclass; + + /* add channel to nexus channel list */ + cldcp->next = cnex_ssp->clist; + cnex_ssp->clist = cldcp; + + mutex_exit(&cnex_ssp->clist_lock); + + return (0); +} + +/* + * Add Tx/Rx interrupt handler for the channel + */ +static int +cnex_add_intr(dev_info_t *dip, uint64_t id, cnex_intrtype_t itype, + uint_t (*hdlr)(), caddr_t arg1, caddr_t arg2) +{ + int rv, idx, pil; + cnex_ldc_t *cldcp; + cnex_intr_t *iinfo; + uint64_t cpuid; + cnex_soft_state_t *cnex_ssp; + int instance; + + /* Get device instance and structure */ + instance = ddi_get_instance(dip); + cnex_ssp = ddi_get_soft_state(cnex_state, instance); + + /* get channel info */ + mutex_enter(&cnex_ssp->clist_lock); + cldcp = cnex_ssp->clist; + while (cldcp) { + if (cldcp->id == id) + break; + cldcp = cldcp->next; + } + if (cldcp == NULL) { + DWARN("cnex_add_intr: channel 0x%llx does not exist\n", id); + mutex_exit(&cnex_ssp->clist_lock); + return (EINVAL); + } + mutex_exit(&cnex_ssp->clist_lock); + + /* get channel lock */ + mutex_enter(&cldcp->lock); + + /* get interrupt type */ + if (itype == CNEX_TX_INTR) { + iinfo = &(cldcp->tx); + } else if (itype == CNEX_RX_INTR) { + iinfo = &(cldcp->rx); + } else { + DWARN("cnex_add_intr: invalid interrupt type\n", id); + mutex_exit(&cldcp->lock); + return (EINVAL); + } + + /* check if a handler is already added */ + if (iinfo->hdlr != 0) { + DWARN("cnex_add_intr: interrupt handler exists\n"); + mutex_exit(&cldcp->lock); + return (EINVAL); + } + + /* save interrupt handler info */ + iinfo->hdlr = hdlr; + iinfo->arg1 = arg1; + iinfo->arg2 = arg2; + + iinfo->ssp = cnex_ssp; + + /* + * FIXME - generate the interrupt cookie + * using the interrupt registry + */ + iinfo->icookie = cnex_ssp->cfghdl | iinfo->ino; + + D1("cnex_add_intr: add hdlr, cfghdl=0x%llx, ino=0x%llx, " + "cookie=0x%llx\n", cnex_ssp->cfghdl, iinfo->ino, iinfo->icookie); + + /* Pick a PIL on the basis of the channel's devclass */ + for (idx = 0, pil = PIL_3; idx < CNEX_MAX_DEVS; idx++) { + if (cldcp->devclass == cnex_class_to_pil[idx].devclass) { + pil = cnex_class_to_pil[idx].pil; + break; + } + } + + /* add interrupt to solaris ivec table */ + VERIFY(add_ivintr(iinfo->icookie, pil, cnex_intr_wrapper, + (caddr_t)iinfo, NULL) == 0); + + /* set the cookie in the HV */ + rv = hvldc_intr_setcookie(cnex_ssp->cfghdl, iinfo->ino, iinfo->icookie); + + /* pick next CPU in the domain for this channel */ + cpuid = intr_dist_cpuid(); + + /* set the target CPU and then enable interrupts */ + rv = hvldc_intr_settarget(cnex_ssp->cfghdl, iinfo->ino, cpuid); + if (rv) { + DWARN("cnex_add_intr: ino=0x%llx, cannot set target cpu\n", + iinfo->ino); + goto hv_error; + } + rv = hvldc_intr_setstate(cnex_ssp->cfghdl, iinfo->ino, + HV_INTR_IDLE_STATE); + if (rv) { + DWARN("cnex_add_intr: ino=0x%llx, cannot set state\n", + iinfo->ino); + goto hv_error; + } + rv = hvldc_intr_setvalid(cnex_ssp->cfghdl, iinfo->ino, HV_INTR_VALID); + if (rv) { + DWARN("cnex_add_intr: ino=0x%llx, cannot set valid\n", + iinfo->ino); + goto hv_error; + } + + mutex_exit(&cldcp->lock); + return (0); + +hv_error: + (void) rem_ivintr(iinfo->icookie, NULL); + mutex_exit(&cldcp->lock); + return (ENXIO); +} + + +/* + * Exported interface to unregister a LDC endpoint with + * the channel nexus + */ +static int +cnex_unreg_chan(dev_info_t *dip, uint64_t id) +{ + cnex_ldc_t *cldcp, *prev_cldcp; + cnex_soft_state_t *cnex_ssp; + int instance; + + /* Get device instance and structure */ + instance = ddi_get_instance(dip); + cnex_ssp = ddi_get_soft_state(cnex_state, instance); + + /* find and remove channel from list */ + mutex_enter(&cnex_ssp->clist_lock); + prev_cldcp = NULL; + cldcp = cnex_ssp->clist; + while (cldcp) { + if (cldcp->id == id) + break; + prev_cldcp = cldcp; + cldcp = cldcp->next; + } + + if (cldcp == 0) { + DWARN("cnex_unreg_chan: invalid channel %d\n", id); + mutex_exit(&cnex_ssp->clist_lock); + return (EINVAL); + } + + if (cldcp->tx.hdlr || cldcp->rx.hdlr) { + DWARN("cnex_unreg_chan: handlers still exist\n"); + mutex_exit(&cnex_ssp->clist_lock); + return (ENXIO); + } + + if (prev_cldcp) + prev_cldcp->next = cldcp->next; + else + cnex_ssp->clist = cldcp->next; + + mutex_exit(&cnex_ssp->clist_lock); + + /* destroy mutex */ + mutex_destroy(&cldcp->lock); + + /* free channel */ + kmem_free(cldcp, sizeof (*cldcp)); + + return (0); +} + +/* + * Remove Tx/Rx interrupt handler for the channel + */ +static int +cnex_rem_intr(dev_info_t *dip, uint64_t id, cnex_intrtype_t itype) +{ + int rv; + cnex_ldc_t *cldcp; + cnex_intr_t *iinfo; + cnex_soft_state_t *cnex_ssp; + hrtime_t start; + int instance, istate; + + /* Get device instance and structure */ + instance = ddi_get_instance(dip); + cnex_ssp = ddi_get_soft_state(cnex_state, instance); + + /* get channel info */ + mutex_enter(&cnex_ssp->clist_lock); + cldcp = cnex_ssp->clist; + while (cldcp) { + if (cldcp->id == id) + break; + cldcp = cldcp->next; + } + if (cldcp == NULL) { + DWARN("cnex_rem_intr: channel 0x%llx does not exist\n", id); + mutex_exit(&cnex_ssp->clist_lock); + return (EINVAL); + } + mutex_exit(&cnex_ssp->clist_lock); + + /* get rid of the channel intr handler */ + mutex_enter(&cldcp->lock); + + /* get interrupt type */ + if (itype == CNEX_TX_INTR) { + iinfo = &(cldcp->tx); + } else if (itype == CNEX_RX_INTR) { + iinfo = &(cldcp->rx); + } else { + DWARN("cnex_rem_intr: invalid interrupt type\n"); + mutex_exit(&cldcp->lock); + return (EINVAL); + } + + D1("cnex_rem_intr: interrupt ino=0x%x\n", iinfo->ino); + + /* check if a handler is already added */ + if (iinfo->hdlr == 0) { + DWARN("cnex_rem_intr: interrupt handler does not exist\n"); + mutex_exit(&cldcp->lock); + return (EINVAL); + } + + D1("cnex_rem_intr: set intr to invalid ino=0x%x\n", iinfo->ino); + rv = hvldc_intr_setvalid(cnex_ssp->cfghdl, + iinfo->ino, HV_INTR_NOTVALID); + if (rv) { + DWARN("cnex_rem_intr: cannot set valid ino=%x\n", iinfo->ino); + mutex_exit(&cldcp->lock); + return (ENXIO); + } + + /* + * Make a best effort to wait for pending interrupts + * to finish. There is not much we can do if we timeout. + */ + start = gethrtime(); + do { + rv = hvldc_intr_getstate(cnex_ssp->cfghdl, iinfo->ino, &istate); + if (rv) { + DWARN("cnex_rem_intr: ino=0x%llx, cannot get state\n", + iinfo->ino); + } + + if (rv || ((gethrtime() - start) > cnex_pending_tmout)) + break; + + } while (!panicstr && istate == HV_INTR_DELIVERED_STATE); + + /* if interrupts are still pending print warning */ + if (istate != HV_INTR_IDLE_STATE) { + DWARN("cnex_rem_intr: cannot remove intr busy ino=%x\n", + iinfo->ino); + /* clear interrupt state */ + (void) hvldc_intr_setstate(cnex_ssp->cfghdl, iinfo->ino, + HV_INTR_IDLE_STATE); + } + + /* remove interrupt */ + rem_ivintr(iinfo->icookie, NULL); + + /* clear interrupt info */ + bzero(iinfo, sizeof (*iinfo)); + + mutex_exit(&cldcp->lock); + + return (0); +} + + +/* + * Clear pending Tx/Rx interrupt + */ +static int +cnex_clr_intr(dev_info_t *dip, uint64_t id, cnex_intrtype_t itype) +{ + int rv; + cnex_ldc_t *cldcp; + cnex_intr_t *iinfo; + cnex_soft_state_t *cnex_ssp; + int instance; + + /* Get device instance and structure */ + instance = ddi_get_instance(dip); + cnex_ssp = ddi_get_soft_state(cnex_state, instance); + + /* get channel info */ + mutex_enter(&cnex_ssp->clist_lock); + cldcp = cnex_ssp->clist; + while (cldcp) { + if (cldcp->id == id) + break; + cldcp = cldcp->next; + } + if (cldcp == NULL) { + DWARN("cnex_clr_intr: channel 0x%llx does not exist\n", id); + mutex_exit(&cnex_ssp->clist_lock); + return (EINVAL); + } + mutex_exit(&cnex_ssp->clist_lock); + + mutex_enter(&cldcp->lock); + + /* get interrupt type */ + if (itype == CNEX_TX_INTR) { + iinfo = &(cldcp->tx); + } else if (itype == CNEX_RX_INTR) { + iinfo = &(cldcp->rx); + } else { + DWARN("cnex_rem_intr: invalid interrupt type\n"); + mutex_exit(&cldcp->lock); + return (EINVAL); + } + + D1("cnex_rem_intr: interrupt ino=0x%x\n", iinfo->ino); + + /* check if a handler is already added */ + if (iinfo->hdlr == 0) { + DWARN("cnex_clr_intr: interrupt handler does not exist\n"); + mutex_exit(&cldcp->lock); + return (EINVAL); + } + + rv = hvldc_intr_setstate(cnex_ssp->cfghdl, iinfo->ino, + HV_INTR_IDLE_STATE); + if (rv) { + DWARN("cnex_intr_wrapper: cannot clear interrupt state\n"); + } + + mutex_exit(&cldcp->lock); + + return (0); +} + +/* + * Channel nexus interrupt handler wrapper + */ +static uint_t +cnex_intr_wrapper(caddr_t arg) +{ + int res; + uint_t (*handler)(); + caddr_t handler_arg1; + caddr_t handler_arg2; + cnex_intr_t *iinfo = (cnex_intr_t *)arg; + + ASSERT(iinfo != NULL); + + handler = iinfo->hdlr; + handler_arg1 = iinfo->arg1; + handler_arg2 = iinfo->arg2; + + D1("cnex_intr_wrapper: ino=0x%llx invoke client handler\n", iinfo->ino); + res = (*handler)(handler_arg1, handler_arg2); + + return (res); +} + +/*ARGSUSED*/ +static int +cnex_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +{ + int rv, instance, reglen; + cnex_regspec_t *reg_p; + ldc_cnex_t cinfo; + cnex_soft_state_t *cnex_ssp; + + switch (cmd) { + case DDI_ATTACH: + break; + case DDI_RESUME: + return (DDI_SUCCESS); + default: + return (DDI_FAILURE); + } + + /* + * Get the instance specific soft state structure. + * Save the devi for this instance in the soft_state data. + */ + instance = ddi_get_instance(devi); + if (ddi_soft_state_zalloc(cnex_state, instance) != DDI_SUCCESS) + return (DDI_FAILURE); + cnex_ssp = ddi_get_soft_state(cnex_state, instance); + + cnex_ssp->devi = devi; + cnex_ssp->clist = NULL; + + if (ddi_getlongprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, + "reg", (caddr_t)®_p, ®len) != DDI_SUCCESS) { + return (DDI_FAILURE); + } + + /* get the sun4v config handle for this device */ + cnex_ssp->cfghdl = SUN4V_REG_SPEC2CFG_HDL(reg_p->physaddr); + kmem_free(reg_p, reglen); + + D1("cnex_attach: cfghdl=0x%llx\n", cnex_ssp->cfghdl); + + /* init channel list mutex */ + mutex_init(&cnex_ssp->clist_lock, NULL, MUTEX_DRIVER, NULL); + + /* Register with LDC module */ + cinfo.dip = devi; + cinfo.reg_chan = cnex_reg_chan; + cinfo.unreg_chan = cnex_unreg_chan; + cinfo.add_intr = cnex_add_intr; + cinfo.rem_intr = cnex_rem_intr; + cinfo.clr_intr = cnex_clr_intr; + + /* + * LDC register will fail if an nexus instance had already + * registered with the LDC framework + */ + rv = ldc_register(&cinfo); + if (rv) { + DWARN("cnex_attach: unable to register with LDC\n"); + ddi_soft_state_free(cnex_state, instance); + mutex_destroy(&cnex_ssp->clist_lock); + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(devi, "devctl", S_IFCHR, instance, + DDI_NT_NEXUS, 0) != DDI_SUCCESS) { + ddi_remove_minor_node(devi, NULL); + ddi_soft_state_free(cnex_state, instance); + mutex_destroy(&cnex_ssp->clist_lock); + return (DDI_FAILURE); + } + + /* Add interrupt redistribution callback. */ + intr_dist_add(cnex_intr_redist, cnex_ssp); + + ddi_report_dev(devi); + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +cnex_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) +{ + int instance; + ldc_cnex_t cinfo; + cnex_soft_state_t *cnex_ssp; + + switch (cmd) { + case DDI_DETACH: + break; + case DDI_SUSPEND: + return (DDI_SUCCESS); + default: + return (DDI_FAILURE); + } + + instance = ddi_get_instance(devi); + cnex_ssp = ddi_get_soft_state(cnex_state, instance); + + /* check if there are any channels still registered */ + if (cnex_ssp->clist) { + cmn_err(CE_WARN, "?cnex_dettach: channels registered %d\n", + ddi_get_instance(devi)); + return (DDI_FAILURE); + } + + /* Unregister with LDC module */ + cinfo.dip = devi; + (void) ldc_unregister(&cinfo); + + /* Remove interrupt redistribution callback. */ + intr_dist_rem(cnex_intr_redist, cnex_ssp); + + /* destroy mutex */ + mutex_destroy(&cnex_ssp->clist_lock); + + /* free soft state structure */ + ddi_soft_state_free(cnex_state, instance); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +cnex_open(dev_t *devp, int flags, int otyp, cred_t *credp) +{ + int instance; + + if (otyp != OTYP_CHR) + return (EINVAL); + + instance = getminor(*devp); + if (ddi_get_soft_state(cnex_state, instance) == NULL) + return (ENXIO); + + return (0); +} + +/*ARGSUSED*/ +static int +cnex_close(dev_t dev, int flags, int otyp, cred_t *credp) +{ + int instance; + + if (otyp != OTYP_CHR) + return (EINVAL); + + instance = getminor(dev); + if (ddi_get_soft_state(cnex_state, instance) == NULL) + return (ENXIO); + + return (0); +} + +/*ARGSUSED*/ +static int +cnex_ioctl(dev_t dev, + int cmd, intptr_t arg, int mode, cred_t *cred_p, int *rval_p) +{ + int instance; + cnex_soft_state_t *cnex_ssp; + + instance = getminor(dev); + if ((cnex_ssp = ddi_get_soft_state(cnex_state, instance)) == NULL) + return (ENXIO); + ASSERT(cnex_ssp->devi); + return (ndi_devctl_ioctl(cnex_ssp->devi, cmd, arg, mode, 0)); +} + +static int +cnex_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop, + void *arg, void *result) +{ + char name[MAXNAMELEN]; + uint32_t reglen; + int *cnex_regspec; + + switch (ctlop) { + case DDI_CTLOPS_REPORTDEV: + if (rdip == NULL) + return (DDI_FAILURE); + cmn_err(CE_CONT, "?channel-device: %s%d\n", + ddi_driver_name(rdip), ddi_get_instance(rdip)); + return (DDI_SUCCESS); + + case DDI_CTLOPS_INITCHILD: + { + dev_info_t *child = (dev_info_t *)arg; + + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, child, + DDI_PROP_DONTPASS, "reg", + &cnex_regspec, ®len) != DDI_SUCCESS) { + return (DDI_FAILURE); + } + + (void) snprintf(name, sizeof (name), "%x", *cnex_regspec); + ddi_set_name_addr(child, name); + ddi_set_parent_data(child, NULL); + ddi_prop_free(cnex_regspec); + return (DDI_SUCCESS); + } + + case DDI_CTLOPS_UNINITCHILD: + { + dev_info_t *child = (dev_info_t *)arg; + + NDI_CONFIG_DEBUG((CE_NOTE, + "DDI_CTLOPS_UNINITCHILD(%s, instance=%d)", + ddi_driver_name(child), DEVI(child)->devi_instance)); + + ddi_set_name_addr(child, NULL); + + return (DDI_SUCCESS); + } + + case DDI_CTLOPS_DMAPMAPC: + case DDI_CTLOPS_REPORTINT: + case DDI_CTLOPS_REGSIZE: + case DDI_CTLOPS_NREGS: + case DDI_CTLOPS_SIDDEV: + case DDI_CTLOPS_SLAVEONLY: + case DDI_CTLOPS_AFFINITY: + case DDI_CTLOPS_POKE: + case DDI_CTLOPS_PEEK: + /* + * These ops correspond to functions that "shouldn't" be called + * by a channel-device driver. So we whine when we're called. + */ + cmn_err(CE_WARN, "%s%d: invalid op (%d) from %s%d\n", + ddi_driver_name(dip), ddi_get_instance(dip), ctlop, + ddi_driver_name(rdip), ddi_get_instance(rdip)); + return (DDI_FAILURE); + + case DDI_CTLOPS_ATTACH: + case DDI_CTLOPS_BTOP: + case DDI_CTLOPS_BTOPR: + case DDI_CTLOPS_DETACH: + case DDI_CTLOPS_DVMAPAGESIZE: + case DDI_CTLOPS_IOMIN: + case DDI_CTLOPS_POWER: + case DDI_CTLOPS_PTOB: + default: + /* + * Everything else (e.g. PTOB/BTOP/BTOPR requests) we pass up + */ + return (ddi_ctlops(dip, rdip, ctlop, arg, result)); + } +} + +/* -------------------------------------------------------------------------- */ diff --git a/usr/src/uts/sun4v/io/dr_cpu.c b/usr/src/uts/sun4v/io/dr_cpu.c new file mode 100644 index 0000000000..a66f6610bd --- /dev/null +++ b/usr/src/uts/sun4v/io/dr_cpu.c @@ -0,0 +1,1151 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * sun4v CPU DR Module + */ + +#include <sys/modctl.h> +#include <sys/processor.h> +#include <sys/cpuvar.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/note.h> +#include <sys/sysevent/dr.h> +#include <sys/hypervisor_api.h> +#include <sys/mach_descrip.h> +#include <sys/mdesc.h> +#include <sys/ds.h> +#include <sys/dr_util.h> +#include <sys/dr_cpu.h> +#include <sys/promif.h> +#include <sys/machsystm.h> + + +static struct modlmisc modlmisc = { + &mod_miscops, + "sun4v CPU DR %I%" +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modlmisc, + NULL +}; + +/* + * Global DS Handle + */ +static ds_svc_hdl_t ds_handle; + +/* + * Supported DS Capability Versions + */ +static ds_ver_t dr_cpu_vers[] = { { 1, 0 } }; +#define DR_CPU_NVERS (sizeof (dr_cpu_vers) / sizeof (dr_cpu_vers[0])) + +/* + * DS Capability Description + */ +static ds_capability_t dr_cpu_cap = { + DR_CPU_DS_ID, /* svc_id */ + dr_cpu_vers, /* vers */ + DR_CPU_NVERS /* nvers */ +}; + +/* + * DS Callbacks + */ +static void dr_cpu_reg_handler(ds_cb_arg_t, ds_ver_t *, ds_svc_hdl_t); +static void dr_cpu_unreg_handler(ds_cb_arg_t arg); +static void dr_cpu_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen); + +/* + * DS Client Ops Vector + */ +static ds_clnt_ops_t dr_cpu_ops = { + dr_cpu_reg_handler, /* ds_reg_cb */ + dr_cpu_unreg_handler, /* ds_unreg_cb */ + dr_cpu_data_handler, /* ds_data_cb */ + NULL /* cb_arg */ +}; + +/* + * Internal Functions + */ +static int dr_cpu_init(void); +static int dr_cpu_fini(void); + +static int dr_cpu_list_configure(dr_cpu_hdr_t *, dr_cpu_hdr_t **, int *); +static int dr_cpu_list_unconfigure(dr_cpu_hdr_t *, dr_cpu_hdr_t **, int *); +static int dr_cpu_list_status(dr_cpu_hdr_t *, dr_cpu_hdr_t **, int *); + +static int dr_cpu_unconfigure(processorid_t, int *status, boolean_t force); +static int dr_cpu_configure(processorid_t, int *status); +static int dr_cpu_status(processorid_t, int *status); + +static int dr_cpu_probe(processorid_t newcpuid); +static int dr_cpu_deprobe(processorid_t cpuid); + +static dev_info_t *dr_cpu_find_node(processorid_t cpuid); +static mde_cookie_t dr_cpu_find_node_md(processorid_t, md_t *, mde_cookie_t *); + + +int +_init(void) +{ + int status; + + /* check that CPU DR is enabled */ + if (dr_is_disabled(DR_TYPE_CPU)) { + cmn_err(CE_CONT, "!CPU DR is disabled\n"); + return (-1); + } + + if ((status = dr_cpu_init()) != 0) { + cmn_err(CE_NOTE, "CPU DR initialization failed"); + return (status); + } + + if ((status = mod_install(&modlinkage)) != 0) { + (void) dr_cpu_fini(); + } + + return (status); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int dr_cpu_allow_unload; + +int +_fini(void) +{ + int status; + + if (dr_cpu_allow_unload == 0) + return (EBUSY); + + if ((status = mod_remove(&modlinkage)) == 0) { + (void) dr_cpu_fini(); + } + + return (status); +} + +static int +dr_cpu_init(void) +{ + int rv; + + if ((rv = ds_cap_init(&dr_cpu_cap, &dr_cpu_ops)) != 0) { + cmn_err(CE_NOTE, "ds_cap_init failed: %d", rv); + return (-1); + } + + return (0); +} + +static int +dr_cpu_fini(void) +{ + int rv; + + if ((rv = ds_cap_fini(&dr_cpu_cap)) != 0) { + cmn_err(CE_NOTE, "ds_cap_fini failed: %d", rv); + return (-1); + } + + return (0); +} + +static void +dr_cpu_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl) +{ + DR_DBG_CPU("reg_handler: arg=0x%p, ver=%d.%d, hdl=0x%lx\n", arg, + ver->major, ver->minor, hdl); + + ds_handle = hdl; +} + +static void +dr_cpu_unreg_handler(ds_cb_arg_t arg) +{ + DR_DBG_CPU("unreg_handler: arg=0x%p\n", arg); + + ds_handle = DS_INVALID_HDL; +} + +static void +dr_cpu_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen) +{ + _NOTE(ARGUNUSED(arg)) + + dr_cpu_hdr_t *req = buf; + dr_cpu_hdr_t err_resp; + dr_cpu_hdr_t *resp = &err_resp; + int resp_len = 0; + int rv; + + /* + * Sanity check the message + */ + if (buflen < sizeof (dr_cpu_hdr_t)) { + DR_DBG_CPU("incoming message short: expected at least %ld " + "bytes, received %ld\n", sizeof (dr_cpu_hdr_t), buflen); + goto done; + } + + if (req == NULL) { + DR_DBG_CPU("empty message: expected at least %ld bytes\n", + sizeof (dr_cpu_hdr_t)); + goto done; + } + + DR_DBG_CPU("incoming request:\n"); + DR_DBG_DUMP_MSG(buf, buflen); + + if (req->num_records > NCPU) { + DR_DBG_CPU("CPU list too long: %d when %d is the maximum\n", + req->num_records, NCPU); + goto done; + } + + if (req->num_records == 0) { + DR_DBG_CPU("No CPU specified for operation\n"); + goto done; + } + + /* + * Process the command + */ + switch (req->msg_type) { + case DR_CPU_CONFIGURE: + if ((rv = dr_cpu_list_configure(req, &resp, &resp_len)) != 0) + DR_DBG_CPU("dr_cpu_list_configure failed (%d)\n", rv); + break; + + case DR_CPU_UNCONFIGURE: + case DR_CPU_FORCE_UNCONFIG: + if ((rv = dr_cpu_list_unconfigure(req, &resp, &resp_len)) != 0) + DR_DBG_CPU("dr_cpu_list_unconfigure failed (%d)\n", rv); + break; + + case DR_CPU_STATUS: + if ((rv = dr_cpu_list_status(req, &resp, &resp_len)) != 0) + DR_DBG_CPU("dr_cpu_list_status failed (%d)\n", rv); + break; + + default: + cmn_err(CE_NOTE, "unsupported DR operation (%d)", + req->msg_type); + break; + } + +done: + /* check if an error occurred */ + if (resp == &err_resp) { + resp->req_num = (req) ? req->req_num : 0; + resp->msg_type = DR_CPU_ERROR; + resp->num_records = 0; + resp_len = sizeof (dr_cpu_hdr_t); + } + + /* send back the response */ + if (ds_cap_send(ds_handle, resp, resp_len) != 0) { + DR_DBG_CPU("ds_send failed\n"); + } + + /* free any allocated memory */ + if (resp != &err_resp) { + kmem_free(resp, resp_len); + } +} + +/* + * Do not modify result buffer or length on error. + */ +static int +dr_cpu_list_configure(dr_cpu_hdr_t *req, dr_cpu_hdr_t **resp, int *resp_len) +{ + int idx; + int result; + int status; + int rlen; + uint32_t *cpuids; + dr_cpu_hdr_t *rp; + dr_cpu_stat_t *stat; + + /* the incoming array of cpuids to configure */ + cpuids = (uint32_t *)((caddr_t)req + sizeof (dr_cpu_hdr_t)); + + /* allocate a response message */ + rlen = sizeof (dr_cpu_hdr_t); + rlen += req->num_records * sizeof (dr_cpu_stat_t); + rp = kmem_zalloc(rlen, KM_SLEEP); + + /* fill in the known data */ + rp->req_num = req->req_num; + rp->msg_type = DR_CPU_OK; + rp->num_records = req->num_records; + + /* stat array for the response */ + stat = (dr_cpu_stat_t *)((caddr_t)rp + sizeof (dr_cpu_hdr_t)); + + /* configure each of the CPUs */ + for (idx = 0; idx < req->num_records; idx++) { + + result = dr_cpu_configure(cpuids[idx], &status); + + /* save off results of the configure */ + stat[idx].cpuid = cpuids[idx]; + stat[idx].result = result; + stat[idx].status = status; + } + + *resp = rp; + *resp_len = rlen; + + dr_generate_event(DR_TYPE_CPU, SE_HINT_INSERT); + + return (0); +} + +static void +dr_cpu_check_cpus(uint32_t *cpuids, int ncpus, dr_cpu_stat_t *stat) +{ + int idx; + kthread_t *tp; + proc_t *pp; + + DR_DBG_CPU("dr_cpu_check_cpus...\n"); + + mutex_enter(&cpu_lock); + + /* process each cpu that is part of the request */ + for (idx = 0; idx < ncpus; idx++) { + + if (cpu_get(cpuids[idx]) == NULL) + continue; + + mutex_enter(&pidlock); + + /* + * Walk the active processes, checking if each + * thread belonging to the process is bound. + */ + for (pp = practive; pp != NULL; pp = pp->p_next) { + mutex_enter(&pp->p_lock); + tp = pp->p_tlist; + + if (tp == NULL || (pp->p_flag & SSYS)) { + mutex_exit(&pp->p_lock); + continue; + } + + do { + if (tp->t_bind_cpu != cpuids[idx]) + continue; + + DR_DBG_CPU("thread(s) bound to cpu %d\n", + cpuids[idx]); + + stat[idx].cpuid = cpuids[idx]; + stat[idx].result = DR_CPU_RES_BLOCKED; + stat[idx].status = DR_CPU_STAT_CONFIGURED; + break; + + } while ((tp = tp->t_forw) != pp->p_tlist); + mutex_exit(&pp->p_lock); + } + + mutex_exit(&pidlock); + } + + mutex_exit(&cpu_lock); +} + +/* + * Do not modify result buffer or length on error. + */ +static int +dr_cpu_list_unconfigure(dr_cpu_hdr_t *req, dr_cpu_hdr_t **resp, int *resp_len) +{ + int idx; + int result; + int status; + int rlen; + uint32_t *cpuids; + dr_cpu_hdr_t *rp; + dr_cpu_stat_t *stat; + boolean_t force; + + /* the incoming array of cpuids to configure */ + cpuids = (uint32_t *)((caddr_t)req + sizeof (dr_cpu_hdr_t)); + + /* check if this is a forced unconfigured */ + force = (req->msg_type == DR_CPU_FORCE_UNCONFIG) ? B_TRUE : B_FALSE; + + /* allocate a response message */ + rlen = sizeof (dr_cpu_hdr_t); + rlen += req->num_records * sizeof (dr_cpu_stat_t); + rp = kmem_zalloc(rlen, KM_SLEEP); + + /* fill in the known data */ + rp->req_num = req->req_num; + rp->msg_type = DR_CPU_OK; + rp->num_records = req->num_records; + + /* stat array for the response */ + stat = (dr_cpu_stat_t *)((caddr_t)rp + sizeof (dr_cpu_hdr_t)); + + /* + * If the operation is not a forced unconfigure, + * perform secondary checks for things that would + * prevent an operation. + */ + if (!force) + dr_cpu_check_cpus(cpuids, req->num_records, stat); + + /* unconfigure each of the CPUs */ + for (idx = 0; idx < req->num_records; idx++) { + + /* skip this cpu if it is already marked as blocked */ + if (stat[idx].result == DR_CPU_RES_BLOCKED) + continue; + + result = dr_cpu_unconfigure(cpuids[idx], &status, force); + + /* save off results of the unconfigure */ + stat[idx].cpuid = cpuids[idx]; + stat[idx].result = result; + stat[idx].status = status; + } + + *resp = rp; + *resp_len = rlen; + + dr_generate_event(DR_TYPE_CPU, SE_HINT_REMOVE); + + return (0); +} + +/* + * Do not modify result buffer or length on error. + */ +static int +dr_cpu_list_status(dr_cpu_hdr_t *req, dr_cpu_hdr_t **resp, int *resp_len) +{ + int idx; + int result; + int status; + int rlen; + uint32_t *cpuids; + dr_cpu_hdr_t *rp; + dr_cpu_stat_t *stat; + md_t *mdp = NULL; + int num_nodes; + int listsz; + mde_cookie_t *listp = NULL; + mde_cookie_t cpunode; + boolean_t walk_md = B_FALSE; + + /* the incoming array of cpuids to configure */ + cpuids = (uint32_t *)((caddr_t)req + sizeof (dr_cpu_hdr_t)); + + /* allocate a response message */ + rlen = sizeof (dr_cpu_hdr_t); + rlen += req->num_records * sizeof (dr_cpu_stat_t); + rp = kmem_zalloc(rlen, KM_SLEEP); + + /* fill in the known data */ + rp->req_num = req->req_num; + rp->msg_type = DR_CPU_STATUS; + rp->num_records = req->num_records; + + /* stat array for the response */ + stat = (dr_cpu_stat_t *)((caddr_t)rp + sizeof (dr_cpu_hdr_t)); + + /* get the status for each of the CPUs */ + for (idx = 0; idx < req->num_records; idx++) { + + result = dr_cpu_status(cpuids[idx], &status); + + if (result == DR_CPU_RES_FAILURE) + walk_md = B_TRUE; + + /* save off results of the status */ + stat[idx].cpuid = cpuids[idx]; + stat[idx].result = result; + stat[idx].status = status; + } + + if (walk_md == B_FALSE) + goto done; + + /* + * At least one of the cpus did not have a CPU + * structure. So, consult the MD to determine if + * they are present. + */ + + if ((mdp = md_get_handle()) == NULL) { + DR_DBG_CPU("unable to initialize MD\n"); + goto done; + } + + num_nodes = md_node_count(mdp); + ASSERT(num_nodes > 0); + + listsz = num_nodes * sizeof (mde_cookie_t); + listp = kmem_zalloc(listsz, KM_SLEEP); + + for (idx = 0; idx < req->num_records; idx++) { + + if (stat[idx].result != DR_CPU_RES_FAILURE) + continue; + + /* check the MD for the current cpuid */ + cpunode = dr_cpu_find_node_md(stat[idx].cpuid, mdp, listp); + + stat[idx].result = DR_CPU_RES_OK; + + if (cpunode == MDE_INVAL_ELEM_COOKIE) { + stat[idx].status = DR_CPU_STAT_NOT_PRESENT; + } else { + stat[idx].status = DR_CPU_STAT_UNCONFIGURED; + } + } + + kmem_free(listp, listsz); + + (void) md_fini_handle(mdp); + +done: + *resp = rp; + *resp_len = rlen; + + return (0); +} + +static int +dr_cpu_configure(processorid_t cpuid, int *status) +{ + struct cpu *cp; + int rv = 0; + + DR_DBG_CPU("dr_cpu_configure...\n"); + + /* + * Build device tree node for the CPU + */ + if ((rv = dr_cpu_probe(cpuid)) != 0) { + DR_DBG_CPU("failed to probe CPU %d (%d)\n", cpuid, rv); + if (rv == EINVAL) { + *status = DR_CPU_STAT_NOT_PRESENT; + return (DR_CPU_RES_NOT_IN_MD); + } + *status = DR_CPU_STAT_UNCONFIGURED; + return (DR_CPU_RES_FAILURE); + } + + mutex_enter(&cpu_lock); + + /* + * Configure the CPU + */ + if ((cp = cpu_get(cpuid)) == NULL) { + + if ((rv = cpu_configure(cpuid)) != 0) { + DR_DBG_CPU("failed to configure CPU %d (%d)\n", + cpuid, rv); + rv = DR_CPU_RES_FAILURE; + *status = DR_CPU_STAT_UNCONFIGURED; + goto done; + } + + DR_DBG_CPU("CPU %d configured\n", cpuid); + + /* CPU struct should exist now */ + cp = cpu_get(cpuid); + } + + ASSERT(cp); + + /* + * Power on the CPU. In sun4v, this brings the stopped + * CPU into the guest from the Hypervisor. + */ + if (cpu_is_poweredoff(cp)) { + + if ((rv = cpu_poweron(cp)) != 0) { + DR_DBG_CPU("failed to power on CPU %d (%d)\n", + cpuid, rv); + rv = DR_CPU_RES_FAILURE; + *status = DR_CPU_STAT_UNCONFIGURED; + goto done; + } + + DR_DBG_CPU("CPU %d powered on\n", cpuid); + } + + /* + * Online the CPU + */ + if (cpu_is_offline(cp)) { + + if ((rv = cpu_online(cp)) != 0) { + DR_DBG_CPU("failed to online CPU %d (%d)\n", + cpuid, rv); + rv = DR_CPU_RES_FAILURE; + /* offline is still configured */ + *status = DR_CPU_STAT_CONFIGURED; + goto done; + } + + DR_DBG_CPU("CPU %d online\n", cpuid); + } + + rv = DR_CPU_RES_OK; + *status = DR_CPU_STAT_CONFIGURED; + +done: + mutex_exit(&cpu_lock); + + return (rv); +} + +static int +dr_cpu_unconfigure(processorid_t cpuid, int *status, boolean_t force) +{ + struct cpu *cp; + int rv = 0; + int cpu_flags; + + DR_DBG_CPU("dr_cpu_unconfigure%s...\n", (force) ? " (force)" : ""); + + mutex_enter(&cpu_lock); + + cp = cpu_get(cpuid); + + if (cp == NULL) { + + /* + * The OS CPU structures are already torn down, + * Attempt to deprobe the CPU to make sure the + * device tree is up to date. + */ + if (dr_cpu_deprobe(cpuid) != 0) { + DR_DBG_CPU("failed to deprobe CPU %d\n", cpuid); + rv = DR_CPU_RES_FAILURE; + *status = DR_CPU_STAT_UNCONFIGURED; + goto done; + } + + goto done; + } + + ASSERT(cp->cpu_id == cpuid); + + /* + * Offline the CPU + */ + if (cpu_is_active(cp)) { + + /* set the force flag correctly */ + cpu_flags = (force) ? CPU_FORCED : 0; + + if ((rv = cpu_offline(cp, cpu_flags)) != 0) { + DR_DBG_CPU("failed to offline CPU %d (%d)\n", + cpuid, rv); + + rv = DR_CPU_RES_FAILURE; + *status = DR_CPU_STAT_CONFIGURED; + goto done; + } + + DR_DBG_CPU("CPU %d offline\n", cpuid); + } + + /* + * Power off the CPU. In sun4v, this puts the running + * CPU into the stopped state in the Hypervisor. + */ + if (!cpu_is_poweredoff(cp)) { + + if ((rv = cpu_poweroff(cp)) != 0) { + DR_DBG_CPU("failed to power off CPU %d (%d)\n", + cpuid, rv); + rv = DR_CPU_RES_FAILURE; + *status = DR_CPU_STAT_CONFIGURED; + goto done; + } + + DR_DBG_CPU("CPU %d powered off\n", cpuid); + } + + /* + * Unconfigure the CPU + */ + if ((rv = cpu_unconfigure(cpuid)) != 0) { + DR_DBG_CPU("failed to unconfigure CPU %d (%d)\n", cpuid, rv); + rv = DR_CPU_RES_FAILURE; + *status = DR_CPU_STAT_UNCONFIGURED; + goto done; + } + + DR_DBG_CPU("CPU %d unconfigured\n", cpuid); + + /* + * Tear down device tree. + */ + if ((rv = dr_cpu_deprobe(cpuid)) != 0) { + DR_DBG_CPU("failed to deprobe CPU %d (%d)\n", cpuid, rv); + rv = DR_CPU_RES_FAILURE; + *status = DR_CPU_STAT_UNCONFIGURED; + goto done; + } + + rv = DR_CPU_RES_OK; + *status = DR_CPU_STAT_UNCONFIGURED; + +done: + mutex_exit(&cpu_lock); + + return (rv); +} + +/* + * Determine the state of a CPU. If the CPU structure is not present, + * it does not attempt to determine whether or not the CPU is in the + * MD. It is more efficient to do this at the higher level for all + * CPUs since it may not even be necessary to search the MD if all + * the CPUs are accounted for. Returns DR_CPU_RES_OK if the CPU + * structure is present, and DR_CPU_RES_FAILURE otherwise as a signal + * that an MD walk is necessary. + */ +static int +dr_cpu_status(processorid_t cpuid, int *status) +{ + int rv; + struct cpu *cp; + + DR_DBG_CPU("dr_cpu_status...\n"); + + mutex_enter(&cpu_lock); + + if ((cp = cpu_get(cpuid)) == NULL) { + /* need to check if cpu is in the MD */ + rv = DR_CPU_RES_FAILURE; + goto done; + } + + if (cpu_is_poweredoff(cp)) { + /* + * The CPU is powered off, so it is considered + * unconfigured from the service entity point of + * view. The CPU is not available to the system + * and intervention by the service entity would + * be required to change that. + */ + *status = DR_CPU_STAT_UNCONFIGURED; + } else { + /* + * The CPU is powered on, so it is considered + * configured from the service entity point of + * view. It is available for use by the system + * and service entities are not concerned about + * the operational status (offline, online, etc.) + * of the CPU in terms of DR. + */ + *status = DR_CPU_STAT_CONFIGURED; + } + + rv = DR_CPU_RES_OK; + +done: + mutex_exit(&cpu_lock); + + return (rv); +} + +typedef struct { + md_t *mdp; + mde_cookie_t cpunode; + dev_info_t *dip; +} cb_arg_t; + +#define STR_ARR_LEN 5 + +static int +new_cpu_node(dev_info_t *new_node, void *arg, uint_t flags) +{ + _NOTE(ARGUNUSED(flags)) + + char *compat; + uint64_t freq; + uint64_t cpuid = 0; + int regbuf[4]; + int len = 0; + cb_arg_t *cba; + char *str_arr[STR_ARR_LEN]; + char *curr; + int idx = 0; + + DR_DBG_CPU("new_cpu_node...\n"); + + cba = (cb_arg_t *)arg; + + /* + * Add 'name' property + */ + if (ndi_prop_update_string(DDI_DEV_T_NONE, new_node, + "name", "cpu") != DDI_SUCCESS) { + DR_DBG_CPU("new_cpu_node: failed to create 'name' property\n"); + return (DDI_WALK_ERROR); + } + + /* + * Add 'compatible' property + */ + if (md_get_prop_data(cba->mdp, cba->cpunode, "compatible", + (uint8_t **)(&compat), &len)) { + DR_DBG_CPU("new_cpu_node: failed to read 'compatible' property " + "from MD\n"); + return (DDI_WALK_ERROR); + } + + DR_DBG_CPU("'compatible' len is %d\n", len); + + /* parse the MD string array */ + curr = compat; + while (curr < (compat + len)) { + + DR_DBG_CPU("adding '%s' to 'compatible' property\n", curr); + + str_arr[idx++] = curr; + curr += strlen(curr) + 1; + + if (idx == STR_ARR_LEN) { + DR_DBG_CPU("exceeded str_arr len (%d)\n", STR_ARR_LEN); + break; + } + } + + if (ndi_prop_update_string_array(DDI_DEV_T_NONE, new_node, + "compatible", str_arr, idx) != DDI_SUCCESS) { + DR_DBG_CPU("new_cpu_node: failed to create 'compatible' " + "property\n"); + return (DDI_WALK_ERROR); + } + + /* + * Add 'device_type' property + */ + if (ndi_prop_update_string(DDI_DEV_T_NONE, new_node, + "device_type", "cpu") != DDI_SUCCESS) { + DR_DBG_CPU("new_cpu_node: failed to create 'device_type' " + "property\n"); + return (DDI_WALK_ERROR); + } + + /* + * Add 'clock-frequency' property + */ + if (md_get_prop_val(cba->mdp, cba->cpunode, "clock-frequency", &freq)) { + DR_DBG_CPU("new_cpu_node: failed to read 'clock-frequency' " + "property from MD\n"); + return (DDI_WALK_ERROR); + } + + if (ndi_prop_update_int(DDI_DEV_T_NONE, new_node, + "clock-frequency", freq) != DDI_SUCCESS) { + DR_DBG_CPU("new_cpu_node: failed to create 'clock-frequency' " + "property\n"); + return (DDI_WALK_ERROR); + } + + /* + * Add 'reg' (cpuid) property + */ + if (md_get_prop_val(cba->mdp, cba->cpunode, "id", &cpuid)) { + DR_DBG_CPU("new_cpu_node: failed to read 'id' property " + "from MD\n"); + return (DDI_WALK_ERROR); + } + + DR_DBG_CPU("new cpuid=0x%lx\n", cpuid); + + bzero(regbuf, 4 * sizeof (int)); + regbuf[0] = 0xc0000000 | cpuid; + + if (ndi_prop_update_int_array(DDI_DEV_T_NONE, new_node, + "reg", regbuf, 4) != DDI_SUCCESS) { + DR_DBG_CPU("new_cpu_node: failed to create 'reg' property\n"); + return (DDI_WALK_ERROR); + } + + cba->dip = new_node; + + return (DDI_WALK_TERMINATE); +} + +static int +dr_cpu_probe(processorid_t cpuid) +{ + dev_info_t *pdip; + dev_info_t *dip; + devi_branch_t br; + md_t *mdp = NULL; + int num_nodes; + int rv = 0; + int listsz; + mde_cookie_t *listp = NULL; + cb_arg_t cba; + mde_cookie_t cpunode; + + if ((dip = dr_cpu_find_node(cpuid)) != NULL) { + /* nothing to do */ + e_ddi_branch_rele(dip); + return (0); + } + + if ((mdp = md_get_handle()) == NULL) { + DR_DBG_CPU("unable to initialize machine description\n"); + return (-1); + } + + num_nodes = md_node_count(mdp); + ASSERT(num_nodes > 0); + + listsz = num_nodes * sizeof (mde_cookie_t); + listp = kmem_zalloc(listsz, KM_SLEEP); + + cpunode = dr_cpu_find_node_md(cpuid, mdp, listp); + + if (cpunode == MDE_INVAL_ELEM_COOKIE) { + rv = EINVAL; + goto done; + } + + /* pass in MD cookie for CPU */ + cba.mdp = mdp; + cba.cpunode = cpunode; + + br.arg = (void *)&cba; + br.type = DEVI_BRANCH_SID; + br.create.sid_branch_create = new_cpu_node; + br.devi_branch_callback = NULL; + pdip = ddi_root_node(); + + if ((rv = e_ddi_branch_create(pdip, &br, NULL, 0))) { + DR_DBG_CPU("e_ddi_branch_create failed: %d\n", rv); + rv = -1; + goto done; + } + + DR_DBG_CPU("CPU %d probed\n", cpuid); + + rv = 0; + +done: + if (listp) + kmem_free(listp, listsz); + + if (mdp) + (void) md_fini_handle(mdp); + + return (rv); +} + +static int +dr_cpu_deprobe(processorid_t cpuid) +{ + dev_info_t *fdip = NULL; + dev_info_t *dip; + + if ((dip = dr_cpu_find_node(cpuid)) == NULL) { + DR_DBG_CPU("cpuid %d already deprobed\n", cpuid); + return (0); + } + + ASSERT(e_ddi_branch_held(dip)); + + if (e_ddi_branch_destroy(dip, &fdip, 0)) { + char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + /* + * If non-NULL, fdip is held and must be released. + */ + if (fdip != NULL) { + (void) ddi_pathname(fdip, path); + ddi_release_devi(fdip); + } else { + (void) ddi_pathname(dip, path); + } + cmn_err(CE_NOTE, "node removal failed: %s (%p)", + path, (fdip) ? (void *)fdip : (void *)dip); + + kmem_free(path, MAXPATHLEN); + + return (-1); + } + + DR_DBG_CPU("CPU %d deprobed\n", cpuid); + + return (0); +} + +typedef struct { + processorid_t cpuid; + dev_info_t *dip; +} dr_search_arg_t; + +static int +dr_cpu_check_node(dev_info_t *dip, void *arg) +{ + char *name; + processorid_t cpuid; + dr_search_arg_t *sarg = (dr_search_arg_t *)arg; + + if (dip == ddi_root_node()) { + return (DDI_WALK_CONTINUE); + } + + name = ddi_node_name(dip); + + if (strcmp(name, "cpu") != 0) { + return (DDI_WALK_PRUNECHILD); + } + + cpuid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "reg", -1); + + cpuid = PROM_CFGHDL_TO_CPUID(cpuid); + + DR_DBG_CPU("found cpuid=0x%x, looking for 0x%x\n", cpuid, sarg->cpuid); + + if (cpuid == sarg->cpuid) { + DR_DBG_CPU("matching node\n"); + + /* matching node must be returned held */ + if (!e_ddi_branch_held(dip)) + e_ddi_branch_hold(dip); + + sarg->dip = dip; + return (DDI_WALK_TERMINATE); + } + + return (DDI_WALK_CONTINUE); +} + +/* + * Walk the device tree to find the dip corresponding to the cpuid + * passed in. If present, the dip is returned held. The caller must + * release the hold on the dip once it is no longer required. If no + * matching node if found, NULL is returned. + */ +static dev_info_t * +dr_cpu_find_node(processorid_t cpuid) +{ + dr_search_arg_t arg; + + DR_DBG_CPU("dr_cpu_find_node...\n"); + + arg.cpuid = cpuid; + arg.dip = NULL; + + ddi_walk_devs(ddi_root_node(), dr_cpu_check_node, &arg); + + ASSERT((arg.dip == NULL) || (e_ddi_branch_held(arg.dip))); + + return ((arg.dip) ? arg.dip : NULL); +} + +/* + * Look up a particular cpuid in the MD. Returns the mde_cookie_t + * representing that CPU if present, and MDE_INVAL_ELEM_COOKIE + * otherwise. It is assumed the scratch array has already been + * allocated so that it can accommodate the worst case scenario, + * every node in the MD. + */ +static mde_cookie_t +dr_cpu_find_node_md(processorid_t cpuid, md_t *mdp, mde_cookie_t *listp) +{ + int idx; + int nnodes; + mde_cookie_t rootnode; + uint64_t cpuid_prop; + mde_cookie_t result = MDE_INVAL_ELEM_COOKIE; + + rootnode = md_root_node(mdp); + ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); + + /* + * Scan the DAG for all the CPU nodes + */ + nnodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "cpu"), + md_find_name(mdp, "fwd"), listp); + + if (nnodes < 0) { + DR_DBG_CPU("Scan for CPUs failed\n"); + return (result); + } + + DR_DBG_CPU("dr_cpu_find_node_md: found %d CPUs in the MD\n", nnodes); + + /* + * Find the CPU of interest + */ + for (idx = 0; idx < nnodes; idx++) { + + if (md_get_prop_val(mdp, listp[idx], "id", &cpuid_prop)) { + DR_DBG_CPU("Missing 'id' property for CPU node %d\n", + idx); + break; + } + + if (cpuid_prop == cpuid) { + /* found a match */ + DR_DBG_CPU("dr_cpu_find_node_md: found CPU %d " + "in MD\n", cpuid); + result = listp[idx]; + break; + } + } + + if (result == MDE_INVAL_ELEM_COOKIE) { + DR_DBG_CPU("CPU %d not in MD\n", cpuid); + } + + return (result); +} diff --git a/usr/src/uts/sun4v/io/dr_util.c b/usr/src/uts/sun4v/io/dr_util.c new file mode 100644 index 0000000000..58e7710a08 --- /dev/null +++ b/usr/src/uts/sun4v/io/dr_util.c @@ -0,0 +1,206 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * sun4v DR Utility functions + */ + +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/sunddi.h> +#include <sys/note.h> +#include <sys/sysevent.h> +#include <sys/sysevent/dr.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/ldoms.h> + +#include <sys/dr_util.h> + +boolean_t +dr_is_disabled(dr_type_t type) +{ + /* + * The type argument is currently unused. However, it + * keeps the interface flexible enough to allows for + * only disabling certain types of DR. + */ + _NOTE(ARGUNUSED(type)) + + /* + * DR requires that the kernel is using its own CIF + * handler. If that is not the case, either because + * domaining has been explicitly disabled, or because + * the firmware does not support it, the system must + * remain static and DR must be disabled. + */ + if (!domaining_enabled) { + cmn_err(CE_NOTE, "!Kernel CIF handler is not enabled, DR " + "is not available\n"); + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Generate a DR sysevent based on the type of resource and + * sysevent hint specified. The hint indicates whether the + * resource was added or removed. + */ +void +dr_generate_event(dr_type_t type, int se_hint) +{ + int rv; + sysevent_id_t eid; + sysevent_t *ev = NULL; + sysevent_attr_list_t *evnt_attr_list = NULL; + sysevent_value_t evnt_val; + static char pubname[] = SUNW_KERN_PUB"dr"; + + DR_DBG_ALL("generate_event: type=%s, hint=%s\n", DR_TYPE2STR(type), + SE_HINT2STR(se_hint)); + + /* + * Add the attachment point attribute + */ + ev = sysevent_alloc(EC_DR, ESC_DR_AP_STATE_CHANGE, pubname, KM_SLEEP); + evnt_val.value_type = SE_DATA_TYPE_STRING; + evnt_val.value.sv_string = DR_TYPE2STR(type); + + rv = sysevent_add_attr(&evnt_attr_list, DR_AP_ID, &evnt_val, KM_SLEEP); + if (rv != 0) { + DR_DBG_ALL("generate_event: failed to add attr '%s' for " + "'%s' event\n", DR_AP_ID, EC_DR); + goto done; + } + + /* + * Add the DR hint attribute + */ + evnt_val.value_type = SE_DATA_TYPE_STRING; + evnt_val.value.sv_string = SE_HINT2STR(se_hint); + + rv = sysevent_add_attr(&evnt_attr_list, DR_HINT, &evnt_val, KM_SLEEP); + if (rv != 0) { + DR_DBG_ALL("generate_event: failed to add attr '%s' for " + "'%s' event\n", DR_HINT, EC_DR); + sysevent_free_attr(evnt_attr_list); + goto done; + } + + /* + * Attach the attribute list to the event + */ + rv = sysevent_attach_attributes(ev, evnt_attr_list); + if (rv != 0) { + DR_DBG_ALL("generate_event: failed to add attr list for " + "'%s' event\n", EC_DR); + sysevent_free_attr(evnt_attr_list); + goto done; + } + + /* + * Log the event + */ + rv = log_sysevent(ev, KM_NOSLEEP, &eid); + if (rv != 0) { + DR_DBG_ALL("generate_event: failed to log event (%d)\n", rv); + } + +done: + if (ev != NULL) + sysevent_free(ev); +} + +/* + * Debugging Features + */ +#ifdef DEBUG + +uint_t dr_debug = 0x0; + +#define BYTESPERLINE 8 +#define LINEWIDTH ((BYTESPERLINE * 3) + (BYTESPERLINE + 2) + 1) +#define ASCIIOFFSET ((BYTESPERLINE * 3) + 2) +#define ISPRINT(c) ((c >= ' ') && (c <= '~')) + +/* + * Output a buffer formatted with a set number of bytes on + * each line. Append each line with the ASCII equivalent of + * each byte if it falls within the printable ASCII range, + * and '.' otherwise. + */ +void +dr_dbg_dump_msg(void *buf, size_t len) +{ + int i, j; + char *msg = buf; + char *curr; + char *aoff; + char line[LINEWIDTH]; + + /* abort if not debugging transport */ + if (!(dr_debug & DR_DBG_FLAG_TRANS)) { + return; + } + + /* walk the buffer one line at a time */ + for (i = 0; i < len; i += BYTESPERLINE) { + + bzero(line, LINEWIDTH); + + curr = line; + aoff = line + ASCIIOFFSET; + + /* + * Walk the bytes in the current line, storing + * the hex value for the byte as well as the + * ASCII representation in a temporary buffer. + * All ASCII values are placed at the end of + * the line. + */ + for (j = 0; (j < BYTESPERLINE) && ((i + j) < len); j++) { + (void) sprintf(curr, " %02x", msg[i + j]); + *aoff = (ISPRINT(msg[i + j])) ? msg[i + j] : '.'; + curr += 3; + aoff++; + } + + /* + * Fill in to the start of the ASCII translation + * with spaces. This will only be necessary if + * this is the last line and there are not enough + * bytes to fill the whole line. + */ + while (curr != (line + ASCIIOFFSET)) + *curr++ = ' '; + + DR_DBG_TRANS("%s\n", line); + } +} +#endif /* DEBUG */ diff --git a/usr/src/uts/sun4v/io/ds.c b/usr/src/uts/sun4v/io/ds.c new file mode 100644 index 0000000000..06961cef91 --- /dev/null +++ b/usr/src/uts/sun4v/io/ds.c @@ -0,0 +1,2728 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Domain Services Module + * + * The Domain Services (DS) module is responsible for communication + * with external service entities. It provides an API for clients to + * publish capabilities and handles the low level communication and + * version negotiation required to export those capabilites to any + * interested service entity. Once a capability has been successfully + * registered with a service entity, the DS module facilitates all + * data transfers between the service entity and the client providing + * that particular capability. + */ + +#include <sys/modctl.h> +#include <sys/ksynch.h> +#include <sys/taskq.h> +#include <sys/disp.h> +#include <sys/cmn_err.h> +#include <sys/note.h> + +#include <sys/mach_descrip.h> +#include <sys/mdesc.h> +#include <sys/ldc.h> + +#include <sys/ds.h> +#include <sys/ds_impl.h> + +/* + * All DS ports in the system + * + * The list of DS ports is read in from the MD when the DS module is + * initialized and is never modified. This eliminates the need for + * locking to access the port array itself. Access to the individual + * ports are synchronized at the port level. + */ +static ds_port_t ds_ports[DS_MAX_PORTS]; +static ds_portset_t ds_allports; /* all DS ports in the system */ + +/* + * Table of registered services + * + * Locking: Accesses to the table of services are sychronized using + * a RW lock. The reader lock must be held when looking up service + * information in the table. The writer lock must be held when any + * service information is being modified. + */ +static struct ds_svcs { + ds_svc_t **tbl; /* the table itself */ + krwlock_t rwlock; /* table lock */ + uint_t maxsvcs; /* size of the table */ + uint_t nsvcs; /* current number of items */ +} ds_svcs; + +/* initial size of the table */ +#define DS_MAXSVCS_INIT 32 + +/* + * Taskq for internal task processing + */ +static taskq_t *ds_taskq; +static boolean_t ds_enabled; /* enable/disable taskq processing */ + +/* + * The actual required number of parallel threads is not expected + * to be very large. Use the maximum number of CPUs in the system + * as a rough upper bound. + */ +#define DS_MAX_TASKQ_THR NCPU +#define DS_DISPATCH(fn, arg) taskq_dispatch(ds_taskq, fn, arg, TQ_SLEEP) + +/* + * Supported versions of the DS message protocol + * + * The version array must be sorted in order from the highest + * supported version to the lowest. Support for a particular + * <major>.<minor> version implies all lower minor versions of + * that same major version are supported as well. + */ +static ds_ver_t ds_vers[] = { { 1, 0 } }; + +#define DS_NUM_VER (sizeof (ds_vers) / sizeof (ds_vers[0])) + +/* + * Results of checking version array with ds_vers_isvalid() + */ +typedef enum { + DS_VERS_OK, + DS_VERS_INCREASING_MAJOR_ERR, + DS_VERS_INCREASING_MINOR_ERR +} ds_vers_check_t; + +/* incoming message handling functions */ +typedef void (*ds_msg_handler_t)(ds_port_t *port, caddr_t buf, size_t len); +static void ds_handle_init_req(ds_port_t *port, caddr_t buf, size_t len); +static void ds_handle_init_ack(ds_port_t *port, caddr_t buf, size_t len); +static void ds_handle_init_nack(ds_port_t *port, caddr_t buf, size_t len); +static void ds_handle_reg_req(ds_port_t *port, caddr_t buf, size_t len); +static void ds_handle_reg_ack(ds_port_t *port, caddr_t buf, size_t len); +static void ds_handle_reg_nack(ds_port_t *port, caddr_t buf, size_t len); +static void ds_handle_unreg_req(ds_port_t *port, caddr_t buf, size_t len); +static void ds_handle_unreg_ack(ds_port_t *port, caddr_t buf, size_t len); +static void ds_handle_unreg_nack(ds_port_t *port, caddr_t buf, size_t len); +static void ds_handle_data(ds_port_t *port, caddr_t buf, size_t len); +static void ds_handle_nack(ds_port_t *port, caddr_t buf, size_t len); + +/* + * DS Message Handler Dispatch Table + * + * A table used to dispatch all incoming messages. This table + * contains handlers for all the fixed message types, as well as + * the the messages defined in the 1.0 version of the DS protocol. + */ +static const ds_msg_handler_t ds_msg_handlers[] = { + ds_handle_init_req, /* DS_INIT_REQ */ + ds_handle_init_ack, /* DS_INIT_ACK */ + ds_handle_init_nack, /* DS_INIT_NACK */ + ds_handle_reg_req, /* DS_REG_REQ */ + ds_handle_reg_ack, /* DS_REG_ACK */ + ds_handle_reg_nack, /* DS_REG_NACK */ + ds_handle_unreg_req, /* DS_UNREG */ + ds_handle_unreg_ack, /* DS_UNREG_ACK */ + ds_handle_unreg_nack, /* DS_UNREG_NACK */ + ds_handle_data, /* DS_DATA */ + ds_handle_nack /* DS_NACK */ +}; + +/* + * DS message log + * + * Locking: The message log is protected by a single mutex. This + * protects all fields in the log structure itself as well as + * everything in the entry structures on both the log and the + * free list. + */ +static struct log { + ds_log_entry_t *head; /* head of the log */ + ds_log_entry_t *freelist; /* head of the free list */ + size_t size; /* size of the log in bytes */ + uint32_t nentry; /* number of entries */ + kmutex_t lock; /* log lock */ +} ds_log; + +/* log soft limit */ +uint_t ds_log_sz = DS_LOG_DEFAULT_SZ; + +/* initial pool of log entry structures */ +static ds_log_entry_t ds_log_entry_pool[DS_LOG_NPOOL]; + +/* + * Debugging Features + */ +#ifdef DEBUG + +#define DS_DBG_FLAG_LDC 0x1 +#define DS_DBG_FLAG_LOG 0x2 +#define DS_DBG_FLAG_ALL 0xf + +#define DS_DBG if (ds_debug) printf +#define DS_DBG_LDC if (ds_debug & DS_DBG_FLAG_LDC) printf +#define DS_DBG_LOG if (ds_debug & DS_DBG_FLAG_LOG) printf +#define DS_DUMP_LDC_MSG(buf, len) ds_dump_ldc_msg(buf, len) + +uint_t ds_debug = 0; +static void ds_dump_ldc_msg(void *buf, size_t len); + +#else /* DEBUG */ + +#define DS_DBG _NOTE(CONSTCOND) if (0) printf +#define DS_DBG_LDC DS_DBG +#define DS_DBG_LOG DS_DBG +#define DS_DUMP_LDC_MSG(buf, len) + +#endif /* DEBUG */ + + +/* initialization functions */ +static void ds_init(void); +static void ds_fini(void); +static int ds_ports_init(void); +static int ds_ports_fini(void); +static int ds_ldc_init(ds_port_t *port); +static int ds_ldc_fini(ds_port_t *port); + +/* event processing functions */ +static uint_t ds_ldc_cb(uint64_t event, caddr_t arg); +static void ds_dispatch_event(void *arg); +static void ds_handle_ldc_event(ds_port_t *port, int newstate); +static int ds_recv_msg(ldc_handle_t ldc_hdl, caddr_t msgp, size_t msglen); +static void ds_handle_recv(void *arg); + +/* message sending functions */ +static int ds_send_msg(ds_port_t *port, caddr_t msg, size_t msglen); +static void ds_send_init_req(ds_port_t *port); +static int ds_send_reg_req(ds_svc_t *svc); +static int ds_send_unreg_req(ds_svc_t *svc); +static void ds_send_unreg_nack(ds_port_t *port, ds_svc_hdl_t bad_hdl); +static void ds_send_data_nack(ds_port_t *port, ds_svc_hdl_t bad_hdl); + +/* walker functions */ +typedef int (*svc_cb_t)(ds_svc_t *svc, void *arg); +static int ds_walk_svcs(svc_cb_t svc_cb, void *arg); +static int ds_svc_isfree(ds_svc_t *svc, void *arg); +static int ds_svc_ismatch(ds_svc_t *svc, void *arg); +static int ds_svc_free(ds_svc_t *svc, void *arg); +static int ds_svc_register(ds_svc_t *svc, void *arg); +static int ds_svc_unregister(ds_svc_t *svc, void *arg); +static int ds_svc_port_up(ds_svc_t *svc, void *arg); + +/* service utilities */ +static ds_svc_t *ds_alloc_svc(void); +static void ds_reset_svc(ds_svc_t *svc, ds_port_t *port); +static ds_svc_t *ds_get_svc(ds_svc_hdl_t hdl); + +/* port utilities */ +static int ds_port_add(md_t *mdp, mde_cookie_t port, mde_cookie_t chan); +static void ds_port_reset(ds_port_t *port); + +/* misc utilities */ +static ds_vers_check_t ds_vers_isvalid(ds_ver_t *vers, int nvers); + +/* log functions */ +static void ds_log_init(void); +static void ds_log_fini(void); +static int ds_log_add_msg(int32_t dest, uint8_t *msg, size_t sz); +static int ds_log_remove(void); +static void ds_log_purge(void *arg); + + +static struct modlmisc modlmisc = { + &mod_miscops, + "Domain Services %I%" +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modlmisc, + NULL +}; + +int +_init(void) +{ + int rv; + + /* + * Perform all internal setup before initializing + * the DS ports. This ensures that events can be + * processed as soon as the port comes up. + */ + ds_init(); + + if ((rv = ds_ports_init()) != 0) { + cmn_err(CE_WARN, "Domin Services initialization failed"); + ds_fini(); + return (rv); + } + + if ((rv = mod_install(&modlinkage)) != 0) { + (void) ds_ports_fini(); + ds_fini(); + } + + return (rv); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int rv; + + if ((rv = mod_remove(&modlinkage)) == 0) { + (void) ds_ports_fini(); + ds_fini(); + } + + return (rv); +} + +static void +ds_init(void) +{ + int tblsz; + + /* + * Initialize table of registered service classes + */ + ds_svcs.maxsvcs = DS_MAXSVCS_INIT; + + tblsz = ds_svcs.maxsvcs * sizeof (ds_svc_t *); + ds_svcs.tbl = kmem_zalloc(tblsz, KM_SLEEP); + + rw_init(&ds_svcs.rwlock, NULL, RW_DRIVER, NULL); + + ds_svcs.nsvcs = 0; + + /* + * Initialize the message log. + */ + ds_log_init(); + + /* + * Create taskq for internal processing threads. This + * includes processing incoming request messages and + * sending out of band registration messages. + */ + ds_taskq = taskq_create("ds_taskq", 1, minclsyspri, 1, + DS_MAX_TASKQ_THR, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + + ds_enabled = B_TRUE; + + /* catch problems with the version array */ + ASSERT(ds_vers_isvalid(ds_vers, DS_NUM_VER) == DS_VERS_OK); +} + +static void +ds_fini(void) +{ + int idx; + + /* + * Flip the enabled switch to make sure that no + * incoming events get dispatched while things + * are being torn down. + */ + ds_enabled = B_FALSE; + + /* + * Destroy the taskq. + */ + taskq_destroy(ds_taskq); + + /* + * Destroy the message log. + */ + ds_log_fini(); + + /* + * Deallocate the table of registered services + */ + + /* clear out all entries */ + rw_enter(&ds_svcs.rwlock, RW_WRITER); + idx = ds_walk_svcs(ds_svc_free, NULL); + rw_exit(&ds_svcs.rwlock); + + /* should have gone through the whole table */ + ASSERT(idx == ds_svcs.maxsvcs); + + /* destroy the table itself */ + kmem_free(ds_svcs.tbl, ds_svcs.maxsvcs * sizeof (ds_svc_t *)); + rw_destroy(&ds_svcs.rwlock); + bzero(&ds_svcs, sizeof (ds_svcs)); +} + +/* + * Initialize the list of ports based on the MD. + */ +static int +ds_ports_init(void) +{ + int idx; + int rv; + md_t *mdp; + int num_nodes; + int listsz; + mde_cookie_t rootnode; + mde_cookie_t *portp = NULL; + mde_cookie_t *chanp = NULL; + int nport; + int nchan; + ds_port_t *port; + + if ((mdp = md_get_handle()) == NULL) { + cmn_err(CE_WARN, "unable to initialize machine description"); + return (-1); + } + + num_nodes = md_node_count(mdp); + ASSERT(num_nodes > 0); + + listsz = num_nodes * sizeof (mde_cookie_t); + + /* allocate temporary storage for MD scans */ + portp = kmem_zalloc(listsz, KM_SLEEP); + chanp = kmem_zalloc(listsz, KM_SLEEP); + + rootnode = md_root_node(mdp); + ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); + + /* find all the DS ports in the MD */ + nport = md_scan_dag(mdp, rootnode, md_find_name(mdp, DS_MD_PORT_NAME), + md_find_name(mdp, "fwd"), portp); + + if (nport <= 0) { + cmn_err(CE_NOTE, "No '%s' nodes in MD", DS_MD_PORT_NAME); + rv = -1; + goto done; + } + + /* + * Initialize all the ports found in the MD. + */ + for (idx = 0; idx < nport; idx++) { + + /* get the channels for this port */ + nchan = md_scan_dag(mdp, portp[idx], + md_find_name(mdp, DS_MD_CHAN_NAME), + md_find_name(mdp, "fwd"), chanp); + + if (nchan <= 0) { + cmn_err(CE_NOTE, "No '%s' node for DS port", + DS_MD_CHAN_NAME); + rv = -1; + goto done; + } + + /* expecting only one channel */ + if (nchan != 1) { + DS_DBG("expected 1 '%s' node for DS port, found %d\n", + DS_MD_CHAN_NAME, nchan); + } + + if (ds_port_add(mdp, portp[idx], chanp[0]) != 0) { + rv = -1; + goto done; + } + } + + /* + * Initialize the LDC channel for each port. + */ + for (idx = 0; idx < DS_MAX_PORTS; idx++) { + + if (!DS_PORT_IN_SET(ds_allports, idx)) + continue; + + port = &ds_ports[idx]; + + mutex_enter(&port->lock); + + if (ds_ldc_init(port)) { + cmn_err(CE_WARN, "ds@%lx: ports_init: failed to " + "initialize LDC %ld", port->id, port->ldc.id); + } else { + DS_DBG("ds@%lx: ports_init: initialization complete\n", + port->id); + } + + mutex_exit(&port->lock); + } + + rv = 0; + +done: + if (rv != 0) + (void) ds_ports_fini(); + + kmem_free(portp, listsz); + kmem_free(chanp, listsz); + + (void) md_fini_handle(mdp); + + return (rv); +} + +static int +ds_ports_fini(void) +{ + int idx; + ds_port_t *port; + + /* + * Tear down each initialized port. + */ + for (idx = 0; idx < DS_MAX_PORTS; idx++) { + + if (!DS_PORT_IN_SET(ds_allports, idx)) + continue; + + port = &ds_ports[idx]; + + mutex_enter(&port->lock); + + if (port->state >= DS_PORT_LDC_INIT) { + /* shut down the LDC for this port */ + (void) ds_ldc_fini(port); + } + + port->state = DS_PORT_FREE; + + mutex_exit(&port->lock); + + /* clean up the port structure */ + mutex_destroy(&port->lock); + DS_PORTSET_DEL(ds_allports, idx); + } + + return (0); +} + +static int +ds_ldc_init(ds_port_t *port) +{ + int rv; + ldc_attr_t ldc_attr; + caddr_t cb_arg = (caddr_t)port; + + ASSERT(MUTEX_HELD(&port->lock)); + + DS_DBG("ds@%lx: ldc_init: ldc_id=%ld\n", port->id, port->ldc.id); + + ldc_attr.devclass = LDC_DEV_GENERIC; + ldc_attr.instance = 0; + ldc_attr.mode = LDC_MODE_STREAM; + ldc_attr.qlen = DS_QUEUE_LEN; + + if ((rv = ldc_init(port->ldc.id, &ldc_attr, &port->ldc.hdl)) != 0) { + cmn_err(CE_WARN, "ds@%lx: ldc_init: ldc_init error (%d)", + port->id, rv); + goto done; + } + + /* register the LDC callback */ + if ((rv = ldc_reg_callback(port->ldc.hdl, ds_ldc_cb, cb_arg)) != 0) { + cmn_err(CE_WARN, "ds@%lx: dc_init: ldc_reg_callback error " + "(%d)", port->id, rv); + goto done; + } + + if ((rv = ldc_open(port->ldc.hdl)) != 0) { + cmn_err(CE_WARN, "ds@%lx: ldc_init: ldc_open error (%d)", + port->id, rv); + goto done; + } + + (void) ldc_up(port->ldc.hdl); + + (void) ldc_status(port->ldc.hdl, &port->ldc.state); + + DS_DBG_LDC("ds@%lx: ldc_init: initial LDC state 0x%x\n", + port->id, port->ldc.state); + + port->state = DS_PORT_LDC_INIT; + + /* if port is up, send init message */ + if (port->ldc.state == LDC_UP) { + ds_send_init_req(port); + } + +done: + return (rv); +} + +static int +ds_ldc_fini(ds_port_t *port) +{ + int rv; + + ASSERT(port->state >= DS_PORT_LDC_INIT); + + DS_DBG("ds@%lx: ldc_fini: ldc_id=%ld\n", port->id, port->ldc.id); + + if ((rv = ldc_close(port->ldc.hdl)) != 0) { + cmn_err(CE_WARN, "ds@%lx: ldc_fini: ldc_close error (%d)", + port->id, rv); + return (rv); + } + + if ((rv = ldc_unreg_callback(port->ldc.hdl)) != 0) { + cmn_err(CE_WARN, "ds@%lx: ldc_fini: ldc_unreg_callback error " + "(%d)", port->id, rv); + return (rv); + } + + if ((rv = ldc_fini(port->ldc.hdl)) != 0) { + cmn_err(CE_WARN, "ds@%lx: ldc_fini: ldc_fini error (%d)", + port->id, rv); + return (rv); + } + + return (rv); +} + +/* + * A DS event consists of a buffer on a port. + */ +typedef struct ds_event { + ds_port_t *port; + char *buf; + size_t buflen; +} ds_event_t; + +static uint_t +ds_ldc_cb(uint64_t event, caddr_t arg) +{ + ldc_status_t ldc_state; + int rv; + ds_port_t *port = (ds_port_t *)arg; + ldc_handle_t ldc_hdl; + + DS_DBG("ds@%lx: ds_ldc_cb...\n", port->id); + + if (!ds_enabled) { + DS_DBG("ds@%lx: callback handling is disabled\n", port->id); + return (LDC_SUCCESS); + } + + ldc_hdl = port->ldc.hdl; + + /* + * Check the LDC event. + */ + + if (event & LDC_EVT_READ) { + /* dispatch a thread to handle the read event */ + if (DS_DISPATCH(ds_handle_recv, port) == NULL) { + cmn_err(CE_WARN, "error initiating event handler"); + } + return (LDC_SUCCESS); + } + + /* only check status if not a read event */ + if ((rv = ldc_status(ldc_hdl, &ldc_state)) != 0) { + DS_DBG("ds@%lx: ldc_status error (%d)", port->id, rv); + return (LDC_SUCCESS); + } + + if (event & LDC_EVT_DOWN || event & LDC_EVT_UP) { + mutex_enter(&port->lock); + ds_handle_ldc_event(port, ldc_state); + mutex_exit(&port->lock); + return (LDC_SUCCESS); + } + + if (event & LDC_EVT_RESET || event & LDC_EVT_WRITE) { + DS_DBG("ds@%lx: LDC event (%lx) received", port->id, event); + return (LDC_SUCCESS); + } + + cmn_err(CE_NOTE, "ds@%lx: Unexpected LDC event (%lx) received", + port->id, event); + + return (LDC_SUCCESS); +} + +static int +ds_recv_msg(ldc_handle_t ldc_hdl, caddr_t msgp, size_t msglen) +{ + int rv = 0; + size_t amt_left = msglen; + int loopcnt = 0; + + while (msglen > 0) { + if ((rv = ldc_read(ldc_hdl, msgp, &amt_left)) != 0) { + if ((rv == EAGAIN) && (loopcnt++ < 1000)) { + /* + * Try again, but don't try for more than + * one second. Something is wrong with + * the channel. + */ + delay(drv_usectohz(10000)); /* 1/1000 sec */ + } else { + /* fail */ + return (rv); + } + } else { + msgp += amt_left; + msglen -= amt_left; + amt_left = msglen; + } + } /* while (msglen > 0) */ + + return (rv); +} + +static void +ds_handle_recv(void *arg) +{ + ds_port_t *port = (ds_port_t *)arg; + char *hbuf; + size_t len; + size_t read_size; + boolean_t isempty; + ds_hdr_t hdr; + uint8_t *msg; + char *currp; + int rv; + ldc_handle_t ldc_hdl; + ds_event_t *devent; + + DS_DBG("ds@%lx: ds_ldc_cb...\n", port->id); + + ldc_hdl = port->ldc.hdl; + + mutex_enter(&port->lock); + while ((ldc_chkq(ldc_hdl, &isempty) == 0) && (!isempty)) { + + + DS_DBG("ds@%lx: reading next message\n", port->id); + + /* + * Read in the next message. + */ + hbuf = (char *)&hdr; + bzero(hbuf, DS_HDR_SZ); + read_size = DS_HDR_SZ; + currp = hbuf; + + /* read in the message header */ + + if ((rv = ds_recv_msg(ldc_hdl, currp, read_size)) != 0) { + /* + * failed to read message drop it and see if there + * are anymore messages + */ + cmn_err(CE_NOTE, "ldc_read returned %d", rv); + continue; + } + + len = read_size; + + /* get payload size and alloc a buffer */ + + read_size = ((ds_hdr_t *)hbuf)->payload_len; + msg = kmem_zalloc((DS_HDR_SZ + read_size), KM_SLEEP); + + /* move message header into buffer */ + + bcopy(hbuf, msg, DS_HDR_SZ); + currp = (char *)(msg) + DS_HDR_SZ; + + /* read in the message body */ + + if ((rv = ds_recv_msg(ldc_hdl, currp, read_size)) != 0) { + /* + * failed to read message drop it and see if there + * are anymore messages + */ + kmem_free(msg, (DS_HDR_SZ + read_size)); + cmn_err(CE_NOTE, "ldc_read returned %d", rv); + continue; + } + + len += read_size; + DS_DUMP_LDC_MSG(msg, len); + + /* + * Send the message for processing, and store it + * in the log. The memory is deallocated only when + * the message is removed from the log. + */ + + devent = kmem_zalloc(sizeof (ds_event_t), KM_SLEEP); + devent->port = port; + devent->buf = (char *)msg; + devent->buflen = len; + + /* log the message */ + (void) ds_log_add_msg(DS_LOG_IN(port->id), msg, len); + + /* send the message off to get processed in a new thread */ + if (DS_DISPATCH(ds_dispatch_event, devent) == NULL) { + cmn_err(CE_WARN, "error initiating event handler"); + continue; + } + + } + mutex_exit(&port->lock); +} + +static void +ds_dispatch_event(void *arg) +{ + ds_event_t *event = (ds_event_t *)arg; + ds_hdr_t *hdr; + ds_port_t *port; + + port = event->port; + + hdr = (ds_hdr_t *)event->buf; + + if (!DS_MSG_TYPE_VALID(hdr->msg_type)) { + cmn_err(CE_NOTE, "ds@%lx: dispatch_event: invalid msg " + "type (%d)", port->id, hdr->msg_type); + return; + } + + DS_DBG("ds@%lx: dispatch_event: msg_type=%d\n", port->id, + hdr->msg_type); + + (*ds_msg_handlers[hdr->msg_type])(port, event->buf, event->buflen); + + kmem_free(event, sizeof (ds_event_t)); +} + +static void +ds_handle_ldc_event(ds_port_t *port, int newstate) +{ + ldc_status_t oldstate = port->ldc.state; + + ASSERT(MUTEX_HELD(&port->lock)); + + DS_DBG_LDC("ds@%lx: LDC state change: 0x%x -> 0x%x\n", + port->id, oldstate, newstate); + + switch (newstate) { + case LDC_UP: + if ((oldstate == LDC_OPEN) || (oldstate == LDC_READY)) { + /* start the version negotiation */ + ds_send_init_req(port); + } else { + DS_DBG_LDC("unsupported LDC state change\n"); + } + break; + + case LDC_READY: + case LDC_OPEN: + if (oldstate != LDC_UP) { + /* not worried about this state change */ + break; + } + + _NOTE(FALLTHROUGH) + + default: + if (oldstate == LDC_UP) { + ds_port_reset(port); + } else { + DS_DBG_LDC("unsupported LDC state change\n"); + } + break; + } + + port->ldc.state = newstate; +} + +/* + * Version negotiation is always initiated by the guest. Any + * attempt by a remote party to initiate the handshake gets + * nack'd with a major number equal to zero. This indicates + * that no version is supported since an init request is not + * expected. + */ +static void +ds_handle_init_req(ds_port_t *port, caddr_t buf, size_t len) +{ + ds_hdr_t *hdr; + ds_init_nack_t *nack; + char *msg; + size_t msglen; + ds_init_req_t *req; + size_t explen = DS_MSG_LEN(ds_init_req_t); + + req = (ds_init_req_t *)(buf + DS_HDR_SZ); + + /* sanity check the incoming message */ + if (len != explen) { + cmn_err(CE_NOTE, "ds@%lx: <init_req: invalid message length " + "(%ld), expected %ld", port->id, len, explen); + } else { + DS_DBG("ds@%lx: <init_req: ver=%d.%d\n", port->id, + req->major_vers, req->minor_vers); + } + + DS_DBG("ds@%lx: init_nack>: major=0\n", port->id); + + msglen = DS_MSG_LEN(ds_init_nack_t); + msg = kmem_zalloc(msglen, KM_SLEEP); + + hdr = (ds_hdr_t *)msg; + hdr->msg_type = DS_INIT_NACK; + hdr->payload_len = sizeof (ds_init_nack_t); + + nack = (ds_init_nack_t *)(msg + DS_HDR_SZ); + nack->major_vers = 0; + + /* send message */ + mutex_enter(&port->lock); + (void) ds_send_msg(port, msg, msglen); + mutex_exit(&port->lock); +} + +static void +ds_handle_init_ack(ds_port_t *port, caddr_t buf, size_t len) +{ + ds_init_ack_t *ack; + ds_ver_t *ver; + size_t explen = DS_MSG_LEN(ds_init_ack_t); + + /* sanity check the incoming message */ + if (len != explen) { + cmn_err(CE_NOTE, "ds@%lx: <init_ack: invalid message length " + "(%ld), expected %ld", port->id, len, explen); + return; + } + + ack = (ds_init_ack_t *)(buf + DS_HDR_SZ); + + mutex_enter(&port->lock); + + if (port->state != DS_PORT_INIT_REQ) { + cmn_err(CE_NOTE, "ds@%lx: <init_ack: invalid state for msg " + "(%d)", port->id, port->state); + mutex_exit(&port->lock); + return; + } + + ver = &(ds_vers[port->ver_idx]); + + DS_DBG("ds@%lx: <init_ack: req=v%d.%d, ack=v%d.%d\n", port->id, + ver->major, ver->minor, ver->major, ack->minor_vers); + + /* agreed upon a major version */ + port->ver.major = ver->major; + + /* + * If the returned minor version is larger than + * the requested minor version, use the lower of + * the two, i.e. the requested version. + */ + if (ack->minor_vers >= ver->minor) { + /* + * Use the minor version specified in the + * original request. + */ + port->ver.minor = ver->minor; + } else { + /* + * Use the lower minor version returned in + * the ack. By defninition, all lower minor + * versions must be supported. + */ + port->ver.minor = ack->minor_vers; + } + + port->state = DS_PORT_READY; + + DS_DBG("ds@%lx: <init_ack: port ready v%d.%d\n", port->id, + port->ver.major, port->ver.minor); + + mutex_exit(&port->lock); + + /* + * The port came up, so update all the services + * with this information. Follow that up with an + * attempt to register any service that is not + * already registered. + */ + rw_enter(&ds_svcs.rwlock, RW_WRITER); + + (void) ds_walk_svcs(ds_svc_port_up, port); + (void) ds_walk_svcs(ds_svc_register, NULL); + + rw_exit(&ds_svcs.rwlock); +} + +static void +ds_handle_init_nack(ds_port_t *port, caddr_t buf, size_t len) +{ + int idx; + ds_init_nack_t *nack; + ds_ver_t *ver; + size_t explen = DS_MSG_LEN(ds_init_nack_t); + + /* sanity check the incoming message */ + if (len != explen) { + cmn_err(CE_NOTE, "ds@%lx: <init_nack: invalid message length " + "(%ld), expected %ld", port->id, len, explen); + return; + } + + nack = (ds_init_nack_t *)(buf + DS_HDR_SZ); + + mutex_enter(&port->lock); + + if (port->state != DS_PORT_INIT_REQ) { + cmn_err(CE_NOTE, "ds@%lx: <init_nack: invalid state for msg " + "(%d)", port->id, port->state); + mutex_exit(&port->lock); + return; + } + + ver = &(ds_vers[port->ver_idx]); + + DS_DBG("ds@%lx: <init_nack: req=v%d.%d, nack=v%d.x\n", port->id, + ver->major, ver->minor, nack->major_vers); + + if (nack->major_vers == 0) { + /* no supported protocol version */ + cmn_err(CE_NOTE, "ds@%lx: <init_nack: DS not supported", + port->id); + mutex_exit(&port->lock); + return; + } + + /* + * Walk the version list, looking for a major version + * that is as close to the requested major version as + * possible. + */ + for (idx = port->ver_idx; idx < DS_NUM_VER; idx++) { + if (ds_vers[idx].major <= nack->major_vers) { + /* found a version to try */ + goto done; + } + } + + if (idx == DS_NUM_VER) { + /* no supported version */ + cmn_err(CE_NOTE, "ds@%lx: <init_nack: DS v%d.x not supported", + port->id, nack->major_vers); + + mutex_exit(&port->lock); + return; + } + +done: + /* start the handshake again */ + port->ver_idx = idx; + port->state = DS_PORT_LDC_INIT; + + ds_send_init_req(port); + + mutex_exit(&port->lock); +} + +static void +ds_handle_reg_req(ds_port_t *port, caddr_t buf, size_t len) +{ + ds_hdr_t *hdr; + ds_reg_req_t *req; + ds_reg_nack_t *nack; + char *msg; + size_t msglen; + size_t explen = DS_MSG_LEN(ds_reg_req_t); + + /* the request information */ + req = (ds_reg_req_t *)(buf + DS_HDR_SZ); + + /* sanity check the incoming message */ + if (len < explen) { + cmn_err(CE_NOTE, "ds@%lx: <reg_req: invalid message length " + "(%ld), expected at least %ld", port->id, len, explen); + } else { + DS_DBG("ds@%lx: <reg_req: id='%s', ver=%d.%d, hdl=0x%lx\n", + port->id, req->svc_id, req->major_vers, req->minor_vers, + req->svc_handle); + } + + DS_DBG("ds@%lx: reg_nack>: major=0\n", port->id); + + msglen = DS_MSG_LEN(ds_reg_nack_t); + msg = kmem_zalloc(msglen, KM_SLEEP); + + hdr = (ds_hdr_t *)msg; + hdr->msg_type = DS_REG_NACK; + hdr->payload_len = sizeof (ds_reg_nack_t); + + nack = (ds_reg_nack_t *)(msg + DS_HDR_SZ); + nack->svc_handle = req->svc_handle; + nack->result = DS_REG_VER_NACK; + nack->major_vers = 0; + + /* send message */ + mutex_enter(&port->lock); + (void) ds_send_msg(port, msg, msglen); + mutex_exit(&port->lock); +} + +static void +ds_handle_reg_ack(ds_port_t *port, caddr_t buf, size_t len) +{ + ds_reg_ack_t *ack; + ds_ver_t *ver; + ds_ver_t tmpver; + ds_svc_t *svc; + size_t explen = DS_MSG_LEN(ds_reg_ack_t); + + /* sanity check the incoming message */ + if (len != explen) { + cmn_err(CE_NOTE, "ds@%lx: <reg_ack: invalid message length " + "(%ld), expected %ld", port->id, len, explen); + return; + } + + ack = (ds_reg_ack_t *)(buf + DS_HDR_SZ); + + rw_enter(&ds_svcs.rwlock, RW_READER); + + /* lookup appropriate client */ + if ((svc = ds_get_svc(ack->svc_handle)) == NULL) { + cmn_err(CE_NOTE, "ds@%lx: <reg_ack: invalid handle 0x%lx", + port->id, ack->svc_handle); + goto done; + } + + /* make sure the message makes sense */ + if (svc->state != DS_SVC_REG_PENDING) { + cmn_err(CE_NOTE, "ds@%lx: <reg_ack: invalid state for message " + "(%d)", port->id, svc->state); + goto done; + } + + ver = &(svc->cap.vers[svc->ver_idx]); + + DS_DBG("ds@%lx: <reg_ack: hdl=0x%lx, ack=v%d.%d\n", port->id, + ack->svc_handle, ver->major, ack->minor_vers); + + /* major version has been agreed upon */ + svc->ver.major = ver->major; + + if (ack->minor_vers >= ver->minor) { + /* + * Use the minor version specified in the + * original request. + */ + svc->ver.minor = ver->minor; + } else { + /* + * Use the lower minor version returned in + * the ack. By defninition, all lower minor + * versions must be supported. + */ + svc->ver.minor = ack->minor_vers; + } + + svc->state = DS_SVC_ACTIVE; + + DS_DBG("ds@%lx: <reg_ack: %s v%d.%d ready, hdl=0x%lx\n", port->id, + svc->cap.svc_id, svc->ver.major, svc->ver.minor, svc->hdl); + + /* notify the client that registration is complete */ + if (svc->ops.ds_reg_cb) { + /* + * Use a temporary version structure so that + * the copy in the svc structure cannot be + * modified by the client. + */ + tmpver.major = svc->ver.major; + tmpver.minor = svc->ver.minor; + + (*svc->ops.ds_reg_cb)(svc->ops.cb_arg, &tmpver, svc->hdl); + } + +done: + rw_exit(&ds_svcs.rwlock); +} + +static void +ds_handle_reg_nack(ds_port_t *port, caddr_t buf, size_t len) +{ + ds_reg_nack_t *nack; + ds_svc_t *svc; + int idx; + size_t explen = DS_MSG_LEN(ds_reg_nack_t); + + /* sanity check the incoming message */ + if (len != explen) { + cmn_err(CE_NOTE, "ds@%lx: <reg_nack: invalid message length " + "(%ld), expected %ld", port->id, len, explen); + return; + } + + nack = (ds_reg_nack_t *)(buf + DS_HDR_SZ); + + rw_enter(&ds_svcs.rwlock, RW_READER); + + /* lookup appropriate client */ + if ((svc = ds_get_svc(nack->svc_handle)) == NULL) { + cmn_err(CE_NOTE, "ds@%lx: <reg_nack: invalid handle 0x%lx", + port->id, nack->svc_handle); + goto done; + } + + /* make sure the message makes sense */ + if (svc->state != DS_SVC_REG_PENDING) { + cmn_err(CE_NOTE, "ds@%lx: <reg_nack: invalid state for message " + "(%d)", port->id, svc->state); + goto done; + } + + if (nack->result == DS_REG_DUP) { + cmn_err(CE_NOTE, "ds@%lx: <reg_nack: duplicate registration " + "for %s", port->id, svc->cap.svc_id); + goto done; + } + + /* + * A major version of zero indicates that the + * service is not supported at all. + */ + if (nack->major_vers == 0) { + DS_DBG("ds@%lx: <reg_nack: %s not supported\n", port->id, + svc->cap.svc_id); + ds_reset_svc(svc, port); + goto done; + } + + DS_DBG("ds@%lx: <reg_nack: hdl=0x%lx, nack=%d.x\n", port->id, + nack->svc_handle, nack->major_vers); + + /* + * Walk the version list for the service, looking for + * a major version that is as close to the requested + * major version as possible. + */ + for (idx = svc->ver_idx; idx < svc->cap.nvers; idx++) { + if (svc->cap.vers[idx].major <= nack->major_vers) { + /* found a version to try */ + break; + } + } + + if (idx == svc->cap.nvers) { + /* no supported version */ + DS_DBG("ds@%lx: <reg_nack: %s v%d.x not supported\n", + port->id, svc->cap.svc_id, nack->major_vers); + svc->state = DS_SVC_INACTIVE; + goto done; + } + + /* start the handshake again */ + svc->state = DS_SVC_INACTIVE; + svc->ver_idx = idx; + + (void) ds_svc_register(svc, NULL); + +done: + rw_exit(&ds_svcs.rwlock); +} + +static void +ds_handle_unreg_req(ds_port_t *port, caddr_t buf, size_t len) +{ + ds_hdr_t *hdr; + ds_unreg_req_t *req; + ds_unreg_ack_t *ack; + ds_svc_t *svc; + char *msg; + size_t msglen; + size_t explen = DS_MSG_LEN(ds_unreg_req_t); + + /* sanity check the incoming message */ + if (len != explen) { + cmn_err(CE_NOTE, "ds@%lx: <unreg_req: invalid message length " + "(%ld), expected %ld", port->id, len, explen); + return; + } + + /* the request information */ + req = (ds_unreg_req_t *)(buf + DS_HDR_SZ); + + rw_enter(&ds_svcs.rwlock, RW_READER); + + /* lookup appropriate client */ + if ((svc = ds_get_svc(req->svc_handle)) == NULL) { + cmn_err(CE_NOTE, "ds@%lx: <unreg_req: invalid handle " + "0x%lx", port->id, req->svc_handle); + ds_send_unreg_nack(port, req->svc_handle); + goto done; + } + + /* unregister the service */ + (void) ds_svc_unregister(svc, svc->port); + + DS_DBG("ds@%lx: unreg_ack>: hdl=0x%lx\n", port->id, req->svc_handle); + + msglen = DS_HDR_SZ + sizeof (ds_unreg_ack_t); + msg = kmem_zalloc(msglen, KM_SLEEP); + + hdr = (ds_hdr_t *)msg; + hdr->msg_type = DS_UNREG_ACK; + hdr->payload_len = sizeof (ds_unreg_ack_t); + + ack = (ds_unreg_ack_t *)(msg + DS_HDR_SZ); + ack->svc_handle = req->svc_handle; + + /* send message */ + mutex_enter(&port->lock); + (void) ds_send_msg(port, msg, msglen); + mutex_enter(&port->lock); + +done: + rw_exit(&ds_svcs.rwlock); +} + +static void +ds_handle_unreg_ack(ds_port_t *port, caddr_t buf, size_t len) +{ + ds_unreg_ack_t *ack; + size_t explen = DS_MSG_LEN(ds_unreg_ack_t); + + /* sanity check the incoming message */ + if (len != explen) { + cmn_err(CE_NOTE, "ds@%lx: <unreg_ack: invalid message length " + "(%ld), expected %ld", port->id, len, explen); + return; + } + + ack = (ds_unreg_ack_t *)(buf + DS_HDR_SZ); + + DS_DBG("ds@%lx: <unreg_ack: hdl=0x%lx\n", port->id, ack->svc_handle); + + rw_enter(&ds_svcs.rwlock, RW_READER); + + /* + * Since the unregister request was initiated locally, + * the service structure has already been torn down. + * Just perform a sanity check to make sure the message + * is appropriate. + */ + if (ds_get_svc(ack->svc_handle) != NULL) { + cmn_err(CE_NOTE, "ds@%lx: <unreg_ack: handle 0x%lx still " + "in use", port->id, ack->svc_handle); + } + + rw_exit(&ds_svcs.rwlock); +} + +static void +ds_handle_unreg_nack(ds_port_t *port, caddr_t buf, size_t len) +{ + ds_unreg_nack_t *nack; + size_t explen = DS_MSG_LEN(ds_unreg_nack_t); + + /* sanity check the incoming message */ + if (len != explen) { + cmn_err(CE_NOTE, "ds@%lx: <unreg_nack: invalid message length " + "(%ld), expected %ld", port->id, len, explen); + return; + } + + nack = (ds_unreg_nack_t *)(buf + DS_HDR_SZ); + + DS_DBG("ds@%lx: <unreg_nack: hdl=0x%lx\n", port->id, + nack->svc_handle); + + rw_enter(&ds_svcs.rwlock, RW_READER); + + /* + * Since the unregister request was initiated locally, + * the service structure has already been torn down. + * Just perform a sanity check to make sure the message + * is appropriate. + */ + if (ds_get_svc(nack->svc_handle) != NULL) { + cmn_err(CE_NOTE, "ds@%lx: <unreg_nack: handle 0x%lx still " + "in use", port->id, nack->svc_handle); + } + + rw_exit(&ds_svcs.rwlock); +} + +static void +ds_handle_data(ds_port_t *port, caddr_t buf, size_t len) +{ + ds_data_handle_t *data; + ds_svc_t *svc; + char *msg; + int msgsz; + int hdrsz; + size_t explen = DS_MSG_LEN(ds_data_handle_t); + + /* sanity check the incoming message */ + if (len < explen) { + cmn_err(CE_NOTE, "ds@%lx: <data: invalid message length " + "(%ld), expected at least %ld", port->id, len, explen); + return; + } + + data = (ds_data_handle_t *)(buf + DS_HDR_SZ); + + hdrsz = DS_HDR_SZ + sizeof (ds_data_handle_t); + msgsz = len - hdrsz; + + /* strip off the header for the client */ + msg = (msgsz) ? (buf + hdrsz) : NULL; + + rw_enter(&ds_svcs.rwlock, RW_READER); + + /* lookup appropriate client */ + if ((svc = ds_get_svc(data->svc_handle)) == NULL) { + cmn_err(CE_NOTE, "ds@%lx: <data: invalid handle 0x%lx", + port->id, data->svc_handle); + ds_send_data_nack(port, data->svc_handle); + return; + } + + rw_exit(&ds_svcs.rwlock); + + DS_DBG("ds@%lx: <data: client=%s hdl=0x%lx\n", port->id, + (svc->cap.svc_id) ? svc->cap.svc_id : "NULL", svc->hdl); + + /* dispatch this message to the client */ + (*svc->ops.ds_data_cb)(svc->ops.cb_arg, msg, msgsz); +} + +static void +ds_handle_nack(ds_port_t *port, caddr_t buf, size_t len) +{ + ds_svc_t *svc; + ds_data_nack_t *nack; + size_t explen = DS_MSG_LEN(ds_data_nack_t); + + /* sanity check the incoming message */ + if (len != explen) { + cmn_err(CE_NOTE, "ds@%lx: <data_nack: invalid message length " + "(%ld), expected %ld", port->id, len, explen); + return; + } + + nack = (ds_data_nack_t *)(buf + DS_HDR_SZ); + + DS_DBG("ds@%lx: data_nack: hdl=0x%lx, result=0x%lx\n", port->id, + nack->svc_handle, nack->result); + + if (nack->result == DS_INV_HDL) { + + rw_enter(&ds_svcs.rwlock, RW_READER); + + if ((svc = ds_get_svc(nack->svc_handle)) == NULL) { + rw_exit(&ds_svcs.rwlock); + return; + } + + cmn_err(CE_NOTE, "ds@%lx: <data_nack: handle 0x%lx reported " + "as invalid", port->id, nack->svc_handle); + + (void) ds_svc_unregister(svc, svc->port); + + rw_exit(&ds_svcs.rwlock); + } +} + +static int +ds_send_msg(ds_port_t *port, caddr_t msg, size_t msglen) +{ + int rv; + caddr_t currp = msg; + size_t amt_left = msglen; + int loopcnt = 0; + + DS_DUMP_LDC_MSG(msg, msglen); + (void) ds_log_add_msg(DS_LOG_OUT(port->id), (uint8_t *)msg, msglen); + + /* + * ensure that no other messages can be sent on this port in case + * the write doesn't get sent with one write to guarantee that the + * message doesn't become fragmented. + */ + ASSERT(MUTEX_HELD(&port->lock)); + + /* send the message */ + do { + if ((rv = ldc_write(port->ldc.hdl, currp, &msglen)) != 0) { + if ((rv == EWOULDBLOCK) && (loopcnt++ < 1000)) { + /* + * don't try for more than a sec. Something + * is wrong with the channel. + */ + delay(drv_usectohz(10000)); /* 1/1000 sec */ + } else { + cmn_err(CE_WARN, + "ds@%lx: send_msg: ldc_write failed (%d)", + port->id, rv); + return (rv); + } + } else { + amt_left -= msglen; + currp += msglen; + msglen = amt_left; + loopcnt = 0; + } + } while (amt_left > 0); + + return (rv); +} + +static void +ds_send_init_req(ds_port_t *port) +{ + ds_hdr_t *hdr; + ds_init_req_t *init_req; + size_t nbytes; + ds_ver_t *vers = &ds_vers[port->ver_idx]; + + ASSERT(MUTEX_HELD(&port->lock)); + + if (port->state != DS_PORT_LDC_INIT) { + cmn_err(CE_NOTE, "ds@%lx: init_req>: invalid port state (%d)", + port->id, port->state); + return; + } + + DS_DBG("ds@%lx: init_req>: req=v%d.%d\n", port->id, vers->major, + vers->minor); + + nbytes = DS_HDR_SZ + sizeof (ds_init_req_t); + hdr = kmem_zalloc(nbytes, KM_SLEEP); + + hdr->msg_type = DS_INIT_REQ; + hdr->payload_len = sizeof (ds_init_req_t); + + init_req = (ds_init_req_t *)((caddr_t)hdr + DS_HDR_SZ); + init_req->major_vers = vers->major; + init_req->minor_vers = vers->minor; + + /* send the message */ + if (ds_send_msg(port, (caddr_t)hdr, nbytes) == 0) { + port->state = DS_PORT_INIT_REQ; + } +} + +static int +ds_send_reg_req(ds_svc_t *svc) +{ + ds_port_t *port = svc->port; + ds_ver_t *ver; + ds_hdr_t *hdr; + caddr_t msg; + size_t msglen; + size_t nbytes; + ds_reg_req_t *req; + size_t idlen; + + /* assumes some checking has already occurred */ + ASSERT(svc->state == DS_SVC_INACTIVE); + + mutex_enter(&port->lock); + + /* check on the LDC to Zeus */ + if (port->ldc.state != LDC_UP) { + /* can not send message */ + DS_DBG("ds@%lx: reg_req>: channel %ld is not up\n", port->id, + port->ldc.id); + mutex_exit(&port->lock); + return (-1); + } + + /* make sure port is ready */ + if (port->state != DS_PORT_READY) { + /* can not send message */ + DS_DBG("ds@%lx: reg_req>: port is not ready\n", port->id); + mutex_exit(&port->lock); + return (-1); + } + + mutex_exit(&port->lock); + + /* allocate the message buffer */ + idlen = strlen(svc->cap.svc_id); + msglen = DS_HDR_SZ + sizeof (ds_reg_req_t) + idlen; + msg = kmem_zalloc(msglen, KM_SLEEP); + + /* copy in the header data */ + hdr = (ds_hdr_t *)msg; + hdr->msg_type = DS_REG_REQ; + hdr->payload_len = sizeof (ds_reg_req_t) + idlen; + + req = (ds_reg_req_t *)(msg + DS_HDR_SZ); + req->svc_handle = svc->hdl; + ver = &(svc->cap.vers[svc->ver_idx]); + req->major_vers = ver->major; + req->minor_vers = ver->minor; + + /* copy in the service id */ + bcopy(svc->cap.svc_id, req->svc_id, idlen + 1); + + /* send the message */ + DS_DBG("ds@%lx: reg_req>: id='%s', ver=%d.%d, hdl=0x%lx\n", port->id, + svc->cap.svc_id, ver->major, ver->minor, svc->hdl); + + nbytes = msglen; + mutex_enter(&port->lock); + if (ds_send_msg(port, msg, nbytes) != 0) { + mutex_exit(&port->lock); + return (-1); + } else { + svc->state = DS_SVC_REG_PENDING; + } + mutex_exit(&port->lock); + + return (0); +} + +static int +ds_send_unreg_req(ds_svc_t *svc) +{ + caddr_t msg; + size_t msglen; + size_t nbytes; + ds_hdr_t *hdr; + ds_unreg_req_t *req; + ds_port_t *port = svc->port; + + if (port == NULL) { + DS_DBG("send_unreg_req: service '%s' not associated with " + "a port\n", svc->cap.svc_id); + return (-1); + } + + mutex_enter(&port->lock); + + /* check on the LDC to Zeus */ + if (port->ldc.state != LDC_UP) { + /* can not send message */ + cmn_err(CE_NOTE, "ds@%lx: unreg_req>: channel %ld is not up", + port->id, port->ldc.id); + mutex_exit(&port->lock); + return (-1); + } + + /* make sure port is ready */ + if (port->state != DS_PORT_READY) { + /* can not send message */ + cmn_err(CE_NOTE, "ds@%lx: unreg_req>: port is not ready", + port->id); + mutex_exit(&port->lock); + return (-1); + } + + mutex_exit(&port->lock); + + msglen = DS_HDR_SZ + sizeof (ds_unreg_req_t); + msg = kmem_zalloc(msglen, KM_SLEEP); + + /* copy in the header data */ + hdr = (ds_hdr_t *)msg; + hdr->msg_type = DS_UNREG; + hdr->payload_len = sizeof (ds_unreg_req_t); + + req = (ds_unreg_req_t *)(msg + DS_HDR_SZ); + req->svc_handle = svc->hdl; + + /* send the message */ + DS_DBG("ds@%lx: unreg_req>: id='%s', hdl=0x%lx\n", port->id, + (svc->cap.svc_id) ? svc->cap.svc_id : "NULL", svc->hdl); + + nbytes = msglen; + mutex_enter(&port->lock); + if (ds_send_msg(port, msg, nbytes) != 0) { + mutex_exit(&port->lock); + return (-1); + } + mutex_exit(&port->lock); + + return (0); +} + +static void +ds_send_unreg_nack(ds_port_t *port, ds_svc_hdl_t bad_hdl) +{ + caddr_t msg; + size_t msglen; + size_t nbytes; + ds_hdr_t *hdr; + ds_unreg_nack_t *nack; + + mutex_enter(&port->lock); + + /* check on the LDC to Zeus */ + if (port->ldc.state != LDC_UP) { + /* can not send message */ + cmn_err(CE_NOTE, "ds@%lx: unreg_nack>: channel %ld is not up", + port->id, port->ldc.id); + mutex_exit(&port->lock); + return; + } + + /* make sure port is ready */ + if (port->state != DS_PORT_READY) { + /* can not send message */ + cmn_err(CE_NOTE, "ds@%lx: unreg_nack>: port is not ready", + port->id); + mutex_exit(&port->lock); + return; + } + + mutex_exit(&port->lock); + + msglen = DS_HDR_SZ + sizeof (ds_unreg_nack_t); + msg = kmem_zalloc(msglen, KM_SLEEP); + + /* copy in the header data */ + hdr = (ds_hdr_t *)msg; + hdr->msg_type = DS_UNREG_NACK; + hdr->payload_len = sizeof (ds_unreg_nack_t); + + nack = (ds_unreg_nack_t *)(msg + DS_HDR_SZ); + nack->svc_handle = bad_hdl; + + /* send the message */ + DS_DBG("ds@%lx: unreg_nack>: hdl=0x%lx\n", port->id, bad_hdl); + + nbytes = msglen; + mutex_enter(&port->lock); + (void) ds_send_msg(port, msg, nbytes); + mutex_exit(&port->lock); +} + +static void +ds_send_data_nack(ds_port_t *port, ds_svc_hdl_t bad_hdl) +{ + caddr_t msg; + size_t msglen; + size_t nbytes; + ds_hdr_t *hdr; + ds_data_nack_t *nack; + + mutex_enter(&port->lock); + + /* check on the LDC to Zeus */ + if (port->ldc.state != LDC_UP) { + /* can not send message */ + cmn_err(CE_NOTE, "ds@%lx: data_nack>: channel %ld is not up", + port->id, port->ldc.id); + mutex_exit(&port->lock); + return; + } + + /* make sure port is ready */ + if (port->state != DS_PORT_READY) { + /* can not send message */ + cmn_err(CE_NOTE, "ds@%lx: data_nack>: port is not ready", + port->id); + mutex_exit(&port->lock); + return; + } + + mutex_exit(&port->lock); + + msglen = DS_HDR_SZ + sizeof (ds_data_nack_t); + msg = kmem_zalloc(msglen, KM_SLEEP); + + /* copy in the header data */ + hdr = (ds_hdr_t *)msg; + hdr->msg_type = DS_NACK; + hdr->payload_len = sizeof (ds_data_nack_t); + + nack = (ds_data_nack_t *)(msg + DS_HDR_SZ); + nack->svc_handle = bad_hdl; + nack->result = DS_INV_HDL; + + /* send the message */ + DS_DBG("ds@%lx: data_nack>: hdl=0x%lx\n", port->id, bad_hdl); + + nbytes = msglen; + mutex_enter(&port->lock); + (void) ds_send_msg(port, msg, nbytes); + mutex_exit(&port->lock); +} + +#ifdef DEBUG + +#define BYTESPERLINE 8 +#define LINEWIDTH ((BYTESPERLINE * 3) + (BYTESPERLINE + 2) + 1) +#define ASCIIOFFSET ((BYTESPERLINE * 3) + 2) +#define ISPRINT(c) ((c >= ' ') && (c <= '~')) + +/* + * Output a buffer formatted with a set number of bytes on + * each line. Append each line with the ASCII equivalent of + * each byte if it falls within the printable ASCII range, + * and '.' otherwise. + */ +static void +ds_dump_ldc_msg(void *vbuf, size_t len) +{ + int i, j; + char *curr; + char *aoff; + char line[LINEWIDTH]; + uint8_t *buf = vbuf; + + /* abort if not debugging ldc */ + if (!(ds_debug & DS_DBG_FLAG_LDC)) { + return; + } + + /* walk the buffer one line at a time */ + for (i = 0; i < len; i += BYTESPERLINE) { + + bzero(line, LINEWIDTH); + + curr = line; + aoff = line + ASCIIOFFSET; + + /* + * Walk the bytes in the current line, storing + * the hex value for the byte as well as the + * ASCII representation in a temporary buffer. + * All ASCII values are placed at the end of + * the line. + */ + for (j = 0; (j < BYTESPERLINE) && ((i + j) < len); j++) { + (void) sprintf(curr, " %02x", buf[i + j]); + *aoff = (ISPRINT(buf[i + j])) ? buf[i + j] : '.'; + curr += 3; + aoff++; + } + + /* + * Fill in to the start of the ASCII translation + * with spaces. This will only be necessary if + * this is the last line and there are not enough + * bytes to fill the whole line. + */ + while (curr != (line + ASCIIOFFSET)) + *curr++ = ' '; + + DS_DBG_LDC("%s\n", line); + } +} +#endif /* DEBUG */ + + +/* + * Walk the table of registered services, executing the specified + * callback function for each service. A non-zero return value from + * the callback is used to terminate the walk, not to indicate an + * error. Returns the index of the last service visited. + */ +static int +ds_walk_svcs(svc_cb_t svc_cb, void *arg) +{ + int idx; + ds_svc_t *svc; + + ASSERT(RW_LOCK_HELD(&ds_svcs.rwlock)); + + /* walk every table entry */ + for (idx = 0; idx < ds_svcs.maxsvcs; idx++) { + + svc = ds_svcs.tbl[idx]; + + /* execute the callback */ + if ((*svc_cb)(svc, arg) != 0) + break; + } + + return (idx); +} + +static int +ds_svc_isfree(ds_svc_t *svc, void *arg) +{ + _NOTE(ARGUNUSED(arg)) + + /* + * Looking for a free service. This may be a NULL entry + * in the table, or an unused structure that could be + * reused. + */ + + if (DS_SVC_ISFREE(svc)) { + /* yes, it is free */ + return (1); + } + + /* not a candidate */ + return (0); +} + +static int +ds_svc_ismatch(ds_svc_t *svc, void *arg) +{ + if (DS_SVC_ISFREE(svc)) { + return (0); + } + + if (strcmp(svc->cap.svc_id, arg) == 0) { + /* found a match */ + return (1); + } + + return (0); +} + +static int +ds_svc_free(ds_svc_t *svc, void *arg) +{ + _NOTE(ARGUNUSED(arg)) + + if (svc == NULL) { + return (0); + } + + if (svc->cap.svc_id) { + kmem_free(svc->cap.svc_id, strlen(svc->cap.svc_id) + 1); + svc->cap.svc_id = NULL; + } + + if (svc->cap.vers) { + kmem_free(svc->cap.vers, svc->cap.nvers * sizeof (ds_ver_t)); + svc->cap.vers = NULL; + } + + kmem_free(svc, sizeof (ds_svc_t)); + + return (0); +} + +static int +ds_svc_register(ds_svc_t *svc, void *arg) +{ + _NOTE(ARGUNUSED(arg)) + + int idx; + + /* check the state of the service */ + if (DS_SVC_ISFREE(svc) || (svc->state != DS_SVC_INACTIVE)) + return (0); + + /* check if there are any ports to try */ + if (DS_PORTSET_ISNULL(svc->avail)) + return (0); + + /* + * Attempt to register the service. Start with the lowest + * numbered port and continue until a registration message + * is sent successfully, or there are no ports left to try. + */ + for (idx = 0; idx < DS_MAX_PORTS; idx++) { + + /* + * If the port is not in the available list, + * it is not a candidate for registration. + */ + if (!DS_PORT_IN_SET(svc->avail, idx)) { + continue; + } + + svc->port = &ds_ports[idx]; + if (ds_send_reg_req(svc) == 0) { + /* register sent successfully */ + break; + } + + /* reset the service to try the next port */ + ds_reset_svc(svc, svc->port); + } + + return (0); +} + +static int +ds_svc_unregister(ds_svc_t *svc, void *arg) +{ + ds_port_t *port = (ds_port_t *)arg; + + if (DS_SVC_ISFREE(svc)) { + return (0); + } + + /* make sure the service is using this port */ + if (svc->port != port) { + return (0); + } + + /* reset the service structure */ + ds_reset_svc(svc, port); + + /* increment the count in the handle to prevent reuse */ + svc->hdl = DS_ALLOC_HDL(DS_HDL2IDX(svc->hdl), DS_HDL2COUNT(svc->hdl)); + + /* call the client unregister callback */ + if (svc->ops.ds_unreg_cb) + (*svc->ops.ds_unreg_cb)(svc->ops.cb_arg); + + /* try to initiate a new registration */ + (void) ds_svc_register(svc, NULL); + + return (0); +} + +static int +ds_svc_port_up(ds_svc_t *svc, void *arg) +{ + ds_port_t *port = (ds_port_t *)arg; + + if (DS_SVC_ISFREE(svc)) { + /* nothing to do */ + return (0); + } + + DS_PORTSET_ADD(svc->avail, port->id); + + return (0); +} + +static ds_svc_t * +ds_alloc_svc(void) +{ + int idx; + uint_t newmaxsvcs; + ds_svc_t **newtbl; + ds_svc_t *newsvc; + + ASSERT(RW_WRITE_HELD(&ds_svcs.rwlock)); + + idx = ds_walk_svcs(ds_svc_isfree, NULL); + + if (idx != ds_svcs.maxsvcs) { + goto found; + } + + /* + * There was no free space in the table. Grow + * the table to double its current size. + */ + newmaxsvcs = ds_svcs.maxsvcs * 2; + newtbl = kmem_zalloc(newmaxsvcs * sizeof (ds_svc_t *), KM_SLEEP); + + /* copy old table data to the new table */ + for (idx = 0; idx < ds_svcs.maxsvcs; idx++) { + newtbl[idx] = ds_svcs.tbl[idx]; + } + + /* clean up the old table */ + kmem_free(ds_svcs.tbl, ds_svcs.maxsvcs * sizeof (ds_svc_t *)); + ds_svcs.tbl = newtbl; + ds_svcs.maxsvcs = newmaxsvcs; + + /* search for a free space again */ + idx = ds_walk_svcs(ds_svc_isfree, NULL); + + /* the table is locked so should find a free slot */ + ASSERT(idx != ds_svcs.maxsvcs); + +found: + /* allocate a new svc structure if necessary */ + if ((newsvc = ds_svcs.tbl[idx]) == NULL) { + /* allocate a new service */ + newsvc = kmem_zalloc(sizeof (ds_svc_t), KM_SLEEP); + ds_svcs.tbl[idx] = newsvc; + } + + /* fill in the handle */ + newsvc->hdl = DS_ALLOC_HDL(idx, DS_HDL2COUNT(newsvc->hdl)); + + return (newsvc); +} + +static void +ds_reset_svc(ds_svc_t *svc, ds_port_t *port) +{ + svc->state = DS_SVC_INACTIVE; + svc->ver_idx = 0; + svc->ver.major = 0; + svc->ver.minor = 0; + svc->port = NULL; + DS_PORTSET_DEL(svc->avail, port->id); +} + +static ds_svc_t * +ds_get_svc(ds_svc_hdl_t hdl) +{ + int idx; + ds_svc_t *svc; + + ASSERT(RW_LOCK_HELD(&ds_svcs.rwlock)); + + if (hdl == DS_INVALID_HDL) + return (NULL); + + idx = DS_HDL2IDX(hdl); + + /* check if index is out of bounds */ + if ((idx < 0) || (idx >= ds_svcs.maxsvcs)) + return (NULL); + + svc = ds_svcs.tbl[idx]; + + /* check for a valid service */ + if (DS_SVC_ISFREE(svc)) + return (NULL); + + /* make sure the handle is an exact match */ + if (svc->hdl != hdl) + return (NULL); + + return (svc); +} + +static int +ds_port_add(md_t *mdp, mde_cookie_t port, mde_cookie_t chan) +{ + ds_port_t *newport; + uint64_t port_id; + uint64_t ldc_id; + + /* get the ID for this port */ + if (md_get_prop_val(mdp, port, "id", &port_id) != 0) { + cmn_err(CE_NOTE, "ds_port_add: port 'id' property not found"); + return (-1); + } + + /* sanity check the port id */ + if (port_id > DS_MAX_PORT_ID) { + cmn_err(CE_WARN, "ds_port_add: port ID %ld out of range", + port_id); + return (-1); + } + + DS_DBG("ds_port_add: adding port ds@%ld\n", port_id); + + /* get the channel ID for this port */ + if (md_get_prop_val(mdp, chan, "id", &ldc_id) != 0) { + cmn_err(CE_NOTE, "ds@%lx: add_port: no channel 'id' property", + port_id); + return (-1); + } + + /* get the port structure from the array of ports */ + newport = &ds_ports[port_id]; + + /* check for a duplicate port in the MD */ + if (newport->state != DS_PORT_FREE) { + cmn_err(CE_NOTE, "ds@%lx: add_port: port already exists", + port_id); + return (-1); + } + + /* initialize the port lock */ + mutex_init(&newport->lock, NULL, MUTEX_DRIVER, NULL); + + /* initialize the port */ + newport->id = port_id; + newport->state = DS_PORT_INIT; + newport->ldc.id = ldc_id; + + /* add the port to the set of all ports */ + DS_PORTSET_ADD(ds_allports, port_id); + + return (0); +} + +static void +ds_port_reset(ds_port_t *port) +{ + ASSERT(MUTEX_HELD(&port->lock)); + + /* connection went down, mark everything inactive */ + rw_enter(&ds_svcs.rwlock, RW_WRITER); + + (void) ds_walk_svcs(ds_svc_unregister, port); + + rw_exit(&ds_svcs.rwlock); + + port->ver_idx = 0; + port->ver.major = 0; + port->ver.minor = 0; + port->state = DS_PORT_LDC_INIT; +} + +/* + * Verify that a version array is sorted as expected for the + * version negotiation to work correctly. + */ +static ds_vers_check_t +ds_vers_isvalid(ds_ver_t *vers, int nvers) +{ + uint16_t curr_major; + uint16_t curr_minor; + int idx; + + curr_major = vers[0].major; + curr_minor = vers[0].minor; + + /* + * Walk the version array, verifying correct ordering. + * The array must be sorted from highest supported + * version to lowest supported version. + */ + for (idx = 0; idx < nvers; idx++) { + if (vers[idx].major > curr_major) { + DS_DBG("vers_isvalid: version array has increasing " + "major versions\n"); + return (DS_VERS_INCREASING_MAJOR_ERR); + } + + if (vers[idx].major < curr_major) { + curr_major = vers[idx].major; + curr_minor = vers[idx].minor; + continue; + } + + if (vers[idx].minor > curr_minor) { + DS_DBG("vers_isvalid: version array has increasing " + "minor versions\n"); + return (DS_VERS_INCREASING_MINOR_ERR); + } + + curr_minor = vers[idx].minor; + } + + return (DS_VERS_OK); +} + +/* + * Logging Support + */ +static void +ds_log_init(void) +{ + ds_log_entry_t *new; + + /* initialize global lock */ + mutex_init(&ds_log.lock, NULL, MUTEX_DRIVER, NULL); + + mutex_enter(&ds_log.lock); + + /* initialize the log */ + ds_log.head = NULL; + ds_log.size = 0; + ds_log.nentry = 0; + + /* initialize the free list */ + for (new = ds_log_entry_pool; new < DS_LOG_POOL_END; new++) { + new->next = ds_log.freelist; + ds_log.freelist = new; + } + + mutex_exit(&ds_log.lock); + + DS_DBG_LOG("ds_log initialized: size=%d bytes, limit=%d bytes, " + "ninit=%ld\n", ds_log_sz, DS_LOG_LIMIT, DS_LOG_NPOOL); +} + +static void +ds_log_fini(void) +{ + ds_log_entry_t *next; + + mutex_enter(&ds_log.lock); + + /* clear out the log */ + while (ds_log.nentry > 0) + (void) ds_log_remove(); + + /* + * Now all the entries are on the free list. + * Clear out the free list, deallocating any + * entry that was dynamically allocated. + */ + while (ds_log.freelist != NULL) { + next = ds_log.freelist->next; + + if (!DS_IS_POOL_ENTRY(ds_log.freelist)) { + kmem_free(ds_log.freelist, sizeof (ds_log_entry_t)); + } + + ds_log.freelist = next; + } + + mutex_exit(&ds_log.lock); + + mutex_destroy(&ds_log.lock); +} + +static ds_log_entry_t * +ds_log_entry_alloc(void) +{ + ds_log_entry_t *new = NULL; + + ASSERT(MUTEX_HELD(&ds_log.lock)); + + if (ds_log.freelist != NULL) { + new = ds_log.freelist; + ds_log.freelist = ds_log.freelist->next; + } + + if (new == NULL) { + /* free list was empty */ + new = kmem_zalloc(sizeof (ds_log_entry_t), KM_SLEEP); + } + + ASSERT(new); + + return (new); +} + +static void +ds_log_entry_free(ds_log_entry_t *entry) +{ + ASSERT(MUTEX_HELD(&ds_log.lock)); + + if (entry == NULL) + return; + + if (entry->data != NULL) { + kmem_free(entry->data, entry->datasz); + entry->data = NULL; + } + + /* place entry on the free list */ + entry->next = ds_log.freelist; + ds_log.freelist = entry; +} + +/* + * Add a message to the end of the log + */ +static int +ds_log_add(ds_log_entry_t *new) +{ + ASSERT(MUTEX_HELD(&ds_log.lock)); + + if (ds_log.head == NULL) { + + new->prev = new; + new->next = new; + + ds_log.head = new; + } else { + ds_log_entry_t *head = ds_log.head; + ds_log_entry_t *tail = ds_log.head->prev; + + new->next = head; + new->prev = tail; + tail->next = new; + head->prev = new; + } + + /* increase the log size, including the metadata size */ + ds_log.size += DS_LOG_ENTRY_SZ(new); + ds_log.nentry++; + + DS_DBG_LOG("ds_log: added %ld data bytes, %ld total bytes\n", + new->datasz, DS_LOG_ENTRY_SZ(new)); + + return (0); +} + +/* + * Remove an entry from the head of the log + */ +static int +ds_log_remove(void) +{ + ds_log_entry_t *head; + + ASSERT(MUTEX_HELD(&ds_log.lock)); + + head = ds_log.head; + + /* empty list */ + if (head == NULL) + return (0); + + if (head->next == ds_log.head) { + /* one element list */ + ds_log.head = NULL; + } else { + head->next->prev = head->prev; + head->prev->next = head->next; + ds_log.head = head->next; + } + + DS_DBG_LOG("ds_log: removed %ld data bytes, %ld total bytes\n", + head->datasz, DS_LOG_ENTRY_SZ(head)); + + ds_log.size -= DS_LOG_ENTRY_SZ(head); + ds_log.nentry--; + + ASSERT((ds_log.size >= 0) && (ds_log.nentry >= 0)); + + ds_log_entry_free(head); + + return (0); +} + +/* + * Replace the data in the entry at the front of the list with then + * new data. This has the effect of removing the oldest entry and + * adding the new entry. + */ +static int +ds_log_replace(uint8_t *msg, size_t sz) +{ + ds_log_entry_t *head; + + ASSERT(MUTEX_HELD(&ds_log.lock)); + + head = ds_log.head; + + DS_DBG_LOG("ds_log: replaced %ld data bytes (%ld total) with %ld data " + "bytes (%ld total)\n", head->datasz, DS_LOG_ENTRY_SZ(head), + sz, sz + sizeof (ds_log_entry_t)); + + ds_log.size -= DS_LOG_ENTRY_SZ(head); + + ASSERT((ds_log.size >= 0) && (ds_log.nentry >= 0)); + + kmem_free(head->data, head->datasz); + head->data = msg; + head->datasz = sz; + + ds_log.size += DS_LOG_ENTRY_SZ(head); + + ds_log.head = head->next; + + return (0); +} + +static void +ds_log_purge(void *arg) +{ + _NOTE(ARGUNUSED(arg)) + + mutex_enter(&ds_log.lock); + + DS_DBG_LOG("ds_log: purging oldest log entries\n"); + + while ((ds_log.nentry) && (ds_log.size >= ds_log_sz)) { + (void) ds_log_remove(); + } + + mutex_exit(&ds_log.lock); +} + +static int +ds_log_add_msg(int32_t dest, uint8_t *msg, size_t sz) +{ + int rv = 0; + + mutex_enter(&ds_log.lock); + + /* check if the log is larger than the soft limit */ + if ((ds_log.nentry) && ((ds_log.size + sz) >= ds_log_sz)) { + /* + * The log is larger than the soft limit. + * Swap the oldest entry for the newest. + */ + DS_DBG_LOG("ds_log: replacing oldest entry with new entry\n"); + (void) ds_log_replace(msg, sz); + } else { + /* + * Still have headroom under the soft limit. + * Add the new entry to the log. + */ + ds_log_entry_t *new; + + new = ds_log_entry_alloc(); + + /* fill in message data */ + new->data = msg; + new->datasz = sz; + new->timestamp = ddi_get_time(); + new->dest = dest; + + rv = ds_log_add(new); + } + + /* check if the log is larger than the hard limit */ + if ((ds_log.nentry > 1) && (ds_log.size >= DS_LOG_LIMIT)) { + /* + * Wakeup the thread to remove entries + * from the log until it is smaller than + * the soft limit. + */ + DS_DBG_LOG("ds_log: log exceeded %d bytes, scheduling a " + "purge...\n", DS_LOG_LIMIT); + + if (DS_DISPATCH(ds_log_purge, (void *)msg) == NULL) { + cmn_err(CE_NOTE, "ds_log: purge thread failed to " + "start"); + } + } + + mutex_exit(&ds_log.lock); + + return (rv); +} + +/* + * Client Interface + */ + +int +ds_cap_init(ds_capability_t *cap, ds_clnt_ops_t *ops) +{ + int idx; + ds_vers_check_t status; + ds_svc_t *svc; + + /* sanity check the args */ + if ((cap == NULL) || (ops == NULL)) { + cmn_err(CE_NOTE, "ds_cap_init: invalid arguments"); + return (EINVAL); + } + + /* sanity check the capability specifier */ + if ((cap->svc_id == NULL) || (cap->vers == NULL) || (cap->nvers == 0)) { + cmn_err(CE_NOTE, "ds_cap_init: invalid capability specifier"); + return (EINVAL); + } + + /* sanity check the version array */ + if ((status = ds_vers_isvalid(cap->vers, cap->nvers)) != DS_VERS_OK) { + cmn_err(CE_NOTE, "ds_cap_init: invalid capability " + "version array for %s service: %s", cap->svc_id, + (status == DS_VERS_INCREASING_MAJOR_ERR) ? + "increasing major versions" : + "increasing minor versions"); + return (EINVAL); + } + + /* data and register callbacks are required */ + if ((ops->ds_data_cb == NULL) || (ops->ds_reg_cb == NULL)) { + cmn_err(CE_NOTE, "ds_cap_init: invalid ops specifier for " + "%s service", cap->svc_id); + return (EINVAL); + } + + DS_DBG("ds_cap_init: svc_id='%s', data_cb=0x%lx, cb_arg=0x%lx\n", + cap->svc_id, (uint64_t)ops->ds_data_cb, (uint64_t)ops->cb_arg); + + rw_enter(&ds_svcs.rwlock, RW_WRITER); + + /* check if the service is already registered */ + idx = ds_walk_svcs(ds_svc_ismatch, cap->svc_id); + if (idx != ds_svcs.maxsvcs) { + /* already registered */ + cmn_err(CE_NOTE, "service '%s' already registered", + cap->svc_id); + rw_exit(&ds_svcs.rwlock); + return (EALREADY); + } + + svc = ds_alloc_svc(); + + /* copy over all the client information */ + bcopy(cap, &svc->cap, sizeof (ds_capability_t)); + + /* make a copy of the service name */ + svc->cap.svc_id = kmem_zalloc(strlen(cap->svc_id) + 1, KM_SLEEP); + (void) strncpy(svc->cap.svc_id, cap->svc_id, strlen(cap->svc_id)); + + /* make a copy of the version array */ + svc->cap.vers = kmem_zalloc(cap->nvers * sizeof (ds_ver_t), KM_SLEEP); + bcopy(cap->vers, svc->cap.vers, cap->nvers * sizeof (ds_ver_t)); + + /* copy the client ops vector */ + bcopy(ops, &svc->ops, sizeof (ds_clnt_ops_t)); + + svc->state = DS_SVC_INACTIVE; + svc->ver_idx = 0; + DS_PORTSET_DUP(svc->avail, ds_allports); + + ds_svcs.nsvcs++; + + rw_exit(&ds_svcs.rwlock); + + /* attempt to register the service */ + (void) ds_svc_register(svc, NULL); + + DS_DBG("ds_cap_init: service '%s' assigned handle 0x%lx\n", + svc->cap.svc_id, svc->hdl); + + return (0); +} + +int +ds_cap_fini(ds_capability_t *cap) +{ + int idx; + ds_svc_t *svc; + ds_svc_hdl_t tmp_hdl; + + rw_enter(&ds_svcs.rwlock, RW_WRITER); + + /* make sure the service is registered */ + idx = ds_walk_svcs(ds_svc_ismatch, cap->svc_id); + if (idx == ds_svcs.maxsvcs) { + /* service is not registered */ + cmn_err(CE_NOTE, "ds_cap_fini: unknown service '%s'", + cap->svc_id); + rw_exit(&ds_svcs.rwlock); + return (EINVAL); + } + + svc = ds_svcs.tbl[idx]; + + DS_DBG("ds_cap_fini: svcid='%s', hdl=0x%lx\n", svc->cap.svc_id, + svc->hdl); + + /* + * Attempt to send an unregister notification. Even + * if sending the message fails, the local unregister + * request must be honored, since this indicates that + * the client will no longer handle incoming requests. + */ + (void) ds_send_unreg_req(svc); + + /* + * Clear out the structure, but do not deallocate the + * memory. It can be reused for the next registration. + */ + kmem_free(svc->cap.svc_id, strlen(svc->cap.svc_id) + 1); + kmem_free(svc->cap.vers, svc->cap.nvers * sizeof (ds_ver_t)); + + /* save the handle to prevent reuse */ + tmp_hdl = svc->hdl; + bzero(svc, sizeof (ds_svc_t)); + + /* initialize for next use */ + svc->hdl = tmp_hdl; + svc->state = DS_SVC_FREE; + + ds_svcs.nsvcs--; + + rw_exit(&ds_svcs.rwlock); + + return (0); +} + +int +ds_cap_send(ds_svc_hdl_t hdl, void *buf, size_t len) +{ + int rv; + ds_hdr_t *hdr; + caddr_t msg; + size_t msglen; + size_t hdrlen; + caddr_t payload; + ds_svc_t *svc; + ds_port_t *port; + ds_data_handle_t *data; + + rw_enter(&ds_svcs.rwlock, RW_READER); + + if ((hdl == DS_INVALID_HDL) || (svc = ds_get_svc(hdl)) == NULL) { + cmn_err(CE_NOTE, "ds_cap_send: invalid handle 0x%lx", hdl); + rw_exit(&ds_svcs.rwlock); + return (EINVAL); + } + + if ((port = svc->port) == NULL) { + cmn_err(CE_NOTE, "ds_cap_send: service '%s' not associated " + "with a port", svc->cap.svc_id); + rw_exit(&ds_svcs.rwlock); + return (ECONNRESET); + } + + mutex_enter(&port->lock); + + /* check that the LDC channel is ready */ + if (port->ldc.state != LDC_UP) { + cmn_err(CE_NOTE, "ds_cap_send: LDC channel is not up"); + mutex_exit(&port->lock); + rw_exit(&ds_svcs.rwlock); + return (ECONNRESET); + } + + + if (svc->state != DS_SVC_ACTIVE) { + /* channel is up, but svc is not registered */ + cmn_err(CE_NOTE, "ds_cap_send: invalid service state 0x%x", + svc->state); + mutex_exit(&port->lock); + rw_exit(&ds_svcs.rwlock); + return (EINVAL); + } + + hdrlen = DS_HDR_SZ + sizeof (ds_data_handle_t); + + msg = kmem_zalloc(len + hdrlen, KM_SLEEP); + hdr = (ds_hdr_t *)msg; + payload = msg + hdrlen; + msglen = len + hdrlen; + + hdr->payload_len = len + sizeof (ds_data_handle_t); + hdr->msg_type = DS_DATA; + + data = (ds_data_handle_t *)(msg + DS_HDR_SZ); + data->svc_handle = hdl; + + if ((buf != NULL) && (len != 0)) { + bcopy(buf, payload, len); + } + + DS_DBG("ds@%lx: data>: hdl=0x%lx, len=%ld, payload_len=%d\n", + port->id, svc->hdl, msglen, hdr->payload_len); + + if ((rv = ds_send_msg(port, msg, msglen)) != 0) { + rv = (rv == EIO) ? ECONNRESET : rv; + } + + mutex_exit(&port->lock); + rw_exit(&ds_svcs.rwlock); + + return (rv); +} diff --git a/usr/src/uts/sun4v/io/fault_iso.c b/usr/src/uts/sun4v/io/fault_iso.c new file mode 100644 index 0000000000..0123c19291 --- /dev/null +++ b/usr/src/uts/sun4v/io/fault_iso.c @@ -0,0 +1,453 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * sun4v Fault Isolation Services Module + */ + +#include <sys/modctl.h> +#include <sys/cmn_err.h> +#include <sys/machsystm.h> +#include <sys/processor.h> +#include <sys/mem.h> +#include <vm/page.h> +#include <sys/note.h> +#include <sys/ds.h> +#include <sys/fault_iso.h> + +/* + * Debugging routines + */ +#ifdef DEBUG +uint_t fi_debug = 0x0; +#define FI_DBG if (fi_debug) cmn_err +#else /* DEBUG */ +#define FI_DBG _NOTE(CONSTCOND) if (0) cmn_err +#endif /* DEBUG */ + +/* + * Domains Services interaction + */ +static ds_svc_hdl_t cpu_handle; +static ds_svc_hdl_t mem_handle; + +static ds_ver_t fi_vers[] = { { 1, 0 } }; +#define FI_NVERS (sizeof (fi_vers) / sizeof (fi_vers[0])) + +static ds_capability_t cpu_cap = { + "fma-cpu-service", /* svc_id */ + fi_vers, /* vers */ + FI_NVERS /* nvers */ +}; + +static ds_capability_t mem_cap = { + "fma-mem-service", /* svc_id */ + fi_vers, /* vers */ + FI_NVERS /* nvers */ +}; + +static void fi_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl); +static void fi_unreg_handler(ds_cb_arg_t arg); + +static void cpu_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen); +static void mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen); + +static ds_clnt_ops_t cpu_ops = { + fi_reg_handler, /* ds_reg_cb */ + fi_unreg_handler, /* ds_unreg_cb */ + cpu_data_handler, /* ds_data_cb */ + &cpu_handle /* cb_arg */ +}; + +static ds_clnt_ops_t mem_ops = { + fi_reg_handler, /* ds_reg_cb */ + fi_unreg_handler, /* ds_unreg_cb */ + mem_data_handler, /* ds_data_cb */ + &mem_handle /* cb_arg */ +}; + +static int fi_init(void); +static void fi_fini(void); + +static struct modlmisc modlmisc = { + &mod_miscops, + "sun4v Fault Isolation Services %I%" +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modlmisc, + NULL +}; + +int +_init(void) +{ + int rv; + + if ((rv = fi_init()) != 0) + return (rv); + + if ((rv = mod_install(&modlinkage)) != 0) + fi_fini(); + + return (rv); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int fi_allow_unload; + +int +_fini(void) +{ + int status; + + if (fi_allow_unload == 0) + return (EBUSY); + + if ((status = mod_remove(&modlinkage)) == 0) + fi_fini(); + + return (status); +} + +static int +fi_init(void) +{ + int rv; + + /* register CPU service with domain services framework */ + rv = ds_cap_init(&cpu_cap, &cpu_ops); + if (rv != 0) { + FI_DBG(CE_CONT, "ds_cap_init failed: %d", rv); + return (rv); + } + + /* register MEM servicewith domain services framework */ + rv = ds_cap_init(&mem_cap, &mem_ops); + if (rv != 0) { + FI_DBG(CE_CONT, "ds_cap_init failed: %d", rv); + (void) ds_cap_fini(&cpu_cap); + return (rv); + } + + return (rv); +} + +static void +fi_fini(void) +{ + /* + * Stop incoming requests from Zeus + */ + (void) ds_cap_fini(&cpu_cap); + (void) ds_cap_fini(&mem_cap); +} + +static void +cpu_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen) +{ + _NOTE(ARGUNUSED(arg)) + + fma_cpu_service_req_t *msg = buf; + fma_cpu_resp_t resp_msg; + int rv = 0; + int cpu_status; + int resp_back = 0; + + /* + * If the buffer is the wrong size for CPU calls or is NULL then + * do not return any message. The call from the ldom mgr. will time out + * and the response will be NULL. + */ + if (msg == NULL || buflen != sizeof (fma_cpu_service_req_t)) { + return; + } + + FI_DBG(CE_CONT, "req_num = %ld, msg_type = %d, cpu_id = %d\n", + msg->req_num, msg->msg_type, msg->cpu_id); + + resp_msg.req_num = msg->req_num; + + switch (msg->msg_type) { + case FMA_CPU_REQ_STATUS: + rv = p_online_internal(msg->cpu_id, P_STATUS, + &cpu_status); + if (rv == EINVAL) { + FI_DBG(CE_CONT, "Failed p_online call failed." + "Invalid CPU\n"); + resp_msg.result = FMA_CPU_RESP_FAILURE; + resp_msg.status = FMA_CPU_STAT_ILLEGAL; + resp_back = 1; + } + break; + case FMA_CPU_REQ_OFFLINE: + rv = p_online_internal(msg->cpu_id, P_OFFLINE, + &cpu_status); + if (rv == EINVAL) { + FI_DBG(CE_CONT, "Failed p_online call failed." + "Invalid CPU\n"); + resp_msg.result = FMA_CPU_RESP_FAILURE; + resp_msg.status = FMA_CPU_STAT_ILLEGAL; + resp_back = 1; + } else if (rv == EBUSY) { + FI_DBG(CE_CONT, "Failed p_online call failed." + "Tried to offline while busy\n"); + resp_msg.result = FMA_CPU_RESP_FAILURE; + resp_msg.status = FMA_CPU_STAT_ONLINE; + resp_back = 1; + } + break; + case FMA_CPU_REQ_ONLINE: + rv = p_online_internal(msg->cpu_id, P_ONLINE, + &cpu_status); + if (rv == EINVAL) { + FI_DBG(CE_CONT, "Failed p_online call failed." + "Invalid CPU\n"); + resp_msg.result = FMA_CPU_RESP_FAILURE; + resp_msg.status = FMA_CPU_STAT_ILLEGAL; + resp_back = 1; + } else if (rv == ENOTSUP) { + FI_DBG(CE_CONT, "Failed p_online call failed." + "Online not supported for single CPU\n"); + resp_msg.result = FMA_CPU_RESP_FAILURE; + resp_msg.status = FMA_CPU_STAT_OFFLINE; + resp_back = 1; + } + break; + default: + /* + * If the msg_type was of unknown type simply return and + * have the ldom mgr. time out with a NULL response. + */ + return; + } + + if (rv != 0) { + if (resp_back) { + if ((rv = ds_cap_send(cpu_handle, &resp_msg, + sizeof (resp_msg))) != 0) { + FI_DBG(CE_CONT, "ds_cap_send failed (%d)\n", + rv); + } + return; + } + ASSERT((rv == EINVAL) || ((rv == EBUSY) && + (msg->msg_type == FMA_CPU_REQ_OFFLINE)) || + ((rv == ENOTSUP) && + (msg->msg_type == FMA_CPU_REQ_ONLINE))); + + cmn_err(CE_WARN, "p_online_internal error not handled " + "rv = %d\n", rv); + } + + resp_msg.req_num = msg->req_num; + resp_msg.result = FMA_CPU_RESP_OK; + + switch (cpu_status) { + case P_OFFLINE: + case P_FAULTED: + case P_POWEROFF: + case P_SPARE: + resp_msg.status = FMA_CPU_STAT_OFFLINE; + break; + case P_ONLINE: + case P_NOINTR: + resp_msg.status = FMA_CPU_STAT_ONLINE; + break; + default: + resp_msg.status = FMA_CPU_STAT_ILLEGAL; + } + + if ((rv = ds_cap_send(cpu_handle, &resp_msg, + sizeof (resp_msg))) != 0) { + FI_DBG(CE_CONT, "ds_cap_send failed (%d)\n", rv); + } +} + +static void +mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen) +{ + _NOTE(ARGUNUSED(arg)) + + fma_mem_service_req_t *msg = buf; + fma_mem_resp_t resp_msg; + int rv = 0; + + /* + * If the buffer is the wrong size for Mem calls or is NULL then + * do not return any message. The call from the ldom mgr. will time out + * and the response will be NULL. + */ + if (msg == NULL || buflen != sizeof (fma_mem_service_req_t)) { + return; + } + + FI_DBG(CE_CONT, "req_num = %ld, msg_type = %d, memory addr = 0x%lx" + "memory length = 0x%lx\n", msg->req_num, msg->msg_type, + msg->real_addr, msg->length); + + resp_msg.req_num = msg->req_num; + resp_msg.res_addr = msg->real_addr; + resp_msg.res_length = msg->length; + + /* + * Information about return values for page calls can be referenced + * in usr/src/uts/common/vm/page_retire.c + */ + switch (msg->msg_type) { + case FMA_MEM_REQ_STATUS: + rv = page_retire_check(msg->real_addr, NULL); + switch (rv) { + /* Page is retired */ + case 0: + resp_msg.result = FMA_MEM_RESP_OK; + resp_msg.status = FMA_MEM_STAT_RETIRED; + break; + /* Page is pending. Send back failure and not retired */ + case EAGAIN: + resp_msg.result = FMA_MEM_RESP_FAILURE; + resp_msg.status = FMA_MEM_STAT_NOTRETIRED; + break; + /* Page is not retired. */ + case EIO: + resp_msg.result = FMA_MEM_RESP_FAILURE; + resp_msg.status = FMA_MEM_STAT_NOTRETIRED; + break; + /* PA is not valid */ + case EINVAL: + resp_msg.result = FMA_MEM_RESP_FAILURE; + resp_msg.status = FMA_MEM_STAT_ILLEGAL; + break; + default: + ASSERT((rv == 0) || (rv == EAGAIN) || (rv == EIO) || + (rv == EINVAL)); + cmn_err(CE_WARN, "fault_iso: return value from " + "page_retire_check invalid: %d\n", rv); + } + break; + case FMA_MEM_REQ_RETIRE: + rv = page_retire(msg->real_addr, PR_FMA); + switch (rv) { + /* Page retired successfully */ + case 0: + resp_msg.result = FMA_MEM_RESP_OK; + resp_msg.status = FMA_MEM_STAT_RETIRED; + break; + /* Tried to retire and now Pending retirement */ + case EAGAIN: + resp_msg.result = FMA_MEM_RESP_FAILURE; + resp_msg.status = FMA_MEM_STAT_NOTRETIRED; + break; + /* Did not try to retire. Page already retired */ + case EIO: + resp_msg.result = FMA_MEM_RESP_FAILURE; + resp_msg.status = FMA_MEM_STAT_RETIRED; + break; + /* PA is not valid */ + case EINVAL: + resp_msg.result = FMA_MEM_RESP_FAILURE; + resp_msg.status = FMA_MEM_STAT_ILLEGAL; + break; + default: + ASSERT((rv == 0) || (rv == EAGAIN) || (rv == EIO) || + (rv == EINVAL)); + cmn_err(CE_WARN, "fault_iso: return value from " + "page_retire invalid: %d\n", rv); + } + break; + case FMA_MEM_REQ_RESURRECT: + rv = page_unretire(msg->real_addr); + switch (rv) { + /* Page succesfullly unretired */ + case 0: + resp_msg.result = FMA_MEM_RESP_OK; + resp_msg.status = FMA_MEM_STAT_NOTRETIRED; + break; + /* Page could not be locked. Still retired */ + case EAGAIN: + resp_msg.result = FMA_MEM_RESP_FAILURE; + resp_msg.status = FMA_MEM_STAT_RETIRED; + break; + /* Page was not retired already */ + case EIO: + resp_msg.result = FMA_MEM_RESP_FAILURE; + resp_msg.status = FMA_MEM_STAT_NOTRETIRED; + break; + /* PA is not valid */ + case EINVAL: + resp_msg.result = FMA_MEM_RESP_FAILURE; + resp_msg.status = FMA_MEM_STAT_ILLEGAL; + break; + default: + ASSERT((rv == 0) || (rv == EAGAIN) || (rv == EIO) || + (rv == EINVAL)); + cmn_err(CE_WARN, "fault_iso: return value from " + "page_unretire invalid: %d\n", rv); + } + break; + default: + /* + * If the msg_type was of unknown type simply return and + * have the ldom mgr. time out with a NULL response. + */ + return; + } + + if ((rv = ds_cap_send(mem_handle, &resp_msg, sizeof (resp_msg))) != 0) { + FI_DBG(CE_CONT, "ds_cap_send failed (%d)\n", rv); + } +} + +static void +fi_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl) +{ + FI_DBG(CE_CONT, "fi_reg_handler: arg=0x%p, ver=%d.%d, hdl=0x%lx\n", + arg, ver->major, ver->minor, hdl); + + if ((ds_svc_hdl_t *)arg == &cpu_handle) + cpu_handle = hdl; + if ((ds_svc_hdl_t *)arg == &mem_handle) + mem_handle = hdl; +} + +static void +fi_unreg_handler(ds_cb_arg_t arg) +{ + FI_DBG(CE_CONT, "fi_unreg_handler: arg=0x%p\n", arg); + + if ((ds_svc_hdl_t *)arg == &cpu_handle) + cpu_handle = DS_INVALID_HDL; + if ((ds_svc_hdl_t *)arg == &mem_handle) + mem_handle = DS_INVALID_HDL; +} diff --git a/usr/src/uts/sun4v/io/ldc.c b/usr/src/uts/sun4v/io/ldc.c new file mode 100644 index 0000000000..87b02588ca --- /dev/null +++ b/usr/src/uts/sun4v/io/ldc.c @@ -0,0 +1,5609 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * sun4v LDC Transport Layer + */ +#include <sys/types.h> +#include <sys/file.h> +#include <sys/errno.h> +#include <sys/open.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/cmn_err.h> +#include <sys/ksynch.h> +#include <sys/modctl.h> +#include <sys/stat.h> /* needed for S_IFBLK and S_IFCHR */ +#include <sys/debug.h> +#include <sys/types.h> +#include <sys/cred.h> +#include <sys/promif.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/cyclic.h> +#include <sys/machsystm.h> +#include <sys/vm.h> +#include <sys/cpu.h> +#include <sys/intreg.h> +#include <sys/machcpuvar.h> +#include <sys/note.h> +#include <sys/ivintr.h> +#include <sys/hypervisor_api.h> +#include <sys/ldc.h> +#include <sys/ldc_impl.h> +#include <sys/cnex.h> +#include <sys/hsvc.h> + +/* Core internal functions */ +static int i_ldc_h2v_error(int h_error); +static int i_ldc_txq_reconf(ldc_chan_t *ldcp); +static int i_ldc_rxq_reconf(ldc_chan_t *ldcp); +static void i_ldc_reset_state(ldc_chan_t *ldcp); +static void i_ldc_reset(ldc_chan_t *ldcp); + +static int i_ldc_get_tx_tail(ldc_chan_t *ldcp, uint64_t *tail); +static int i_ldc_set_tx_tail(ldc_chan_t *ldcp, uint64_t tail); +static int i_ldc_set_rx_head(ldc_chan_t *ldcp, uint64_t head); +static int i_ldc_send_pkt(ldc_chan_t *ldcp, uint8_t pkttype, uint8_t subtype, + uint8_t ctrlmsg); + +/* Interrupt handling functions */ +static uint_t i_ldc_tx_hdlr(caddr_t arg1, caddr_t arg2); +static uint_t i_ldc_rx_hdlr(caddr_t arg1, caddr_t arg2); +static void i_ldc_clear_intr(ldc_chan_t *ldcp, cnex_intrtype_t itype); + +/* Read method functions */ +static int i_ldc_read_raw(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep); +static int i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, + size_t *sizep); +static int i_ldc_read_stream(ldc_chan_t *ldcp, caddr_t target_bufp, + size_t *sizep); + +/* Write method functions */ +static int i_ldc_write_raw(ldc_chan_t *ldcp, caddr_t target_bufp, + size_t *sizep); +static int i_ldc_write_packet(ldc_chan_t *ldcp, caddr_t target_bufp, + size_t *sizep); +static int i_ldc_write_stream(ldc_chan_t *ldcp, caddr_t target_bufp, + size_t *sizep); + +/* Pkt processing internal functions */ +static int i_ldc_check_seqid(ldc_chan_t *ldcp, ldc_msg_t *ldcmsg); +static int i_ldc_ctrlmsg(ldc_chan_t *ldcp, ldc_msg_t *ldcmsg); +static int i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg); +static int i_ldc_process_RTS(ldc_chan_t *ldcp, ldc_msg_t *msg); +static int i_ldc_process_RTR(ldc_chan_t *ldcp, ldc_msg_t *msg); +static int i_ldc_process_RDX(ldc_chan_t *ldcp, ldc_msg_t *msg); +static int i_ldc_process_data_ACK(ldc_chan_t *ldcp, ldc_msg_t *msg); + +/* Memory synchronization internal functions */ +static int i_ldc_mem_acquire_release(ldc_mem_handle_t mhandle, + uint8_t direction, uint64_t offset, size_t size); +static int i_ldc_dring_acquire_release(ldc_dring_handle_t dhandle, + uint8_t direction, uint64_t start, uint64_t end); + +/* LDC Version */ +static ldc_ver_t ldc_versions[] = { {1, 0} }; + +/* number of supported versions */ +#define LDC_NUM_VERS (sizeof (ldc_versions) / sizeof (ldc_versions[0])) + +/* Module State Pointer */ +static ldc_soft_state_t *ldcssp; + +static struct modldrv md = { + &mod_miscops, /* This is a misc module */ + "sun4v LDC module v%I%", /* Name of the module */ +}; + +static struct modlinkage ml = { + MODREV_1, + &md, + NULL +}; + +static uint64_t ldc_sup_minor; /* Supported minor number */ +static hsvc_info_t ldc_hsvc = { + HSVC_REV_1, NULL, HSVC_GROUP_LDC, 1, 0, "ldc" +}; + +static uint64_t intr_sup_minor; /* Supported minor number */ +static hsvc_info_t intr_hsvc = { + HSVC_REV_1, NULL, HSVC_GROUP_INTR, 1, 0, "ldc" +}; + +#ifdef DEBUG + +/* + * Print debug messages + * + * set ldcdbg to 0x7 for enabling all msgs + * 0x4 - Warnings + * 0x2 - All debug messages + * 0x1 - Minimal debug messages + * + * set ldcdbgchan to the channel number you want to debug + * setting it to -1 prints debug messages for all channels + * NOTE: ldcdbgchan has no effect on error messages + */ + +#define DBG_ALL_LDCS -1 + +int ldcdbg = 0x0; +int64_t ldcdbgchan = DBG_ALL_LDCS; + +static void +ldcdebug(int64_t id, const char *fmt, ...) +{ + char buf[512]; + va_list ap; + + /* + * Do not return if, + * caller wants to print it anyway - (id == DBG_ALL_LDCS) + * debug channel is set to all LDCs - (ldcdbgchan == DBG_ALL_LDCS) + * debug channel = caller specified channel + */ + if ((id != DBG_ALL_LDCS) && + (ldcdbgchan != DBG_ALL_LDCS) && + (ldcdbgchan != id)) { + return; + } + + va_start(ap, fmt); + (void) vsprintf(buf, fmt, ap); + va_end(ap); + + cmn_err(CE_CONT, "?%s\n", buf); +} + +#define D1 \ +if (ldcdbg & 0x01) \ + ldcdebug + +#define D2 \ +if (ldcdbg & 0x02) \ + ldcdebug + +#define DWARN \ +if (ldcdbg & 0x04) \ + ldcdebug + +#define DUMP_PAYLOAD(id, addr) \ +{ \ + char buf[65*3]; \ + int i; \ + uint8_t *src = (uint8_t *)addr; \ + for (i = 0; i < 64; i++, src++) \ + (void) sprintf(&buf[i * 3], "|%02x", *src); \ + (void) sprintf(&buf[i * 3], "|\n"); \ + D2((id), "payload: %s", buf); \ +} + +#define DUMP_LDC_PKT(c, s, addr) \ +{ \ + ldc_msg_t *msg = (ldc_msg_t *)(addr); \ + uint32_t mid = ((c)->mode != LDC_MODE_RAW) ? msg->seqid : 0; \ + if (msg->type == LDC_DATA) { \ + D2((c)->id, "%s: msg%d (/%x/%x/%x/,env[%c%c,sz=%d])", \ + (s), mid, msg->type, msg->stype, msg->ctrl, \ + (msg->env & LDC_FRAG_START) ? 'B' : ' ', \ + (msg->env & LDC_FRAG_STOP) ? 'E' : ' ', \ + (msg->env & LDC_LEN_MASK)); \ + } else { \ + D2((c)->id, "%s: msg%d (/%x/%x/%x/,env=%x)", (s), \ + mid, msg->type, msg->stype, msg->ctrl, msg->env); \ + } \ +} + +#else + +#define DBG_ALL_LDCS -1 + +#define D1 +#define D2 +#define DWARN + +#define DUMP_PAYLOAD(id, addr) +#define DUMP_LDC_PKT(c, s, addr) + +#endif + +#define ZERO_PKT(p) \ + bzero((p), sizeof (ldc_msg_t)); + +#define IDX2COOKIE(idx, pg_szc, pg_shift) \ + (((pg_szc) << LDC_COOKIE_PGSZC_SHIFT) | ((idx) << (pg_shift))) + + +int +_init(void) +{ + int status; + + status = hsvc_register(&ldc_hsvc, &ldc_sup_minor); + if (status != 0) { + cmn_err(CE_WARN, "%s: cannot negotiate hypervisor LDC services" + " group: 0x%lx major: %ld minor: %ld errno: %d", + ldc_hsvc.hsvc_modname, ldc_hsvc.hsvc_group, + ldc_hsvc.hsvc_major, ldc_hsvc.hsvc_minor, status); + return (-1); + } + + status = hsvc_register(&intr_hsvc, &intr_sup_minor); + if (status != 0) { + cmn_err(CE_WARN, "%s: cannot negotiate hypervisor interrupt " + "services group: 0x%lx major: %ld minor: %ld errno: %d", + intr_hsvc.hsvc_modname, intr_hsvc.hsvc_group, + intr_hsvc.hsvc_major, intr_hsvc.hsvc_minor, status); + (void) hsvc_unregister(&ldc_hsvc); + return (-1); + } + + /* allocate soft state structure */ + ldcssp = kmem_zalloc(sizeof (ldc_soft_state_t), KM_SLEEP); + + /* Link the module into the system */ + status = mod_install(&ml); + if (status != 0) { + kmem_free(ldcssp, sizeof (ldc_soft_state_t)); + return (status); + } + + /* Initialize the LDC state structure */ + mutex_init(&ldcssp->lock, NULL, MUTEX_DRIVER, NULL); + + mutex_enter(&ldcssp->lock); + + ldcssp->channel_count = 0; + ldcssp->channels_open = 0; + ldcssp->chan_list = NULL; + ldcssp->dring_list = NULL; + + mutex_exit(&ldcssp->lock); + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + /* Report status of the dynamically loadable driver module */ + return (mod_info(&ml, modinfop)); +} + +int +_fini(void) +{ + int rv, status; + ldc_chan_t *ldcp; + ldc_dring_t *dringp; + ldc_mem_info_t minfo; + + /* Unlink the driver module from the system */ + status = mod_remove(&ml); + if (status) { + DWARN(DBG_ALL_LDCS, "_fini: mod_remove failed\n"); + return (EIO); + } + + /* close and finalize channels */ + ldcp = ldcssp->chan_list; + while (ldcp != NULL) { + (void) ldc_close((ldc_handle_t)ldcp); + (void) ldc_fini((ldc_handle_t)ldcp); + + ldcp = ldcp->next; + } + + /* Free descriptor rings */ + dringp = ldcssp->dring_list; + while (dringp != NULL) { + dringp = dringp->next; + + rv = ldc_mem_dring_info((ldc_dring_handle_t)dringp, &minfo); + if (rv == 0 && minfo.status != LDC_UNBOUND) { + if (minfo.status == LDC_BOUND) { + (void) ldc_mem_dring_unbind( + (ldc_dring_handle_t)dringp); + } + if (minfo.status == LDC_MAPPED) { + (void) ldc_mem_dring_unmap( + (ldc_dring_handle_t)dringp); + } + } + + (void) ldc_mem_dring_destroy((ldc_dring_handle_t)dringp); + } + ldcssp->dring_list = NULL; + + /* + * We have successfully "removed" the driver. + * Destroying soft states + */ + mutex_destroy(&ldcssp->lock); + kmem_free(ldcssp, sizeof (ldc_soft_state_t)); + + (void) hsvc_unregister(&ldc_hsvc); + (void) hsvc_unregister(&intr_hsvc); + + return (status); +} + +/* -------------------------------------------------------------------------- */ + +/* + * LDC Transport Internal Functions + */ + +/* + * Translate HV Errors to sun4v error codes + */ +static int +i_ldc_h2v_error(int h_error) +{ + switch (h_error) { + + case H_EOK: + return (0); + + case H_ENORADDR: + return (EFAULT); + + case H_EBADPGSZ: + case H_EINVAL: + return (EINVAL); + + case H_EWOULDBLOCK: + return (EWOULDBLOCK); + + case H_ENOACCESS: + case H_ENOMAP: + return (EACCES); + + case H_EIO: + case H_ECPUERROR: + return (EIO); + + case H_ENOTSUPPORTED: + return (ENOTSUP); + + case H_ETOOMANY: + return (ENOSPC); + + case H_ECHANNEL: + return (ECHRNG); + default: + break; + } + + return (EIO); +} + +/* + * Reconfigure the transmit queue + */ +static int +i_ldc_txq_reconf(ldc_chan_t *ldcp) +{ + int rv; + + ASSERT(MUTEX_HELD(&ldcp->lock)); + rv = hv_ldc_tx_qconf(ldcp->id, ldcp->tx_q_ra, ldcp->tx_q_entries); + if (rv) { + cmn_err(CE_WARN, + "ldc_tx_qconf: (0x%lx) cannot set qconf", ldcp->id); + return (EIO); + } + rv = hv_ldc_tx_get_state(ldcp->id, &(ldcp->tx_head), + &(ldcp->tx_tail), &(ldcp->link_state)); + if (rv) { + cmn_err(CE_WARN, + "ldc_tx_get_state: (0x%lx) cannot get qptrs", ldcp->id); + return (EIO); + } + D1(ldcp->id, "ldc_tx_get_state: (0x%llx) h=0x%llx,t=0x%llx," + "s=0x%llx\n", ldcp->id, ldcp->tx_head, ldcp->tx_tail, + ldcp->link_state); + + return (0); +} + +/* + * Reconfigure the receive queue + */ +static int +i_ldc_rxq_reconf(ldc_chan_t *ldcp) +{ + int rv; + uint64_t rx_head, rx_tail; + + ASSERT(MUTEX_HELD(&ldcp->lock)); + rv = hv_ldc_rx_get_state(ldcp->id, &rx_head, &rx_tail, + &(ldcp->link_state)); + if (rv) { + cmn_err(CE_WARN, + "ldc_rx_getstate: (0x%lx) cannot get state", + ldcp->id); + return (EIO); + } + + if (rx_head != rx_tail || ldcp->tstate > TS_READY) { + rv = hv_ldc_rx_qconf(ldcp->id, ldcp->rx_q_ra, + ldcp->rx_q_entries); + if (rv) { + cmn_err(CE_WARN, + "ldc_rx_qconf: (0x%lx) cannot set qconf", + ldcp->id); + return (EIO); + } + D1(ldcp->id, "ldc_rx_qconf: (0x%llx) completed qconf", + ldcp->id); + } + + return (0); +} + +/* + * Reset LDC state structure and its contents + */ +static void +i_ldc_reset_state(ldc_chan_t *ldcp) +{ + ASSERT(MUTEX_HELD(&ldcp->lock)); + ldcp->last_msg_snt = LDC_INIT_SEQID; + ldcp->last_ack_rcd = 0; + ldcp->last_msg_rcd = 0; + ldcp->tx_ackd_head = ldcp->tx_head; + ldcp->next_vidx = 0; + ldcp->hstate = 0; + ldcp->tstate = TS_OPEN; + ldcp->status = LDC_OPEN; + + if (ldcp->link_state == LDC_CHANNEL_UP || + ldcp->link_state == LDC_CHANNEL_RESET) { + + if (ldcp->mode == LDC_MODE_RAW) { + ldcp->status = LDC_UP; + ldcp->tstate = TS_UP; + } else { + ldcp->status = LDC_READY; + ldcp->tstate |= TS_LINK_READY; + } + } +} + +/* + * Reset a LDC channel + */ +static void +i_ldc_reset(ldc_chan_t *ldcp) +{ + D2(ldcp->id, "i_ldc_reset: (0x%llx) channel reset\n", ldcp->id); + + (void) i_ldc_txq_reconf(ldcp); + (void) i_ldc_rxq_reconf(ldcp); + i_ldc_reset_state(ldcp); +} + +/* + * Clear pending interrupts + */ +static void +i_ldc_clear_intr(ldc_chan_t *ldcp, cnex_intrtype_t itype) +{ + ldc_cnex_t *cinfo = &ldcssp->cinfo; + + ASSERT(MUTEX_HELD(&ldcp->lock)); + if (cinfo->dip && ldcp->intr_pending) { + ldcp->intr_pending = B_FALSE; + (void) cinfo->clr_intr(cinfo->dip, ldcp->id, itype); + } +} + +/* + * Set the receive queue head + * Returns an error if it fails + */ +static int +i_ldc_set_rx_head(ldc_chan_t *ldcp, uint64_t head) +{ + int rv; + + ASSERT(MUTEX_HELD(&ldcp->lock)); + rv = hv_ldc_rx_set_qhead(ldcp->id, head); + if (rv && rv != H_EWOULDBLOCK) { + cmn_err(CE_WARN, + "ldc_rx_set_qhead: (0x%lx) cannot set qhead", ldcp->id); + i_ldc_reset(ldcp); + return (ECONNRESET); + } + + return (0); +} + + +/* + * Returns the tx_tail to be used for transfer + * Re-reads the TX queue ptrs if and only if the + * the cached head and tail are equal (queue is full) + */ +static int +i_ldc_get_tx_tail(ldc_chan_t *ldcp, uint64_t *tail) +{ + int rv; + uint64_t current_head, new_tail; + + ASSERT(MUTEX_HELD(&ldcp->lock)); + /* Read the head and tail ptrs from HV */ + rv = hv_ldc_tx_get_state(ldcp->id, + &ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state); + if (rv) { + cmn_err(CE_WARN, + "i_ldc_get_tx_tail: (0x%lx) cannot read qptrs\n", + ldcp->id); + return (EIO); + } + if (ldcp->link_state == LDC_CHANNEL_DOWN) { + DWARN(DBG_ALL_LDCS, + "i_ldc_get_tx_tail: (0x%llx) channel not ready\n", + ldcp->id); + return (ECONNRESET); + } + + /* In reliable mode, check against last ACKd msg */ + current_head = (ldcp->mode == LDC_MODE_RELIABLE || + ldcp->mode == LDC_MODE_STREAM) + ? ldcp->tx_ackd_head : ldcp->tx_head; + + /* increment the tail */ + new_tail = (ldcp->tx_tail + LDC_PACKET_SIZE) % + (ldcp->tx_q_entries << LDC_PACKET_SHIFT); + + if (new_tail == current_head) { + DWARN(ldcp->id, + "i_ldc_get_tx_tail: (0x%llx) TX queue is full\n", + ldcp->id); + return (EWOULDBLOCK); + } + + D2(ldcp->id, "i_ldc_get_tx_tail: (0x%llx) head=0x%llx, tail=0x%llx\n", + ldcp->id, ldcp->tx_head, ldcp->tx_tail); + + *tail = ldcp->tx_tail; + return (0); +} + +/* + * Set the tail pointer. If HV returns EWOULDBLOCK, it will back off + * and retry LDC_CHK_CNT times before returning an error. + * Returns 0, EWOULDBLOCK or EIO + */ +static int +i_ldc_set_tx_tail(ldc_chan_t *ldcp, uint64_t tail) +{ + int rv, retval = EWOULDBLOCK; + int loop_cnt, chk_cnt; + + ASSERT(MUTEX_HELD(&ldcp->lock)); + for (chk_cnt = 0; chk_cnt < LDC_CHK_CNT; chk_cnt++) { + + if ((rv = hv_ldc_tx_set_qtail(ldcp->id, tail)) == 0) { + retval = 0; + break; + } + if (rv != H_EWOULDBLOCK) { + DWARN(ldcp->id, "i_ldc_set_tx_tail: (0x%llx) set " + "qtail=0x%llx failed, rv=%d\n", ldcp->id, tail, rv); + retval = EIO; + break; + } + + /* spin LDC_LOOP_CNT and then try again */ + for (loop_cnt = 0; loop_cnt < LDC_LOOP_CNT; loop_cnt++); + } + return (retval); +} + +/* + * Send a LDC message + */ +static int +i_ldc_send_pkt(ldc_chan_t *ldcp, uint8_t pkttype, uint8_t subtype, + uint8_t ctrlmsg) +{ + int rv; + ldc_msg_t *pkt; + uint64_t tx_tail; + uint32_t curr_seqid = ldcp->last_msg_snt; + + ASSERT(MUTEX_HELD(&ldcp->lock)); + /* get the current tail for the message */ + rv = i_ldc_get_tx_tail(ldcp, &tx_tail); + if (rv) { + DWARN(ldcp->id, + "i_ldc_send_pkt: (0x%llx) error sending pkt, " + "type=0x%x,subtype=0x%x,ctrl=0x%x\n", + ldcp->id, pkttype, subtype, ctrlmsg); + return (rv); + } + + pkt = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail); + ZERO_PKT(pkt); + + /* Initialize the packet */ + pkt->type = pkttype; + pkt->stype = subtype; + pkt->ctrl = ctrlmsg; + + /* Store ackid/seqid iff it is RELIABLE mode & not a RTS/RTR message */ + if (((ctrlmsg & LDC_CTRL_MASK) != LDC_RTS) && + ((ctrlmsg & LDC_CTRL_MASK) != LDC_RTR)) { + curr_seqid++; + if (ldcp->mode != LDC_MODE_RAW) { + pkt->seqid = curr_seqid; + pkt->ackid = ldcp->last_msg_rcd; + } + } + DUMP_LDC_PKT(ldcp, "i_ldc_send_pkt", (uint64_t)pkt); + + /* initiate the send by calling into HV and set the new tail */ + tx_tail = (tx_tail + LDC_PACKET_SIZE) % + (ldcp->tx_q_entries << LDC_PACKET_SHIFT); + + rv = i_ldc_set_tx_tail(ldcp, tx_tail); + if (rv) { + DWARN(ldcp->id, + "i_ldc_send_pkt:(0x%llx) error sending pkt, " + "type=0x%x,stype=0x%x,ctrl=0x%x\n", + ldcp->id, pkttype, subtype, ctrlmsg); + return (EIO); + } + + ldcp->last_msg_snt = curr_seqid; + ldcp->tx_tail = tx_tail; + + return (0); +} + +/* + * Checks if packet was received in right order + * in the case of a reliable transport. + * Returns 0 if in order, else EIO + */ +static int +i_ldc_check_seqid(ldc_chan_t *ldcp, ldc_msg_t *msg) +{ + /* No seqid checking for RAW mode */ + if (ldcp->mode == LDC_MODE_RAW) + return (0); + + /* No seqid checking for version, RTS, RTR message */ + if (msg->ctrl == LDC_VER || + msg->ctrl == LDC_RTS || + msg->ctrl == LDC_RTR) + return (0); + + /* Initial seqid to use is sent in RTS/RTR and saved in last_msg_rcd */ + if (msg->seqid != (ldcp->last_msg_rcd + 1)) { + DWARN(ldcp->id, + "i_ldc_check_seqid: (0x%llx) out-of-order pkt, got 0x%x, " + "expecting 0x%x\n", ldcp->id, msg->seqid, + (ldcp->last_msg_rcd + 1)); + return (EIO); + } + + return (0); +} + + +/* + * Process an incoming version ctrl message + */ +static int +i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg) +{ + int rv = 0, idx = ldcp->next_vidx; + ldc_msg_t *pkt; + uint64_t tx_tail; + ldc_ver_t *rcvd_ver; + + /* get the received version */ + rcvd_ver = (ldc_ver_t *)((uint64_t)msg + LDC_PAYLOAD_VER_OFF); + + D2(ldcp->id, "i_ldc_process_VER: (0x%llx) received VER v%u.%u\n", + ldcp->id, rcvd_ver->major, rcvd_ver->minor); + + switch (msg->stype) { + case LDC_INFO: + + /* get the current tail and pkt for the response */ + rv = i_ldc_get_tx_tail(ldcp, &tx_tail); + if (rv != 0) { + DWARN(ldcp->id, + "i_ldc_process_VER: (0x%llx) err sending " + "version ACK/NACK\n", ldcp->id); + i_ldc_reset(ldcp); + return (ECONNRESET); + } + + pkt = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail); + ZERO_PKT(pkt); + + /* initialize the packet */ + pkt->type = LDC_CTRL; + pkt->ctrl = LDC_VER; + + for (;;) { + + D1(ldcp->id, "i_ldc_process_VER: got %u.%u chk %u.%u\n", + rcvd_ver->major, rcvd_ver->minor, + ldc_versions[idx].major, ldc_versions[idx].minor); + + if (rcvd_ver->major == ldc_versions[idx].major) { + /* major version match - ACK version */ + pkt->stype = LDC_ACK; + + /* + * lower minor version to the one this endpt + * supports, if necessary + */ + if (rcvd_ver->minor > ldc_versions[idx].minor) + rcvd_ver->minor = + ldc_versions[idx].minor; + bcopy(rcvd_ver, pkt->udata, sizeof (*rcvd_ver)); + + break; + } + + if (rcvd_ver->major > ldc_versions[idx].major) { + + D1(ldcp->id, "i_ldc_process_VER: using next" + " lower idx=%d, v%u.%u\n", idx, + ldc_versions[idx].major, + ldc_versions[idx].minor); + + /* nack with next lower version */ + pkt->stype = LDC_NACK; + bcopy(&ldc_versions[idx], pkt->udata, + sizeof (ldc_versions[idx])); + ldcp->next_vidx = idx; + break; + } + + /* next major version */ + idx++; + + D1(ldcp->id, "i_ldc_process_VER: inc idx %x\n", idx); + + if (idx == LDC_NUM_VERS) { + /* no version match - send NACK */ + pkt->stype = LDC_NACK; + bzero(pkt->udata, sizeof (ldc_ver_t)); + ldcp->next_vidx = 0; + break; + } + } + + /* initiate the send by calling into HV and set the new tail */ + tx_tail = (tx_tail + LDC_PACKET_SIZE) % + (ldcp->tx_q_entries << LDC_PACKET_SHIFT); + + rv = i_ldc_set_tx_tail(ldcp, tx_tail); + if (rv == 0) { + ldcp->tx_tail = tx_tail; + if (pkt->stype == LDC_ACK) { + D2(ldcp->id, "i_ldc_process_VER: (0x%llx) sent" + " version ACK\n", ldcp->id); + /* Save the ACK'd version */ + ldcp->version.major = rcvd_ver->major; + ldcp->version.minor = rcvd_ver->minor; + ldcp->tstate |= TS_VER_DONE; + DWARN(DBG_ALL_LDCS, + "(0x%llx) Agreed on version v%u.%u\n", + ldcp->id, rcvd_ver->major, rcvd_ver->minor); + } + } else { + DWARN(ldcp->id, + "i_ldc_process_VER: (0x%llx) error sending " + "ACK/NACK\n", ldcp->id); + i_ldc_reset(ldcp); + return (ECONNRESET); + } + + break; + + case LDC_ACK: + /* SUCCESS - we have agreed on a version */ + ldcp->version.major = rcvd_ver->major; + ldcp->version.minor = rcvd_ver->minor; + ldcp->tstate |= TS_VER_DONE; + + D1(DBG_ALL_LDCS, "(0x%llx) Agreed on version v%u.%u\n", + ldcp->id, rcvd_ver->major, rcvd_ver->minor); + + /* initiate RTS-RTR-RDX handshake */ + rv = i_ldc_get_tx_tail(ldcp, &tx_tail); + if (rv) { + DWARN(ldcp->id, + "i_ldc_process_VER: (0x%llx) cannot send RTS\n", + ldcp->id); + i_ldc_reset(ldcp); + return (ECONNRESET); + } + + pkt = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail); + ZERO_PKT(pkt); + + pkt->type = LDC_CTRL; + pkt->stype = LDC_INFO; + pkt->ctrl = LDC_RTS; + pkt->env = ldcp->mode; + if (ldcp->mode != LDC_MODE_RAW) + pkt->seqid = LDC_INIT_SEQID; + + ldcp->last_msg_rcd = LDC_INIT_SEQID; + + DUMP_LDC_PKT(ldcp, "i_ldc_process_VER snd rts", (uint64_t)pkt); + + /* initiate the send by calling into HV and set the new tail */ + tx_tail = (tx_tail + LDC_PACKET_SIZE) % + (ldcp->tx_q_entries << LDC_PACKET_SHIFT); + + rv = i_ldc_set_tx_tail(ldcp, tx_tail); + if (rv) { + D2(ldcp->id, + "i_ldc_process_VER: (0x%llx) no listener\n", + ldcp->id); + i_ldc_reset(ldcp); + return (ECONNRESET); + } + + ldcp->last_msg_snt++; + ldcp->tx_tail = tx_tail; + ldcp->hstate |= TS_SENT_RTS; + + break; + + case LDC_NACK: + /* check if version in NACK is zero */ + if (rcvd_ver->major == 0 && rcvd_ver->minor == 0) { + /* version handshake failure */ + DWARN(DBG_ALL_LDCS, + "i_ldc_process_VER: (0x%llx) no version match\n", + ldcp->id); + i_ldc_reset(ldcp); + return (ECONNRESET); + } + + /* get the current tail and pkt for the response */ + rv = i_ldc_get_tx_tail(ldcp, &tx_tail); + if (rv != 0) { + cmn_err(CE_NOTE, + "i_ldc_process_VER: (0x%lx) err sending " + "version ACK/NACK\n", ldcp->id); + i_ldc_reset(ldcp); + return (ECONNRESET); + } + + pkt = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail); + ZERO_PKT(pkt); + + /* initialize the packet */ + pkt->type = LDC_CTRL; + pkt->ctrl = LDC_VER; + pkt->stype = LDC_INFO; + + /* check ver in NACK msg has a match */ + for (;;) { + if (rcvd_ver->major == ldc_versions[idx].major) { + /* + * major version match - resubmit request + * if lower minor version to the one this endpt + * supports, if necessary + */ + if (rcvd_ver->minor > ldc_versions[idx].minor) + rcvd_ver->minor = + ldc_versions[idx].minor; + bcopy(rcvd_ver, pkt->udata, sizeof (*rcvd_ver)); + break; + + } + + if (rcvd_ver->major > ldc_versions[idx].major) { + + D1(ldcp->id, "i_ldc_process_VER: using next" + " lower idx=%d, v%u.%u\n", idx, + ldc_versions[idx].major, + ldc_versions[idx].minor); + + /* send next lower version */ + bcopy(&ldc_versions[idx], pkt->udata, + sizeof (ldc_versions[idx])); + ldcp->next_vidx = idx; + break; + } + + /* next version */ + idx++; + + D1(ldcp->id, "i_ldc_process_VER: inc idx %x\n", idx); + + if (idx == LDC_NUM_VERS) { + /* no version match - terminate */ + ldcp->next_vidx = 0; + return (ECONNRESET); + } + } + + /* initiate the send by calling into HV and set the new tail */ + tx_tail = (tx_tail + LDC_PACKET_SIZE) % + (ldcp->tx_q_entries << LDC_PACKET_SHIFT); + + rv = i_ldc_set_tx_tail(ldcp, tx_tail); + if (rv == 0) { + D2(ldcp->id, "i_ldc_process_VER: (0x%llx) sent version" + "INFO v%u.%u\n", ldcp->id, ldc_versions[idx].major, + ldc_versions[idx].minor); + ldcp->tx_tail = tx_tail; + } else { + cmn_err(CE_NOTE, + "i_ldc_process_VER: (0x%lx) error sending version" + "INFO\n", ldcp->id); + i_ldc_reset(ldcp); + return (ECONNRESET); + } + + break; + } + + return (rv); +} + + +/* + * Process an incoming RTS ctrl message + */ +static int +i_ldc_process_RTS(ldc_chan_t *ldcp, ldc_msg_t *msg) +{ + int rv = 0; + ldc_msg_t *pkt; + uint64_t tx_tail; + boolean_t sent_NACK = B_FALSE; + + D2(ldcp->id, "i_ldc_process_RTS: (0x%llx) received RTS\n", ldcp->id); + + switch (msg->stype) { + case LDC_NACK: + DWARN(ldcp->id, + "i_ldc_process_RTS: (0x%llx) RTS NACK received\n", + ldcp->id); + + /* Reset the channel -- as we cannot continue */ + i_ldc_reset(ldcp); + rv = ECONNRESET; + break; + + case LDC_INFO: + + /* check mode */ + if (ldcp->mode != (ldc_mode_t)msg->env) { + cmn_err(CE_NOTE, + "i_ldc_process_RTS: (0x%lx) mode mismatch\n", + ldcp->id); + /* + * send NACK in response to MODE message + * get the current tail for the response + */ + rv = i_ldc_send_pkt(ldcp, LDC_CTRL, LDC_NACK, LDC_RTS); + if (rv) { + /* if cannot send NACK - reset channel */ + i_ldc_reset(ldcp); + rv = ECONNRESET; + break; + } + sent_NACK = B_TRUE; + } + break; + default: + DWARN(ldcp->id, "i_ldc_process_RTS: (0x%llx) unexp ACK\n", + ldcp->id); + i_ldc_reset(ldcp); + rv = ECONNRESET; + break; + } + + /* + * If either the connection was reset (when rv != 0) or + * a NACK was sent, we return. In the case of a NACK + * we dont want to consume the packet that came in but + * not record that we received the RTS + */ + if (rv || sent_NACK) + return (rv); + + /* record RTS received */ + ldcp->hstate |= TS_RCVD_RTS; + + /* store initial SEQID info */ + ldcp->last_msg_snt = msg->seqid; + + /* get the current tail for the response */ + rv = i_ldc_get_tx_tail(ldcp, &tx_tail); + if (rv != 0) { + cmn_err(CE_NOTE, + "i_ldc_process_RTS: (0x%lx) err sending RTR\n", + ldcp->id); + i_ldc_reset(ldcp); + return (ECONNRESET); + } + + pkt = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail); + ZERO_PKT(pkt); + + /* initialize the packet */ + pkt->type = LDC_CTRL; + pkt->stype = LDC_INFO; + pkt->ctrl = LDC_RTR; + pkt->env = ldcp->mode; + if (ldcp->mode != LDC_MODE_RAW) + pkt->seqid = LDC_INIT_SEQID; + + ldcp->last_msg_rcd = msg->seqid; + + /* initiate the send by calling into HV and set the new tail */ + tx_tail = (tx_tail + LDC_PACKET_SIZE) % + (ldcp->tx_q_entries << LDC_PACKET_SHIFT); + + rv = i_ldc_set_tx_tail(ldcp, tx_tail); + if (rv == 0) { + D2(ldcp->id, + "i_ldc_process_RTS: (0x%llx) sent RTR\n", ldcp->id); + DUMP_LDC_PKT(ldcp, "i_ldc_process_RTS sent rtr", (uint64_t)pkt); + + ldcp->tx_tail = tx_tail; + ldcp->hstate |= TS_SENT_RTR; + + } else { + cmn_err(CE_NOTE, + "i_ldc_process_RTS: (0x%lx) error sending RTR\n", + ldcp->id); + i_ldc_reset(ldcp); + return (ECONNRESET); + } + + return (0); +} + +/* + * Process an incoming RTR ctrl message + */ +static int +i_ldc_process_RTR(ldc_chan_t *ldcp, ldc_msg_t *msg) +{ + int rv = 0; + boolean_t sent_NACK = B_FALSE; + + D2(ldcp->id, "i_ldc_process_RTR: (0x%llx) received RTR\n", ldcp->id); + + switch (msg->stype) { + case LDC_NACK: + /* RTR NACK received */ + DWARN(ldcp->id, + "i_ldc_process_RTR: (0x%llx) RTR NACK received\n", + ldcp->id); + + /* Reset the channel -- as we cannot continue */ + i_ldc_reset(ldcp); + rv = ECONNRESET; + + break; + + case LDC_INFO: + + /* check mode */ + if (ldcp->mode != (ldc_mode_t)msg->env) { + DWARN(ldcp->id, + "i_ldc_process_RTR: (0x%llx) mode mismatch\n", + ldcp->id); + /* + * send NACK in response to MODE message + * get the current tail for the response + */ + rv = i_ldc_send_pkt(ldcp, LDC_CTRL, LDC_NACK, LDC_RTR); + if (rv) { + /* if cannot send NACK - reset channel */ + i_ldc_reset(ldcp); + rv = ECONNRESET; + break; + } + sent_NACK = B_TRUE; + } + break; + + default: + DWARN(ldcp->id, "i_ldc_process_RTR: (0x%llx) unexp ACK\n", + ldcp->id); + + /* Reset the channel -- as we cannot continue */ + i_ldc_reset(ldcp); + rv = ECONNRESET; + break; + } + + /* + * If either the connection was reset (when rv != 0) or + * a NACK was sent, we return. In the case of a NACK + * we dont want to consume the packet that came in but + * not record that we received the RTR + */ + if (rv || sent_NACK) + return (rv); + + ldcp->last_msg_snt = msg->seqid; + ldcp->hstate |= TS_RCVD_RTR; + + rv = i_ldc_send_pkt(ldcp, LDC_CTRL, LDC_INFO, LDC_RDX); + if (rv) { + cmn_err(CE_NOTE, + "i_ldc_process_RTR: (0x%lx) cannot send RDX\n", + ldcp->id); + i_ldc_reset(ldcp); + return (ECONNRESET); + } + D2(ldcp->id, + "i_ldc_process_RTR: (0x%llx) sent RDX\n", ldcp->id); + + ldcp->hstate |= TS_SENT_RDX; + ldcp->tstate |= TS_HSHAKE_DONE; + ldcp->status = LDC_UP; + + DWARN(DBG_ALL_LDCS, "(0x%llx) Handshake Complete\n", ldcp->id); + + return (0); +} + + +/* + * Process an incoming RDX ctrl message + */ +static int +i_ldc_process_RDX(ldc_chan_t *ldcp, ldc_msg_t *msg) +{ + int rv = 0; + + D2(ldcp->id, "i_ldc_process_RDX: (0x%llx) received RDX\n", ldcp->id); + + switch (msg->stype) { + case LDC_NACK: + /* RDX NACK received */ + DWARN(ldcp->id, + "i_ldc_process_RDX: (0x%llx) RDX NACK received\n", + ldcp->id); + + /* Reset the channel -- as we cannot continue */ + i_ldc_reset(ldcp); + rv = ECONNRESET; + + break; + + case LDC_INFO: + + /* + * if channel is UP and a RDX received after data transmission + * has commenced it is an error + */ + if ((ldcp->tstate == TS_UP) && (ldcp->hstate & TS_RCVD_RDX)) { + DWARN(DBG_ALL_LDCS, + "i_ldc_process_RDX: (0x%llx) unexpected RDX" + " - LDC reset\n", ldcp->id); + i_ldc_reset(ldcp); + return (ECONNRESET); + } + + ldcp->hstate |= TS_RCVD_RDX; + ldcp->tstate |= TS_HSHAKE_DONE; + ldcp->status = LDC_UP; + + D1(DBG_ALL_LDCS, "(0x%llx) Handshake Complete\n", ldcp->id); + break; + + default: + DWARN(ldcp->id, "i_ldc_process_RDX: (0x%llx) unexp ACK\n", + ldcp->id); + + /* Reset the channel -- as we cannot continue */ + i_ldc_reset(ldcp); + rv = ECONNRESET; + break; + } + + return (rv); +} + +/* + * Process an incoming ACK for a data packet + */ +static int +i_ldc_process_data_ACK(ldc_chan_t *ldcp, ldc_msg_t *msg) +{ + int rv; + uint64_t tx_head; + ldc_msg_t *pkt; + + /* + * Read the curret Tx head and tail + */ + rv = hv_ldc_tx_get_state(ldcp->id, + &ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state); + if (rv != 0) { + cmn_err(CE_WARN, + "i_ldc_process_data_ACK: (0x%lx) cannot read qptrs\n", + ldcp->id); + return (0); + } + + /* + * loop from where the previous ACK location was to the + * current head location. This is how far the HV has + * actually send pkts. Pkts between head and tail are + * yet to be sent by HV. + */ + tx_head = ldcp->tx_ackd_head; + for (;;) { + pkt = (ldc_msg_t *)(ldcp->tx_q_va + tx_head); + tx_head = (tx_head + LDC_PACKET_SIZE) % + (ldcp->tx_q_entries << LDC_PACKET_SHIFT); + + if (pkt->seqid == msg->ackid) { + D2(ldcp->id, + "i_ldc_process_data_ACK: (0x%llx) found packet\n", + ldcp->id); + ldcp->last_ack_rcd = msg->ackid; + ldcp->tx_ackd_head = tx_head; + break; + } + if (tx_head == ldcp->tx_head) { + /* could not find packet */ + DWARN(ldcp->id, + "i_ldc_process_data_ACK: (0x%llx) invalid ACKid\n", + ldcp->id); + break; + } + } + + return (0); +} + +/* + * Process incoming control message + * Return 0 - session can continue + * EAGAIN - reprocess packet - state was changed + * ECONNRESET - channel was reset + */ +static int +i_ldc_ctrlmsg(ldc_chan_t *ldcp, ldc_msg_t *msg) +{ + int rv = 0; + + switch (ldcp->tstate) { + + case TS_OPEN: + case TS_READY: + + switch (msg->ctrl & LDC_CTRL_MASK) { + case LDC_VER: + /* process version message */ + rv = i_ldc_process_VER(ldcp, msg); + break; + default: + DWARN(ldcp->id, + "i_ldc_ctrlmsg: (0x%llx) unexp ctrl 0x%x " + "tstate=0x%x\n", ldcp->id, + (msg->ctrl & LDC_CTRL_MASK), ldcp->tstate); + break; + } + + break; + + case TS_VREADY: + + switch (msg->ctrl & LDC_CTRL_MASK) { + case LDC_VER: + /* peer is redoing version negotiation */ + (void) i_ldc_txq_reconf(ldcp); + i_ldc_reset_state(ldcp); + rv = EAGAIN; + break; + case LDC_RTS: + /* process RTS message */ + rv = i_ldc_process_RTS(ldcp, msg); + break; + case LDC_RTR: + /* process RTR message */ + rv = i_ldc_process_RTR(ldcp, msg); + break; + case LDC_RDX: + /* process RDX message */ + rv = i_ldc_process_RDX(ldcp, msg); + break; + default: + DWARN(ldcp->id, + "i_ldc_ctrlmsg: (0x%llx) unexp ctrl 0x%x " + "tstate=0x%x\n", ldcp->id, + (msg->ctrl & LDC_CTRL_MASK), ldcp->tstate); + break; + } + + break; + + case TS_UP: + + switch (msg->ctrl & LDC_CTRL_MASK) { + case LDC_VER: + DWARN(ldcp->id, + "i_ldc_ctrlmsg: (0x%llx) unexpected VER " + "- LDC reset\n", ldcp->id); + /* peer is redoing version negotiation */ + (void) i_ldc_txq_reconf(ldcp); + i_ldc_reset_state(ldcp); + rv = EAGAIN; + break; + + case LDC_RDX: + /* process RDX message */ + rv = i_ldc_process_RDX(ldcp, msg); + break; + + default: + DWARN(ldcp->id, + "i_ldc_ctrlmsg: (0x%llx) unexp ctrl 0x%x " + "tstate=0x%x\n", ldcp->id, + (msg->ctrl & LDC_CTRL_MASK), ldcp->tstate); + break; + } + } + + return (rv); +} + +/* + * Register channel with the channel nexus + */ +static int +i_ldc_register_channel(ldc_chan_t *ldcp) +{ + int rv = 0; + ldc_cnex_t *cinfo = &ldcssp->cinfo; + + if (cinfo->dip == NULL) { + DWARN(ldcp->id, + "i_ldc_register_channel: cnex has not registered\n"); + return (EAGAIN); + } + + rv = cinfo->reg_chan(cinfo->dip, ldcp->id, ldcp->devclass); + if (rv) { + DWARN(ldcp->id, + "i_ldc_register_channel: cannot register channel\n"); + return (rv); + } + + rv = cinfo->add_intr(cinfo->dip, ldcp->id, CNEX_TX_INTR, + i_ldc_tx_hdlr, ldcp, NULL); + if (rv) { + DWARN(ldcp->id, + "i_ldc_register_channel: cannot add Tx interrupt\n"); + (void) cinfo->unreg_chan(cinfo->dip, ldcp->id); + return (rv); + } + + rv = cinfo->add_intr(cinfo->dip, ldcp->id, CNEX_RX_INTR, + i_ldc_rx_hdlr, ldcp, NULL); + if (rv) { + DWARN(ldcp->id, + "i_ldc_register_channel: cannot add Rx interrupt\n"); + (void) cinfo->rem_intr(cinfo->dip, ldcp->id, CNEX_TX_INTR); + (void) cinfo->unreg_chan(cinfo->dip, ldcp->id); + return (rv); + } + + ldcp->tstate |= TS_CNEX_RDY; + + return (0); +} + +/* + * Unregister a channel with the channel nexus + */ +static int +i_ldc_unregister_channel(ldc_chan_t *ldcp) +{ + int rv = 0; + ldc_cnex_t *cinfo = &ldcssp->cinfo; + + if (cinfo->dip == NULL) { + DWARN(ldcp->id, + "i_ldc_unregister_channel: cnex has not registered\n"); + return (EAGAIN); + } + + if (ldcp->tstate & TS_CNEX_RDY) { + + rv = cinfo->rem_intr(cinfo->dip, ldcp->id, CNEX_RX_INTR); + if (rv) { + DWARN(ldcp->id, + "i_ldc_unregister_channel: err removing Rx intr\n"); + } + rv = cinfo->rem_intr(cinfo->dip, ldcp->id, CNEX_TX_INTR); + if (rv) { + DWARN(ldcp->id, + "i_ldc_unregister_channel: err removing Tx intr\n"); + } + rv = cinfo->unreg_chan(ldcssp->cinfo.dip, ldcp->id); + if (rv) { + DWARN(ldcp->id, + "i_ldc_unregister_channel: cannot unreg channel\n"); + } + + ldcp->tstate &= ~TS_CNEX_RDY; + } + + return (0); +} + + +/* + * LDC transmit interrupt handler + * triggered for chanel up/down/reset events + * and Tx queue content changes + */ +static uint_t +i_ldc_tx_hdlr(caddr_t arg1, caddr_t arg2) +{ + _NOTE(ARGUNUSED(arg2)) + + int rv; + ldc_chan_t *ldcp; + boolean_t notify_client = B_FALSE; + uint64_t notify_event = 0; + + /* Get the channel for which interrupt was received */ + ASSERT(arg1 != NULL); + ldcp = (ldc_chan_t *)arg1; + + D1(ldcp->id, "i_ldc_tx_hdlr: (0x%llx) Received intr, ldcp=0x%p\n", + ldcp->id, ldcp); + + /* Lock channel */ + mutex_enter(&ldcp->lock); + + rv = hv_ldc_tx_get_state(ldcp->id, &ldcp->tx_head, &ldcp->tx_tail, + &ldcp->link_state); + if (rv) { + cmn_err(CE_WARN, + "i_ldc_tx_hdlr: (0x%lx) cannot read queue ptrs rv=0x%d\n", + ldcp->id, rv); + mutex_exit(&ldcp->lock); + return (DDI_INTR_CLAIMED); + } + + /* + * reset the channel state if the channel went down + * (other side unconfigured queue) or channel was reset + * (other side reconfigured its queue) + */ + if (ldcp->link_state == LDC_CHANNEL_DOWN) { + D1(ldcp->id, "i_ldc_tx_hdlr: channel link down\n", ldcp->id); + i_ldc_reset(ldcp); + notify_client = B_TRUE; + notify_event = LDC_EVT_DOWN; + } + + if (ldcp->link_state == LDC_CHANNEL_RESET) { + D1(ldcp->id, "i_ldc_tx_hdlr: channel link reset\n", ldcp->id); + i_ldc_reset(ldcp); + notify_client = B_TRUE; + notify_event = LDC_EVT_RESET; + } + + if (ldcp->tstate == TS_OPEN && ldcp->link_state == LDC_CHANNEL_UP) { + D1(ldcp->id, "i_ldc_tx_hdlr: channel link up\n", ldcp->id); + notify_client = B_TRUE; + notify_event = LDC_EVT_RESET; + ldcp->tstate |= TS_LINK_READY; + ldcp->status = LDC_READY; + } + + /* if callbacks are disabled, do not notify */ + if (!ldcp->cb_enabled) + notify_client = B_FALSE; + + if (notify_client) + ldcp->cb_inprogress = B_TRUE; + + /* Unlock channel */ + mutex_exit(&ldcp->lock); + + if (notify_client) { + rv = ldcp->cb(notify_event, ldcp->cb_arg); + if (rv) { + DWARN(ldcp->id, "i_ldc_tx_hdlr: (0x%llx) callback " + "failure", ldcp->id); + } + mutex_enter(&ldcp->lock); + ldcp->cb_inprogress = B_FALSE; + mutex_exit(&ldcp->lock); + } + + mutex_enter(&ldcp->lock); + i_ldc_clear_intr(ldcp, CNEX_TX_INTR); + mutex_exit(&ldcp->lock); + + D1(ldcp->id, "i_ldc_tx_hdlr: (0x%llx) exiting handler", ldcp->id); + + return (DDI_INTR_CLAIMED); +} + +/* + * LDC receive interrupt handler + * triggered for channel with data pending to read + * i.e. Rx queue content changes + */ +static uint_t +i_ldc_rx_hdlr(caddr_t arg1, caddr_t arg2) +{ + _NOTE(ARGUNUSED(arg2)) + + int rv; + uint64_t rx_head, rx_tail; + ldc_msg_t *msg; + ldc_chan_t *ldcp; + boolean_t notify_client = B_FALSE; + uint64_t notify_event = 0; + + /* Get the channel for which interrupt was received */ + if (arg1 == NULL) { + cmn_err(CE_WARN, "i_ldc_rx_hdlr: invalid arg\n"); + return (DDI_INTR_UNCLAIMED); + } + + ldcp = (ldc_chan_t *)arg1; + + D1(ldcp->id, "i_ldc_rx_hdlr: (0x%llx) Received intr, ldcp=0x%p\n", + ldcp->id, ldcp); + + /* Lock channel */ + mutex_enter(&ldcp->lock); + + /* mark interrupt as pending */ + ldcp->intr_pending = B_TRUE; + + /* + * Read packet(s) from the queue + */ + for (;;) { + + rv = hv_ldc_rx_get_state(ldcp->id, &rx_head, &rx_tail, + &ldcp->link_state); + if (rv) { + cmn_err(CE_WARN, + "i_ldc_rx_hdlr: (0x%lx) cannot read " + "queue ptrs, rv=0x%d\n", ldcp->id, rv); + i_ldc_clear_intr(ldcp, CNEX_RX_INTR); + mutex_exit(&ldcp->lock); + return (DDI_INTR_CLAIMED); + } + + /* + * reset the channel state if the channel went down + * (other side unconfigured queue) or channel was reset + * (other side reconfigured its queue + */ + if (ldcp->link_state == LDC_CHANNEL_DOWN) { + D1(ldcp->id, "i_ldc_rx_hdlr: channel link down\n", + ldcp->id); + i_ldc_reset(ldcp); + notify_client = B_TRUE; + notify_event = LDC_EVT_DOWN; + break; + } + if (ldcp->link_state == LDC_CHANNEL_RESET) { + D1(ldcp->id, "i_ldc_rx_hdlr: channel link reset\n", + ldcp->id); + i_ldc_reset(ldcp); + notify_client = B_TRUE; + notify_event = LDC_EVT_RESET; + } + + if (ldcp->tstate == TS_OPEN && + ldcp->link_state == LDC_CHANNEL_UP) { + D1(ldcp->id, "i_ldc_rx_hdlr: channel link up\n", + ldcp->id); + notify_client = B_TRUE; + notify_event = LDC_EVT_RESET; + ldcp->tstate |= TS_LINK_READY; + ldcp->status = LDC_READY; + } + + if (rx_head == rx_tail) { + D2(ldcp->id, "i_ldc_rx_hdlr: (0x%llx) No packets\n", + ldcp->id); + break; + } + D2(ldcp->id, "i_ldc_rx_hdlr: head=0x%llx, tail=0x%llx\n", + rx_head, rx_tail); + DUMP_LDC_PKT(ldcp, "i_ldc_rx_hdlr rcd", + ldcp->rx_q_va + rx_head); + + /* get the message */ + msg = (ldc_msg_t *)(ldcp->rx_q_va + rx_head); + + /* if channel is in RAW mode or data pkt, notify and return */ + if (ldcp->mode == LDC_MODE_RAW) { + notify_client = B_TRUE; + notify_event |= LDC_EVT_READ; + break; + } + + if ((msg->type & LDC_DATA) && (msg->stype & LDC_INFO)) { + + /* discard packet if channel is not up */ + if (ldcp->tstate != TS_UP) { + + /* move the head one position */ + rx_head = (rx_head + LDC_PACKET_SIZE) % + (ldcp->rx_q_entries << LDC_PACKET_SHIFT); + + if (rv = i_ldc_set_rx_head(ldcp, rx_head)) + break; + + continue; + } else { + notify_client = B_TRUE; + notify_event |= LDC_EVT_READ; + break; + } + } + + /* Check the sequence ID for the message received */ + if ((rv = i_ldc_check_seqid(ldcp, msg)) != 0) { + + DWARN(ldcp->id, "i_ldc_rx_hdlr: (0x%llx) seqid error, " + "q_ptrs=0x%lx,0x%lx", ldcp->id, rx_head, rx_tail); + + /* Reset last_msg_rcd to start of message */ + if (ldcp->first_fragment != 0) { + ldcp->last_msg_rcd = + ldcp->first_fragment - 1; + ldcp->first_fragment = 0; + } + /* + * Send a NACK due to seqid mismatch + */ + rv = i_ldc_send_pkt(ldcp, LDC_CTRL, LDC_NACK, + (msg->ctrl & LDC_CTRL_MASK)); + + if (rv) { + cmn_err(CE_NOTE, + "i_ldc_rx_hdlr: (0x%lx) err sending " + "CTRL/NACK msg\n", ldcp->id); + } + + /* purge receive queue */ + (void) i_ldc_set_rx_head(ldcp, rx_tail); + break; + } + + /* record the message ID */ + ldcp->last_msg_rcd = msg->seqid; + + /* process control messages */ + if (msg->type & LDC_CTRL) { + /* save current internal state */ + uint64_t tstate = ldcp->tstate; + + rv = i_ldc_ctrlmsg(ldcp, msg); + if (rv == EAGAIN) { + /* re-process pkt - state was adjusted */ + continue; + } + if (rv == ECONNRESET) { + notify_client = B_TRUE; + notify_event = LDC_EVT_RESET; + break; + } + + /* + * control message processing was successful + * channel transitioned to ready for communication + */ + if (rv == 0 && ldcp->tstate == TS_UP && + tstate != ldcp->tstate) { + notify_client = B_TRUE; + notify_event = LDC_EVT_UP; + } + } + + /* process data ACKs */ + if ((msg->type & LDC_DATA) && (msg->stype & LDC_ACK)) { + (void) i_ldc_process_data_ACK(ldcp, msg); + } + + /* move the head one position */ + rx_head = (rx_head + LDC_PACKET_SIZE) % + (ldcp->rx_q_entries << LDC_PACKET_SHIFT); + if (rv = i_ldc_set_rx_head(ldcp, rx_head)) + break; + + } /* for */ + + /* if callbacks are disabled, do not notify */ + if (!ldcp->cb_enabled) + notify_client = B_FALSE; + + if (notify_client) + ldcp->cb_inprogress = B_TRUE; + + /* Unlock channel */ + mutex_exit(&ldcp->lock); + + if (notify_client) { + rv = ldcp->cb(notify_event, ldcp->cb_arg); + if (rv) { + DWARN(ldcp->id, + "i_ldc_rx_hdlr: (0x%llx) callback failure", + ldcp->id); + } + mutex_enter(&ldcp->lock); + ldcp->cb_inprogress = B_FALSE; + mutex_exit(&ldcp->lock); + } + + mutex_enter(&ldcp->lock); + + /* + * If there are data packets in the queue, the ldc_read will + * clear interrupts after draining the queue, else clear interrupts + */ + if ((notify_event & LDC_EVT_READ) == 0) { + i_ldc_clear_intr(ldcp, CNEX_RX_INTR); + } + + mutex_exit(&ldcp->lock); + + D1(ldcp->id, "i_ldc_rx_hdlr: (0x%llx) exiting handler", ldcp->id); + return (DDI_INTR_CLAIMED); +} + + +/* -------------------------------------------------------------------------- */ + +/* + * LDC API functions + */ + +/* + * Initialize the channel. Allocate internal structure and memory for + * TX/RX queues, and initialize locks. + */ +int +ldc_init(uint64_t id, ldc_attr_t *attr, ldc_handle_t *handle) +{ + ldc_chan_t *ldcp; + int rv, exit_val; + uint64_t ra_base, nentries; + + exit_val = EINVAL; /* guarantee an error if exit on failure */ + + if (attr == NULL) { + DWARN(id, "ldc_init: (0x%llx) invalid attr\n", id); + return (EINVAL); + } + if (handle == NULL) { + DWARN(id, "ldc_init: (0x%llx) invalid handle\n", id); + return (EINVAL); + } + + /* check if channel is valid */ + rv = hv_ldc_tx_qinfo(id, &ra_base, &nentries); + if (rv == H_ECHANNEL) { + DWARN(id, "ldc_init: (0x%llx) invalid channel id\n", id); + return (EINVAL); + } + + /* check if the channel has already been initialized */ + mutex_enter(&ldcssp->lock); + ldcp = ldcssp->chan_list; + while (ldcp != NULL) { + if (ldcp->id == id) { + DWARN(id, "ldc_init: (0x%llx) already initialized\n", + id); + mutex_exit(&ldcssp->lock); + return (EADDRINUSE); + } + ldcp = ldcp->next; + } + mutex_exit(&ldcssp->lock); + + ASSERT(ldcp == NULL); + + *handle = 0; + + /* Allocate an ldcp structure */ + ldcp = kmem_zalloc(sizeof (ldc_chan_t), KM_SLEEP); + + /* Initialize the channel lock */ + mutex_init(&ldcp->lock, NULL, MUTEX_DRIVER, NULL); + + /* Channel specific processing */ + mutex_enter(&ldcp->lock); + + /* Initialize the channel */ + ldcp->id = id; + ldcp->cb = NULL; + ldcp->cb_arg = NULL; + ldcp->cb_inprogress = B_FALSE; + ldcp->cb_enabled = B_FALSE; + ldcp->next = NULL; + + /* Read attributes */ + ldcp->mode = attr->mode; + ldcp->devclass = attr->devclass; + ldcp->devinst = attr->instance; + + ldcp->rx_q_entries = + (attr->qlen > 0) ? attr->qlen : LDC_QUEUE_ENTRIES; + ldcp->tx_q_entries = ldcp->rx_q_entries; + + D1(ldcp->id, + "ldc_init: (0x%llx) channel attributes, class=0x%x, " + "instance=0x%llx,mode=%d, qlen=%d\n", + ldcp->id, ldcp->devclass, ldcp->devinst, + ldcp->mode, ldcp->rx_q_entries); + + ldcp->next_vidx = 0; + ldcp->tstate = 0; + ldcp->hstate = 0; + ldcp->last_msg_snt = LDC_INIT_SEQID; + ldcp->last_ack_rcd = 0; + ldcp->last_msg_rcd = 0; + + ldcp->stream_bufferp = NULL; + ldcp->exp_dring_list = NULL; + ldcp->imp_dring_list = NULL; + ldcp->mhdl_list = NULL; + + /* Initialize payload size depending on whether channel is reliable */ + switch (ldcp->mode) { + case LDC_MODE_RAW: + ldcp->pkt_payload = LDC_PAYLOAD_SIZE_RAW; + ldcp->read_p = i_ldc_read_raw; + ldcp->write_p = i_ldc_write_raw; + ldcp->mtu = 0; + break; + case LDC_MODE_UNRELIABLE: + ldcp->pkt_payload = LDC_PAYLOAD_SIZE_UNRELIABLE; + ldcp->read_p = i_ldc_read_packet; + ldcp->write_p = i_ldc_write_packet; + ldcp->mtu = 0; + break; + case LDC_MODE_RELIABLE: + ldcp->pkt_payload = LDC_PAYLOAD_SIZE_RELIABLE; + ldcp->read_p = i_ldc_read_packet; + ldcp->write_p = i_ldc_write_packet; + ldcp->mtu = 0; + break; + case LDC_MODE_STREAM: + ldcp->pkt_payload = LDC_PAYLOAD_SIZE_RELIABLE; + + ldcp->stream_remains = 0; + ldcp->stream_offset = 0; + ldcp->mtu = LDC_STREAM_MTU; + ldcp->stream_bufferp = kmem_alloc(ldcp->mtu, KM_SLEEP); + ldcp->read_p = i_ldc_read_stream; + ldcp->write_p = i_ldc_write_stream; + break; + default: + exit_val = EINVAL; + goto cleanup_on_exit; + } + + /* Create a transmit queue */ + ldcp->tx_q_va = (uint64_t) + contig_mem_alloc(ldcp->tx_q_entries << LDC_PACKET_SHIFT); + if (ldcp->tx_q_va == NULL) { + cmn_err(CE_WARN, + "ldc_init: (0x%lx) TX queue allocation failed\n", + ldcp->id); + exit_val = ENOMEM; + goto cleanup_on_exit; + } + ldcp->tx_q_ra = va_to_pa((caddr_t)ldcp->tx_q_va); + + D2(ldcp->id, "ldc_init: txq_va=0x%llx, txq_ra=0x%llx, entries=0x%llx\n", + ldcp->tx_q_va, ldcp->tx_q_ra, ldcp->tx_q_entries); + + ldcp->tstate |= TS_TXQ_RDY; + + /* Create a receive queue */ + ldcp->rx_q_va = (uint64_t) + contig_mem_alloc(ldcp->rx_q_entries << LDC_PACKET_SHIFT); + if (ldcp->rx_q_va == NULL) { + cmn_err(CE_WARN, + "ldc_init: (0x%lx) RX queue allocation failed\n", + ldcp->id); + exit_val = ENOMEM; + goto cleanup_on_exit; + } + ldcp->rx_q_ra = va_to_pa((caddr_t)ldcp->rx_q_va); + + D2(ldcp->id, "ldc_init: rxq_va=0x%llx, rxq_ra=0x%llx, entries=0x%llx\n", + ldcp->rx_q_va, ldcp->rx_q_ra, ldcp->rx_q_entries); + + ldcp->tstate |= TS_RXQ_RDY; + + /* Init descriptor ring and memory handle list lock */ + mutex_init(&ldcp->exp_dlist_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ldcp->imp_dlist_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ldcp->mlist_lock, NULL, MUTEX_DRIVER, NULL); + + /* mark status as INITialized */ + ldcp->status = LDC_INIT; + + mutex_exit(&ldcp->lock); + + /* Add to channel list */ + mutex_enter(&ldcssp->lock); + ldcp->next = ldcssp->chan_list; + ldcssp->chan_list = ldcp; + ldcssp->channel_count++; + mutex_exit(&ldcssp->lock); + + /* set the handle */ + *handle = (ldc_handle_t)ldcp; + + D1(ldcp->id, "ldc_init: (0x%llx) channel initialized\n", ldcp->id); + + return (0); + +cleanup_on_exit: + + if (ldcp->mode == LDC_MODE_STREAM && ldcp->stream_bufferp) + kmem_free(ldcp->stream_bufferp, ldcp->mtu); + + if (ldcp->tstate & TS_TXQ_RDY) + contig_mem_free((caddr_t)ldcp->tx_q_va, + (ldcp->tx_q_entries << LDC_PACKET_SHIFT)); + + if (ldcp->tstate & TS_RXQ_RDY) + contig_mem_free((caddr_t)ldcp->rx_q_va, + (ldcp->rx_q_entries << LDC_PACKET_SHIFT)); + + mutex_exit(&ldcp->lock); + mutex_destroy(&ldcp->lock); + + if (ldcp) + kmem_free(ldcp, sizeof (ldc_chan_t)); + + return (exit_val); +} + +/* + * Finalizes the LDC connection. It will return EBUSY if the + * channel is open. A ldc_close() has to be done prior to + * a ldc_fini operation. It frees TX/RX queues, associated + * with the channel + */ +int +ldc_fini(ldc_handle_t handle) +{ + ldc_chan_t *ldcp; + ldc_chan_t *tmp_ldcp; + uint64_t id; + + if (handle == NULL) { + DWARN(DBG_ALL_LDCS, "ldc_fini: invalid channel handle\n"); + return (EINVAL); + } + ldcp = (ldc_chan_t *)handle; + id = ldcp->id; + + mutex_enter(&ldcp->lock); + + if (ldcp->tstate > TS_INIT) { + DWARN(ldcp->id, "ldc_fini: (0x%llx) channel is open\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EBUSY); + } + + /* Remove from the channel list */ + mutex_enter(&ldcssp->lock); + tmp_ldcp = ldcssp->chan_list; + if (tmp_ldcp == ldcp) { + ldcssp->chan_list = ldcp->next; + ldcp->next = NULL; + } else { + while (tmp_ldcp != NULL) { + if (tmp_ldcp->next == ldcp) { + tmp_ldcp->next = ldcp->next; + ldcp->next = NULL; + break; + } + tmp_ldcp = tmp_ldcp->next; + } + if (tmp_ldcp == NULL) { + DWARN(DBG_ALL_LDCS, "ldc_fini: invalid channel hdl\n"); + mutex_exit(&ldcssp->lock); + mutex_exit(&ldcp->lock); + return (EINVAL); + } + } + + ldcssp->channel_count--; + + mutex_exit(&ldcssp->lock); + + /* Free the map table for this channel */ + if (ldcp->mtbl) { + (void) hv_ldc_set_map_table(ldcp->id, NULL, NULL); + contig_mem_free(ldcp->mtbl->table, ldcp->mtbl->size); + mutex_destroy(&ldcp->mtbl->lock); + kmem_free(ldcp->mtbl, sizeof (ldc_mtbl_t)); + } + + /* Destroy descriptor ring and memory handle list lock */ + mutex_destroy(&ldcp->exp_dlist_lock); + mutex_destroy(&ldcp->imp_dlist_lock); + mutex_destroy(&ldcp->mlist_lock); + + /* Free the stream buffer for STREAM_MODE */ + if (ldcp->mode == LDC_MODE_STREAM && ldcp->stream_bufferp) + kmem_free(ldcp->stream_bufferp, ldcp->mtu); + + /* Free the RX queue */ + contig_mem_free((caddr_t)ldcp->rx_q_va, + (ldcp->rx_q_entries << LDC_PACKET_SHIFT)); + ldcp->tstate &= ~TS_RXQ_RDY; + + /* Free the TX queue */ + contig_mem_free((caddr_t)ldcp->tx_q_va, + (ldcp->tx_q_entries << LDC_PACKET_SHIFT)); + ldcp->tstate &= ~TS_TXQ_RDY; + + + mutex_exit(&ldcp->lock); + + /* Destroy mutex */ + mutex_destroy(&ldcp->lock); + + /* free channel structure */ + kmem_free(ldcp, sizeof (ldc_chan_t)); + + D1(id, "ldc_fini: (0x%llx) channel finalized\n", id); + + return (0); +} + +/* + * Open the LDC channel for use. It registers the TX/RX queues + * with the Hypervisor. It also specifies the interrupt number + * and target CPU for this channel + */ +int +ldc_open(ldc_handle_t handle) +{ + ldc_chan_t *ldcp; + int rv; + + if (handle == NULL) { + DWARN(DBG_ALL_LDCS, "ldc_open: invalid channel handle\n"); + return (EINVAL); + } + + ldcp = (ldc_chan_t *)handle; + + mutex_enter(&ldcp->lock); + + if (ldcp->tstate < TS_INIT) { + DWARN(ldcp->id, + "ldc_open: (0x%llx) channel not initialized\n", ldcp->id); + mutex_exit(&ldcp->lock); + return (EFAULT); + } + if (ldcp->tstate >= TS_OPEN) { + DWARN(ldcp->id, + "ldc_open: (0x%llx) channel is already open\n", ldcp->id); + mutex_exit(&ldcp->lock); + return (EFAULT); + } + + /* + * Unregister/Register the tx queue with the hypervisor + */ + rv = hv_ldc_tx_qconf(ldcp->id, NULL, NULL); + if (rv) { + cmn_err(CE_WARN, + "ldc_open: (0x%lx) channel tx queue unconf failed\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EIO); + } + + rv = hv_ldc_tx_qconf(ldcp->id, ldcp->tx_q_ra, ldcp->tx_q_entries); + if (rv) { + cmn_err(CE_WARN, + "ldc_open: (0x%lx) channel tx queue conf failed\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EIO); + } + + D2(ldcp->id, "ldc_open: (0x%llx) registered tx queue with LDC\n", + ldcp->id); + + /* + * Unregister/Register the rx queue with the hypervisor + */ + rv = hv_ldc_rx_qconf(ldcp->id, NULL, NULL); + if (rv) { + cmn_err(CE_WARN, + "ldc_open: (0x%lx) channel rx queue unconf failed\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EIO); + } + + rv = hv_ldc_rx_qconf(ldcp->id, ldcp->rx_q_ra, ldcp->rx_q_entries); + if (rv) { + cmn_err(CE_WARN, + "ldc_open: (0x%lx) channel rx queue conf failed\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EIO); + } + + D2(ldcp->id, "ldc_open: (0x%llx) registered rx queue with LDC\n", + ldcp->id); + + ldcp->tstate |= TS_QCONF_RDY; + + /* Register the channel with the channel nexus */ + rv = i_ldc_register_channel(ldcp); + if (rv && rv != EAGAIN) { + cmn_err(CE_WARN, + "ldc_open: (0x%lx) channel register failed\n", ldcp->id); + (void) hv_ldc_tx_qconf(ldcp->id, NULL, NULL); + (void) hv_ldc_rx_qconf(ldcp->id, NULL, NULL); + mutex_exit(&ldcp->lock); + return (EIO); + } + + /* mark channel in OPEN state */ + ldcp->status = LDC_OPEN; + + /* Read channel state */ + rv = hv_ldc_tx_get_state(ldcp->id, + &ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state); + if (rv) { + cmn_err(CE_WARN, + "ldc_open: (0x%lx) cannot read channel state\n", + ldcp->id); + (void) i_ldc_unregister_channel(ldcp); + (void) hv_ldc_tx_qconf(ldcp->id, NULL, NULL); + (void) hv_ldc_rx_qconf(ldcp->id, NULL, NULL); + mutex_exit(&ldcp->lock); + return (EIO); + } + + /* + * set the ACKd head to current head location for reliable & + * streaming mode + */ + ldcp->tx_ackd_head = ldcp->tx_head; + + /* mark channel ready if HV report link is UP (peer alloc'd Rx queue) */ + if (ldcp->link_state == LDC_CHANNEL_UP || + ldcp->link_state == LDC_CHANNEL_RESET) { + ldcp->tstate |= TS_LINK_READY; + ldcp->status = LDC_READY; + } + + /* + * if channel is being opened in RAW mode - no handshake is needed + * switch the channel READY and UP state + */ + if (ldcp->mode == LDC_MODE_RAW) { + ldcp->tstate = TS_UP; /* set bits associated with LDC UP */ + ldcp->status = LDC_UP; + } + + mutex_exit(&ldcp->lock); + + /* + * Increment number of open channels + */ + mutex_enter(&ldcssp->lock); + ldcssp->channels_open++; + mutex_exit(&ldcssp->lock); + + D1(ldcp->id, + "ldc_open: (0x%llx) channel (0x%p) open for use (tstate=0x%x)\n", + ldcp->id, ldcp, ldcp->tstate); + + return (0); +} + +/* + * Close the LDC connection. It will return EBUSY if there + * are memory segments or descriptor rings either bound to or + * mapped over the channel + */ +int +ldc_close(ldc_handle_t handle) +{ + ldc_chan_t *ldcp; + int rv = 0; + boolean_t chk_done = B_FALSE; + + if (handle == NULL) { + DWARN(DBG_ALL_LDCS, "ldc_close: invalid channel handle\n"); + return (EINVAL); + } + ldcp = (ldc_chan_t *)handle; + + mutex_enter(&ldcp->lock); + + /* return error if channel is not open */ + if (ldcp->tstate < TS_OPEN) { + DWARN(ldcp->id, + "ldc_close: (0x%llx) channel is not open\n", ldcp->id); + mutex_exit(&ldcp->lock); + return (EFAULT); + } + + /* if any memory handles, drings, are bound or mapped cannot close */ + if (ldcp->mhdl_list != NULL) { + DWARN(ldcp->id, + "ldc_close: (0x%llx) channel has bound memory handles\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EBUSY); + } + if (ldcp->exp_dring_list != NULL) { + DWARN(ldcp->id, + "ldc_close: (0x%llx) channel has bound descriptor rings\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EBUSY); + } + if (ldcp->imp_dring_list != NULL) { + DWARN(ldcp->id, + "ldc_close: (0x%llx) channel has mapped descriptor rings\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EBUSY); + } + + /* + * Wait for pending transmits to complete i.e Tx queue to drain + * if there are pending pkts - wait 1 ms and retry again + */ + for (;;) { + + rv = hv_ldc_tx_get_state(ldcp->id, + &ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state); + if (rv) { + cmn_err(CE_WARN, + "ldc_close: (0x%lx) cannot read qptrs\n", ldcp->id); + mutex_exit(&ldcp->lock); + return (EIO); + } + + if (ldcp->tx_head == ldcp->tx_tail || + ldcp->link_state != LDC_CHANNEL_UP) { + break; + } + + if (chk_done) { + DWARN(ldcp->id, + "ldc_close: (0x%llx) Tx queue drain timeout\n", + ldcp->id); + break; + } + + /* wait for one ms and try again */ + delay(drv_usectohz(1000)); + chk_done = B_TRUE; + } + + /* + * Unregister the channel with the nexus + */ + rv = i_ldc_unregister_channel(ldcp); + if (rv && rv != EAGAIN) { + cmn_err(CE_WARN, + "ldc_close: (0x%lx) channel unregister failed\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (rv); + } + + /* + * Unregister queues + */ + rv = hv_ldc_tx_qconf(ldcp->id, NULL, NULL); + if (rv) { + cmn_err(CE_WARN, + "ldc_close: (0x%lx) channel TX queue unconf failed\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EIO); + } + rv = hv_ldc_rx_qconf(ldcp->id, NULL, NULL); + if (rv) { + cmn_err(CE_WARN, + "ldc_close: (0x%lx) channel RX queue unconf failed\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EIO); + } + + ldcp->tstate &= ~TS_QCONF_RDY; + + /* Reset channel state information */ + i_ldc_reset_state(ldcp); + + /* Mark channel as down and in initialized state */ + ldcp->tx_ackd_head = 0; + ldcp->tx_head = 0; + ldcp->tstate = TS_INIT; + ldcp->status = LDC_INIT; + + mutex_exit(&ldcp->lock); + + /* Decrement number of open channels */ + mutex_enter(&ldcssp->lock); + ldcssp->channels_open--; + mutex_exit(&ldcssp->lock); + + D1(ldcp->id, "ldc_close: (0x%llx) channel closed\n", ldcp->id); + + return (0); +} + +/* + * Register channel callback + */ +int +ldc_reg_callback(ldc_handle_t handle, + uint_t(*cb)(uint64_t event, caddr_t arg), caddr_t arg) +{ + ldc_chan_t *ldcp; + + if (handle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_reg_callback: invalid channel handle\n"); + return (EINVAL); + } + if (((uint64_t)cb) < KERNELBASE) { + DWARN(DBG_ALL_LDCS, "ldc_reg_callback: invalid callback\n"); + return (EINVAL); + } + ldcp = (ldc_chan_t *)handle; + + mutex_enter(&ldcp->lock); + + if (ldcp->cb) { + DWARN(ldcp->id, "ldc_reg_callback: (0x%llx) callback exists\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EIO); + } + if (ldcp->cb_inprogress) { + DWARN(ldcp->id, "ldc_reg_callback: (0x%llx) callback active\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EWOULDBLOCK); + } + + ldcp->cb = cb; + ldcp->cb_arg = arg; + ldcp->cb_enabled = B_TRUE; + + D1(ldcp->id, + "ldc_reg_callback: (0x%llx) registered callback for channel\n", + ldcp->id); + + mutex_exit(&ldcp->lock); + + return (0); +} + +/* + * Unregister channel callback + */ +int +ldc_unreg_callback(ldc_handle_t handle) +{ + ldc_chan_t *ldcp; + + if (handle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_unreg_callback: invalid channel handle\n"); + return (EINVAL); + } + ldcp = (ldc_chan_t *)handle; + + mutex_enter(&ldcp->lock); + + if (ldcp->cb == NULL) { + DWARN(ldcp->id, + "ldc_unreg_callback: (0x%llx) no callback exists\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EIO); + } + if (ldcp->cb_inprogress) { + DWARN(ldcp->id, + "ldc_unreg_callback: (0x%llx) callback active\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EWOULDBLOCK); + } + + ldcp->cb = NULL; + ldcp->cb_arg = NULL; + ldcp->cb_enabled = B_FALSE; + + D1(ldcp->id, + "ldc_unreg_callback: (0x%llx) unregistered callback for channel\n", + ldcp->id); + + mutex_exit(&ldcp->lock); + + return (0); +} + + +/* + * Bring a channel up by initiating a handshake with the peer + * This call is asynchronous. It will complete at a later point + * in time when the peer responds back with an RTR. + */ +int +ldc_up(ldc_handle_t handle) +{ + int rv; + ldc_chan_t *ldcp; + ldc_msg_t *ldcmsg; + uint64_t tx_tail; + + if (handle == NULL) { + DWARN(DBG_ALL_LDCS, "ldc_up: invalid channel handle\n"); + return (EINVAL); + } + ldcp = (ldc_chan_t *)handle; + + mutex_enter(&ldcp->lock); + + if (ldcp->tstate == TS_UP) { + D2(ldcp->id, + "ldc_up: (0x%llx) channel is already in UP state\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (0); + } + + /* if the channel is in RAW mode - mark it as UP, if READY */ + if (ldcp->mode == LDC_MODE_RAW && ldcp->tstate >= TS_READY) { + ldcp->tstate = TS_UP; + mutex_exit(&ldcp->lock); + return (0); + } + + /* Don't start another handshake if there is one in progress */ + if (ldcp->hstate) { + D2(ldcp->id, + "ldc_up: (0x%llx) channel handshake in progress\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (0); + } + + /* get the current tail for the LDC msg */ + rv = i_ldc_get_tx_tail(ldcp, &tx_tail); + if (rv) { + DWARN(ldcp->id, "ldc_up: (0x%llx) cannot initiate handshake\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (ECONNREFUSED); + } + + ldcmsg = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail); + ZERO_PKT(ldcmsg); + + ldcmsg->type = LDC_CTRL; + ldcmsg->stype = LDC_INFO; + ldcmsg->ctrl = LDC_VER; + ldcp->next_vidx = 0; + bcopy(&ldc_versions[0], ldcmsg->udata, sizeof (ldc_versions[0])); + + DUMP_LDC_PKT(ldcp, "ldc_up snd ver", (uint64_t)ldcmsg); + + /* initiate the send by calling into HV and set the new tail */ + tx_tail = (tx_tail + LDC_PACKET_SIZE) % + (ldcp->tx_q_entries << LDC_PACKET_SHIFT); + + rv = i_ldc_set_tx_tail(ldcp, tx_tail); + if (rv) { + DWARN(ldcp->id, + "ldc_up: (0x%llx) cannot initiate handshake rv=%d\n", + ldcp->id, rv); + mutex_exit(&ldcp->lock); + return (rv); + } + + ldcp->tx_tail = tx_tail; + D1(ldcp->id, "ldc_up: (0x%llx) channel up initiated\n", ldcp->id); + + mutex_exit(&ldcp->lock); + + return (rv); +} + + +/* + * Reset a channel by re-registering the Rx queues + */ +int +ldc_reset(ldc_handle_t handle) +{ + ldc_chan_t *ldcp; + + if (handle == NULL) { + DWARN(DBG_ALL_LDCS, "ldc_reset: invalid channel handle\n"); + return (EINVAL); + } + ldcp = (ldc_chan_t *)handle; + + mutex_enter(&ldcp->lock); + i_ldc_reset(ldcp); + mutex_exit(&ldcp->lock); + + return (0); +} + +/* + * Get the current channel status + */ +int +ldc_status(ldc_handle_t handle, ldc_status_t *status) +{ + ldc_chan_t *ldcp; + + if (handle == NULL || status == NULL) { + DWARN(DBG_ALL_LDCS, "ldc_status: invalid argument\n"); + return (EINVAL); + } + ldcp = (ldc_chan_t *)handle; + + *status = ((ldc_chan_t *)handle)->status; + + D1(ldcp->id, + "ldc_status: (0x%llx) returned status %d\n", ldcp->id, *status); + return (0); +} + + +/* + * Set the channel's callback mode - enable/disable callbacks + */ +int +ldc_set_cb_mode(ldc_handle_t handle, ldc_cb_mode_t cmode) +{ + ldc_chan_t *ldcp; + + if (handle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_set_intr_mode: invalid channel handle\n"); + return (EINVAL); + } + ldcp = (ldc_chan_t *)handle; + + /* + * Record no callbacks should be invoked + */ + mutex_enter(&ldcp->lock); + + switch (cmode) { + case LDC_CB_DISABLE: + if (!ldcp->cb_enabled) { + DWARN(ldcp->id, + "ldc_set_cb_mode: (0x%llx) callbacks disabled\n", + ldcp->id); + break; + } + ldcp->cb_enabled = B_FALSE; + + D1(ldcp->id, "ldc_set_cb_mode: (0x%llx) disabled callbacks\n", + ldcp->id); + break; + + case LDC_CB_ENABLE: + if (ldcp->cb_enabled) { + DWARN(ldcp->id, + "ldc_set_cb_mode: (0x%llx) callbacks enabled\n", + ldcp->id); + break; + } + ldcp->cb_enabled = B_TRUE; + + D1(ldcp->id, "ldc_set_cb_mode: (0x%llx) enabled callbacks\n", + ldcp->id); + break; + } + + mutex_exit(&ldcp->lock); + + return (0); +} + +/* + * Check to see if there are packets on the incoming queue + * Will return isempty = B_FALSE if there are packets + */ +int +ldc_chkq(ldc_handle_t handle, boolean_t *isempty) +{ + int rv; + uint64_t rx_head, rx_tail; + ldc_chan_t *ldcp; + + if (handle == NULL) { + DWARN(DBG_ALL_LDCS, "ldc_chkq: invalid channel handle\n"); + return (EINVAL); + } + ldcp = (ldc_chan_t *)handle; + + *isempty = B_TRUE; + + mutex_enter(&ldcp->lock); + + if (ldcp->tstate != TS_UP) { + D1(ldcp->id, + "ldc_chkq: (0x%llx) channel is not up\n", ldcp->id); + mutex_exit(&ldcp->lock); + return (ECONNRESET); + } + + /* Read packet(s) from the queue */ + rv = hv_ldc_rx_get_state(ldcp->id, &rx_head, &rx_tail, + &ldcp->link_state); + if (rv != 0) { + cmn_err(CE_WARN, + "ldc_chkq: (0x%lx) unable to read queue ptrs", ldcp->id); + mutex_exit(&ldcp->lock); + return (EIO); + } + /* reset the channel state if the channel went down */ + if (ldcp->link_state == LDC_CHANNEL_DOWN || + ldcp->link_state == LDC_CHANNEL_RESET) { + i_ldc_reset(ldcp); + mutex_exit(&ldcp->lock); + return (ECONNRESET); + } + + if (rx_head != rx_tail) { + D1(ldcp->id, "ldc_chkq: (0x%llx) queue has pkt(s)\n", ldcp->id); + *isempty = B_FALSE; + } + + mutex_exit(&ldcp->lock); + + return (0); +} + + +/* + * Read 'size' amount of bytes or less. If incoming buffer + * is more than 'size', ENOBUFS is returned. + * + * On return, size contains the number of bytes read. + */ +int +ldc_read(ldc_handle_t handle, caddr_t bufp, size_t *sizep) +{ + ldc_chan_t *ldcp; + uint64_t rx_head = 0, rx_tail = 0; + int rv = 0, exit_val; + + if (handle == NULL) { + DWARN(DBG_ALL_LDCS, "ldc_read: invalid channel handle\n"); + return (EINVAL); + } + + ldcp = (ldc_chan_t *)handle; + + /* channel lock */ + mutex_enter(&ldcp->lock); + + if (ldcp->tstate != TS_UP) { + DWARN(ldcp->id, + "ldc_read: (0x%llx) channel is not in UP state\n", + ldcp->id); + exit_val = ECONNRESET; + } else { + exit_val = ldcp->read_p(ldcp, bufp, sizep); + } + + /* + * if queue has been drained - clear interrupt + */ + rv = hv_ldc_rx_get_state(ldcp->id, &rx_head, &rx_tail, + &ldcp->link_state); + if (exit_val == 0 && rv == 0 && rx_head == rx_tail) { + i_ldc_clear_intr(ldcp, CNEX_RX_INTR); + } + + mutex_exit(&ldcp->lock); + return (exit_val); +} + +/* + * Basic raw mondo read - + * no interpretation of mondo contents at all. + * + * Enter and exit with ldcp->lock held by caller + */ +static int +i_ldc_read_raw(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) +{ + uint64_t q_size_mask; + ldc_msg_t *msgp; + uint8_t *msgbufp; + int rv = 0, space; + uint64_t rx_head, rx_tail; + + space = *sizep; + + if (space < LDC_PAYLOAD_SIZE_RAW) + return (ENOBUFS); + + ASSERT(mutex_owned(&ldcp->lock)); + + /* compute mask for increment */ + q_size_mask = (ldcp->rx_q_entries-1)<<LDC_PACKET_SHIFT; + + /* + * Read packet(s) from the queue + */ + rv = hv_ldc_rx_get_state(ldcp->id, &rx_head, &rx_tail, + &ldcp->link_state); + if (rv != 0) { + cmn_err(CE_WARN, + "ldc_read_raw: (0x%lx) unable to read queue ptrs", + ldcp->id); + return (EIO); + } + D1(ldcp->id, "ldc_read_raw: (0x%llx) rxh=0x%llx," + " rxt=0x%llx, st=0x%llx\n", + ldcp->id, rx_head, rx_tail, ldcp->link_state); + + /* reset the channel state if the channel went down */ + if (ldcp->link_state == LDC_CHANNEL_DOWN) { + i_ldc_reset(ldcp); + return (ECONNRESET); + } + + /* + * Check for empty queue + */ + if (rx_head == rx_tail) { + *sizep = 0; + return (0); + } + + /* get the message */ + msgp = (ldc_msg_t *)(ldcp->rx_q_va + rx_head); + + /* if channel is in RAW mode, copy data and return */ + msgbufp = (uint8_t *)&(msgp->raw[0]); + + bcopy(msgbufp, target_bufp, LDC_PAYLOAD_SIZE_RAW); + + DUMP_PAYLOAD(ldcp->id, msgbufp); + + *sizep = LDC_PAYLOAD_SIZE_RAW; + + rx_head = (rx_head + LDC_PACKET_SIZE) & q_size_mask; + (void) i_ldc_set_rx_head(ldcp, rx_head); + + return (rv); +} + +/* + * Process LDC mondos to build larger packets + * with either un-reliable or reliable delivery. + * + * Enter and exit with ldcp->lock held by caller + */ +static int +i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) +{ + int rv = 0; + uint64_t rx_head = 0, rx_tail = 0; + uint64_t curr_head = 0; + ldc_msg_t *msg; + caddr_t target; + size_t len = 0, bytes_read = 0; + int loop_cnt = 0, chk_cnt = 0; + uint64_t q_size_mask; + + target = target_bufp; + + ASSERT(mutex_owned(&ldcp->lock)); + + /* reset first frag to 0 */ + ldcp->first_fragment = 0; + + /* compute mask for increment */ + q_size_mask = (ldcp->rx_q_entries-1)<<LDC_PACKET_SHIFT; + + /* + * Read packet(s) from the queue + */ + rv = hv_ldc_rx_get_state(ldcp->id, &curr_head, &rx_tail, + &ldcp->link_state); + if (rv != 0) { + cmn_err(CE_WARN, + "ldc_read: (0x%lx) unable to read queue ptrs", + ldcp->id); + return (EIO); + } + D1(ldcp->id, "ldc_read: (0x%llx) chd=0x%llx, tl=0x%llx, st=0x%llx\n", + ldcp->id, curr_head, rx_tail, ldcp->link_state); + + /* reset the channel state if the channel went down */ + if (ldcp->link_state == LDC_CHANNEL_DOWN) { + i_ldc_reset(ldcp); + return (ECONNRESET); + } + + for (;;) { + + if (curr_head == rx_tail) { + rv = hv_ldc_rx_get_state(ldcp->id, + &rx_head, &rx_tail, &ldcp->link_state); + if (rv != 0) { + cmn_err(CE_WARN, + "ldc_read: (0x%lx) cannot read queue ptrs", + ldcp->id); + return (EIO); + } + /* reset the channel state if the channel went down */ + if (ldcp->link_state == LDC_CHANNEL_DOWN) { + i_ldc_reset(ldcp); + return (ECONNRESET); + } + } + + if (curr_head == rx_tail) { + + /* If in the middle of a fragmented xfer */ + if (ldcp->first_fragment != 0) { + if (++loop_cnt > LDC_LOOP_CNT) { + loop_cnt = 0; + ++chk_cnt; + } + if (chk_cnt < LDC_CHK_CNT) { + continue; + } else { + *sizep = 0; + ldcp->last_msg_rcd = + ldcp->first_fragment - 1; + DWARN(DBG_ALL_LDCS, + "ldc_read: (0x%llx) read timeout", + ldcp->id); + return (ETIMEDOUT); + } + } + *sizep = 0; + break; + } + loop_cnt = 0; + chk_cnt = 0; + + D2(ldcp->id, + "ldc_read: (0x%llx) chd=0x%llx, rxhd=0x%llx, rxtl=0x%llx\n", + ldcp->id, curr_head, rx_head, rx_tail); + + /* get the message */ + msg = (ldc_msg_t *)(ldcp->rx_q_va + curr_head); + + DUMP_LDC_PKT(ldcp, "ldc_read received pkt", + ldcp->rx_q_va + curr_head); + + /* Check the message ID for the message received */ + if ((rv = i_ldc_check_seqid(ldcp, msg)) != 0) { + + DWARN(ldcp->id, "ldc_read: (0x%llx) seqid error, " + "q_ptrs=0x%lx,0x%lx", ldcp->id, rx_head, rx_tail); + + /* Reset last_msg_rcd to start of message */ + if (ldcp->first_fragment != 0) { + ldcp->last_msg_rcd = + ldcp->first_fragment - 1; + ldcp->first_fragment = 0; + } + /* + * Send a NACK -- invalid seqid + * get the current tail for the response + */ + rv = i_ldc_send_pkt(ldcp, msg->type, LDC_NACK, + (msg->ctrl & LDC_CTRL_MASK)); + if (rv) { + cmn_err(CE_NOTE, + "ldc_read: (0x%lx) err sending " + "NACK msg\n", ldcp->id); + } + + /* purge receive queue */ + (void) i_ldc_set_rx_head(ldcp, rx_tail); + + break; + } + + /* + * Process any messages of type CTRL messages + * Future implementations should try to pass these to + * LDC transport by resetting the intr state. + * + * NOTE: not done as a switch() as type can be both ctrl+data + */ + if (msg->type & LDC_CTRL) { + if (rv = i_ldc_ctrlmsg(ldcp, msg)) { + if (rv == EAGAIN) + continue; + (void) i_ldc_set_rx_head(ldcp, rx_tail); + *sizep = 0; + bytes_read = 0; + rv = ECONNRESET; + break; + } + } + + /* process data ACKs */ + if ((msg->type & LDC_DATA) && (msg->stype & LDC_ACK)) { + (void) i_ldc_process_data_ACK(ldcp, msg); + } + + /* process data messages */ + if ((msg->type & LDC_DATA) && (msg->stype & LDC_INFO)) { + + uint8_t *msgbuf = (uint8_t *)( + (ldcp->mode == LDC_MODE_RELIABLE || + ldcp->mode == LDC_MODE_STREAM) + ? msg->rdata : msg->udata); + + D2(ldcp->id, + "ldc_read: (0x%llx) received data msg\n", ldcp->id); + + /* get the packet length */ + len = (msg->env & LDC_LEN_MASK); + + /* + * FUTURE OPTIMIZATION: + * dont need to set q head for every + * packet we read just need to do this when + * we are done or need to wait for more + * mondos to make a full packet - this is + * currently expensive. + */ + + if (ldcp->first_fragment == 0) { + + /* + * first packets should always have the start + * bit set (even for a single packet). If not + * throw away the packet + */ + if (!(msg->env & LDC_FRAG_START)) { + + DWARN(DBG_ALL_LDCS, + "ldc_read: (0x%llx) not start - " + "frag=%x\n", ldcp->id, + (msg->env) & LDC_FRAG_MASK); + + /* toss pkt, inc head, cont reading */ + bytes_read = 0; + target = target_bufp; + curr_head = + (curr_head + LDC_PACKET_SIZE) + & q_size_mask; + if (rv = i_ldc_set_rx_head(ldcp, + curr_head)) + break; + + continue; + } + + ldcp->first_fragment = msg->seqid; + } else { + /* check to see if this is a pkt w/ START bit */ + if (msg->env & LDC_FRAG_START) { + DWARN(DBG_ALL_LDCS, + "ldc_read:(0x%llx) unexpected pkt" + " env=0x%x discarding %d bytes," + " lastmsg=%d, currentmsg=%d\n", + ldcp->id, msg->env&LDC_FRAG_MASK, + bytes_read, ldcp->last_msg_rcd, + msg->seqid); + + /* throw data we have read so far */ + bytes_read = 0; + target = target_bufp; + ldcp->first_fragment = msg->seqid; + + if (rv = i_ldc_set_rx_head(ldcp, + curr_head)) + break; + } + } + + /* copy (next) pkt into buffer */ + if (len <= (*sizep - bytes_read)) { + bcopy(msgbuf, target, len); + target += len; + bytes_read += len; + } else { + /* + * there is not enough space in the buffer to + * read this pkt. throw message away & continue + * reading data from queue + */ + DWARN(DBG_ALL_LDCS, + "ldc_read: (0x%llx) buffer too small, " + "head=0x%lx, expect=%d, got=%d\n", ldcp->id, + curr_head, *sizep, bytes_read+len); + + ldcp->first_fragment = 0; + target = target_bufp; + bytes_read = 0; + + /* throw away everything received so far */ + if (rv = i_ldc_set_rx_head(ldcp, curr_head)) + break; + + /* continue reading remaining pkts */ + continue; + } + } + + /* set the message id */ + ldcp->last_msg_rcd = msg->seqid; + + /* move the head one position */ + curr_head = (curr_head + LDC_PACKET_SIZE) & q_size_mask; + + if (msg->env & LDC_FRAG_STOP) { + + /* + * All pkts that are part of this fragmented transfer + * have been read or this was a single pkt read + * or there was an error + */ + + /* set the queue head */ + if (rv = i_ldc_set_rx_head(ldcp, curr_head)) + bytes_read = 0; + + *sizep = bytes_read; + + break; + } + + /* advance head if it is a DATA ACK */ + if ((msg->type & LDC_DATA) && (msg->stype & LDC_ACK)) { + + /* set the queue head */ + if (rv = i_ldc_set_rx_head(ldcp, curr_head)) { + bytes_read = 0; + break; + } + + D2(ldcp->id, "ldc_read: (0x%llx) set ACK qhead 0x%llx", + ldcp->id, curr_head); + } + + } /* for (;;) */ + + + /* + * If useful data was read - Send msg ACK + * OPTIMIZE: do not send ACK for all msgs - use some frequency + */ + if ((bytes_read > 0) && (ldcp->mode == LDC_MODE_RELIABLE || + ldcp->mode == LDC_MODE_STREAM)) { + + rv = i_ldc_send_pkt(ldcp, LDC_DATA, LDC_ACK, 0); + if (rv != 0) { + cmn_err(CE_NOTE, + "ldc_read: (0x%lx) cannot send ACK\n", ldcp->id); + return (0); + } + } + + D2(ldcp->id, "ldc_read: (0x%llx) end size=%d", ldcp->id, *sizep); + + return (rv); +} + +/* + * Use underlying reliable packet mechanism to fetch + * and buffer incoming packets so we can hand them back as + * a basic byte stream. + * + * Enter and exit with ldcp->lock held by caller + */ +static int +i_ldc_read_stream(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) +{ + int rv; + size_t size; + + ASSERT(mutex_owned(&ldcp->lock)); + + D2(ldcp->id, "i_ldc_read_stream: (0x%llx) buffer size=%d", + ldcp->id, *sizep); + + if (ldcp->stream_remains == 0) { + size = ldcp->mtu; + rv = i_ldc_read_packet(ldcp, + (caddr_t)ldcp->stream_bufferp, &size); + D2(ldcp->id, "i_ldc_read_stream: read packet (0x%llx) size=%d", + ldcp->id, size); + + if (rv != 0) + return (rv); + + ldcp->stream_remains = size; + ldcp->stream_offset = 0; + } + + size = MIN(ldcp->stream_remains, *sizep); + + bcopy(ldcp->stream_bufferp + ldcp->stream_offset, target_bufp, size); + ldcp->stream_offset += size; + ldcp->stream_remains -= size; + + D2(ldcp->id, "i_ldc_read_stream: (0x%llx) fill from buffer size=%d", + ldcp->id, size); + + *sizep = size; + return (0); +} + +/* + * Write specified amount of bytes to the channel + * in multiple pkts of pkt_payload size. Each + * packet is tagged with an unique packet ID in + * the case of a reliable transport. + * + * On return, size contains the number of bytes written. + */ +int +ldc_write(ldc_handle_t handle, caddr_t buf, size_t *sizep) +{ + ldc_chan_t *ldcp; + int rv = 0; + + if (handle == NULL) { + DWARN(DBG_ALL_LDCS, "ldc_write: invalid channel handle\n"); + return (EINVAL); + } + ldcp = (ldc_chan_t *)handle; + + mutex_enter(&ldcp->lock); + + /* check if non-zero data to write */ + if (buf == NULL || sizep == NULL) { + DWARN(ldcp->id, "ldc_write: (0x%llx) invalid data write\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EINVAL); + } + + if (*sizep == 0) { + DWARN(ldcp->id, "ldc_write: (0x%llx) write size of zero\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (0); + } + + /* Check if channel is UP for data exchange */ + if (ldcp->tstate != TS_UP) { + DWARN(ldcp->id, + "ldc_write: (0x%llx) channel is not in UP state\n", + ldcp->id); + *sizep = 0; + rv = ECONNRESET; + } else { + rv = ldcp->write_p(ldcp, buf, sizep); + } + + mutex_exit(&ldcp->lock); + + return (rv); +} + +/* + * Write a raw packet to the channel + * On return, size contains the number of bytes written. + */ +static int +i_ldc_write_raw(ldc_chan_t *ldcp, caddr_t buf, size_t *sizep) +{ + ldc_msg_t *ldcmsg; + uint64_t tx_head, tx_tail, new_tail; + int rv = 0; + size_t size; + + ASSERT(mutex_owned(&ldcp->lock)); + ASSERT(ldcp->mode == LDC_MODE_RAW); + + size = *sizep; + + /* + * Check to see if the packet size is less than or + * equal to packet size support in raw mode + */ + if (size > ldcp->pkt_payload) { + DWARN(ldcp->id, + "ldc_write: (0x%llx) invalid size (0x%llx) for RAW mode\n", + ldcp->id, *sizep); + *sizep = 0; + return (EMSGSIZE); + } + + /* get the qptrs for the tx queue */ + rv = hv_ldc_tx_get_state(ldcp->id, + &ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state); + if (rv != 0) { + cmn_err(CE_WARN, + "ldc_write: (0x%lx) cannot read queue ptrs\n", ldcp->id); + *sizep = 0; + return (EIO); + } + + if (ldcp->link_state == LDC_CHANNEL_DOWN || + ldcp->link_state == LDC_CHANNEL_RESET) { + DWARN(ldcp->id, + "ldc_write: (0x%llx) channel down/reset\n", ldcp->id); + i_ldc_reset(ldcp); + *sizep = 0; + return (ECONNRESET); + } + + tx_tail = ldcp->tx_tail; + tx_head = ldcp->tx_head; + new_tail = (tx_tail + LDC_PACKET_SIZE) & + ((ldcp->tx_q_entries-1) << LDC_PACKET_SHIFT); + + if (new_tail == tx_head) { + DWARN(DBG_ALL_LDCS, + "ldc_write: (0x%llx) TX queue is full\n", ldcp->id); + *sizep = 0; + return (EWOULDBLOCK); + } + + D2(ldcp->id, "ldc_write: (0x%llx) start xfer size=%d", + ldcp->id, size); + + /* Send the data now */ + ldcmsg = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail); + + /* copy the data into pkt */ + bcopy((uint8_t *)buf, ldcmsg, size); + + /* increment tail */ + tx_tail = new_tail; + + /* + * All packets have been copied into the TX queue + * update the tail ptr in the HV + */ + rv = i_ldc_set_tx_tail(ldcp, tx_tail); + if (rv) { + if (rv == EWOULDBLOCK) { + DWARN(ldcp->id, "ldc_write: (0x%llx) write timed out\n", + ldcp->id); + *sizep = 0; + return (EWOULDBLOCK); + } + + /* cannot write data - reset channel */ + i_ldc_reset(ldcp); + *sizep = 0; + return (ECONNRESET); + } + + ldcp->tx_tail = tx_tail; + *sizep = size; + + D2(ldcp->id, "ldc_write: (0x%llx) end xfer size=%d", ldcp->id, size); + + return (rv); +} + + +/* + * Write specified amount of bytes to the channel + * in multiple pkts of pkt_payload size. Each + * packet is tagged with an unique packet ID in + * the case of a reliable transport. + * + * On return, size contains the number of bytes written. + * This function needs to ensure that the write size is < MTU size + */ +static int +i_ldc_write_packet(ldc_chan_t *ldcp, caddr_t buf, size_t *size) +{ + ldc_msg_t *ldcmsg; + uint64_t tx_head, tx_tail, new_tail, start; + uint64_t txq_size_mask, numavail; + uint8_t *msgbuf, *source = (uint8_t *)buf; + size_t len, bytes_written = 0, remaining; + int rv; + uint32_t curr_seqid; + + ASSERT(mutex_owned(&ldcp->lock)); + + ASSERT(ldcp->mode == LDC_MODE_RELIABLE || + ldcp->mode == LDC_MODE_UNRELIABLE || + ldcp->mode == LDC_MODE_STREAM); + + /* compute mask for increment */ + txq_size_mask = (ldcp->tx_q_entries - 1) << LDC_PACKET_SHIFT; + + /* get the qptrs for the tx queue */ + rv = hv_ldc_tx_get_state(ldcp->id, + &ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state); + if (rv != 0) { + cmn_err(CE_WARN, + "ldc_write: (0x%lx) cannot read queue ptrs\n", ldcp->id); + *size = 0; + return (EIO); + } + + if (ldcp->link_state == LDC_CHANNEL_DOWN || + ldcp->link_state == LDC_CHANNEL_RESET) { + DWARN(ldcp->id, + "ldc_write: (0x%llx) channel down/reset\n", ldcp->id); + *size = 0; + i_ldc_reset(ldcp); + return (ECONNRESET); + } + + tx_tail = ldcp->tx_tail; + new_tail = (tx_tail + LDC_PACKET_SIZE) % + (ldcp->tx_q_entries << LDC_PACKET_SHIFT); + + /* + * Transport mode determines whether we use HV Tx head or the + * private protocol head (corresponding to last ACKd pkt) for + * determining how much we can write + */ + tx_head = (ldcp->mode == LDC_MODE_RELIABLE || + ldcp->mode == LDC_MODE_STREAM) + ? ldcp->tx_ackd_head : ldcp->tx_head; + if (new_tail == tx_head) { + DWARN(DBG_ALL_LDCS, + "ldc_write: (0x%llx) TX queue is full\n", ldcp->id); + *size = 0; + return (EWOULDBLOCK); + } + + /* + * Make sure that the LDC Tx queue has enough space + */ + numavail = (tx_head >> LDC_PACKET_SHIFT) - (tx_tail >> LDC_PACKET_SHIFT) + + ldcp->tx_q_entries - 1; + numavail %= ldcp->tx_q_entries; + + if (*size > (numavail * ldcp->pkt_payload)) { + DWARN(DBG_ALL_LDCS, + "ldc_write: (0x%llx) TX queue has no space\n", ldcp->id); + return (EWOULDBLOCK); + } + + D2(ldcp->id, "ldc_write: (0x%llx) start xfer size=%d", + ldcp->id, *size); + + /* Send the data now */ + bytes_written = 0; + curr_seqid = ldcp->last_msg_snt; + start = tx_tail; + + while (*size > bytes_written) { + + ldcmsg = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail); + + msgbuf = (uint8_t *)((ldcp->mode == LDC_MODE_RELIABLE || + ldcp->mode == LDC_MODE_STREAM) + ? ldcmsg->rdata : ldcmsg->udata); + + ldcmsg->type = LDC_DATA; + ldcmsg->stype = LDC_INFO; + ldcmsg->ctrl = 0; + + remaining = *size - bytes_written; + len = min(ldcp->pkt_payload, remaining); + ldcmsg->env = (uint8_t)len; + + curr_seqid++; + ldcmsg->seqid = curr_seqid; + + DUMP_LDC_PKT(ldcp, "ldc_write snd data", (uint64_t)ldcmsg); + + /* copy the data into pkt */ + bcopy(source, msgbuf, len); + + source += len; + bytes_written += len; + + /* increment tail */ + tx_tail = (tx_tail + LDC_PACKET_SIZE) & txq_size_mask; + + ASSERT(tx_tail != tx_head); + } + + /* Set the start and stop bits */ + ldcmsg->env |= LDC_FRAG_STOP; + ldcmsg = (ldc_msg_t *)(ldcp->tx_q_va + start); + ldcmsg->env |= LDC_FRAG_START; + + /* + * All packets have been copied into the TX queue + * update the tail ptr in the HV + */ + rv = i_ldc_set_tx_tail(ldcp, tx_tail); + if (rv == 0) { + ldcp->tx_tail = tx_tail; + ldcp->last_msg_snt = curr_seqid; + *size = bytes_written; + } else { + int rv2; + + if (rv != EWOULDBLOCK) { + /* cannot write data - reset channel */ + i_ldc_reset(ldcp); + *size = 0; + return (ECONNRESET); + } + + DWARN(ldcp->id, "hv_tx_set_tail returns 0x%x (head 0x%x, " + "old tail 0x%x, new tail 0x%x, qsize=0x%x)\n", + rv, ldcp->tx_head, ldcp->tx_tail, tx_tail, + (ldcp->tx_q_entries << LDC_PACKET_SHIFT)); + + rv2 = hv_ldc_tx_get_state(ldcp->id, + &tx_head, &tx_tail, &ldcp->link_state); + + DWARN(ldcp->id, "hv_ldc_tx_get_state returns 0x%x " + "(head 0x%x, tail 0x%x state 0x%x)\n", + rv2, tx_head, tx_tail, ldcp->link_state); + + *size = 0; + } + + D2(ldcp->id, "ldc_write: (0x%llx) end xfer size=%d", ldcp->id, *size); + + return (rv); +} + +/* + * Write specified amount of bytes to the channel + * in multiple pkts of pkt_payload size. Each + * packet is tagged with an unique packet ID in + * the case of a reliable transport. + * + * On return, size contains the number of bytes written. + * This function needs to ensure that the write size is < MTU size + */ +static int +i_ldc_write_stream(ldc_chan_t *ldcp, caddr_t buf, size_t *sizep) +{ + ASSERT(mutex_owned(&ldcp->lock)); + ASSERT(ldcp->mode == LDC_MODE_STREAM); + + /* Truncate packet to max of MTU size */ + if (*sizep > ldcp->mtu) *sizep = ldcp->mtu; + return (i_ldc_write_packet(ldcp, buf, sizep)); +} + + +/* + * Interfaces for channel nexus to register/unregister with LDC module + * The nexus will register functions to be used to register individual + * channels with the nexus and enable interrupts for the channels + */ +int +ldc_register(ldc_cnex_t *cinfo) +{ + ldc_chan_t *ldcp; + + if (cinfo == NULL || cinfo->dip == NULL || + cinfo->reg_chan == NULL || cinfo->unreg_chan == NULL || + cinfo->add_intr == NULL || cinfo->rem_intr == NULL || + cinfo->clr_intr == NULL) { + + DWARN(DBG_ALL_LDCS, "ldc_register: invalid nexus info\n"); + return (EINVAL); + } + + mutex_enter(&ldcssp->lock); + + /* nexus registration */ + ldcssp->cinfo.dip = cinfo->dip; + ldcssp->cinfo.reg_chan = cinfo->reg_chan; + ldcssp->cinfo.unreg_chan = cinfo->unreg_chan; + ldcssp->cinfo.add_intr = cinfo->add_intr; + ldcssp->cinfo.rem_intr = cinfo->rem_intr; + ldcssp->cinfo.clr_intr = cinfo->clr_intr; + + /* register any channels that might have been previously initialized */ + ldcp = ldcssp->chan_list; + while (ldcp) { + if ((ldcp->tstate & TS_QCONF_RDY) && + (ldcp->tstate & TS_CNEX_RDY) == 0) + (void) i_ldc_register_channel(ldcp); + + ldcp = ldcp->next; + } + + mutex_exit(&ldcssp->lock); + + return (0); +} + +int +ldc_unregister(ldc_cnex_t *cinfo) +{ + if (cinfo == NULL || cinfo->dip == NULL) { + DWARN(DBG_ALL_LDCS, "ldc_unregister: invalid nexus info\n"); + return (EINVAL); + } + + mutex_enter(&ldcssp->lock); + + if (cinfo->dip != ldcssp->cinfo.dip) { + DWARN(DBG_ALL_LDCS, "ldc_unregister: invalid dip\n"); + mutex_exit(&ldcssp->lock); + return (EINVAL); + } + + /* nexus unregister */ + ldcssp->cinfo.dip = NULL; + ldcssp->cinfo.reg_chan = NULL; + ldcssp->cinfo.unreg_chan = NULL; + ldcssp->cinfo.add_intr = NULL; + ldcssp->cinfo.rem_intr = NULL; + ldcssp->cinfo.clr_intr = NULL; + + mutex_exit(&ldcssp->lock); + + return (0); +} + + +/* ------------------------------------------------------------------------- */ + +/* + * Allocate a memory handle for the channel and link it into the list + * Also choose which memory table to use if this is the first handle + * being assigned to this channel + */ +int +ldc_mem_alloc_handle(ldc_handle_t handle, ldc_mem_handle_t *mhandle) +{ + ldc_chan_t *ldcp; + ldc_mhdl_t *mhdl; + int rv; + + if (handle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_alloc_handle: invalid channel handle\n"); + return (EINVAL); + } + ldcp = (ldc_chan_t *)handle; + + mutex_enter(&ldcp->lock); + + /* check to see if channel is initalized */ + if (ldcp->tstate < TS_INIT) { + DWARN(ldcp->id, + "ldc_mem_alloc_handle: (0x%llx) channel not initialized\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EINVAL); + } + + /* + * If this channel is allocating a mem handle for the + * first time allocate it a memory map table and initialize it + */ + if (ldcp->mtbl == NULL) { + + ldc_mtbl_t *mtbl; + + /* Allocate and initialize the map table structure */ + mtbl = kmem_zalloc(sizeof (ldc_mtbl_t), KM_SLEEP); + mtbl->size = MTBL_MAX_SIZE; + mtbl->num_entries = mtbl->num_avail = + (MTBL_MAX_SIZE/sizeof (ldc_mte_slot_t)); + mtbl->next_entry = NULL; + + /* Allocate the table itself */ + mtbl->table = (ldc_mte_slot_t *) + contig_mem_alloc_align(mtbl->size, MMU_PAGESIZE); + if (mtbl->table == NULL) { + cmn_err(CE_WARN, + "ldc_mem_alloc_handle: (0x%lx) error allocating " + "table memory", ldcp->id); + kmem_free(mtbl, sizeof (ldc_mtbl_t)); + mutex_exit(&ldcp->lock); + return (ENOMEM); + } + + /* zero out the memory */ + bzero(mtbl->table, mtbl->size); + + /* initialize the lock */ + mutex_init(&mtbl->lock, NULL, MUTEX_DRIVER, NULL); + + /* register table for this channel */ + rv = hv_ldc_set_map_table(ldcp->id, + va_to_pa(mtbl->table), mtbl->num_entries); + if (rv != 0) { + cmn_err(CE_WARN, + "ldc_mem_alloc_handle: (0x%lx) err %d mapping tbl", + ldcp->id, rv); + contig_mem_free(mtbl->table, mtbl->size); + mutex_destroy(&mtbl->lock); + kmem_free(mtbl, sizeof (ldc_mtbl_t)); + mutex_exit(&ldcp->lock); + return (EIO); + } + + ldcp->mtbl = mtbl; + + D1(ldcp->id, + "ldc_mem_alloc_handle: (0x%llx) alloc'd map table 0x%llx\n", + ldcp->id, ldcp->mtbl->table); + } + + /* allocate handle for channel */ + mhdl = kmem_zalloc(sizeof (ldc_mhdl_t), KM_SLEEP); + + /* initialize the lock */ + mutex_init(&mhdl->lock, NULL, MUTEX_DRIVER, NULL); + + mhdl->status = LDC_UNBOUND; + mhdl->ldcp = ldcp; + + /* insert memory handle (@ head) into list */ + if (ldcp->mhdl_list == NULL) { + ldcp->mhdl_list = mhdl; + mhdl->next = NULL; + } else { + /* insert @ head */ + mhdl->next = ldcp->mhdl_list; + ldcp->mhdl_list = mhdl; + } + + /* return the handle */ + *mhandle = (ldc_mem_handle_t)mhdl; + + mutex_exit(&ldcp->lock); + + D1(ldcp->id, "ldc_mem_alloc_handle: (0x%llx) allocated handle 0x%llx\n", + ldcp->id, mhdl); + + return (0); +} + +/* + * Free memory handle for the channel and unlink it from the list + */ +int +ldc_mem_free_handle(ldc_mem_handle_t mhandle) +{ + ldc_mhdl_t *mhdl, *phdl; + ldc_chan_t *ldcp; + + if (mhandle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_free_handle: invalid memory handle\n"); + return (EINVAL); + } + mhdl = (ldc_mhdl_t *)mhandle; + + mutex_enter(&mhdl->lock); + + ldcp = mhdl->ldcp; + + if (mhdl->status == LDC_BOUND || mhdl->status == LDC_MAPPED) { + DWARN(ldcp->id, + "ldc_mem_free_handle: cannot free, 0x%llx hdl bound\n", + mhdl); + mutex_exit(&mhdl->lock); + return (EINVAL); + } + mutex_exit(&mhdl->lock); + + mutex_enter(&ldcp->mlist_lock); + + phdl = ldcp->mhdl_list; + + /* first handle */ + if (phdl == mhdl) { + ldcp->mhdl_list = mhdl->next; + mutex_destroy(&mhdl->lock); + kmem_free(mhdl, sizeof (ldc_mhdl_t)); + D1(ldcp->id, + "ldc_mem_free_handle: (0x%llx) freed handle 0x%llx\n", + ldcp->id, mhdl); + } else { + /* walk the list - unlink and free */ + while (phdl != NULL) { + if (phdl->next == mhdl) { + phdl->next = mhdl->next; + mutex_destroy(&mhdl->lock); + kmem_free(mhdl, sizeof (ldc_mhdl_t)); + D1(ldcp->id, + "ldc_mem_free_handle: (0x%llx) freed " + "handle 0x%llx\n", ldcp->id, mhdl); + break; + } + phdl = phdl->next; + } + } + + if (phdl == NULL) { + DWARN(ldcp->id, + "ldc_mem_free_handle: invalid handle 0x%llx\n", mhdl); + mutex_exit(&ldcp->mlist_lock); + return (EINVAL); + } + + mutex_exit(&ldcp->mlist_lock); + + return (0); +} + +/* + * Bind a memory handle to a virtual address. + * The virtual address is converted to the corresponding real addresses. + * Returns pointer to the first ldc_mem_cookie and the total number + * of cookies for this virtual address. Other cookies can be obtained + * using the ldc_mem_nextcookie() call. If the pages are stored in + * consecutive locations in the table, a single cookie corresponding to + * the first location is returned. The cookie size spans all the entries. + * + * If the VA corresponds to a page that is already being exported, reuse + * the page and do not export it again. Bump the page's use count. + */ +int +ldc_mem_bind_handle(ldc_mem_handle_t mhandle, caddr_t vaddr, size_t len, + uint8_t mtype, uint8_t perm, ldc_mem_cookie_t *cookie, uint32_t *ccount) +{ + ldc_mhdl_t *mhdl; + ldc_chan_t *ldcp; + ldc_mtbl_t *mtbl; + ldc_memseg_t *memseg; + ldc_mte_t tmp_mte; + uint64_t index, prev_index = 0; + int64_t cookie_idx; + uintptr_t raddr, ra_aligned; + uint64_t psize, poffset, v_offset; + uint64_t pg_shift, pg_size, pg_size_code, pg_mask; + pgcnt_t npages; + caddr_t v_align, addr; + int i; + + if (mhandle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_bind_handle: invalid memory handle\n"); + return (EINVAL); + } + mhdl = (ldc_mhdl_t *)mhandle; + ldcp = mhdl->ldcp; + mtbl = ldcp->mtbl; + + /* clear count */ + *ccount = 0; + + mutex_enter(&mhdl->lock); + + if (mhdl->status == LDC_BOUND || mhdl->memseg != NULL) { + DWARN(ldcp->id, + "ldc_mem_bind_handle: (0x%x) handle already bound\n", + mhandle); + mutex_exit(&mhdl->lock); + return (EINVAL); + } + + /* Force address and size to be 8-byte aligned */ + if ((((uintptr_t)vaddr | len) & 0x7) != 0) { + DWARN(ldcp->id, + "ldc_mem_bind_handle: addr/size is not 8-byte aligned\n"); + mutex_exit(&mhdl->lock); + return (EINVAL); + } + + /* FUTURE: get the page size, pgsz code, and shift */ + pg_size = MMU_PAGESIZE; + pg_size_code = page_szc(pg_size); + pg_shift = page_get_shift(pg_size_code); + pg_mask = ~(pg_size - 1); + + D1(ldcp->id, "ldc_mem_bind_handle: (0x%llx) binding " + "va 0x%llx pgsz=0x%llx, pgszc=0x%llx, pg_shift=0x%llx\n", + ldcp->id, vaddr, pg_size, pg_size_code, pg_shift); + + /* aligned VA and its offset */ + v_align = (caddr_t)(((uintptr_t)vaddr) & ~(pg_size - 1)); + v_offset = ((uintptr_t)vaddr) & (pg_size - 1); + + npages = (len+v_offset)/pg_size; + npages = ((len+v_offset)%pg_size == 0) ? npages : npages+1; + + D1(ldcp->id, "ldc_mem_bind_handle: binding " + "(0x%llx) v=0x%llx,val=0x%llx,off=0x%x,pgs=0x%x\n", + ldcp->id, vaddr, v_align, v_offset, npages); + + /* lock the memory table - exclusive access to channel */ + mutex_enter(&mtbl->lock); + + if (npages > mtbl->num_avail) { + DWARN(ldcp->id, + "ldc_mem_bind_handle: (0x%llx) no table entries\n", + ldcp->id); + mutex_exit(&mtbl->lock); + mutex_exit(&mhdl->lock); + return (ENOMEM); + } + + /* Allocate a memseg structure */ + memseg = mhdl->memseg = kmem_zalloc(sizeof (ldc_memseg_t), KM_SLEEP); + + /* Allocate memory to store all pages and cookies */ + memseg->pages = kmem_zalloc((sizeof (ldc_page_t) * npages), KM_SLEEP); + memseg->cookies = + kmem_zalloc((sizeof (ldc_mem_cookie_t) * npages), KM_SLEEP); + + D2(ldcp->id, "ldc_mem_bind_handle: (0x%llx) processing 0x%llx pages\n", + ldcp->id, npages); + + addr = v_align; + + /* + * Table slots are used in a round-robin manner. The algorithm permits + * inserting duplicate entries. Slots allocated earlier will typically + * get freed before we get back to reusing the slot.Inserting duplicate + * entries should be OK as we only lookup entries using the cookie addr + * i.e. tbl index, during export, unexport and copy operation. + * + * One implementation what was tried was to search for a duplicate + * page entry first and reuse it. The search overhead is very high and + * in the vnet case dropped the perf by almost half, 50 to 24 mbps. + * So it does make sense to avoid searching for duplicates. + * + * But during the process of searching for a free slot, if we find a + * duplicate entry we will go ahead and use it, and bump its use count. + */ + + /* index to start searching from */ + index = mtbl->next_entry; + cookie_idx = -1; + + tmp_mte.ll = 0; /* initialise fields to 0 */ + + if (mtype & LDC_DIRECT_MAP) { + tmp_mte.mte_r = (perm & LDC_MEM_R) ? 1 : 0; + tmp_mte.mte_w = (perm & LDC_MEM_W) ? 1 : 0; + tmp_mte.mte_x = (perm & LDC_MEM_X) ? 1 : 0; + } + + if (mtype & LDC_SHADOW_MAP) { + tmp_mte.mte_cr = (perm & LDC_MEM_R) ? 1 : 0; + tmp_mte.mte_cw = (perm & LDC_MEM_W) ? 1 : 0; + } + + if (mtype & LDC_IO_MAP) { + tmp_mte.mte_ir = (perm & LDC_MEM_R) ? 1 : 0; + tmp_mte.mte_iw = (perm & LDC_MEM_W) ? 1 : 0; + } + + D1(ldcp->id, "ldc_mem_bind_handle mte=0x%llx\n", tmp_mte.ll); + + tmp_mte.mte_pgszc = pg_size_code; + + /* initialize each mem table entry */ + for (i = 0; i < npages; i++) { + + /* check if slot is available in the table */ + while (mtbl->table[index].entry.ll != 0) { + + index = (index + 1) % mtbl->num_entries; + + if (index == mtbl->next_entry) { + /* we have looped around */ + DWARN(DBG_ALL_LDCS, + "ldc_mem_bind_handle: (0x%llx) cannot find " + "entry\n", ldcp->id); + *ccount = 0; + + /* NOTE: free memory, remove previous entries */ + /* this shouldnt happen as num_avail was ok */ + + mutex_exit(&mtbl->lock); + mutex_exit(&mhdl->lock); + return (ENOMEM); + } + } + + /* get the real address */ + raddr = va_to_pa((void *)addr); + ra_aligned = ((uintptr_t)raddr & pg_mask); + + /* build the mte */ + tmp_mte.mte_rpfn = ra_aligned >> pg_shift; + + D1(ldcp->id, "ldc_mem_bind_handle mte=0x%llx\n", tmp_mte.ll); + + /* update entry in table */ + mtbl->table[index].entry = tmp_mte; + + D2(ldcp->id, "ldc_mem_bind_handle: (0x%llx) stored MTE 0x%llx" + " into loc 0x%llx\n", ldcp->id, tmp_mte.ll, index); + + /* calculate the size and offset for this export range */ + if (i == 0) { + /* first page */ + psize = min((pg_size - v_offset), len); + poffset = v_offset; + + } else if (i == (npages - 1)) { + /* last page */ + psize = (((uintptr_t)(vaddr + len)) & + ((uint64_t)(pg_size-1))); + if (psize == 0) + psize = pg_size; + poffset = 0; + + } else { + /* middle pages */ + psize = pg_size; + poffset = 0; + } + + /* store entry for this page */ + memseg->pages[i].index = index; + memseg->pages[i].raddr = raddr; + memseg->pages[i].offset = poffset; + memseg->pages[i].size = psize; + memseg->pages[i].mte = &(mtbl->table[index]); + + /* create the cookie */ + if (i == 0 || (index != prev_index + 1)) { + cookie_idx++; + memseg->cookies[cookie_idx].addr = + IDX2COOKIE(index, pg_size_code, pg_shift); + memseg->cookies[cookie_idx].addr |= poffset; + memseg->cookies[cookie_idx].size = psize; + + } else { + memseg->cookies[cookie_idx].size += psize; + } + + D1(ldcp->id, "ldc_mem_bind_handle: bound " + "(0x%llx) va=0x%llx, idx=0x%llx, " + "ra=0x%llx(sz=0x%x,off=0x%x)\n", + ldcp->id, addr, index, raddr, psize, poffset); + + /* decrement number of available entries */ + mtbl->num_avail--; + + /* increment va by page size */ + addr += pg_size; + + /* increment index */ + prev_index = index; + index = (index + 1) % mtbl->num_entries; + + /* save the next slot */ + mtbl->next_entry = index; + } + + mutex_exit(&mtbl->lock); + + /* memory handle = bound */ + mhdl->mtype = mtype; + mhdl->perm = perm; + mhdl->status = LDC_BOUND; + + /* update memseg_t */ + memseg->vaddr = vaddr; + memseg->raddr = memseg->pages[0].raddr; + memseg->size = len; + memseg->npages = npages; + memseg->ncookies = cookie_idx + 1; + memseg->next_cookie = (memseg->ncookies > 1) ? 1 : 0; + + /* return count and first cookie */ + *ccount = memseg->ncookies; + cookie->addr = memseg->cookies[0].addr; + cookie->size = memseg->cookies[0].size; + + D1(ldcp->id, + "ldc_mem_bind_handle: (0x%llx) bound 0x%llx, va=0x%llx, " + "pgs=0x%llx cookies=0x%llx\n", + ldcp->id, mhdl, vaddr, npages, memseg->ncookies); + + mutex_exit(&mhdl->lock); + return (0); +} + +/* + * Return the next cookie associated with the specified memory handle + */ +int +ldc_mem_nextcookie(ldc_mem_handle_t mhandle, ldc_mem_cookie_t *cookie) +{ + ldc_mhdl_t *mhdl; + ldc_chan_t *ldcp; + ldc_memseg_t *memseg; + + if (mhandle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_nextcookie: invalid memory handle\n"); + return (EINVAL); + } + mhdl = (ldc_mhdl_t *)mhandle; + + mutex_enter(&mhdl->lock); + + ldcp = mhdl->ldcp; + memseg = mhdl->memseg; + + if (cookie == 0) { + DWARN(ldcp->id, + "ldc_mem_nextcookie:(0x%llx) invalid cookie arg\n", + ldcp->id); + mutex_exit(&mhdl->lock); + return (EINVAL); + } + + if (memseg->next_cookie != 0) { + cookie->addr = memseg->cookies[memseg->next_cookie].addr; + cookie->size = memseg->cookies[memseg->next_cookie].size; + memseg->next_cookie++; + if (memseg->next_cookie == memseg->ncookies) + memseg->next_cookie = 0; + + } else { + DWARN(ldcp->id, + "ldc_mem_nextcookie:(0x%llx) no more cookies\n", ldcp->id); + cookie->addr = 0; + cookie->size = 0; + mutex_exit(&mhdl->lock); + return (EINVAL); + } + + D1(ldcp->id, + "ldc_mem_nextcookie: (0x%llx) cookie addr=0x%llx,sz=0x%llx\n", + ldcp->id, cookie->addr, cookie->size); + + mutex_exit(&mhdl->lock); + return (0); +} + +/* + * Unbind the virtual memory region associated with the specified + * memory handle. Allassociated cookies are freed and the corresponding + * RA space is no longer exported. + */ +int +ldc_mem_unbind_handle(ldc_mem_handle_t mhandle) +{ + ldc_mhdl_t *mhdl; + ldc_chan_t *ldcp; + ldc_mtbl_t *mtbl; + ldc_memseg_t *memseg; + int i; + + if (mhandle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_unbind_handle: invalid memory handle\n"); + return (EINVAL); + } + mhdl = (ldc_mhdl_t *)mhandle; + + mutex_enter(&mhdl->lock); + + if (mhdl->status == LDC_UNBOUND) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_unbind_handle: (0x%x) handle is not bound\n", + mhandle); + mutex_exit(&mhdl->lock); + return (EINVAL); + } + + ldcp = mhdl->ldcp; + mtbl = ldcp->mtbl; + + memseg = mhdl->memseg; + + /* lock the memory table - exclusive access to channel */ + mutex_enter(&mtbl->lock); + + /* undo the pages exported */ + for (i = 0; i < memseg->npages; i++) { + + /* FUTURE: check for mapped pages */ + if (memseg->pages[i].mte->cookie) { + _NOTE(EMPTY) + } + + /* clear the entry from the table */ + memseg->pages[i].mte->entry.ll = 0; + mtbl->num_avail++; + } + mutex_exit(&mtbl->lock); + + /* free the allocated memseg and page structures */ + kmem_free(memseg->pages, (sizeof (ldc_page_t) * memseg->npages)); + kmem_free(memseg->cookies, + (sizeof (ldc_mem_cookie_t) * memseg->npages)); + kmem_free(memseg, sizeof (ldc_memseg_t)); + + /* uninitialize the memory handle */ + mhdl->memseg = NULL; + mhdl->status = LDC_UNBOUND; + + D1(ldcp->id, "ldc_mem_unbind_handle: (0x%llx) unbound handle 0x%llx\n", + ldcp->id, mhdl); + + mutex_exit(&mhdl->lock); + return (0); +} + +/* + * Get information about the dring. The base address of the descriptor + * ring along with the type and permission are returned back. + */ +int +ldc_mem_info(ldc_mem_handle_t mhandle, ldc_mem_info_t *minfo) +{ + ldc_mhdl_t *mhdl; + + if (mhandle == NULL) { + DWARN(DBG_ALL_LDCS, "ldc_mem_info: invalid memory handle\n"); + return (EINVAL); + } + mhdl = (ldc_mhdl_t *)mhandle; + + if (minfo == NULL) { + DWARN(DBG_ALL_LDCS, "ldc_mem_info: invalid args\n"); + return (EINVAL); + } + + mutex_enter(&mhdl->lock); + + minfo->status = mhdl->status; + if (mhdl->status == LDC_BOUND || mhdl->status == LDC_MAPPED) { + minfo->vaddr = mhdl->memseg->vaddr; + minfo->raddr = mhdl->memseg->raddr; + minfo->mtype = mhdl->mtype; + minfo->perm = mhdl->perm; + } + mutex_exit(&mhdl->lock); + + return (0); +} + +/* + * Copy data either from or to the client specified virtual address + * space to or from the exported memory associated with the cookies. + * The direction argument determines whether the data is read from or + * written to exported memory. + */ +int +ldc_mem_copy(ldc_handle_t handle, caddr_t vaddr, uint64_t off, size_t *size, + ldc_mem_cookie_t *cookies, uint32_t ccount, uint8_t direction) +{ + ldc_chan_t *ldcp; + uint64_t local_voff, local_valign; + uint64_t cookie_addr, cookie_size; + uint64_t pg_shift, pg_size, pg_size_code; + uint64_t export_caddr, export_poff, export_psize, export_size; + uint64_t local_ra, local_poff, local_psize; + uint64_t copy_size, copied_len = 0, total_bal = 0, idx = 0; + pgcnt_t npages; + size_t len = *size; + int i, rv = 0; + + if (handle == NULL) { + DWARN(DBG_ALL_LDCS, "ldc_mem_copy: invalid channel handle\n"); + return (EINVAL); + } + ldcp = (ldc_chan_t *)handle; + + mutex_enter(&ldcp->lock); + + /* check to see if channel is UP */ + if (ldcp->tstate != TS_UP) { + DWARN(ldcp->id, "ldc_mem_copy: (0x%llx) channel is not UP\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EINVAL); + } + + /* Force address and size to be 8-byte aligned */ + if ((((uintptr_t)vaddr | len) & 0x7) != 0) { + DWARN(ldcp->id, + "ldc_mem_copy: addr/sz is not 8-byte aligned\n"); + mutex_exit(&ldcp->lock); + return (EINVAL); + } + + /* Find the size of the exported memory */ + export_size = 0; + for (i = 0; i < ccount; i++) + export_size += cookies[i].size; + + /* check to see if offset is valid */ + if (off > export_size) { + DWARN(ldcp->id, + "ldc_mem_copy: (0x%llx) start offset > export mem size\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EINVAL); + } + + /* + * Check to see if the export size is smaller than the size we + * are requesting to copy - if so flag an error + */ + if ((export_size - off) < *size) { + DWARN(ldcp->id, + "ldc_mem_copy: (0x%llx) copy size > export mem size\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EINVAL); + } + + total_bal = min(export_size, *size); + + /* FUTURE: get the page size, pgsz code, and shift */ + pg_size = MMU_PAGESIZE; + pg_size_code = page_szc(pg_size); + pg_shift = page_get_shift(pg_size_code); + + D1(ldcp->id, "ldc_mem_copy: copying data " + "(0x%llx) va 0x%llx pgsz=0x%llx, pgszc=0x%llx, pg_shift=0x%llx\n", + ldcp->id, vaddr, pg_size, pg_size_code, pg_shift); + + /* aligned VA and its offset */ + local_valign = (((uintptr_t)vaddr) & ~(pg_size - 1)); + local_voff = ((uintptr_t)vaddr) & (pg_size - 1); + + npages = (len+local_voff)/pg_size; + npages = ((len+local_voff)%pg_size == 0) ? npages : npages+1; + + D1(ldcp->id, + "ldc_mem_copy: (0x%llx) v=0x%llx,val=0x%llx,off=0x%x,pgs=0x%x\n", + ldcp->id, vaddr, local_valign, local_voff, npages); + + local_ra = va_to_pa((void *)local_valign); + local_poff = local_voff; + local_psize = min(len, (pg_size - local_voff)); + + len -= local_psize; + + /* + * find the first cookie in the list of cookies + * if the offset passed in is not zero + */ + for (idx = 0; idx < ccount; idx++) { + cookie_size = cookies[idx].size; + if (off < cookie_size) + break; + off -= cookie_size; + } + + cookie_addr = cookies[idx].addr + off; + cookie_size = cookies[idx].size - off; + + export_caddr = cookie_addr & ~(pg_size - 1); + export_poff = cookie_addr & (pg_size - 1); + export_psize = min(cookie_size, (pg_size - export_poff)); + + for (;;) { + + copy_size = min(export_psize, local_psize); + + D1(ldcp->id, + "ldc_mem_copy:(0x%llx) dir=0x%x, caddr=0x%llx," + " loc_ra=0x%llx, exp_poff=0x%llx, loc_poff=0x%llx," + " exp_psz=0x%llx, loc_psz=0x%llx, copy_sz=0x%llx," + " total_bal=0x%llx\n", + ldcp->id, direction, export_caddr, local_ra, export_poff, + local_poff, export_psize, local_psize, copy_size, + total_bal); + + rv = hv_ldc_copy(ldcp->id, direction, + (export_caddr + export_poff), (local_ra + local_poff), + copy_size, &copied_len); + + if (rv != 0) { + cmn_err(CE_WARN, + "ldc_mem_copy: (0x%lx) err %d during copy\n", + ldcp->id, rv); + DWARN(DBG_ALL_LDCS, + "ldc_mem_copy: (0x%llx) dir=0x%x, caddr=0x%llx, " + "loc_ra=0x%llx, exp_poff=0x%llx, loc_poff=0x%llx," + " exp_psz=0x%llx, loc_psz=0x%llx, copy_sz=0x%llx," + " copied_len=0x%llx, total_bal=0x%llx\n", + ldcp->id, direction, export_caddr, local_ra, + export_poff, local_poff, export_psize, local_psize, + copy_size, copied_len, total_bal); + + *size = *size - total_bal; + mutex_exit(&ldcp->lock); + return (EIO); + } + + ASSERT(copied_len <= copy_size); + + D2(ldcp->id, "ldc_mem_copy: copied=0x%llx\n", copied_len); + export_poff += copied_len; + local_poff += copied_len; + export_psize -= copied_len; + local_psize -= copied_len; + cookie_size -= copied_len; + + total_bal -= copied_len; + + if (copy_size != copied_len) + continue; + + if (export_psize == 0 && total_bal != 0) { + + if (cookie_size == 0) { + idx++; + cookie_addr = cookies[idx].addr; + cookie_size = cookies[idx].size; + + export_caddr = cookie_addr & ~(pg_size - 1); + export_poff = cookie_addr & (pg_size - 1); + export_psize = + min(cookie_size, (pg_size-export_poff)); + } else { + export_caddr += pg_size; + export_poff = 0; + export_psize = min(cookie_size, pg_size); + } + } + + if (local_psize == 0 && total_bal != 0) { + local_valign += pg_size; + local_ra = va_to_pa((void *)local_valign); + local_poff = 0; + local_psize = min(pg_size, len); + len -= local_psize; + } + + /* check if we are all done */ + if (total_bal == 0) + break; + } + + mutex_exit(&ldcp->lock); + + D1(ldcp->id, + "ldc_mem_copy: (0x%llx) done copying sz=0x%llx\n", + ldcp->id, *size); + + return (0); +} + +/* + * Copy data either from or to the client specified virtual address + * space to or from HV physical memory. + * + * The direction argument determines whether the data is read from or + * written to HV memory. direction values are LDC_COPY_IN/OUT similar + * to the ldc_mem_copy interface + */ +int +ldc_mem_rdwr_pa(ldc_handle_t handle, caddr_t vaddr, size_t *size, + caddr_t paddr, uint8_t direction) +{ + ldc_chan_t *ldcp; + uint64_t local_voff, local_valign; + uint64_t pg_shift, pg_size, pg_size_code; + uint64_t target_pa, target_poff, target_psize, target_size; + uint64_t local_ra, local_poff, local_psize; + uint64_t copy_size, copied_len = 0; + pgcnt_t npages; + size_t len = *size; + int rv = 0; + + if (handle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_rdwr_pa: invalid channel handle\n"); + return (EINVAL); + } + ldcp = (ldc_chan_t *)handle; + + mutex_enter(&ldcp->lock); + + /* check to see if channel is UP */ + if (ldcp->tstate != TS_UP) { + DWARN(ldcp->id, + "ldc_mem_rdwr_pa: (0x%llx) channel is not UP\n", + ldcp->id); + mutex_exit(&ldcp->lock); + return (EINVAL); + } + + /* Force address and size to be 8-byte aligned */ + if ((((uintptr_t)vaddr | len) & 0x7) != 0) { + DWARN(ldcp->id, + "ldc_mem_rdwr_pa: addr/size is not 8-byte aligned\n"); + mutex_exit(&ldcp->lock); + return (EINVAL); + } + + target_size = *size; + + /* FUTURE: get the page size, pgsz code, and shift */ + pg_size = MMU_PAGESIZE; + pg_size_code = page_szc(pg_size); + pg_shift = page_get_shift(pg_size_code); + + D1(ldcp->id, "ldc_mem_rdwr_pa: copying data " + "(0x%llx) va 0x%llx pgsz=0x%llx, pgszc=0x%llx, pg_shift=0x%llx\n", + ldcp->id, vaddr, pg_size, pg_size_code, pg_shift); + + /* aligned VA and its offset */ + local_valign = ((uintptr_t)vaddr) & ~(pg_size - 1); + local_voff = ((uintptr_t)vaddr) & (pg_size - 1); + + npages = (len + local_voff) / pg_size; + npages = ((len + local_voff) % pg_size == 0) ? npages : npages+1; + + D1(ldcp->id, + "ldc_mem_rdwr_pa: (0x%llx) v=0x%llx,val=0x%llx,off=0x%x,pgs=0x%x\n", + ldcp->id, vaddr, local_valign, local_voff, npages); + + local_ra = va_to_pa((void *)local_valign); + local_poff = local_voff; + local_psize = min(len, (pg_size - local_voff)); + + len -= local_psize; + + target_pa = ((uintptr_t)paddr) & ~(pg_size - 1); + target_poff = ((uintptr_t)paddr) & (pg_size - 1); + target_psize = pg_size - target_poff; + + for (;;) { + + copy_size = min(target_psize, local_psize); + + D1(ldcp->id, + "ldc_mem_rdwr_pa: (0x%llx) dir=0x%x, tar_pa=0x%llx," + " loc_ra=0x%llx, tar_poff=0x%llx, loc_poff=0x%llx," + " tar_psz=0x%llx, loc_psz=0x%llx, copy_sz=0x%llx," + " total_bal=0x%llx\n", + ldcp->id, direction, target_pa, local_ra, target_poff, + local_poff, target_psize, local_psize, copy_size, + target_size); + + rv = hv_ldc_copy(ldcp->id, direction, + (target_pa + target_poff), (local_ra + local_poff), + copy_size, &copied_len); + + if (rv != 0) { + cmn_err(CE_WARN, + "ldc_mem_rdwr_pa: (0x%lx) err %d during copy\n", + ldcp->id, rv); + DWARN(DBG_ALL_LDCS, + "ldc_mem_rdwr_pa: (0x%llx) dir=%lld,tar_pa=0x%llx, " + "loc_ra=0x%llx, tar_poff=0x%llx, loc_poff=0x%llx," + " tar_psz=0x%llx, loc_psz=0x%llx, copy_sz=0x%llx," + " total_bal=0x%llx\n", + ldcp->id, direction, target_pa, local_ra, + target_poff, local_poff, target_psize, local_psize, + copy_size, target_size); + + *size = *size - target_size; + mutex_exit(&ldcp->lock); + return (i_ldc_h2v_error(rv)); + } + + D2(ldcp->id, "ldc_mem_rdwr_pa: copied=0x%llx\n", copied_len); + target_poff += copied_len; + local_poff += copied_len; + target_psize -= copied_len; + local_psize -= copied_len; + + target_size -= copied_len; + + if (copy_size != copied_len) + continue; + + if (target_psize == 0 && target_size != 0) { + target_pa += pg_size; + target_poff = 0; + target_psize = min(pg_size, target_size); + } + + if (local_psize == 0 && target_size != 0) { + local_valign += pg_size; + local_ra = va_to_pa((void *)local_valign); + local_poff = 0; + local_psize = min(pg_size, len); + len -= local_psize; + } + + /* check if we are all done */ + if (target_size == 0) + break; + } + + mutex_exit(&ldcp->lock); + + D1(ldcp->id, "ldc_mem_rdwr_pa: (0x%llx) done copying sz=0x%llx\n", + ldcp->id, *size); + + return (0); +} + +/* + * Map an exported memory segment into the local address space. If the + * memory range was exported for direct map access, a HV call is made + * to allocate a RA range. If the map is done via a shadow copy, local + * shadow memory is allocated and the base VA is returned in 'vaddr'. If + * the mapping is a direct map then the RA is returned in 'raddr'. + */ +int +ldc_mem_map(ldc_mem_handle_t mhandle, ldc_mem_cookie_t *cookie, uint32_t ccount, + uint8_t mtype, caddr_t *vaddr, caddr_t *raddr) +{ + int i, idx; + ldc_chan_t *ldcp; + ldc_mhdl_t *mhdl; + ldc_memseg_t *memseg; + caddr_t shadow_base = NULL, tmpaddr; + uint64_t pg_size, pg_shift, pg_size_code; + uint64_t exp_size = 0, npages; + + if (mhandle == NULL) { + DWARN(DBG_ALL_LDCS, "ldc_mem_map: invalid memory handle\n"); + return (EINVAL); + } + mhdl = (ldc_mhdl_t *)mhandle; + + mutex_enter(&mhdl->lock); + + if (mhdl->status == LDC_BOUND || mhdl->status == LDC_MAPPED || + mhdl->memseg != NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_map: (0x%llx) handle bound/mapped\n", mhandle); + mutex_exit(&mhdl->lock); + return (EINVAL); + } + + ldcp = mhdl->ldcp; + + mutex_enter(&ldcp->lock); + + if (ldcp->tstate != TS_UP) { + DWARN(ldcp->id, + "ldc_mem_dring_map: (0x%llx) channel is not UP\n", + ldcp->id); + mutex_exit(&ldcp->lock); + mutex_exit(&mhdl->lock); + return (EINVAL); + } + + if ((mtype & (LDC_SHADOW_MAP|LDC_DIRECT_MAP|LDC_IO_MAP)) == 0) { + DWARN(ldcp->id, "ldc_mem_map: invalid map type\n"); + mutex_exit(&ldcp->lock); + mutex_exit(&mhdl->lock); + return (EINVAL); + } + + if (mtype == LDC_SHADOW_MAP && vaddr == NULL) { + DWARN(ldcp->id, + "ldc_mem_map: invalid vaddr arg0x%llx\n", vaddr); + mutex_exit(&ldcp->lock); + mutex_exit(&mhdl->lock); + return (EINVAL); + } + + if (mtype == LDC_SHADOW_MAP && + (vaddr) && ((uintptr_t)(*vaddr) & MMU_PAGEOFFSET)) { + DWARN(ldcp->id, + "ldc_mem_map: vaddr not page aligned, 0x%llx\n", *vaddr); + mutex_exit(&ldcp->lock); + mutex_exit(&mhdl->lock); + return (EINVAL); + } + + D1(ldcp->id, "ldc_mem_map: (0x%llx) cookie = 0x%llx,0x%llx\n", + mhandle, cookie->addr, cookie->size); + + /* FUTURE: get the page size, pgsz code, and shift */ + pg_size = MMU_PAGESIZE; + pg_size_code = page_szc(pg_size); + pg_shift = page_get_shift(pg_size_code); + + /* calculate the number of pages in the exported cookie */ + for (idx = 0; idx < ccount; idx++) { + if (cookie[idx].addr & MMU_PAGEOFFSET || + cookie[idx].size & MMU_PAGEOFFSET) { + DWARN(ldcp->id, + "ldc_mem_map: cookie addr/size not page aligned, " + "0x%llx\n", cookie[idx].addr); + mutex_exit(&ldcp->lock); + mutex_exit(&mhdl->lock); + return (EINVAL); + } + exp_size += cookie[idx].size; + } + npages = (exp_size >> pg_shift); + + /* Allocate memseg structure */ + memseg = mhdl->memseg = kmem_zalloc(sizeof (ldc_memseg_t), KM_SLEEP); + + /* Allocate memory to store all pages and cookies */ + memseg->pages = kmem_zalloc((sizeof (ldc_page_t) * npages), KM_SLEEP); + memseg->cookies = + kmem_zalloc((sizeof (ldc_mem_cookie_t) * ccount), KM_SLEEP); + + D2(ldcp->id, "ldc_mem_map: (0x%llx) processing 0x%llx pages\n", + ldcp->id, npages); + + /* Check to see if the client is requesting direct or shadow map */ + if (mtype == LDC_SHADOW_MAP) { + if (*vaddr == NULL) { + shadow_base = + contig_mem_alloc_align(exp_size, PAGESIZE); + if (shadow_base == NULL) { + cmn_err(CE_WARN, "ldc_mem_map: shadow memory " + "allocation failed\n"); + kmem_free(memseg->cookies, + (sizeof (ldc_mem_cookie_t) * ccount)); + kmem_free(memseg->pages, + (sizeof (ldc_page_t) * npages)); + kmem_free(memseg, sizeof (ldc_memseg_t)); + mutex_exit(&ldcp->lock); + mutex_exit(&mhdl->lock); + return (ENOMEM); + } + + bzero(shadow_base, exp_size); + mhdl->myshadow = B_TRUE; + + D1(ldcp->id, "ldc_mem_map: (0x%llx) allocated " + "shadow page va=0x%llx\n", ldcp->id, shadow_base); + } else { + /* + * Use client supplied memory for shadow_base + * WARNING: assuming that client mem is >= exp_size + */ + shadow_base = *vaddr; + } + } else if (mtype == LDC_DIRECT_MAP) { + /* FUTURE: Do a direct map by calling into HV */ + _NOTE(EMPTY) + } + + /* Save all page and cookie information */ + for (i = 0, tmpaddr = shadow_base; i < npages; i++) { + memseg->pages[i].raddr = va_to_pa(tmpaddr); + memseg->pages[i].size = pg_size; + memseg->pages[i].index = 0; + memseg->pages[i].offset = 0; + memseg->pages[i].mte = NULL; + tmpaddr += pg_size; + } + for (i = 0; i < ccount; i++) { + memseg->cookies[i].addr = cookie[i].addr; + memseg->cookies[i].size = cookie[i].size; + } + + /* update memseg_t */ + memseg->vaddr = shadow_base; + memseg->raddr = memseg->pages[0].raddr; + memseg->size = exp_size; + memseg->npages = npages; + memseg->ncookies = ccount; + memseg->next_cookie = 0; + + /* memory handle = mapped */ + mhdl->mtype = mtype; + mhdl->perm = 0; + mhdl->status = LDC_MAPPED; + + D1(ldcp->id, "ldc_mem_map: (0x%llx) mapped 0x%llx, ra=0x%llx, " + "va=0x%llx, pgs=0x%llx cookies=0x%llx\n", + ldcp->id, mhdl, memseg->raddr, memseg->vaddr, + memseg->npages, memseg->ncookies); + + if (raddr) + *raddr = (caddr_t)memseg->raddr; + if (vaddr) + *vaddr = memseg->vaddr; + + mutex_exit(&ldcp->lock); + mutex_exit(&mhdl->lock); + return (0); +} + +/* + * Unmap a memory segment. Free shadow memory (if any). + */ +int +ldc_mem_unmap(ldc_mem_handle_t mhandle) +{ + ldc_mhdl_t *mhdl = (ldc_mhdl_t *)mhandle; + ldc_chan_t *ldcp; + ldc_memseg_t *memseg; + + if (mhdl == 0 || mhdl->status != LDC_MAPPED) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_unmap: (0x%llx) handle is not mapped\n", + mhandle); + return (EINVAL); + } + + mutex_enter(&mhdl->lock); + + ldcp = mhdl->ldcp; + memseg = mhdl->memseg; + + D1(ldcp->id, "ldc_mem_unmap: (0x%llx) unmapping handle 0x%llx\n", + ldcp->id, mhdl); + + /* if we allocated shadow memory - free it */ + if (mhdl->mtype == LDC_SHADOW_MAP && mhdl->myshadow) { + contig_mem_free(memseg->vaddr, memseg->size); + } + + /* free the allocated memseg and page structures */ + kmem_free(memseg->pages, (sizeof (ldc_page_t) * memseg->npages)); + kmem_free(memseg->cookies, + (sizeof (ldc_mem_cookie_t) * memseg->ncookies)); + kmem_free(memseg, sizeof (ldc_memseg_t)); + + /* uninitialize the memory handle */ + mhdl->memseg = NULL; + mhdl->status = LDC_UNBOUND; + + D1(ldcp->id, "ldc_mem_unmap: (0x%llx) unmapped handle 0x%llx\n", + ldcp->id, mhdl); + + mutex_exit(&mhdl->lock); + return (0); +} + +/* + * Internal entry point for LDC mapped memory entry consistency + * semantics. Acquire copies the contents of the remote memory + * into the local shadow copy. The release operation copies the local + * contents into the remote memory. The offset and size specify the + * bounds for the memory range being synchronized. + */ +static int +i_ldc_mem_acquire_release(ldc_mem_handle_t mhandle, uint8_t direction, + uint64_t offset, size_t size) +{ + int err; + ldc_mhdl_t *mhdl; + ldc_chan_t *ldcp; + ldc_memseg_t *memseg; + caddr_t local_vaddr; + size_t copy_size; + + if (mhandle == NULL) { + DWARN(DBG_ALL_LDCS, + "i_ldc_mem_acquire_release: invalid memory handle\n"); + return (EINVAL); + } + mhdl = (ldc_mhdl_t *)mhandle; + + mutex_enter(&mhdl->lock); + + if (mhdl->status != LDC_MAPPED || mhdl->ldcp == NULL) { + DWARN(DBG_ALL_LDCS, + "i_ldc_mem_acquire_release: not mapped memory\n"); + mutex_exit(&mhdl->lock); + return (EINVAL); + } + + if (offset >= mhdl->memseg->size || + (offset + size) > mhdl->memseg->size) { + DWARN(DBG_ALL_LDCS, + "i_ldc_mem_acquire_release: memory out of range\n"); + mutex_exit(&mhdl->lock); + return (EINVAL); + } + + /* get the channel handle and memory segment */ + ldcp = mhdl->ldcp; + memseg = mhdl->memseg; + + if (mhdl->mtype == LDC_SHADOW_MAP) { + + local_vaddr = memseg->vaddr + offset; + copy_size = size; + + /* copy to/from remote from/to local memory */ + err = ldc_mem_copy((ldc_handle_t)ldcp, local_vaddr, offset, + ©_size, memseg->cookies, memseg->ncookies, + direction); + if (err || copy_size != size) { + cmn_err(CE_WARN, + "i_ldc_mem_acquire_release: copy failed\n"); + mutex_exit(&mhdl->lock); + return (err); + } + } + + mutex_exit(&mhdl->lock); + + return (0); +} + +/* + * Ensure that the contents in the remote memory seg are consistent + * with the contents if of local segment + */ +int +ldc_mem_acquire(ldc_mem_handle_t mhandle, uint64_t offset, uint64_t size) +{ + return (i_ldc_mem_acquire_release(mhandle, LDC_COPY_IN, offset, size)); +} + + +/* + * Ensure that the contents in the local memory seg are consistent + * with the contents if of remote segment + */ +int +ldc_mem_release(ldc_mem_handle_t mhandle, uint64_t offset, uint64_t size) +{ + return (i_ldc_mem_acquire_release(mhandle, LDC_COPY_OUT, offset, size)); +} + +/* + * Allocate a descriptor ring. The size of each each descriptor + * must be 8-byte aligned and the entire ring should be a multiple + * of MMU_PAGESIZE. + */ +int +ldc_mem_dring_create(uint32_t len, uint32_t dsize, ldc_dring_handle_t *dhandle) +{ + ldc_dring_t *dringp; + size_t size = (dsize * len); + + D1(DBG_ALL_LDCS, "ldc_mem_dring_create: len=0x%x, size=0x%x\n", + len, dsize); + + if (dhandle == NULL) { + DWARN(DBG_ALL_LDCS, "ldc_mem_dring_create: invalid dhandle\n"); + return (EINVAL); + } + + if (len == 0) { + DWARN(DBG_ALL_LDCS, "ldc_mem_dring_create: invalid length\n"); + return (EINVAL); + } + + /* descriptor size should be 8-byte aligned */ + if (dsize == 0 || (dsize & 0x7)) { + DWARN(DBG_ALL_LDCS, "ldc_mem_dring_create: invalid size\n"); + return (EINVAL); + } + + *dhandle = 0; + + /* Allocate a desc ring structure */ + dringp = kmem_zalloc(sizeof (ldc_dring_t), KM_SLEEP); + + /* Initialize dring */ + dringp->length = len; + dringp->dsize = dsize; + + /* round off to multiple of pagesize */ + dringp->size = (size & MMU_PAGEMASK); + if (size & MMU_PAGEOFFSET) + dringp->size += MMU_PAGESIZE; + + dringp->status = LDC_UNBOUND; + + /* allocate descriptor ring memory */ + dringp->base = contig_mem_alloc_align(dringp->size, PAGESIZE); + if (dringp->base == NULL) { + cmn_err(CE_WARN, + "ldc_mem_dring_create: unable to alloc desc\n"); + kmem_free(dringp, sizeof (ldc_dring_t)); + return (ENOMEM); + } + + bzero(dringp->base, dringp->size); + + /* initialize the desc ring lock */ + mutex_init(&dringp->lock, NULL, MUTEX_DRIVER, NULL); + + /* Add descriptor ring to the head of global list */ + mutex_enter(&ldcssp->lock); + dringp->next = ldcssp->dring_list; + ldcssp->dring_list = dringp; + mutex_exit(&ldcssp->lock); + + *dhandle = (ldc_dring_handle_t)dringp; + + D1(DBG_ALL_LDCS, "ldc_mem_dring_create: dring allocated\n"); + + return (0); +} + + +/* + * Destroy a descriptor ring. + */ +int +ldc_mem_dring_destroy(ldc_dring_handle_t dhandle) +{ + ldc_dring_t *dringp; + ldc_dring_t *tmp_dringp; + + D1(DBG_ALL_LDCS, "ldc_mem_dring_destroy: entered\n"); + + if (dhandle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_destroy: invalid desc ring handle\n"); + return (EINVAL); + } + dringp = (ldc_dring_t *)dhandle; + + if (dringp->status == LDC_BOUND) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_destroy: desc ring is bound\n"); + return (EACCES); + } + + mutex_enter(&dringp->lock); + mutex_enter(&ldcssp->lock); + + /* remove from linked list - if not bound */ + tmp_dringp = ldcssp->dring_list; + if (tmp_dringp == dringp) { + ldcssp->dring_list = dringp->next; + dringp->next = NULL; + + } else { + while (tmp_dringp != NULL) { + if (tmp_dringp->next == dringp) { + tmp_dringp->next = dringp->next; + dringp->next = NULL; + break; + } + tmp_dringp = tmp_dringp->next; + } + if (tmp_dringp == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_destroy: invalid descriptor\n"); + mutex_exit(&ldcssp->lock); + mutex_exit(&dringp->lock); + return (EINVAL); + } + } + + mutex_exit(&ldcssp->lock); + + /* free the descriptor ring */ + contig_mem_free((caddr_t)dringp->base, dringp->size); + + mutex_exit(&dringp->lock); + + /* destroy dring lock */ + mutex_destroy(&dringp->lock); + + /* free desc ring object */ + kmem_free(dringp, sizeof (ldc_dring_t)); + + return (0); +} + +/* + * Bind a previously allocated dring to a channel. The channel should + * be OPEN in order to bind the ring to the channel. Returns back a + * descriptor ring cookie. The descriptor ring is exported for remote + * access by the client at the other end of the channel. An entry for + * dring pages is stored in map table (via call to ldc_mem_bind_handle). + */ +int +ldc_mem_dring_bind(ldc_handle_t handle, ldc_dring_handle_t dhandle, + uint8_t mtype, uint8_t perm, ldc_mem_cookie_t *cookie, uint32_t *ccount) +{ + int err; + ldc_chan_t *ldcp; + ldc_dring_t *dringp; + ldc_mem_handle_t mhandle; + + /* check to see if channel is initalized */ + if (handle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_bind: invalid channel handle\n"); + return (EINVAL); + } + ldcp = (ldc_chan_t *)handle; + + if (dhandle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_bind: invalid desc ring handle\n"); + return (EINVAL); + } + dringp = (ldc_dring_t *)dhandle; + + if (cookie == NULL) { + DWARN(ldcp->id, + "ldc_mem_dring_bind: invalid cookie arg\n"); + return (EINVAL); + } + + mutex_enter(&dringp->lock); + + if (dringp->status == LDC_BOUND) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_bind: (0x%llx) descriptor ring is bound\n", + ldcp->id); + mutex_exit(&dringp->lock); + return (EINVAL); + } + + if ((perm & LDC_MEM_RW) == 0) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_bind: invalid permissions\n"); + mutex_exit(&dringp->lock); + return (EINVAL); + } + + if ((mtype & (LDC_SHADOW_MAP|LDC_DIRECT_MAP|LDC_IO_MAP)) == 0) { + DWARN(DBG_ALL_LDCS, "ldc_mem_dring_bind: invalid type\n"); + mutex_exit(&dringp->lock); + return (EINVAL); + } + + dringp->ldcp = ldcp; + + /* create an memory handle */ + err = ldc_mem_alloc_handle(handle, &mhandle); + if (err || mhandle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_bind: (0x%llx) error allocating mhandle\n", + ldcp->id); + mutex_exit(&dringp->lock); + return (err); + } + dringp->mhdl = mhandle; + + /* bind the descriptor ring to channel */ + err = ldc_mem_bind_handle(mhandle, dringp->base, dringp->size, + mtype, perm, cookie, ccount); + if (err) { + DWARN(ldcp->id, + "ldc_mem_dring_bind: (0x%llx) error binding mhandle\n", + ldcp->id); + mutex_exit(&dringp->lock); + return (err); + } + + /* + * For now return error if we get more than one cookie + * FUTURE: Return multiple cookies .. + */ + if (*ccount > 1) { + (void) ldc_mem_unbind_handle(mhandle); + (void) ldc_mem_free_handle(mhandle); + + dringp->ldcp = NULL; + dringp->mhdl = NULL; + *ccount = 0; + + mutex_exit(&dringp->lock); + return (EAGAIN); + } + + /* Add descriptor ring to channel's exported dring list */ + mutex_enter(&ldcp->exp_dlist_lock); + dringp->ch_next = ldcp->exp_dring_list; + ldcp->exp_dring_list = dringp; + mutex_exit(&ldcp->exp_dlist_lock); + + dringp->status = LDC_BOUND; + + mutex_exit(&dringp->lock); + + return (0); +} + +/* + * Return the next cookie associated with the specified dring handle + */ +int +ldc_mem_dring_nextcookie(ldc_dring_handle_t dhandle, ldc_mem_cookie_t *cookie) +{ + int rv = 0; + ldc_dring_t *dringp; + ldc_chan_t *ldcp; + + if (dhandle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_nextcookie: invalid desc ring handle\n"); + return (EINVAL); + } + dringp = (ldc_dring_t *)dhandle; + mutex_enter(&dringp->lock); + + if (dringp->status != LDC_BOUND) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_nextcookie: descriptor ring 0x%llx " + "is not bound\n", dringp); + mutex_exit(&dringp->lock); + return (EINVAL); + } + + ldcp = dringp->ldcp; + + if (cookie == NULL) { + DWARN(ldcp->id, + "ldc_mem_dring_nextcookie:(0x%llx) invalid cookie arg\n", + ldcp->id); + mutex_exit(&dringp->lock); + return (EINVAL); + } + + rv = ldc_mem_nextcookie((ldc_mem_handle_t)dringp->mhdl, cookie); + mutex_exit(&dringp->lock); + + return (rv); +} +/* + * Unbind a previously bound dring from a channel. + */ +int +ldc_mem_dring_unbind(ldc_dring_handle_t dhandle) +{ + ldc_dring_t *dringp; + ldc_dring_t *tmp_dringp; + ldc_chan_t *ldcp; + + if (dhandle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_unbind: invalid desc ring handle\n"); + return (EINVAL); + } + dringp = (ldc_dring_t *)dhandle; + + mutex_enter(&dringp->lock); + + if (dringp->status == LDC_UNBOUND) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_bind: descriptor ring 0x%llx is unbound\n", + dringp); + mutex_exit(&dringp->lock); + return (EINVAL); + } + ldcp = dringp->ldcp; + + mutex_enter(&ldcp->exp_dlist_lock); + + tmp_dringp = ldcp->exp_dring_list; + if (tmp_dringp == dringp) { + ldcp->exp_dring_list = dringp->ch_next; + dringp->ch_next = NULL; + + } else { + while (tmp_dringp != NULL) { + if (tmp_dringp->ch_next == dringp) { + tmp_dringp->ch_next = dringp->ch_next; + dringp->ch_next = NULL; + break; + } + tmp_dringp = tmp_dringp->ch_next; + } + if (tmp_dringp == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_unbind: invalid descriptor\n"); + mutex_exit(&ldcp->exp_dlist_lock); + mutex_exit(&dringp->lock); + return (EINVAL); + } + } + + mutex_exit(&ldcp->exp_dlist_lock); + + (void) ldc_mem_unbind_handle((ldc_mem_handle_t)dringp->mhdl); + (void) ldc_mem_free_handle((ldc_mem_handle_t)dringp->mhdl); + + dringp->ldcp = NULL; + dringp->mhdl = NULL; + dringp->status = LDC_UNBOUND; + + mutex_exit(&dringp->lock); + + return (0); +} + +/* + * Get information about the dring. The base address of the descriptor + * ring along with the type and permission are returned back. + */ +int +ldc_mem_dring_info(ldc_dring_handle_t dhandle, ldc_mem_info_t *minfo) +{ + ldc_dring_t *dringp; + int rv; + + if (dhandle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_info: invalid desc ring handle\n"); + return (EINVAL); + } + dringp = (ldc_dring_t *)dhandle; + + mutex_enter(&dringp->lock); + + if (dringp->mhdl) { + rv = ldc_mem_info(dringp->mhdl, minfo); + if (rv) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_info: error reading mem info\n"); + mutex_exit(&dringp->lock); + return (rv); + } + } else { + minfo->vaddr = dringp->base; + minfo->raddr = NULL; + minfo->status = dringp->status; + } + + mutex_exit(&dringp->lock); + + return (0); +} + +/* + * Map an exported descriptor ring into the local address space. If the + * descriptor ring was exported for direct map access, a HV call is made + * to allocate a RA range. If the map is done via a shadow copy, local + * shadow memory is allocated. + */ +int +ldc_mem_dring_map(ldc_handle_t handle, ldc_mem_cookie_t *cookie, + uint32_t ccount, uint32_t len, uint32_t dsize, uint8_t mtype, + ldc_dring_handle_t *dhandle) +{ + int err; + ldc_chan_t *ldcp = (ldc_chan_t *)handle; + ldc_mem_handle_t mhandle; + ldc_dring_t *dringp; + size_t dring_size; + + if (dhandle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_map: invalid dhandle\n"); + return (EINVAL); + } + + /* check to see if channel is initalized */ + if (handle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_map: invalid channel handle\n"); + return (EINVAL); + } + ldcp = (ldc_chan_t *)handle; + + if (cookie == NULL) { + DWARN(ldcp->id, + "ldc_mem_dring_map: (0x%llx) invalid cookie\n", + ldcp->id); + return (EINVAL); + } + + /* FUTURE: For now we support only one cookie per dring */ + ASSERT(ccount == 1); + + if (cookie->size < (dsize * len)) { + DWARN(ldcp->id, + "ldc_mem_dring_map: (0x%llx) invalid dsize/len\n", + ldcp->id); + return (EINVAL); + } + + *dhandle = 0; + + /* Allocate an dring structure */ + dringp = kmem_zalloc(sizeof (ldc_dring_t), KM_SLEEP); + + D1(ldcp->id, + "ldc_mem_dring_map: 0x%x,0x%x,0x%x,0x%llx,0x%llx\n", + mtype, len, dsize, cookie->addr, cookie->size); + + /* Initialize dring */ + dringp->length = len; + dringp->dsize = dsize; + + /* round of to multiple of page size */ + dring_size = len * dsize; + dringp->size = (dring_size & MMU_PAGEMASK); + if (dring_size & MMU_PAGEOFFSET) + dringp->size += MMU_PAGESIZE; + + dringp->ldcp = ldcp; + + /* create an memory handle */ + err = ldc_mem_alloc_handle(handle, &mhandle); + if (err || mhandle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_map: cannot alloc hdl err=%d\n", + err); + kmem_free(dringp, sizeof (ldc_dring_t)); + return (ENOMEM); + } + + dringp->mhdl = mhandle; + dringp->base = NULL; + + /* map the dring into local memory */ + err = ldc_mem_map(mhandle, cookie, ccount, mtype, + &(dringp->base), NULL); + if (err || dringp->base == NULL) { + cmn_err(CE_WARN, + "ldc_mem_dring_map: cannot map desc ring err=%d\n", err); + (void) ldc_mem_free_handle(mhandle); + kmem_free(dringp, sizeof (ldc_dring_t)); + return (ENOMEM); + } + + /* initialize the desc ring lock */ + mutex_init(&dringp->lock, NULL, MUTEX_DRIVER, NULL); + + /* Add descriptor ring to channel's imported dring list */ + mutex_enter(&ldcp->imp_dlist_lock); + dringp->ch_next = ldcp->imp_dring_list; + ldcp->imp_dring_list = dringp; + mutex_exit(&ldcp->imp_dlist_lock); + + dringp->status = LDC_MAPPED; + + *dhandle = (ldc_dring_handle_t)dringp; + + return (0); +} + +/* + * Unmap a descriptor ring. Free shadow memory (if any). + */ +int +ldc_mem_dring_unmap(ldc_dring_handle_t dhandle) +{ + ldc_dring_t *dringp; + ldc_dring_t *tmp_dringp; + ldc_chan_t *ldcp; + + if (dhandle == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_unmap: invalid desc ring handle\n"); + return (EINVAL); + } + dringp = (ldc_dring_t *)dhandle; + + if (dringp->status != LDC_MAPPED) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_unmap: not a mapped desc ring\n"); + return (EINVAL); + } + + mutex_enter(&dringp->lock); + + ldcp = dringp->ldcp; + + mutex_enter(&ldcp->imp_dlist_lock); + + /* find and unlink the desc ring from channel import list */ + tmp_dringp = ldcp->imp_dring_list; + if (tmp_dringp == dringp) { + ldcp->imp_dring_list = dringp->ch_next; + dringp->ch_next = NULL; + + } else { + while (tmp_dringp != NULL) { + if (tmp_dringp->ch_next == dringp) { + tmp_dringp->ch_next = dringp->ch_next; + dringp->ch_next = NULL; + break; + } + tmp_dringp = tmp_dringp->ch_next; + } + if (tmp_dringp == NULL) { + DWARN(DBG_ALL_LDCS, + "ldc_mem_dring_unmap: invalid descriptor\n"); + mutex_exit(&ldcp->imp_dlist_lock); + mutex_exit(&dringp->lock); + return (EINVAL); + } + } + + mutex_exit(&ldcp->imp_dlist_lock); + + /* do a LDC memory handle unmap and free */ + (void) ldc_mem_unmap(dringp->mhdl); + (void) ldc_mem_free_handle((ldc_mem_handle_t)dringp->mhdl); + + dringp->status = 0; + dringp->ldcp = NULL; + + mutex_exit(&dringp->lock); + + /* destroy dring lock */ + mutex_destroy(&dringp->lock); + + /* free desc ring object */ + kmem_free(dringp, sizeof (ldc_dring_t)); + + return (0); +} + +/* + * Internal entry point for descriptor ring access entry consistency + * semantics. Acquire copies the contents of the remote descriptor ring + * into the local shadow copy. The release operation copies the local + * contents into the remote dring. The start and end locations specify + * bounds for the entries being synchronized. + */ +static int +i_ldc_dring_acquire_release(ldc_dring_handle_t dhandle, + uint8_t direction, uint64_t start, uint64_t end) +{ + int err; + ldc_dring_t *dringp; + ldc_chan_t *ldcp; + uint64_t soff; + size_t copy_size; + + if (dhandle == NULL) { + DWARN(DBG_ALL_LDCS, + "i_ldc_dring_acquire_release: invalid desc ring handle\n"); + return (EINVAL); + } + dringp = (ldc_dring_t *)dhandle; + mutex_enter(&dringp->lock); + + if (dringp->status != LDC_MAPPED || dringp->ldcp == NULL) { + DWARN(DBG_ALL_LDCS, + "i_ldc_dring_acquire_release: not a mapped desc ring\n"); + mutex_exit(&dringp->lock); + return (EINVAL); + } + + if (start >= dringp->length || end >= dringp->length) { + DWARN(DBG_ALL_LDCS, + "i_ldc_dring_acquire_release: index out of range\n"); + mutex_exit(&dringp->lock); + return (EINVAL); + } + + /* get the channel handle */ + ldcp = dringp->ldcp; + + copy_size = (start <= end) ? (((end - start) + 1) * dringp->dsize) : + ((dringp->length - start) * dringp->dsize); + + /* Calculate the relative offset for the first desc */ + soff = (start * dringp->dsize); + + /* copy to/from remote from/to local memory */ + D1(ldcp->id, "i_ldc_dring_acquire_release: c1 off=0x%llx sz=0x%llx\n", + soff, copy_size); + err = i_ldc_mem_acquire_release((ldc_mem_handle_t)dringp->mhdl, + direction, soff, copy_size); + if (err) { + DWARN(ldcp->id, + "i_ldc_dring_acquire_release: copy failed\n"); + mutex_exit(&dringp->lock); + return (err); + } + + /* do the balance */ + if (start > end) { + copy_size = ((end + 1) * dringp->dsize); + soff = 0; + + /* copy to/from remote from/to local memory */ + D1(ldcp->id, "i_ldc_dring_acquire_release: c2 " + "off=0x%llx sz=0x%llx\n", soff, copy_size); + err = i_ldc_mem_acquire_release((ldc_mem_handle_t)dringp->mhdl, + direction, soff, copy_size); + if (err) { + DWARN(ldcp->id, + "i_ldc_dring_acquire_release: copy failed\n"); + mutex_exit(&dringp->lock); + return (err); + } + } + + mutex_exit(&dringp->lock); + + return (0); +} + +/* + * Ensure that the contents in the local dring are consistent + * with the contents if of remote dring + */ +int +ldc_mem_dring_acquire(ldc_dring_handle_t dhandle, uint64_t start, uint64_t end) +{ + return (i_ldc_dring_acquire_release(dhandle, LDC_COPY_IN, start, end)); +} + +/* + * Ensure that the contents in the remote dring are consistent + * with the contents if of local dring + */ +int +ldc_mem_dring_release(ldc_dring_handle_t dhandle, uint64_t start, uint64_t end) +{ + return (i_ldc_dring_acquire_release(dhandle, LDC_COPY_OUT, start, end)); +} + + +/* ------------------------------------------------------------------------- */ diff --git a/usr/src/uts/sun4v/io/mdeg.c b/usr/src/uts/sun4v/io/mdeg.c new file mode 100644 index 0000000000..879f8b9725 --- /dev/null +++ b/usr/src/uts/sun4v/io/mdeg.c @@ -0,0 +1,914 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * MD Event Generator (MDEG) Module + */ + +#include <sys/machsystm.h> +#include <sys/taskq.h> +#include <sys/disp.h> +#include <sys/cmn_err.h> +#include <sys/note.h> + +#include <sys/mdeg.h> +#include <sys/mach_descrip.h> +#include <sys/mdesc.h> + +/* + * A single client registration + */ +typedef struct mdeg_clnt { + boolean_t valid; /* structure is in active use */ + mdeg_node_match_t *nmatch; /* node match filter */ + mdeg_node_spec_t *pspec; /* parent match filter */ + mdeg_cb_t cb; /* the client callback */ + caddr_t cb_arg; /* argument to the callback */ + uint64_t magic; /* sanity checking magic */ + mdeg_handle_t hdl; /* handle assigned by MDEG */ +} mdeg_clnt_t; + +/* + * Global MDEG data + * + * Locking Strategy: + * + * mdeg.lock - lock used to sychronize system wide MD updates. An + * MD update must be treated as an atomic event. The lock is + * taken when notification that a new MD is available and held + * until all clients have been notified. + * + * mdeg.rwlock - lock used to sychronize access to the table of + * registered clients. The reader lock must be held when looking + * up client information in the table. The writer lock must be + * held when modifying any client information. + */ +static struct mdeg { + taskq_t *taskq; /* for internal processing */ + boolean_t enabled; /* enable/disable taskq processing */ + kmutex_t lock; /* synchronize MD updates */ + md_t *md_prev; /* previous MD */ + md_t *md_curr; /* current MD */ + mdeg_clnt_t *tbl; /* table of registered clients */ + krwlock_t rwlock; /* client table lock */ + uint_t maxclnts; /* client table size */ + uint_t nclnts; /* current number of clients */ +} mdeg; + +/* + * Debugging routines + */ +#ifdef DEBUG +uint_t mdeg_debug = 0x0; + +static void mdeg_dump_clnt(mdeg_clnt_t *clnt); +static void mdeg_dump_table(void); + +#define MDEG_DBG if (mdeg_debug) printf +#define MDEG_DUMP_CLNT mdeg_dump_clnt +#define MDEG_DUMP_TABLE mdeg_dump_table + +#else /* DEBUG */ + +#define MDEG_DBG _NOTE(CONSTCOND) if (0) printf +#define MDEG_DUMP_CLNT +#define MDEG_DUMP_TABLE() + +#endif /* DEBUG */ + +/* + * Global constants + */ +#define MDEG_MAX_TASKQ_THR 512 /* maximum number of taskq threads */ +#define MDEG_MAX_CLNTS_INIT 64 /* initial client table size */ + +#define MDEG_MAGIC 0x4D4445475F48444Cull /* 'MDEG_HDL' */ + +/* + * A client handle is a 64 bit value with two pieces of + * information encoded in it. The upper 32 bits are the + * index into the table of a particular client structure. + * The lower 32 bits are a counter that is incremented + * each time a client structure is reused. + */ +#define MDEG_IDX_SHIFT 32 +#define MDEG_COUNT_MASK 0xfffffffful + +#define MDEG_ALLOC_HDL(_idx, _count) (((uint64_t)_idx << MDEG_IDX_SHIFT) | \ + ((uint64_t)(_count + 1) & \ + MDEG_COUNT_MASK)) +#define MDEG_HDL2IDX(hdl) (hdl >> MDEG_IDX_SHIFT) +#define MDEG_HDL2COUNT(hdl) (hdl & MDEG_COUNT_MASK) + +static const char trunc_str[] = " ... }"; + +/* + * Utility routines + */ +static mdeg_clnt_t *mdeg_alloc_clnt(void); +static void mdeg_notify_client(void *); +static mde_cookie_t mdeg_find_start_node(md_t *, mdeg_node_spec_t *); +static boolean_t mdeg_node_spec_match(md_t *, mde_cookie_t, mdeg_node_spec_t *); +static void mdeg_get_diff_results(md_diff_cookie_t, mdeg_result_t *); + +int +mdeg_init(void) +{ + int tblsz; + + /* + * Grab the current MD + */ + if ((mdeg.md_curr = md_get_handle()) == NULL) { + cmn_err(CE_WARN, "unable to cache snapshot of MD"); + return (-1); + } + + /* + * Initialize table of registered clients + */ + mdeg.maxclnts = MDEG_MAX_CLNTS_INIT; + + tblsz = mdeg.maxclnts * sizeof (mdeg_clnt_t); + mdeg.tbl = kmem_zalloc(tblsz, KM_SLEEP); + + rw_init(&mdeg.rwlock, NULL, RW_DRIVER, NULL); + + mdeg.nclnts = 0; + + /* + * Initialize global lock + */ + mutex_init(&mdeg.lock, NULL, MUTEX_DRIVER, NULL); + + /* + * Initialize the task queue + */ + mdeg.taskq = taskq_create("mdeg_taskq", 1, minclsyspri, 1, + MDEG_MAX_TASKQ_THR, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + + /* ready to begin handling clients */ + mdeg.enabled = B_TRUE; + + return (0); +} + +void +mdeg_fini(void) +{ + /* + * Flip the enabled switch off to make sure that + * no events get dispatched while things are being + * torn down. + */ + mdeg.enabled = B_FALSE; + + /* destroy the task queue */ + taskq_destroy(mdeg.taskq); + + /* + * Deallocate the table of registered clients + */ + kmem_free(mdeg.tbl, mdeg.maxclnts * sizeof (mdeg_clnt_t)); + rw_destroy(&mdeg.rwlock); + + /* + * Free up the cached MDs. + */ + if (mdeg.md_curr) + (void) md_fini_handle(mdeg.md_curr); + + if (mdeg.md_prev) + (void) md_fini_handle(mdeg.md_prev); + + mutex_destroy(&mdeg.lock); +} + +static mdeg_clnt_t * +mdeg_alloc_clnt(void) +{ + mdeg_clnt_t *clnt; + int idx; + mdeg_clnt_t *newtbl; + uint_t newmaxclnts; + uint_t newtblsz; + uint_t oldtblsz; + + ASSERT(RW_WRITE_HELD(&mdeg.rwlock)); + + /* search for an unused slot in the table */ + for (idx = 0; idx < mdeg.maxclnts; idx++) { + clnt = &mdeg.tbl[idx]; + if (!clnt->valid) { + break; + } + } + + /* found any empty slot */ + if (idx != mdeg.maxclnts) { + goto found; + } + + /* + * There was no free space in the table. Grow + * the table to double its current size. + */ + + MDEG_DBG("client table full:\n"); + MDEG_DUMP_TABLE(); + + newmaxclnts = mdeg.maxclnts * 2; + newtblsz = newmaxclnts * sizeof (mdeg_clnt_t); + + newtbl = kmem_zalloc(newtblsz, KM_SLEEP); + + /* copy old table data to the new table */ + oldtblsz = mdeg.maxclnts * sizeof (mdeg_clnt_t); + bcopy(mdeg.tbl, newtbl, oldtblsz); + + /* + * Since the old table was full, the first free entry + * will be just past the end of the old table. + */ + clnt = &mdeg.tbl[mdeg.maxclnts]; + + /* clean up the old table */ + kmem_free(mdeg.tbl, oldtblsz); + mdeg.tbl = newtbl; + mdeg.maxclnts = newmaxclnts; + +found: + ASSERT(clnt->valid == 0); + + clnt->hdl = MDEG_ALLOC_HDL(idx, MDEG_HDL2COUNT(clnt->hdl)); + + return (clnt); +} + +static mdeg_clnt_t * +mdeg_get_client(mdeg_handle_t hdl) +{ + int idx; + mdeg_clnt_t *clnt; + + idx = MDEG_HDL2IDX(hdl); + + /* check if index is out of bounds */ + if ((idx < 0) || (idx >= mdeg.maxclnts)) { + MDEG_DBG("mdeg_get_client: index out of bounds\n"); + return (NULL); + } + + clnt = &mdeg.tbl[idx]; + + /* check for a valid client */ + if (!clnt->valid) { + MDEG_DBG("mdeg_get_client: client is not valid\n"); + return (NULL); + } + + /* make sure the handle is an exact match */ + if (clnt->hdl != hdl) { + MDEG_DBG("mdeg_get_client: bad handle\n"); + return (NULL); + } + + if (clnt->magic != MDEG_MAGIC) { + MDEG_DBG("mdeg_get_client: bad magic\n"); + return (NULL); + } + + return (clnt); +} + +/* + * Send a notification to a client immediately after it registers. + * The result_t is a list of all the nodes that match their specified + * nodes of interest, all returned on the added list. This serves + * as a base of reference to the client. All future MD updates are + * relative to this list. + */ +static int +mdeg_notify_client_reg(mdeg_clnt_t *clnt) +{ + md_t *mdp = NULL; + mde_str_cookie_t nname; + mde_str_cookie_t aname; + mde_cookie_t startnode; + int nnodes; + int nodechk; + mde_cookie_t *listp = NULL; + mdeg_result_t *mdeg_res = NULL; + int rv = MDEG_SUCCESS; + + mutex_enter(&mdeg.lock); + + /* + * Handle the special case where the node specification + * is NULL. In this case, call the client callback without + * any results. All processing is left to the client. + */ + if (clnt->pspec == NULL) { + /* call the client callback */ + (*clnt->cb)(clnt->cb_arg, NULL); + goto done; + } + + if ((mdp = md_get_handle()) == NULL) { + cmn_err(CE_WARN, "unable to retrieve current MD"); + rv = MDEG_FAILURE; + goto done; + } + + startnode = mdeg_find_start_node(mdp, clnt->pspec); + if (startnode == MDE_INVAL_ELEM_COOKIE) { + /* not much we can do */ + cmn_err(CE_WARN, "unable to match node specifier"); + rv = MDEG_FAILURE; + goto done; + } + + /* + * Use zalloc to provide correct default values for the + * unused removed, match_prev, and match_curr lists. + */ + mdeg_res = kmem_zalloc(sizeof (mdeg_result_t), KM_SLEEP); + + nname = md_find_name(mdp, clnt->nmatch->namep); + aname = md_find_name(mdp, "fwd"); + + nnodes = md_scan_dag(mdp, startnode, nname, aname, NULL); + + if (nnodes == 0) { + MDEG_DBG("mdeg_notify_client_reg: no nodes of interest\n"); + rv = MDEG_SUCCESS; + goto done; + } else if (nnodes == -1) { + MDEG_DBG("error scanning DAG\n"); + rv = MDEG_FAILURE; + goto done; + } + + MDEG_DBG("mdeg_notify_client_reg: %d node%s of interest\n", + nnodes, (nnodes == 1) ? "" : "s"); + + /* get the list of nodes of interest */ + listp = kmem_alloc(sizeof (mde_cookie_t) * nnodes, KM_SLEEP); + nodechk = md_scan_dag(mdp, startnode, nname, aname, listp); + + ASSERT(nodechk == nnodes); + + mdeg_res->added.mdp = mdp; + mdeg_res->added.mdep = listp; + mdeg_res->added.nelem = nnodes; + + /* call the client callback */ + (*clnt->cb)(clnt->cb_arg, mdeg_res); + +done: + mutex_exit(&mdeg.lock); + + if (mdp) + (void) md_fini_handle(mdp); + + if (listp) + kmem_free(listp, sizeof (mde_cookie_t) * nnodes); + + if (mdeg_res) + kmem_free(mdeg_res, sizeof (mdeg_result_t)); + + return (rv); +} + +/* + * Register to receive an event notification when the system + * machine description is updated. + * + * Passing NULL for the node specification parameter is valid + * as long as the match specification is also NULL. In this + * case, the client will receive a notification when the MD + * has been updated, but the callback will not include any + * information. The client is then responsible for obtaining + * its own copy of the system MD and performing any processing + * manually. + */ +int +mdeg_register(mdeg_node_spec_t *pspecp, mdeg_node_match_t *nmatchp, + mdeg_cb_t cb, void *cb_arg, mdeg_handle_t *hdlp) +{ + mdeg_clnt_t *clnt; + + /* + * If the RW lock is held, a client is calling + * register from its own callback. + */ + if (RW_LOCK_HELD(&mdeg.rwlock)) { + MDEG_DBG("mdeg_register: rwlock already held\n"); + return (MDEG_FAILURE); + } + + /* node spec and node match must both be valid, or both NULL */ + if (((pspecp != NULL) && (nmatchp == NULL)) || + ((pspecp == NULL) && (nmatchp != NULL))) { + MDEG_DBG("mdeg_register: invalid parameters\n"); + return (MDEG_FAILURE); + } + + rw_enter(&mdeg.rwlock, RW_WRITER); + + clnt = mdeg_alloc_clnt(); + + ASSERT(clnt); + + /* + * Fill in the rest of the data + */ + clnt->nmatch = nmatchp; + clnt->pspec = pspecp; + clnt->cb = cb; + clnt->cb_arg = cb_arg; + clnt->magic = MDEG_MAGIC; + + /* do this last */ + clnt->valid = B_TRUE; + + MDEG_DBG("client registered (0x%lx):\n", clnt->hdl); + MDEG_DUMP_CLNT(clnt); + + mdeg.nclnts++; + + if (mdeg_notify_client_reg(clnt) != MDEG_SUCCESS) { + bzero(clnt, sizeof (mdeg_clnt_t)); + rw_exit(&mdeg.rwlock); + return (MDEG_FAILURE); + } + + rw_exit(&mdeg.rwlock); + + *hdlp = clnt->hdl; + + return (MDEG_SUCCESS); +} + +int +mdeg_unregister(mdeg_handle_t hdl) +{ + mdeg_clnt_t *clnt; + mdeg_handle_t mdh; + + /* + * If the RW lock is held, a client is calling + * unregister from its own callback. + */ + if (RW_LOCK_HELD(&mdeg.rwlock)) { + MDEG_DBG("mdeg_unregister: rwlock already held\n"); + return (MDEG_FAILURE); + } + + /* lookup the client */ + if ((clnt = mdeg_get_client(hdl)) == NULL) { + return (MDEG_FAILURE); + } + + rw_enter(&mdeg.rwlock, RW_WRITER); + + MDEG_DBG("client unregistered (0x%lx):\n", hdl); + MDEG_DUMP_CLNT(clnt); + + /* save the handle to prevent reuse */ + mdh = clnt->hdl; + bzero(clnt, sizeof (mdeg_clnt_t)); + + clnt->hdl = mdh; + + mdeg.nclnts--; + + rw_exit(&mdeg.rwlock); + + return (MDEG_SUCCESS); +} + +/* + * Simple algorithm for now, grab the global lock and let all + * the clients update themselves in parallel. There is a lot of + * room for improvement here. We could eliminate some scans of + * the DAG by imcrementally scanning at lower levels of the DAG + * rather than having each client start its own scan from the root. + */ +void +mdeg_notify_clients(void) +{ + md_t *md_new; + mdeg_clnt_t *clnt; + int idx; + int nclnt; + + rw_enter(&mdeg.rwlock, RW_READER); + mutex_enter(&mdeg.lock); + + /* + * Rotate the MDs + */ + if ((md_new = md_get_handle()) == NULL) { + cmn_err(CE_WARN, "unable to retrieve new MD"); + goto done; + } + + if (mdeg.md_prev) { + (void) md_fini_handle(mdeg.md_prev); + } + + mdeg.md_prev = mdeg.md_curr; + mdeg.md_curr = md_new; + + if (mdeg.nclnts == 0) { + MDEG_DBG("mdeg_notify_clients: no clients registered\n"); + goto done; + } + + /* dispatch the update notification to all clients */ + for (idx = 0, nclnt = 0; idx < mdeg.maxclnts; idx++) { + clnt = &mdeg.tbl[idx]; + + if (!clnt->valid) + continue; + + MDEG_DBG("notifying client 0x%lx (%d/%d)\n", clnt->hdl, + ++nclnt, mdeg.nclnts); + + (void) taskq_dispatch(mdeg.taskq, mdeg_notify_client, + (void *)clnt, TQ_SLEEP); + } + + taskq_wait(mdeg.taskq); + +done: + mutex_exit(&mdeg.lock); + rw_exit(&mdeg.rwlock); +} + +static void +mdeg_notify_client(void *arg) +{ + mdeg_clnt_t *clnt = (mdeg_clnt_t *)arg; + md_diff_cookie_t mdd = MD_INVAL_DIFF_COOKIE; + mdeg_result_t mdeg_res; + mde_cookie_t md_prev_start; + mde_cookie_t md_curr_start; + + rw_enter(&mdeg.rwlock, RW_READER); + + if (!mdeg.enabled) { + /* trying to shutdown */ + MDEG_DBG("mdeg_notify_client: mdeg disabled, aborting\n"); + goto cleanup; + } + + /* + * Handle the special case where the node specification + * is NULL. In this case, call the client callback without + * any results. All processing is left to the client. + */ + if (clnt->pspec == NULL) { + /* call the client callback */ + (*clnt->cb)(clnt->cb_arg, NULL); + + MDEG_DBG("MDEG client callback done\n"); + goto cleanup; + } + + /* find our start nodes */ + md_prev_start = mdeg_find_start_node(mdeg.md_prev, clnt->pspec); + if (md_prev_start == MDE_INVAL_ELEM_COOKIE) { + goto cleanup; + } + + md_curr_start = mdeg_find_start_node(mdeg.md_curr, clnt->pspec); + if (md_curr_start == MDE_INVAL_ELEM_COOKIE) { + goto cleanup; + } + + /* diff the MDs */ + mdd = md_diff_init(mdeg.md_prev, md_prev_start, mdeg.md_curr, + md_curr_start, clnt->nmatch->namep, clnt->nmatch->matchp); + + if (mdd == MD_INVAL_DIFF_COOKIE) { + MDEG_DBG("unable to diff MDs\n"); + goto cleanup; + } + + /* + * Cache the results of the diff + */ + mdeg_get_diff_results(mdd, &mdeg_res); + + /* call the client callback */ + (*clnt->cb)(clnt->cb_arg, &mdeg_res); + + MDEG_DBG("MDEG client callback done\n"); + +cleanup: + rw_exit(&mdeg.rwlock); + + if (mdd != MD_INVAL_DIFF_COOKIE) + (void) md_diff_fini(mdd); +} + +static mde_cookie_t +mdeg_find_start_node(md_t *md, mdeg_node_spec_t *nspec) +{ + mde_cookie_t *nodesp; + mde_str_cookie_t nname; + mde_str_cookie_t aname; + int nnodes; + int idx; + + if ((md == NULL) || (nspec == NULL)) + return (MDE_INVAL_ELEM_COOKIE); + + nname = md_find_name(md, nspec->namep); + aname = md_find_name(md, "fwd"); + + nnodes = md_scan_dag(md, NULL, nname, aname, NULL); + if (nnodes == 0) + return (MDE_INVAL_ELEM_COOKIE); + + nodesp = kmem_alloc(sizeof (mde_cookie_t) * nnodes, KM_SLEEP); + + (void) md_scan_dag(md, NULL, nname, aname, nodesp); + + for (idx = 0; idx < nnodes; idx++) { + + if (mdeg_node_spec_match(md, nodesp[idx], nspec)) { + mde_cookie_t res = nodesp[idx]; + + kmem_free(nodesp, sizeof (mde_cookie_t) * nnodes); + return (res); + } + } + + kmem_free(nodesp, sizeof (mde_cookie_t) * nnodes); + return (MDE_INVAL_ELEM_COOKIE); +} + +static boolean_t +mdeg_node_spec_match(md_t *md, mde_cookie_t node, mdeg_node_spec_t *nspec) +{ + mdeg_prop_spec_t *prop; + + ASSERT(md && nspec); + ASSERT(node != MDE_INVAL_ELEM_COOKIE); + + prop = nspec->specp; + + while (prop->type != MDET_LIST_END) { + + switch (prop->type) { + case MDET_PROP_VAL: { + uint64_t val; + + if (md_get_prop_val(md, node, prop->namep, &val) != 0) + return (B_FALSE); + + if (prop->ps_val != val) + return (B_FALSE); + + break; + } + case MDET_PROP_STR: { + char *str; + + if (md_get_prop_str(md, node, prop->namep, &str) != 0) + return (B_FALSE); + + if (strcmp(prop->ps_str, str) != 0) + return (B_FALSE); + + break; + } + + default: + return (B_FALSE); + } + + prop++; + } + + return (B_TRUE); +} + +static void +mdeg_get_diff_results(md_diff_cookie_t mdd, mdeg_result_t *res) +{ + /* + * Cache added nodes. + */ + res->added.mdp = mdeg.md_curr; + res->added.nelem = md_diff_added(mdd, &(res->added.mdep)); + + if (res->added.nelem == -1) { + bzero(&(res->added), sizeof (mdeg_diff_t)); + } + + /* + * Cache removed nodes. + */ + res->removed.mdp = mdeg.md_prev; + res->removed.nelem = md_diff_removed(mdd, &(res->removed.mdep)); + + if (res->removed.nelem == -1) { + bzero(&(res->removed), sizeof (mdeg_diff_t)); + } + + /* + * Cache matching node pairs. + */ + res->match_curr.mdp = mdeg.md_curr; + res->match_prev.mdp = mdeg.md_prev; + res->match_curr.nelem = md_diff_matched(mdd, &(res->match_prev.mdep), + &(res->match_curr.mdep)); + res->match_prev.nelem = res->match_curr.nelem; + + if (res->match_prev.nelem == -1) { + bzero(&(res->match_prev), sizeof (mdeg_diff_t)); + bzero(&(res->match_curr), sizeof (mdeg_diff_t)); + } +} + +#ifdef DEBUG +/* + * Generate a string that represents the node specifier + * structure. Clamp the string length if the specifier + * structure contains too much information. + * + * General form: + * + * <nodename>:{<propname>=<propval>,...} + * e.g. + * vdevice:{name=vsw,reg=0x0} + */ +static void +mdeg_spec_str(mdeg_node_spec_t *spec, char *buf, int len) +{ + mdeg_prop_spec_t *prop; + int offset; + boolean_t first = B_TRUE; + char *end = buf + len; + + offset = snprintf(buf, len, "%s:{", spec->namep); + + buf += offset; + len -= offset; + if (len <= 0) + goto trunc; + + prop = spec->specp; + + while (prop->type != MDET_LIST_END) { + + switch (prop->type) { + case MDET_PROP_VAL: + offset = snprintf(buf, len, "%s%s=0x%lx", + (first) ? "" : ",", prop->namep, prop->ps_val); + buf += offset; + len -= offset; + if (len <= 0) + goto trunc; + break; + + case MDET_PROP_STR: + offset = snprintf(buf, len, "%s%s=%s", + (first) ? "" : ",", prop->namep, prop->ps_str); + buf += offset; + len -= offset; + if (len <= 0) + goto trunc; + break; + + default: + (void) snprintf(buf, len, "}"); + return; + } + + if (first) + first = B_FALSE; + prop++; + } + + (void) snprintf(buf, len, "}"); + return; + +trunc: + /* string too long, truncate it */ + buf = end - (strlen(trunc_str) + 1); + (void) sprintf(buf, trunc_str); +} + +/* + * Generate a string that represents the match structure. + * Clamp the string length if the match structure contains + * too much information. + * + * General form: + * + * <nodename>:{<propname>,...} + * e.g. + * nmatch=vport:{reg} + */ +static void +mdeg_match_str(mdeg_node_match_t *match, char *buf, int len) +{ + md_prop_match_t *prop; + int offset; + boolean_t first = B_TRUE; + char *end = buf + len; + + offset = snprintf(buf, len, "%s:{", match->namep); + + buf += offset; + len -= offset; + if (len <= 0) + goto trunc; + + prop = match->matchp; + + while (prop->type != MDET_LIST_END) { + offset = snprintf(buf, len, "%s%s", (first) ? "" : ",", + prop->namep); + buf += offset; + len -= offset; + if (len <= 0) + goto trunc; + + if (first) + first = B_FALSE; + prop++; + } + + (void) snprintf(buf, len, "}"); + return; + +trunc: + /* string too long, truncate it */ + buf = end - (strlen(trunc_str) + 1); + (void) sprintf(buf, trunc_str); +} + +#define MAX_FIELD_STR 80 + +static void +mdeg_dump_clnt(mdeg_clnt_t *clnt) +{ + char str[MAX_FIELD_STR]; + + if (!clnt->valid) { + MDEG_DBG(" valid=B_FALSE\n"); + return; + } + + mdeg_spec_str(clnt->pspec, str, MAX_FIELD_STR); + MDEG_DBG(" pspecp=%s\n", str); + + mdeg_match_str(clnt->nmatch, str, MAX_FIELD_STR); + MDEG_DBG(" nmatch=%s\n", str); +} + +static void +mdeg_dump_table(void) +{ + int idx; + mdeg_clnt_t *clnt; + + for (idx = 0; idx < mdeg.maxclnts; idx++) { + clnt = &(mdeg.tbl[idx]); + + MDEG_DBG("client %d (0x%lx):\n", idx, clnt->hdl); + mdeg_dump_clnt(clnt); + } +} +#endif /* DEBUG */ diff --git a/usr/src/uts/sun4v/io/mdesc.c b/usr/src/uts/sun4v/io/mdesc.c index 84dc13fdc0..6aca5946fc 100644 --- a/usr/src/uts/sun4v/io/mdesc.c +++ b/usr/src/uts/sun4v/io/mdesc.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -55,23 +54,29 @@ * Operational state flags */ -#define MDESC_DIDMINOR 0x2 /* Created minors */ -#define MDESC_DIDMUTEX 0x8 /* Created mutex */ -#define MDESC_DIDCV 0x10 /* Created cv */ -#define MDESC_BUSY 0x20 /* Device is busy */ +#define MDESC_GOT_HANDLE 0x10 /* Got mdesc handle */ +#define MDESC_BUSY 0x20 /* Device is busy */ -static void *mdesc_state_head; +static void *mdesc_state_head; +static vmem_t *mdesc_minor; +static uint16_t mdesc_max_opens = 256; +static uint16_t mdesc_opens = 0; +static int mdesc_attached = 0; +static dev_info_t *mdesc_devi; +static kmutex_t mdesc_lock; struct mdesc_state { int instance; - dev_info_t *devi; + dev_t dev; kmutex_t lock; kcondvar_t cv; size_t mdesc_len; - uint8_t *mdesc; + md_t *mdesc; int flags; }; +typedef struct mdesc_state mdesc_state_t; + static int mdesc_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); static int mdesc_attach(dev_info_t *, ddi_attach_cmd_t); static int mdesc_detach(dev_info_t *, ddi_detach_cmd_t); @@ -129,19 +134,13 @@ static struct modlinkage modlinkage = { }; - - - - - - int _init(void) { int retval; if ((retval = ddi_soft_state_init(&mdesc_state_head, - sizeof (struct mdesc_state), 1)) != 0) + sizeof (struct mdesc_state), mdesc_max_opens)) != 0) return (retval); if ((retval = mod_install(&modlinkage)) != 0) { ddi_soft_state_fini(&mdesc_state_head); @@ -189,9 +188,10 @@ mdesc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) switch (cmd) { case DDI_INFO_DEVT2DEVINFO: - if ((mdsp = ddi_get_soft_state(mdesc_state_head, - getminor((dev_t)arg))) != NULL) { - *resultp = mdsp->devi; + mdsp = ddi_get_soft_state(mdesc_state_head, + getminor((dev_t)arg)); + if (mdsp != NULL) { + *resultp = mdesc_devi; retval = DDI_SUCCESS; } else *resultp = NULL; @@ -212,47 +212,23 @@ static int mdesc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { int instance = ddi_get_instance(dip); - struct mdesc_state *mdsp; switch (cmd) { case DDI_ATTACH: - if (ddi_soft_state_zalloc(mdesc_state_head, instance) != - DDI_SUCCESS) { - cmn_err(CE_WARN, "%s@%d: Unable to allocate state", - MDESC_NAME, instance); - return (DDI_FAILURE); - } - if ((mdsp = ddi_get_soft_state(mdesc_state_head, instance)) == - NULL) { - cmn_err(CE_WARN, "%s@%d: Unable to obtain state", - MDESC_NAME, instance); - ddi_soft_state_free(dip, instance); - return (DDI_FAILURE); - } + if (ddi_create_minor_node(dip, MDESC_NAME, S_IFCHR, instance, DDI_PSEUDO, 0) != DDI_SUCCESS) { cmn_err(CE_WARN, "%s@%d: Unable to create minor node", MDESC_NAME, instance); - (void) mdesc_detach(dip, DDI_DETACH); return (DDI_FAILURE); } - mdsp->flags |= MDESC_DIDMINOR; - - mdsp->instance = instance; - mdsp->devi = dip; - - mutex_init(&mdsp->lock, NULL, MUTEX_DRIVER, NULL); - mdsp->flags |= MDESC_DIDMUTEX; - - cv_init(&mdsp->cv, NULL, CV_DRIVER, NULL); - mdsp->flags |= MDESC_DIDCV; - - /* point the driver at the kernel's copy of the data */ - mdsp->mdesc = (uint8_t *)machine_descrip.va; - mdsp->mdesc_len = (machine_descrip.va != NULL) ? - machine_descrip.size : 0; - ddi_report_dev(dip); + mdesc_devi = dip; + mdesc_minor = vmem_create("mdesc_minor", (void *) 1, + mdesc_max_opens, 1, NULL, NULL, NULL, 0, + VM_SLEEP | VMC_IDENTIFIER); + mutex_init(&mdesc_lock, NULL, MUTEX_DRIVER, NULL); + mdesc_attached = 1; return (DDI_SUCCESS); case DDI_RESUME: return (DDI_SUCCESS); @@ -261,27 +237,16 @@ mdesc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) } } - - +/*ARGSUSED*/ static int mdesc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { - int instance = ddi_get_instance(dip); - struct mdesc_state *mdsp; - switch (cmd) { case DDI_DETACH: - mdsp = ddi_get_soft_state(mdesc_state_head, instance); - if (mdsp != NULL) { - ASSERT(!(mdsp->flags & MDESC_BUSY)); - if (mdsp->flags & MDESC_DIDCV) - cv_destroy(&mdsp->cv); - if (mdsp->flags & MDESC_DIDMUTEX) - mutex_destroy(&mdsp->lock); - if (mdsp->flags & MDESC_DIDMINOR) - ddi_remove_minor_node(dip, NULL); - } - ddi_soft_state_free(mdesc_state_head, instance); + mutex_destroy(&mdesc_lock); + vmem_destroy(mdesc_minor); + ddi_remove_minor_node(mdesc_devi, NULL); + mdesc_attached = 0; return (DDI_SUCCESS); case DDI_SUSPEND: @@ -292,28 +257,107 @@ mdesc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) } } +static void +mdesc_destroy_state(mdesc_state_t *mdsp) +{ + minor_t minor = getminor(mdsp->dev); + + if (mdsp->flags & MDESC_GOT_HANDLE) + (void) md_fini_handle(mdsp->mdesc); + + cv_destroy(&mdsp->cv); + mutex_destroy(&mdsp->lock); + ddi_soft_state_free(mdesc_state_head, minor); + vmem_free(mdesc_minor, (void *)(uintptr_t)minor, 1); +} + +static mdesc_state_t * +mdesc_create_state(dev_t *devp) +{ + major_t major; + minor_t minor; + mdesc_state_t *mdsp; + + minor = (minor_t)(uintptr_t)vmem_alloc(mdesc_minor, 1, + VM_BESTFIT | VM_SLEEP); + + if (ddi_soft_state_zalloc(mdesc_state_head, minor) != + DDI_SUCCESS) { + cmn_err(CE_WARN, "%s@%d: Unable to allocate state", + MDESC_NAME, minor); + vmem_free(mdesc_minor, (void *)(uintptr_t)minor, 1); + return (NULL); + } + + mdsp = ddi_get_soft_state(mdesc_state_head, minor); + + if (devp != NULL) { + major = getemajor(*devp); + } else { + major = ddi_driver_major(mdesc_devi); + } + + mdsp->dev = makedevice(major, minor); + + if (devp != NULL) + *devp = mdsp->dev; + + mdsp->instance = minor; + + mutex_init(&mdsp->lock, NULL, MUTEX_DRIVER, NULL); + + cv_init(&mdsp->cv, NULL, CV_DRIVER, NULL); + + mdsp->mdesc = md_get_handle(); + + if (mdsp->mdesc == NULL) { + mdesc_destroy_state(mdsp); + return (NULL); + } + mdsp->flags |= MDESC_GOT_HANDLE; + + mdsp->mdesc_len = md_get_bin_size(mdsp->mdesc); + + if (mdsp->mdesc_len == 0) { + mdesc_destroy_state(mdsp); + mdsp = NULL; + } + + return (mdsp); +} /*ARGSUSED*/ static int mdesc_open(dev_t *devp, int flag, int otyp, cred_t *credp) { - int instance = getminor(*devp); struct mdesc_state *mdsp; - if ((mdsp = ddi_get_soft_state(mdesc_state_head, instance)) == NULL) + if (otyp != OTYP_CHR) + return (EINVAL); + if (!mdesc_attached) return (ENXIO); - ASSERT(mdsp->instance == instance); + mutex_enter(&mdesc_lock); - if (otyp != OTYP_CHR) - return (EINVAL); + if (mdesc_opens >= mdesc_max_opens) { + mutex_exit(&mdesc_lock); + return (ENXIO); + } - return (0); -} + mdsp = mdesc_create_state(devp); + + if (mdsp == NULL) { + mutex_exit(&mdesc_lock); + return (ENXIO); + } + mdesc_opens++; + mutex_exit(&mdesc_lock); + return (0); +} /*ARGSUSED*/ static int @@ -322,13 +366,25 @@ mdesc_close(dev_t dev, int flag, int otyp, cred_t *credp) struct mdesc_state *mdsp; int instance = getminor(dev); + if (otyp != OTYP_CHR) + return (EINVAL); + + mutex_enter(&mdesc_lock); + if (mdesc_opens == 0) { + mutex_exit(&mdesc_lock); + return (0); + } + mutex_exit(&mdesc_lock); + if ((mdsp = ddi_get_soft_state(mdesc_state_head, instance)) == NULL) return (ENXIO); ASSERT(mdsp->instance == instance); - if (otyp != OTYP_CHR) - return (EINVAL); + mdesc_destroy_state(mdsp); + mutex_enter(&mdesc_lock); + mdesc_opens--; + mutex_exit(&mdesc_lock); return (0); } @@ -363,6 +419,7 @@ mdesc_rw(dev_t dev, struct uio *uiop, enum uio_rw rw) int instance = getminor(dev); size_t len; int retval; + caddr_t buf; len = uiop->uio_resid; @@ -400,7 +457,11 @@ mdesc_rw(dev_t dev, struct uio *uiop, enum uio_rw rw) mdsp->flags |= MDESC_BUSY; mutex_exit(&mdsp->lock); - retval = uiomove((void *)(mdsp->mdesc + uiop->uio_offset), + buf = md_get_md_raw(mdsp->mdesc); + if (buf == NULL) + return (ENXIO); + + retval = uiomove((void *)(buf + uiop->uio_offset), len, rw, uiop); mutex_enter(&mdsp->lock); diff --git a/usr/src/uts/sun4v/io/platsvc.c b/usr/src/uts/sun4v/io/platsvc.c new file mode 100644 index 0000000000..5970a7252f --- /dev/null +++ b/usr/src/uts/sun4v/io/platsvc.c @@ -0,0 +1,371 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * sun4v Platform Services Module + */ + +#include <sys/modctl.h> +#include <sys/cmn_err.h> +#include <sys/machsystm.h> +#include <sys/note.h> +#include <sys/uadmin.h> +#include <sys/ds.h> +#include <sys/platsvc.h> + +/* + * Debugging routines + */ +#ifdef DEBUG +uint_t ps_debug = 0x0; +#define DBG if (ps_debug) printf +#else /* DEBUG */ +#define DBG _NOTE(CONSTCOND) if (0) printf +#endif /* DEBUG */ + +/* + * Time resolution conversions. + */ +#define MS2NANO(x) ((x) * MICROSEC) +#define MS2SEC(x) ((x) / MILLISEC) +#define MS2MIN(x) (MS2SEC(x) / 60) + +/* + * Domains Services interaction + */ +static ds_svc_hdl_t ds_md_handle; +static ds_svc_hdl_t ds_shutdown_handle; +static ds_svc_hdl_t ds_panic_handle; + +static ds_ver_t ps_vers[] = {{ 1, 0 }}; +#define PS_NVERS (sizeof (ps_vers) / sizeof (ps_vers[0])) + +static ds_capability_t ps_md_cap = { + "md-update", /* svc_id */ + ps_vers, /* vers */ + PS_NVERS /* nvers */ +}; + +static ds_capability_t ps_shutdown_cap = { + "domain-shutdown", /* svc_id */ + ps_vers, /* vers */ + PS_NVERS /* nvers */ +}; + +static ds_capability_t ps_panic_cap = { + "domain-panic", /* svc_id */ + ps_vers, /* vers */ + PS_NVERS /* nvers */ +}; + +static void ps_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl); +static void ps_unreg_handler(ds_cb_arg_t arg); + +static void ps_md_data_handler(ds_cb_arg_t arg, void * buf, size_t buflen); +static void ps_shutdown_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen); +static void ps_panic_data_handler(ds_cb_arg_t arg, void * buf, size_t buflen); + +static ds_clnt_ops_t ps_md_ops = { + ps_reg_handler, /* ds_reg_cb */ + ps_unreg_handler, /* ds_unreg_cb */ + ps_md_data_handler, /* ds_data_cb */ + &ds_md_handle /* cb_arg */ +}; + +static ds_clnt_ops_t ps_shutdown_ops = { + ps_reg_handler, /* ds_reg_cb */ + ps_unreg_handler, /* ds_unreg_cb */ + ps_shutdown_data_handler, /* ds_data_cb */ + &ds_shutdown_handle /* cb_arg */ +}; + +static ds_clnt_ops_t ps_panic_ops = { + ps_reg_handler, /* ds_reg_cb */ + ps_unreg_handler, /* ds_unreg_cb */ + ps_panic_data_handler, /* ds_data_cb */ + &ds_panic_handle /* cb_arg */ +}; + +static int ps_init(void); +static void ps_fini(void); + +/* + * Powerdown timeout value of 5 minutes. + */ +#define PLATSVC_POWERDOWN_DELAY 1200 + +static struct modlmisc modlmisc = { + &mod_miscops, + "sun4v Platform Services %I%" +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modlmisc, + NULL +}; + +int +_init(void) +{ + int rv; + + if ((rv = ps_init()) != 0) + return (rv); + + if ((rv = mod_install(&modlinkage)) != 0) + ps_fini(); + + return (rv); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int platsvc_allow_unload; + +int +_fini(void) +{ + int status; + + if (platsvc_allow_unload == 0) + return (EBUSY); + + if ((status = mod_remove(&modlinkage)) == 0) + ps_fini(); + + return (status); +} + +static int +ps_init(void) +{ + int rv; + extern int mdeg_init(void); + + /* register with domain services framework */ + rv = ds_cap_init(&ps_md_cap, &ps_md_ops); + if (rv != 0) { + cmn_err(CE_WARN, "ds_cap_init md-update failed: %d", rv); + return (rv); + } + + rv = ds_cap_init(&ps_shutdown_cap, &ps_shutdown_ops); + if (rv != 0) { + cmn_err(CE_WARN, "ds_cap_init domain-shutdown failed: %d", rv); + (void) ds_cap_fini(&ps_md_cap); + return (rv); + } + + rv = ds_cap_init(&ps_panic_cap, &ps_panic_ops); + if (rv != 0) { + cmn_err(CE_WARN, "ds_cap_init domain-panic failed: %d", rv); + (void) ds_cap_fini(&ps_md_cap); + (void) ds_cap_fini(&ps_shutdown_cap); + return (rv); + } + + rv = mdeg_init(); + + return (rv); +} + +static void +ps_fini(void) +{ + extern void mdeg_fini(void); + + /* + * Stop incoming requests from Zeus + */ + (void) ds_cap_fini(&ps_md_cap); + (void) ds_cap_fini(&ps_shutdown_cap); + (void) ds_cap_fini(&ps_panic_cap); + + mdeg_fini(); +} + +static void +ps_md_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen) +{ + extern int mach_descrip_update(void); + extern void mdeg_notify_clients(void); + + ds_svc_hdl_t ds_handle; + platsvc_md_update_req_t *msg = buf; + platsvc_md_update_resp_t resp_msg; + uint_t rv; + + if (arg == NULL) + return; + + ds_handle = ds_md_handle; + + if (msg == NULL || buflen != sizeof (platsvc_md_update_req_t)) { + resp_msg.req_num = 0; + resp_msg.result = MD_UPDATE_INVALID_MSG; + if ((rv = ds_cap_send(ds_handle, &resp_msg, + sizeof (resp_msg))) != 0) { + cmn_err(CE_NOTE, "md ds_cap_send failed (%d)", rv); + } + return; + } + + DBG("MD Reload...\n"); + if (mach_descrip_update()) { + cmn_err(CE_WARN, "MD reload failed\n"); + return; + } + + /* + * notify registered clients that MD has + * been updated + */ + mdeg_notify_clients(); + + resp_msg.req_num = msg->req_num; + resp_msg.result = MD_UPDATE_SUCCESS; + if ((rv = ds_cap_send(ds_handle, &resp_msg, sizeof (resp_msg))) != 0) { + cmn_err(CE_NOTE, "md ds_cap_send resp failed (%d)", rv); + } +} + +static void +ps_shutdown_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen) +{ + ds_svc_hdl_t ds_handle; + platsvc_shutdown_req_t *msg = buf; + platsvc_shutdown_resp_t resp_msg; + uint_t rv; + hrtime_t start; + + if (arg == NULL) + return; + + ds_handle = ds_shutdown_handle; + + if (msg == NULL || buflen != sizeof (platsvc_shutdown_req_t)) { + resp_msg.req_num = 0; + resp_msg.result = DOMAIN_SHUTDOWN_INVALID_MSG; + resp_msg.reason[0] = '\0'; + if ((rv = ds_cap_send(ds_handle, &resp_msg, + sizeof (resp_msg))) != 0) { + cmn_err(CE_NOTE, "shutdown ds_cap_send failed (%d)", + rv); + } + return; + } + + resp_msg.req_num = msg->req_num; + resp_msg.result = DOMAIN_SHUTDOWN_SUCCESS; + resp_msg.reason[0] = '\0'; + + if ((rv = ds_cap_send(ds_handle, &resp_msg, sizeof (resp_msg))) != 0) { + cmn_err(CE_NOTE, "shutdown ds_cap_send resp failed (%d)", rv); + } + + /* + * Honor the ldoms manager's shutdown delay requirement. + */ + cmn_err(CE_NOTE, "shutdown requested by ldom manager, " + "system shutdown in %d minutes", MS2MIN(msg->delay)); + + start = gethrtime(); + while (gethrtime() - start < MS2NANO(msg->delay)) + ; + + (void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred); +} + + +static void +ps_panic_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen) +{ + ds_svc_hdl_t ds_handle; + platsvc_panic_req_t *msg = buf; + platsvc_panic_resp_t resp_msg; + uint_t rv; + + if (arg == NULL) + return; + + ds_handle = ds_panic_handle; + + if (msg == NULL || buflen != sizeof (platsvc_panic_req_t)) { + resp_msg.req_num = 0; + resp_msg.result = DOMAIN_PANIC_INVALID_MSG; + resp_msg.reason[0] = '\0'; + if ((rv = ds_cap_send(ds_handle, &resp_msg, + sizeof (resp_msg))) != 0) { + cmn_err(CE_NOTE, "panic ds_cap_send resp failed (%d)", + rv); + } + return; + } + + resp_msg.req_num = msg->req_num; + resp_msg.result = DOMAIN_PANIC_SUCCESS; + resp_msg.reason[0] = '\0'; + if ((rv = ds_cap_send(ds_handle, &resp_msg, sizeof (resp_msg))) != 0) { + cmn_err(CE_NOTE, "panic ds_cap_send resp failed (%d)", rv); + } + + cmn_err(CE_PANIC, "Panic forced by ldom manager"); + _NOTE(NOTREACHED) +} + +static void +ps_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl) +{ + DBG("ps_reg_handler: arg=0x%p, ver=%d.%d, hdl=0x%lx\n", + arg, ver->major, ver->minor, hdl); + + if ((ds_svc_hdl_t *)arg == &ds_md_handle) + ds_md_handle = hdl; + if ((ds_svc_hdl_t *)arg == &ds_shutdown_handle) + ds_shutdown_handle = hdl; + if ((ds_svc_hdl_t *)arg == &ds_panic_handle) + ds_panic_handle = hdl; +} + +static void +ps_unreg_handler(ds_cb_arg_t arg) +{ + DBG("ps_unreg_handler: arg=0x%p\n", arg); + + if ((ds_svc_hdl_t *)arg == &ds_md_handle) + ds_md_handle = DS_INVALID_HDL; + if ((ds_svc_hdl_t *)arg == &ds_shutdown_handle) + ds_shutdown_handle = DS_INVALID_HDL; + if ((ds_svc_hdl_t *)arg == &ds_panic_handle) + ds_panic_handle = DS_INVALID_HDL; +} diff --git a/usr/src/uts/sun4v/io/qcn.c b/usr/src/uts/sun4v/io/qcn.c index e68e1bde53..63b3c0b5fb 100644 --- a/usr/src/uts/sun4v/io/qcn.c +++ b/usr/src/uts/sun4v/io/qcn.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -87,7 +87,8 @@ static cyc_handler_t qcn_poll_cychandler = { }; static cyclic_id_t qcn_poll_cycid = CYCLIC_NONE; static uint64_t qcn_poll_interval = 5; /* milli sec */ -static uint64_t sb_interval = 0; +static uint64_t sb_interval = 0; +uint_t qcn_force_polling = 0; #endif #define QCN_MI_IDNUM 0xABCE @@ -338,7 +339,8 @@ qcn_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) * the console to work on older firmware releases. */ binding_name = ddi_binding_name(qcn_state->qcn_dip); - if (strcmp(binding_name, "qcn") == 0) + if ((strcmp(binding_name, "qcn") == 0) || + (qcn_force_polling)) qcn_state->qcn_polling = 1; if (qcn_state->qcn_polling) { @@ -802,7 +804,7 @@ qcn_transmit(queue_t *q, mblk_t *mp) buf = (caddr_t)bp->b_rptr; for (i = 0; i < len; i++) { - if (hv_cnputchar(buf[i]) == -1) + if (hv_cnputchar(buf[i]) == H_EWOULDBLOCK) break; } if (i != len) { diff --git a/usr/src/uts/sun4v/io/vcc.c b/usr/src/uts/sun4v/io/vcc.c new file mode 100644 index 0000000000..124db0d05b --- /dev/null +++ b/usr/src/uts/sun4v/io/vcc.c @@ -0,0 +1,2406 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/file.h> +#include <sys/errno.h> +#include <sys/uio.h> +#include <sys/open.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/cmn_err.h> +#include <sys/ksynch.h> +#include <sys/modctl.h> +#include <sys/stat.h> /* needed for S_IFBLK and S_IFCHR */ +#include <sys/debug.h> +#include <sys/promif.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/cyclic.h> +#include <sys/termio.h> +#include <sys/intr.h> +#include <sys/ivintr.h> +#include <sys/note.h> +#include <sys/stat.h> +#include <sys/fcntl.h> +#include <sys/sysmacros.h> + +#include <sys/ldc.h> +#include <sys/mdeg.h> +#include <sys/vcc_impl.h> + +/* + * Function prototypes. + */ + +/* DDI entrypoints */ +static int vcc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); +static int vcc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); +static int vcc_open(dev_t *devp, int flag, int otyp, cred_t *cred); +static int vcc_close(dev_t dev, int flag, int otyp, cred_t *cred); +static int vcc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, + cred_t *credp, int *rvalp); +static int vcc_read(dev_t dev, struct uio *uiop, cred_t *credp); +static int vcc_write(dev_t dev, struct uio *uiop, cred_t *credp); +static int vcc_chpoll(dev_t dev, short events, int anyyet, + short *reventsp, struct pollhead **phpp); +static int vcc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, + void *arg, void **resultp); + +/* callback functions */ +static uint_t vcc_ldc_cb(uint64_t event, caddr_t arg); +static int vcc_mdeg_cb(void *cb_argp, mdeg_result_t *resp); + +/* Internal functions */ +static int i_vcc_ldc_init(vcc_t *vccp, vcc_port_t *vport); +static int i_vcc_add_port(vcc_t *vccp, char *group_name, uint64_t tcp_port, + uint_t portno, char *domain_name); +static int i_vcc_config_port(vcc_t *vccp, uint_t portno, uint64_t ldc_id); +static int i_vcc_reset_events(vcc_t *vccp); +static int i_vcc_cons_tbl(vcc_t *vccp, uint_t num_ports, + caddr_t buf, int mode); +static int i_vcc_del_cons_ok(vcc_t *vccp, caddr_t buf, int mode); +static int i_vcc_close_port(vcc_port_t *vport); +static int i_vcc_write_ldc(vcc_port_t *vport, vcc_msg_t *buf); + +static void *vcc_ssp; + +static struct cb_ops vcc_cb_ops = { + vcc_open, /* open */ + vcc_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + vcc_read, /* read */ + vcc_write, /* write */ + vcc_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + ddi_segmap, /* segmap */ + vcc_chpoll, /* chpoll */ + ddi_prop_op, /* prop_op */ + NULL, /* stream */ + D_NEW | D_MP /* flags */ +}; + + +static struct dev_ops vcc_ops = { + DEVO_REV, /* rev */ + 0, /* ref count */ + vcc_getinfo, /* getinfo */ + nulldev, /* identify */ + nulldev, /* probe */ + vcc_attach, /* attach */ + vcc_detach, /* detach */ + nodev, /* reset */ + &vcc_cb_ops, /* cb_ops */ + (struct bus_ops *)NULL /* bus_ops */ +}; + +extern struct mod_ops mod_driverops; + +#define VCC_CHANNEL_ENDPOINT "channel-endpoint" +#define VCC_ID_PROP "id" + +/* + * This is the string displayed by modinfo(1m). + */ +static char vcc_ident[] = "sun4v Virtual Console Concentrator Driver v%I%"; + +static struct modldrv md = { + &mod_driverops, /* Type - it is a driver */ + vcc_ident, /* Name of the module */ + &vcc_ops, /* driver specfic opts */ +}; + +static struct modlinkage ml = { + MODREV_1, + &md, + NULL +}; + +/* + * Matching criteria passed to the MDEG to register interest + * in changes to 'virtual-device-port' nodes identified by their + * 'id' property. + */ +static md_prop_match_t vcc_port_prop_match[] = { + { MDET_PROP_VAL, "id" }, + { MDET_LIST_END, NULL } +}; + +static mdeg_node_match_t vcc_port_match = {"virtual-device-port", + vcc_port_prop_match}; + +/* + * Specification of an MD node passed to the MDEG to filter any + * 'virtual-device-port' nodes that do not belong to the specified node. + * This template is copied for each vldc instance and filled in with + * the appropriate 'cfg-handle' value before being passed to the MDEG. + */ +static mdeg_prop_spec_t vcc_prop_template[] = { + { MDET_PROP_STR, "name", "virtual-console-concentrator" }, + { MDET_PROP_VAL, "cfg-handle", NULL }, + { MDET_LIST_END, NULL, NULL } +}; + +#define VCC_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); + + +#ifdef DEBUG + +/* + * Print debug messages + * + * set vldcdbg to 0xf to enable all messages + * + * 0x8 - Errors + * 0x4 - Warnings + * 0x2 - All debug messages (most verbose) + * 0x1 - Minimal debug messages + */ + +int vccdbg = 0x8; + +static void +vccdebug(const char *fmt, ...) +{ + char buf[512]; + va_list ap; + + va_start(ap, fmt); + (void) vsprintf(buf, fmt, ap); + va_end(ap); + + cmn_err(CE_CONT, "%s\n", buf); +} + +#define D1 \ +if (vccdbg & 0x01) \ + vccdebug + +#define D2 \ +if (vccdbg & 0x02) \ + vccdebug + +#define DWARN \ +if (vccdbg & 0x04) \ + vccdebug + +#else + +#define D1 +#define D2 +#define DWARN + +#endif + +/* _init(9E): initialize the loadable module */ +int +_init(void) +{ + int error; + + /* init the soft state structure */ + error = ddi_soft_state_init(&vcc_ssp, sizeof (vcc_t), 1); + if (error != 0) { + return (error); + } + + /* Link the driver into the system */ + error = mod_install(&ml); + + return (error); + +} + +/* _info(9E): return information about the loadable module */ +int +_info(struct modinfo *modinfop) +{ + /* Report status of the dynamically loadable driver module */ + return (mod_info(&ml, modinfop)); +} + +/* _fini(9E): prepare the module for unloading. */ +int +_fini(void) +{ + int error; + + /* Unlink the driver module from the system */ + if ((error = mod_remove(&ml)) == 0) { + /* + * We have successfully "removed" the driver. + * destroy soft state + */ + ddi_soft_state_fini(&vcc_ssp); + } + + return (error); +} + +/* getinfo(9E) */ +static int +vcc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) +{ + _NOTE(ARGUNUSED(dip)) + + int instance = VCCINST(getminor((dev_t)arg)); + vcc_t *vccp = NULL; + + switch (cmd) { + + case DDI_INFO_DEVT2DEVINFO: + if ((vccp = ddi_get_soft_state(vcc_ssp, instance)) == NULL) { + *resultp = NULL; + return (DDI_FAILURE); + } + *resultp = vccp->dip; + return (DDI_SUCCESS); + + case DDI_INFO_DEVT2INSTANCE: + *resultp = (void *)(uintptr_t)instance; + return (DDI_SUCCESS); + + default: + *resultp = NULL; + return (DDI_FAILURE); + } +} + +/* + * There are two cases that need special blocking. One of them is to block + * a minor node without a port and another is to block application other + * than vntsd. + * + * A minor node can exist in the file system without associated with a port + * because when a port is deleted, ddi_remove_minor does not unlink it. + * Clients might try to open a minor node even after the corresponding port + * node has been removed. To identify and block these calls, + * we need to validate the association between a port and its minor node. + * + * An application other than vntsd can access a console port as long + * as vntsd is not using the port. A port opened by an application other + * than vntsd will be closed when vntsd wants to use the port. + * However, other application could use same file descriptor + * access vcc cb_ops. So we need to identify and block caller other + * than vntsd, when vntsd is using the port. + */ +static int +i_vcc_can_use_port(vcc_minor_t *minorp, vcc_port_t *vport) +{ + if (vport->minorp != minorp) { + /* port config changed */ + return (ENXIO); + } + + if (vport->valid_pid == VCC_NO_PID_BLOCKING) { + /* no blocking needed */ + return (0); + } + + if (vport->valid_pid != ddi_get_pid()) { + return (EIO); + } + + return (0); +} + + +/* Syncronization between thread using cv_wait */ +static int +i_vcc_wait_port_status(vcc_port_t *vport, kcondvar_t *cv, uint32_t status) +{ + + int rv; + + ASSERT(mutex_owned(&vport->lock)); + + for (; ; ) { + + if ((vport->status & VCC_PORT_AVAIL) == 0) { + /* port has been deleted */ + D1("i_vcc_wait_port_status: port%d deleted\n", + vport->number); + return (EIO); + } + + if ((vport->status & VCC_PORT_OPEN) == 0) { + D1("i_vcc_wait_port_status: port%d is closed \n", + vport->number); + return (EIO); + } + + if (vport->status & VCC_PORT_LDC_LINK_DOWN) { + return (EIO); + } + + if ((vport->valid_pid != VCC_NO_PID_BLOCKING) && + (vport->valid_pid != ddi_get_pid())) { + return (EIO); + } + + if ((vport->status & status) == status) { + return (0); + } + + if (!ddi_can_receive_sig()) { + return (EIO); + } + + rv = cv_wait_sig(cv, &vport->lock); + if (rv == 0) { + D1("i_vcc_wait_port_status: port%d get intr \n", + vport->number); + /* got signal */ + return (EINTR); + } + } + +} + +/* Syncronization between threads, signal state change */ +static void +i_vcc_set_port_status(vcc_port_t *vport, kcondvar_t *cv, uint32_t status) +{ + + mutex_enter(&vport->lock); + vport->status |= status; + cv_broadcast(cv); + mutex_exit(&vport->lock); +} + +/* initialize a ldc channel */ +static int +i_vcc_ldc_init(vcc_t *vccp, vcc_port_t *vport) +{ + ldc_attr_t attr; + int rv = EIO; + + ASSERT(mutex_owned(&vport->lock)); + ASSERT(vport->ldc_id != VCC_INVALID_CHANNEL); + + /* initialize the channel */ + attr.devclass = LDC_DEV_SERIAL; + attr.instance = ddi_get_instance(vccp->dip); + attr.qlen = VCC_QUEUE_LEN; + attr.mode = LDC_MODE_RAW; + + if ((rv = ldc_init(vport->ldc_id, &attr, &(vport->ldc_handle))) != 0) { + cmn_err(CE_CONT, "i_vcc_ldc_init: port %d inv channel 0x%lx\n", + vport->number, vport->ldc_id); + vport->ldc_id = VCC_INVALID_CHANNEL; + return (rv); + } + + /* register it */ + if ((rv = ldc_reg_callback(vport->ldc_handle, vcc_ldc_cb, + (caddr_t)vport)) != 0) { + cmn_err(CE_CONT, "i_vcc_ldc_init: port@%d ldc_register_cb" + "failed\n", vport->number); + (void) ldc_fini(vport->ldc_handle); + vport->ldc_id = VCC_INVALID_CHANNEL; + return (rv); + } + + /* open and bring channel up */ + if ((rv = ldc_open(vport->ldc_handle)) != 0) { + cmn_err(CE_CONT, "i_vcc_ldc_init: port@%d inv channel 0x%lx\n", + vport->number, vport->ldc_id); + (void) ldc_unreg_callback(vport->ldc_handle); + (void) ldc_fini(vport->ldc_handle); + vport->ldc_id = VCC_INVALID_CHANNEL; + return (rv); + } + + /* init the channel status */ + if ((rv = ldc_status(vport->ldc_handle, &vport->ldc_status)) != 0) { + cmn_err(CE_CONT, "i_vcc_ldc_init: port@%d ldc_status failed\n", + vport->number); + (void) ldc_close(vport->ldc_handle); + (void) ldc_unreg_callback(vport->ldc_handle); + (void) ldc_fini(vport->ldc_handle); + vport->ldc_id = VCC_INVALID_CHANNEL; + return (rv); + } + + return (0); +} + +/* release a ldc channel */ +static int +i_vcc_ldc_fini(vcc_port_t *vport) +{ + int rv = EIO; + vcc_msg_t buf; + + D1("i_vcc_ldc_fini: port@%lld, ldc_id%%llx\n", vport->number, + vport->ldc_id); + + ASSERT(mutex_owned(&vport->lock)); + + /* wait for write available */ + rv = i_vcc_wait_port_status(vport, &vport->write_cv, + VCC_PORT_USE_WRITE_LDC); + if (rv) { + return (rv); + } + vport->status &= ~VCC_PORT_USE_WRITE_LDC; + /* send a HUP message */ + buf.type = LDC_CONSOLE_CTRL; + buf.ctrl_msg = LDC_CONSOLE_HUP; + buf.size = 0; + + /* in case of error, we still want to clean up ldc channel */ + (void) i_vcc_write_ldc(vport, &buf); + + mutex_exit(&vport->lock); + i_vcc_set_port_status(vport, &vport->write_cv, VCC_PORT_USE_WRITE_LDC); + mutex_enter(&vport->lock); + + (void) ldc_set_cb_mode(vport->ldc_handle, LDC_CB_DISABLE); + if ((rv = ldc_close(vport->ldc_handle)) != 0) { + cmn_err(CE_CONT, "i_vcc_ldc_fini: cannot close channel %ld\n", + vport->ldc_id); + return (rv); + } + + if ((rv = ldc_unreg_callback(vport->ldc_handle)) != 0) { + cmn_err(CE_CONT, "i_vcc_ldc_fini: port@%d ldc_unreg_callback" + "failed\n", vport->number); + return (rv); + } + + if ((rv = ldc_fini(vport->ldc_handle)) != 0) { + cmn_err(CE_CONT, "i_vcc_ldc_fini: cannot finilize channel" + "%ld\n", vport->ldc_id); + return (rv); + } + + return (0); +} + +/* read data from ldc channel */ + +static int +i_vcc_read_ldc(vcc_port_t *vport, char *data_buf, size_t *sz) +{ + + int rv; + size_t size; + size_t space_left = *sz; + vcc_msg_t buf; + int i; + + + + + /* make sure holding read lock */ + ASSERT((vport->status & VCC_PORT_USE_READ_LDC) == 0); + ASSERT(space_left >= VCC_MTU_SZ); + + *sz = 0; + while (space_left >= VCC_MTU_SZ) { + size = sizeof (buf); + + rv = ldc_read(vport->ldc_handle, (caddr_t)&buf, &size); + + if (rv) { + return (rv); + } + + + /* + * FIXME: ldc_read should not reaturn 0 with + * either size == 0, buf.size == 0 or size < VCC_HDR_SZ + */ + if (size == 0) { + if (*sz > 0) { + return (0); + } + return (EAGAIN); + } + + if (size < VCC_HDR_SZ) { + return (EIO); + } + + /* + * only data is expected from console - otherwise + * return error + */ + if (buf.type != LDC_CONSOLE_DATA) { + return (EIO); + } + + if (buf.size == 0) { + if (*sz > 0) { + return (0); + } + return (EAGAIN); + } + + /* copy data */ + for (i = 0; i < buf.size; i++, (*sz)++) { + data_buf[*sz] = buf.data[i]; + } + + space_left -= buf.size; + } + + return (0); +} + +/* callback from ldc */ +static uint_t +vcc_ldc_cb(uint64_t event, caddr_t arg) +{ + + vcc_port_t *vport = (vcc_port_t *)arg; + boolean_t isempty; + + /* + * do not need to hold lock because if ldc calls back, the + * ldc_handle must be valid. + */ + D2("vcc_ldc_cb: callback invoked port=%d events=%llx\n", + vport->number, event); + + /* check event from ldc */ + if (event & LDC_EVT_WRITE) { + /* channel has space for write */ + + i_vcc_set_port_status(vport, &vport->write_cv, + VCC_PORT_LDC_WRITE_READY); + return (LDC_SUCCESS); + } + + if (event & LDC_EVT_READ) { + + /* channel has data for read */ + (void) ldc_chkq(vport->ldc_handle, &isempty); + if (isempty) { + /* data already read */ + return (LDC_SUCCESS); + } + + i_vcc_set_port_status(vport, &vport->read_cv, + VCC_PORT_LDC_DATA_READY); + return (LDC_SUCCESS); + } + + if (event & LDC_EVT_DOWN) { + /* channel is down */ + i_vcc_set_port_status(vport, &vport->write_cv, + VCC_PORT_LDC_LINK_DOWN); + cv_broadcast(&vport->read_cv); + + } + + return (LDC_SUCCESS); + +} + + +/* configure a vcc port with ldc channel */ +static int +i_vcc_config_port(vcc_t *vccp, uint_t portno, uint64_t ldc_id) +{ + int rv = EIO; + vcc_port_t *vport; + + if ((portno >= VCC_MAX_PORTS) || (portno == VCC_CONTROL_PORT)) { + cmn_err(CE_CONT, "i_vcc_config_port: invalid port number %d\n", + portno); + return (EINVAL); + } + + vport = &(vccp->port[portno]); + if ((vport->status & VCC_PORT_AVAIL) == 0) { + cmn_err(CE_CONT, "i_vcc_config_port: port@%d does not exist\n", + portno); + return (EINVAL); + } + + + if (vport->ldc_id != VCC_INVALID_CHANNEL) { + cmn_err(CE_CONT, "i_vcc_config_port: port@%d channel already" + "configured\n", portno); + return (EINVAL); + } + + mutex_enter(&vport->lock); + + /* store the ldc ID */ + vport->ldc_id = ldc_id; + /* check if someone has already opened this port */ + if (vport->status & VCC_PORT_OPEN) { + + if ((rv = i_vcc_ldc_init(vccp, vport)) != 0) { + mutex_exit(&vport->lock); + return (rv); + } + + /* mark port as ready */ + vport->status |= VCC_PORT_LDC_CHANNEL_READY; + cv_broadcast(&vport->read_cv); + cv_broadcast(&vport->write_cv); + } + + mutex_exit(&vport->lock); + + D1("i_vcc_config_port: port@%d ldc=%d, domain=%s", + vport->number, vport->ldc_id, vport->minorp->domain_name); + + return (0); +} + +/* add a vcc console port */ +static int +i_vcc_add_port(vcc_t *vccp, char *group_name, uint64_t tcp_port, + uint_t portno, char *domain_name) +{ + int instance; + int rv = MDEG_FAILURE; + minor_t minor; + vcc_port_t *vport; + uint_t minor_idx; + char name[MAXPATHLEN]; + + if ((portno >= VCC_MAX_PORTS) || (portno == VCC_CONTROL_PORT)) { + DWARN("i_vcc_add_port: invalid port number %d\n", portno); + return (MDEG_FAILURE); + } + + vport = &(vccp->port[portno]); + if (vport->status & VCC_PORT_AVAIL) { + /* this port already exists */ + cmn_err(CE_CONT, "i_vcc_add_port: invalid port - port@%d " + "exists\n", portno); + return (MDEG_FAILURE); + } + + vport->number = portno; + vport->ldc_id = VCC_INVALID_CHANNEL; + + if (domain_name == NULL) { + cmn_err(CE_CONT, "i_vcc_add_port: invalid domain name\n"); + return (MDEG_FAILURE); + } + + if (group_name == NULL) { + cmn_err(CE_CONT, "i_vcc_add_port: invalid group name\n"); + return (MDEG_FAILURE); + } + + /* look up minor number */ + for (minor_idx = 0; minor_idx < vccp->minors_assigned; minor_idx++) { + if (strcmp(vccp->minor_tbl[minor_idx].domain_name, + domain_name) == 0) { + /* found previous assigned minor number */ + break; + } + } + + if (minor_idx == vccp->minors_assigned) { + /* end of lookup - assign new minor number */ + if (minor_idx == VCC_MAX_PORTS) { + cmn_err(CE_CONT, "i_vcc_add_port:" + "too many minornodes (%d)\n", + minor_idx); + return (MDEG_FAILURE); + } + + (void) strlcpy(vccp->minor_tbl[minor_idx].domain_name, + domain_name, MAXPATHLEN); + + vccp->minors_assigned++; + } + + vport->minorp = &vccp->minor_tbl[minor_idx]; + vccp->minor_tbl[minor_idx].portno = portno; + + (void) strlcpy(vport->group_name, group_name, MAXPATHLEN); + + vport->tcp_port = tcp_port; + D1("i_vcc_add_port:@%d domain=%s, group=%s, tcp=%lld", + vport->number, vport->minorp->domain_name, + vport->group_name, vport->tcp_port); + + + /* + * Create a minor node. The minor number is + * (instance << VCC_INST_SHIFT) | minor_idx + */ + instance = ddi_get_instance(vccp->dip); + + minor = (instance << VCC_INST_SHIFT) | (minor_idx); + + (void) snprintf(name, MAXPATHLEN - 1, "%s%s", VCC_MINOR_NAME_PREFIX, + domain_name); + + rv = ddi_create_minor_node(vccp->dip, name, S_IFCHR, minor, + DDI_NT_SERIAL, 0); + + if (rv != DDI_SUCCESS) { + vccp->minors_assigned--; + return (MDEG_FAILURE); + } + + mutex_enter(&vport->lock); + vport->status = VCC_PORT_AVAIL | VCC_PORT_ADDED; + mutex_exit(&vport->lock); + + + return (MDEG_SUCCESS); +} + +/* delete a port */ +static int +i_vcc_delete_port(vcc_t *vccp, vcc_port_t *vport) +{ + + char name[MAXPATHLEN]; + int rv; + + + ASSERT(mutex_owned(&vport->lock)); + + if ((vport->status & VCC_PORT_AVAIL) == 0) { + D1("vcc_del_port port already deleted \n"); + return (0); + } + + if (vport->status & VCC_PORT_OPEN) { + /* do not block mdeg callback */ + vport->valid_pid = VCC_NO_PID_BLOCKING; + rv = i_vcc_close_port(vport); + } + + /* remove minor node */ + (void) snprintf(name, MAXPATHLEN-1, "%s%s", VCC_MINOR_NAME_PREFIX, + vport->minorp->domain_name); + + ddi_remove_minor_node(vccp->dip, name); + + /* let read and write thread know */ + cv_broadcast(&vport->read_cv); + cv_broadcast(&vport->write_cv); + vport->status = 0; + return (rv); + + +} + +/* register callback to MDEG */ +static int +i_vcc_mdeg_register(vcc_t *vccp, int instance) +{ + mdeg_prop_spec_t *pspecp; + mdeg_node_spec_t *ispecp; + mdeg_handle_t mdeg_hdl; + int sz; + int rv; + + /* + * Allocate and initialize a per-instance copy + * of the global property spec array that will + * uniquely identify this vcc instance. + */ + sz = sizeof (vcc_prop_template); + pspecp = kmem_alloc(sz, KM_SLEEP); + + bcopy(vcc_prop_template, pspecp, sz); + + VCC_SET_MDEG_PROP_INST(pspecp, instance); + + /* initialize the complete prop spec structure */ + ispecp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); + ispecp->namep = "virtual-device"; + ispecp->specp = pspecp; + + /* perform the registration */ + rv = mdeg_register(ispecp, &vcc_port_match, vcc_mdeg_cb, + vccp, &mdeg_hdl); + + if (rv != MDEG_SUCCESS) { + cmn_err(CE_CONT, "i_vcc_mdeg_register:" + "mdeg_register failed (%d)\n", rv); + kmem_free(ispecp, sizeof (mdeg_node_spec_t)); + kmem_free(pspecp, sz); + return (DDI_FAILURE); + } + + /* save off data that will be needed later */ + vccp->md_ispecp = (void *)ispecp; + vccp->mdeg_hdl = mdeg_hdl; + + return (0); +} + +/* destroy all mutex from port table */ +static void +i_vcc_cleanup_port_table(vcc_t *vccp) +{ + int i; + vcc_port_t *vport; + + for (i = 0; i < VCC_MAX_PORTS; i++) { + vport = &(vccp->port[i]); + mutex_destroy(&vport->lock); + cv_destroy(&vport->read_cv); + cv_destroy(&vport->write_cv); + } +} + +/* + * attach(9E): attach a device to the system. + * called once for each instance of the device on the system. + */ +static int +vcc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int i, instance, inst; + int rv = DDI_FAILURE; + vcc_t *vccp; + minor_t minor; + vcc_port_t *vport; + + switch (cmd) { + + case DDI_ATTACH: + + instance = ddi_get_instance(dip); + if (ddi_soft_state_zalloc(vcc_ssp, instance) != DDI_SUCCESS) + return (DDI_FAILURE); + + vccp = ddi_get_soft_state(vcc_ssp, instance); + if (vccp == NULL) { + ddi_soft_state_free(vccp, instance); + return (ENXIO); + } + + D1("vcc_attach: DDI_ATTACH instance=%d\n", instance); + + /* initialize the mutex */ + mutex_init(&vccp->lock, NULL, MUTEX_DRIVER, NULL); + + mutex_enter(&vccp->lock); + + vccp->dip = dip; + + for (i = 0; i < VCC_MAX_PORTS; i++) { + vport = &(vccp->port[i]); + mutex_init(&vport->lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&vport->read_cv, NULL, CV_DRIVER, NULL); + cv_init(&vport->write_cv, NULL, CV_DRIVER, NULL); + vport->valid_pid = VCC_NO_PID_BLOCKING; + } + + vport = &vccp->port[VCC_CONTROL_PORT]; + mutex_enter(&vport->lock); + + vport->minorp = &vccp->minor_tbl[VCC_CONTROL_MINOR_IDX]; + vport->status |= VCC_PORT_AVAIL; + + /* create a minor node for vcc control */ + minor = (instance << VCC_INST_SHIFT) | VCC_CONTROL_MINOR_IDX; + + vccp->minor_tbl[VCC_CONTROL_PORT].portno = + VCC_CONTROL_MINOR_IDX; + + + rv = ddi_create_minor_node(vccp->dip, "ctl", S_IFCHR, minor, + DDI_NT_SERIAL, 0); + + mutex_exit(&vport->lock); + + if (rv != DDI_SUCCESS) { + cmn_err(CE_CONT, "vcc_attach: error" + "creating control minor node\n"); + + i_vcc_cleanup_port_table(vccp); + + mutex_exit(&vccp->lock); + /* clean up soft state */ + ddi_soft_state_free(vccp, instance); + + return (DDI_FAILURE); + } + + /* get the instance number by reading 'reg' property */ + inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "reg", -1); + if (inst == -1) { + cmn_err(CE_CONT, "vcc_attach: vcc%d has no " + "'reg' property\n", + ddi_get_instance(dip)); + + i_vcc_cleanup_port_table(vccp); + + /* remove minor */ + ddi_remove_minor_node(vccp->dip, NULL); + + /* clean up soft state */ + mutex_exit(&vccp->lock); + ddi_soft_state_free(vccp, instance); + + return (DDI_FAILURE); + } + + /* + * Mdeg might invoke callback in the same call sequence + * if there is a domain port at the time of registration. + * Since the callback also grabs vcc->lock mutex, to avoid + * mutex reentry error, release the lock before registration + */ + mutex_exit(&vccp->lock); + + /* register for notifications from Zeus */ + rv = i_vcc_mdeg_register(vccp, inst); + if (rv != MDEG_SUCCESS) { + cmn_err(CE_CONT, "vcc_attach: error register to MD\n"); + + i_vcc_cleanup_port_table(vccp); + + /* remove minor */ + ddi_remove_minor_node(vccp->dip, NULL); + + /* clean up soft state */ + ddi_soft_state_free(vccp, instance); + + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); + + case DDI_RESUME: + + return (DDI_SUCCESS); + + default: + + return (DDI_FAILURE); + } +} + +/* + * detach(9E): detach a device from the system. + */ +static int +vcc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int i, instance; + vcc_t *vccp; + mdeg_node_spec_t *ispecp; + vcc_port_t *vport; + + switch (cmd) { + + case DDI_DETACH: + + instance = ddi_get_instance(dip); + vccp = ddi_get_soft_state(vcc_ssp, instance); + if (vccp == NULL) + return (ENXIO); + + D1("vcc_detach: DDI_DETACH instance=%d\n", instance); + + mutex_enter(&vccp->lock); + + /* unregister from MD event generator */ + + ASSERT(vccp->mdeg_hdl); + (void) mdeg_unregister(vccp->mdeg_hdl); + + ispecp = (mdeg_node_spec_t *)vccp->md_ispecp; + ASSERT(ispecp); + + kmem_free(ispecp->specp, sizeof (vcc_prop_template)); + kmem_free(ispecp, sizeof (mdeg_node_spec_t)); + + /* remove minor nodes */ + ddi_remove_minor_node(vccp->dip, NULL); + mutex_exit(&vccp->lock); + + for (i = 0; i < VCC_MAX_PORTS; i++) { + + vport = &vccp->port[i]; + mutex_enter(&vport->lock); + if (i == VCC_CONTROL_PORT) { + if (vport->status & VCC_PORT_OPEN) { + (void) i_vcc_close_port(vport); + } + } + + if ((vccp->port[i].status & VCC_PORT_AVAIL) && + (i != VCC_CONTROL_PORT)) { + D1("vcc_detach: removing port port@%d\n", i); + (void) i_vcc_delete_port(vccp, vport); + } + mutex_exit(&vport->lock); + cv_destroy(&vport->read_cv); + cv_destroy(&vport->write_cv); + mutex_destroy(&vport->lock); + } + + + + /* destroy mutex and free the soft state */ + mutex_destroy(&vccp->lock); + ddi_soft_state_free(vcc_ssp, instance); + + return (DDI_SUCCESS); + + case DDI_SUSPEND: + + return (DDI_SUCCESS); + + default: + + return (DDI_FAILURE); + } +} + +/* cb_open */ +static int +vcc_open(dev_t *devp, int flag, int otyp, cred_t *cred) +{ + _NOTE(ARGUNUSED(otyp, cred)) + + int instance; + int rv = EIO; + minor_t minor; + uint_t portno; + vcc_t *vccp; + vcc_port_t *vport; + + minor = getminor(*devp); + instance = VCCINST(minor); + + vccp = ddi_get_soft_state(vcc_ssp, instance); + if (vccp == NULL) { + return (ENXIO); + } + + portno = VCCPORT(vccp, minor); + + vport = &(vccp->port[portno]); + + mutex_enter(&vport->lock); + + if (vport->status & VCC_PORT_OPEN) { + /* only one open per port */ + cmn_err(CE_CONT, "vcc_open: virtual-console-concentrator@%d:%d " + "is already open\n", instance, portno); + mutex_exit(&vport->lock); + return (EAGAIN); + } + + /* check minor no and pid */ + if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor), + vport)) != 0) { + mutex_exit(&vport->lock); + return (rv); + } + + if (portno == VCC_CONTROL_PORT) { + vport->status |= VCC_PORT_OPEN; + mutex_exit(&vport->lock); + return (0); + } + + + /* check if channel has been initialized */ + if ((vport->status & VCC_PORT_LDC_CHANNEL_READY) == 0) { + rv = i_vcc_ldc_init(vccp, vport); + if (rv) { + mutex_exit(&vport->lock); + return (EIO); + } + + /* mark port as ready */ + vport->status |= VCC_PORT_LDC_CHANNEL_READY; + } + + vport->status |= VCC_PORT_USE_READ_LDC | VCC_PORT_USE_WRITE_LDC| + VCC_PORT_TERM_RD|VCC_PORT_TERM_WR|VCC_PORT_OPEN; + + if ((flag & O_NONBLOCK) || (flag & O_NDELAY)) { + vport->status |= VCC_PORT_NONBLOCK; + } + + mutex_exit(&vport->lock); + + return (0); +} + +/* close port */ +static int +i_vcc_close_port(vcc_port_t *vport) +{ + int rv = EIO; + + if ((vport->status & VCC_PORT_OPEN) == 0) { + return (0); + } + + ASSERT(mutex_owned(&vport->lock)); + + if (vport->status & VCC_PORT_LDC_CHANNEL_READY) { + /* clean up ldc channel */ + if ((rv = i_vcc_ldc_fini(vport)) != 0) { + return (rv); + } + vport->status &= ~VCC_PORT_LDC_CHANNEL_READY; + } + + /* reset rd/wr suspends */ + vport->status |= VCC_PORT_TERM_RD | VCC_PORT_TERM_WR; + vport->status &= ~VCC_PORT_NONBLOCK; + vport->status &= ~VCC_PORT_OPEN; + vport->valid_pid = VCC_NO_PID_BLOCKING; + + /* signal any blocked read and write thread */ + cv_broadcast(&vport->read_cv); + cv_broadcast(&vport->write_cv); + + return (0); +} + +/* cb_close */ +static int +vcc_close(dev_t dev, int flag, int otyp, cred_t *cred) +{ + _NOTE(ARGUNUSED(flag, otyp, cred)) + + int instance; + minor_t minor; + int rv = EIO; + uint_t portno; + vcc_t *vccp; + vcc_port_t *vport; + + minor = getminor(dev); + + instance = VCCINST(minor); + vccp = ddi_get_soft_state(vcc_ssp, instance); + if (vccp == NULL) { + return (ENXIO); + } + + portno = VCCPORT(vccp, minor); + + D1("vcc_close: closing virtual-console-concentrator@%d:%d\n", + instance, portno); + + vport = &(vccp->port[portno]); + + + if ((vport->status & VCC_PORT_OPEN) == 0) { + return (0); + } + + if (portno == VCC_CONTROL_PORT) { + /* + * vntsd closes control port before it exits. There + * could be events still pending for vntsd. + */ + rv = i_vcc_reset_events(vccp); + return (0); + } + + mutex_enter(&vport->lock); + + /* check minor no and pid */ + if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor), + vport)) != 0) { + mutex_exit(&vport->lock); + return (rv); + } + + rv = i_vcc_close_port(vport); + mutex_exit(&vport->lock); + + return (rv); +} + +/* + * ioctl VCC_CONS_TBL - vntsd allocates buffer according to return of + * VCC_NUM_PORTS. However, when vntsd requests for the console table, console + * ports could be deleted or added. parameter num_ports is number of structures + * that vntsd allocated for the table. If there are more ports than + * num_ports, set up to wakeup vntsd to add ports. + * If there less ports than num_ports, fill (-1) for cons_no to tell vntsd. + */ +static int +i_vcc_cons_tbl(vcc_t *vccp, uint_t num_ports, caddr_t buf, int mode) +{ + vcc_console_t cons; + int i; + vcc_port_t *vport; + boolean_t notify_vntsd = B_FALSE; + char pathname[MAXPATHLEN]; + + + (void) ddi_pathname(vccp->dip, pathname); + for (i = 0; i < VCC_MAX_PORTS; i++) { + + vport = &vccp->port[i]; + + if (i == VCC_CONTROL_PORT) { + continue; + } + + if ((vport->status & VCC_PORT_AVAIL) == 0) { + continue; + } + + /* a port exists before vntsd becomes online */ + mutex_enter(&vport->lock); + + if (num_ports == 0) { + /* more ports than vntsd's buffer can hold */ + vport->status |= VCC_PORT_ADDED; + notify_vntsd = B_TRUE; + mutex_exit(&vport->lock); + continue; + } + + bzero(&cons, sizeof (vcc_console_t)); + + /* construct console buffer */ + cons.cons_no = vport->number; + cons.tcp_port = vport->tcp_port; + (void) memcpy(cons.domain_name, + vport->minorp->domain_name, MAXPATHLEN); + + (void) memcpy(cons.group_name, vport->group_name, + MAXPATHLEN); + vport->status &= ~VCC_PORT_ADDED; + mutex_exit(&vport->lock); + + (void) snprintf(cons.dev_name, MAXPATHLEN-1, "%s:%s%s", + pathname, VCC_MINOR_NAME_PREFIX, cons.domain_name); + + /* copy out data */ + if (ddi_copyout(&cons, (void *)buf, + sizeof (vcc_console_t), mode)) { + mutex_exit(&vport->lock); + return (EFAULT); + } + buf += sizeof (vcc_console_t); + + num_ports--; + + } + + if (num_ports == 0) { + /* vntsd's buffer is full */ + + if (notify_vntsd) { + /* more ports need to notify vntsd */ + vport = &vccp->port[VCC_CONTROL_PORT]; + mutex_enter(&vport->lock); + vport->pollevent |= VCC_POLL_ADD_PORT; + mutex_exit(&vport->lock); + } + + return (0); + } + + /* less ports than vntsd expected */ + bzero(&cons, sizeof (vcc_console_t)); + cons.cons_no = -1; + + while (num_ports > 0) { + /* fill vntsd buffer with no console */ + if (ddi_copyout(&cons, (void *)buf, + sizeof (vcc_console_t), mode) != 0) { + mutex_exit(&vport->lock); + return (EFAULT); + } + D1("i_vcc_cons_tbl: a port is deleted\n"); + buf += sizeof (vcc_console_t) +MAXPATHLEN; + num_ports--; + } + + return (0); +} + + +/* turn off event flag if there is no more change */ +static void +i_vcc_turn_off_event(vcc_t *vccp, uint32_t port_status, uint32_t event) +{ + + vcc_port_t *vport; + int i; + + for (i = 0; i < VCC_MAX_PORTS; i++) { + + vport = &(vccp->port[i]); + + if ((vport->status & VCC_PORT_AVAIL) == 0) { + continue; + } + + + if (vport->status & port_status) { + /* more port changes status */ + return; + } + + } + + /* no more changed port */ + vport = &vccp->port[VCC_CONTROL_PORT]; + + /* turn off event */ + mutex_enter(&vport->lock); + vport->pollevent &= ~event; + mutex_exit(&vport->lock); +} + +/* ioctl VCC_CONS_INFO */ +static int +i_vcc_cons_info(vcc_t *vccp, caddr_t buf, int mode) +{ + vcc_console_t cons; + uint_t portno; + vcc_port_t *vport; + char pathname[MAXPATHLEN]; + + /* read in portno */ + if (ddi_copyin((void*)buf, &portno, sizeof (uint_t), mode)) { + return (EFAULT); + } + + D1("i_vcc_cons_info@%d:\n", portno); + + if ((portno >= VCC_MAX_PORTS) || (portno == VCC_CONTROL_PORT)) { + return (EINVAL); + } + + vport = &vccp->port[portno]; + + if ((vport->status & VCC_PORT_AVAIL) == 0) { + return (EINVAL); + } + + mutex_enter(&vport->lock); + vport->status &= ~VCC_PORT_ADDED; + + /* construct configruation data */ + bzero(&cons, sizeof (vcc_console_t)); + + cons.cons_no = vport->number; + cons.tcp_port = vport->tcp_port; + + (void) memcpy(cons.domain_name, vport->minorp->domain_name, MAXPATHLEN); + + (void) memcpy(cons.group_name, vport->group_name, MAXPATHLEN); + + mutex_exit(&vport->lock); + + (void) ddi_pathname(vccp->dip, pathname), + + /* copy device name */ + (void) snprintf(cons.dev_name, MAXPATHLEN-1, "%s:%s%s", + pathname, VCC_MINOR_NAME_PREFIX, cons.domain_name); + /* copy data */ + if (ddi_copyout(&cons, (void *)buf, + sizeof (vcc_console_t), mode) != 0) { + mutex_exit(&vport->lock); + return (EFAULT); + } + + D1("i_vcc_cons_info@%d:domain:%s serv:%s tcp@%lld %s\n", + cons.cons_no, cons.domain_name, + cons.group_name, cons.tcp_port, cons.dev_name); + + i_vcc_turn_off_event(vccp, VCC_PORT_ADDED, VCC_POLL_ADD_PORT); + + return (0); +} + + +/* response to vntsd inquiry ioctl call */ +static int +i_vcc_inquiry(vcc_t *vccp, caddr_t buf, int mode) +{ + vcc_port_t *vport; + uint_t i; + vcc_response_t msg; + + vport = &(vccp->port[VCC_CONTROL_PORT]); + + if ((vport->pollevent & VCC_POLL_ADD_PORT) == 0) { + return (EINVAL); + } + + /* an added port */ + + D1("i_vcc_inquiry\n"); + + for (i = 0; i < VCC_MAX_PORTS; i++) { + if ((vccp->port[i].status & VCC_PORT_AVAIL) == 0) { + continue; + } + + if (vccp->port[i].status & VCC_PORT_ADDED) { + /* port added */ + msg.reason = VCC_CONS_ADDED; + msg.cons_no = i; + + if (ddi_copyout((void *)&msg, (void *)buf, + sizeof (msg), mode) == -1) { + cmn_err(CE_CONT, "i_vcc_find_changed_port:" + "ddi_copyout" + " failed\n"); + return (EFAULT); + } + return (0); + } + } + + return (EINVAL); +} + +/* clean up events after vntsd exits */ +static int +i_vcc_reset_events(vcc_t *vccp) +{ + uint_t i; + vcc_port_t *vport; + + for (i = 0; i < VCC_MAX_PORTS; i++) { + vport = &(vccp->port[i]); + + if ((vport->status & VCC_PORT_AVAIL) == 0) { + continue; + } + + ASSERT(!mutex_owned(&vport->lock)); + + if (i == VCC_CONTROL_PORT) { + /* close control port */ + mutex_enter(&vport->lock); + vport->status &= ~VCC_PORT_OPEN; + + /* clean up poll events */ + vport->pollevent = 0; + vport->pollflag = 0; + mutex_exit(&vport->lock); + continue; + } + if (vport->status & VCC_PORT_ADDED) { + /* pending added port event to vntsd */ + mutex_enter(&vport->lock); + vport->status &= ~VCC_PORT_ADDED; + mutex_exit(&vport->lock); + } + + } + + vport = &vccp->port[VCC_CONTROL_PORT]; + + return (0); +} + +/* ioctl VCC_FORCE_CLOSE */ +static int +i_vcc_force_close(vcc_t *vccp, caddr_t buf, int mode) +{ + uint_t portno; + vcc_port_t *vport; + int rv; + + /* read in portno */ + if (ddi_copyin((void*)buf, &portno, sizeof (uint_t), mode)) { + return (EFAULT); + } + + D1("i_vcc_force_close@%d:\n", portno); + + if ((portno >= VCC_MAX_PORTS) || (portno == VCC_CONTROL_PORT)) { + return (EINVAL); + } + + vport = &vccp->port[portno]; + + if ((vport->status & VCC_PORT_AVAIL) == 0) { + return (EINVAL); + } + + mutex_enter(&vport->lock); + + rv = i_vcc_close_port(vport); + + /* block callers other than vntsd */ + vport->valid_pid = ddi_get_pid(); + + mutex_exit(&vport->lock); + return (rv); + +} + +/* ioctl VCC_CONS_STATUS */ +static int +i_vcc_cons_status(vcc_t *vccp, caddr_t buf, int mode) +{ + vcc_console_t console; + vcc_port_t *vport; + + /* read in portno */ + if (ddi_copyin((void*)buf, &console, sizeof (console), mode)) { + return (EFAULT); + } + + D1("i_vcc_cons_status@%d:\n", console.cons_no); + + if ((console.cons_no >= VCC_MAX_PORTS) || + (console.cons_no == VCC_CONTROL_PORT)) { + return (EINVAL); + } + + + vport = &vccp->port[console.cons_no]; + if ((vport->status & VCC_PORT_AVAIL) == 0) { + console.cons_no = -1; + } else if (strncmp(console.domain_name, vport->minorp->domain_name, + MAXPATHLEN)) { + console.cons_no = -1; + } else if (strncmp(console.group_name, vport->group_name, + MAXPATHLEN)) { + console.cons_no = -1; + } else if (console.tcp_port != vport->tcp_port) { + console.cons_no = -1; + } + + D1("i_vcc_cons_status@%d: %s %s %llx\n", console.cons_no, + console.group_name, console.domain_name, console.tcp_port); + if (ddi_copyout(&console, (void *)buf, sizeof (console), mode) == -1) { + cmn_err(CE_CONT, "i_vcc_cons_status ddi_copyout failed\n"); + return (EFAULT); + } + + return (0); +} + +/* cb_ioctl handler for vcc control port */ +static int +i_vcc_ctrl_ioctl(vcc_t *vccp, int cmd, void* arg, int mode) +{ + + static uint_t num_ports; + + + switch (cmd) { + + case VCC_NUM_CONSOLE: + + mutex_enter(&vccp->lock); + num_ports = vccp->num_ports; + mutex_exit(&vccp->lock); + /* number of consoles */ + + return (ddi_copyout((void *)&num_ports, arg, + sizeof (int), mode)); + case VCC_CONS_TBL: + + /* console config table */ + return (i_vcc_cons_tbl(vccp, num_ports, (caddr_t)arg, mode)); + + case VCC_INQUIRY: + + /* reason for wakeup */ + return (i_vcc_inquiry(vccp, (caddr_t)arg, mode)); + + case VCC_CONS_INFO: + /* a console config */ + return (i_vcc_cons_info(vccp, (caddr_t)arg, mode)); + + case VCC_FORCE_CLOSE: + /* force to close a console */ + return (i_vcc_force_close(vccp, (caddr_t)arg, mode)); + + case VCC_CONS_STATUS: + /* console status */ + return (i_vcc_cons_status(vccp, (caddr_t)arg, mode)); + + default: + + /* unknown command */ + return (ENODEV); + } + + +} + +/* write data to ldc. may block if channel has no space for write */ +static int +i_vcc_write_ldc(vcc_port_t *vport, vcc_msg_t *buf) +{ + int rv = EIO; + size_t size; + + ASSERT(mutex_owned(&vport->lock)); + ASSERT((vport->status & VCC_PORT_USE_WRITE_LDC) == 0); + + for (; ; ) { + + size = VCC_HDR_SZ + buf->size; + rv = ldc_write(vport->ldc_handle, (caddr_t)buf, &size); + + D1("i_vcc_write_ldc: port@%d: err=%d %d bytes\n", + vport->number, rv, size); + + if (rv == 0) { + return (rv); + } + + if (rv != EWOULDBLOCK) { + return (EIO); + } + + if (vport->status & VCC_PORT_NONBLOCK) { + return (EAGAIN); + } + + /* block util ldc has more space */ + + rv = i_vcc_wait_port_status(vport, &vport->write_cv, + VCC_PORT_LDC_WRITE_READY); + + if (rv) { + return (rv); + } + + vport->status &= ~VCC_PORT_LDC_WRITE_READY; + + } + +} + + + +/* cb_ioctl handler for port ioctl */ +static int +i_vcc_port_ioctl(vcc_t *vccp, minor_t minor, int portno, int cmd, void *arg, + int mode) +{ + + vcc_port_t *vport; + struct termios term; + vcc_msg_t buf; + int rv; + + D1("i_vcc_port_ioctl@%d cmd %d\n", portno, cmd); + + vport = &(vccp->port[portno]); + + if ((vport->status & VCC_PORT_AVAIL) == 0) { + return (EIO); + } + + + switch (cmd) { + + /* terminal support */ + case TCGETA: + case TCGETS: + + mutex_enter(&vport->lock); + + /* check minor no and pid */ + if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor), + vport)) != 0) { + mutex_exit(&vport->lock); + return (rv); + } + + (void) memcpy(&term, &vport->term, sizeof (term)); + mutex_exit(&vport->lock); + + return (ddi_copyout(&term, arg, sizeof (term), mode)); + + case TCSETS: + case TCSETA: + case TCSETAW: + case TCSETAF: + + if (ddi_copyin(arg, &term, sizeof (term), mode) != 0) { + return (EFAULT); + } + + mutex_enter(&vport->lock); + + /* check minor no and pid */ + if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor), + vport)) != 0) { + mutex_exit(&vport->lock); + return (rv); + } + + (void) memcpy(&vport->term, &term, sizeof (term)); + mutex_exit(&vport->lock); + return (0); + + + case TCSBRK: + + /* send break to console */ + mutex_enter(&vport->lock); + + /* check minor no and pid */ + if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor), + vport)) != 0) { + mutex_exit(&vport->lock); + return (rv); + } + + /* wait for write available */ + rv = i_vcc_wait_port_status(vport, &vport->write_cv, + VCC_PORT_LDC_CHANNEL_READY| VCC_PORT_USE_WRITE_LDC); + if (rv) { + mutex_exit(&vport->lock); + return (rv); + } + + vport->status &= ~VCC_PORT_USE_WRITE_LDC; + + buf.type = LDC_CONSOLE_CTRL; + buf.ctrl_msg = LDC_CONSOLE_BREAK; + buf.size = 0; + + rv = i_vcc_write_ldc(vport, &buf); + + mutex_exit(&vport->lock); + + i_vcc_set_port_status(vport, &vport->write_cv, + VCC_PORT_USE_WRITE_LDC); + return (0); + + case TCXONC: + /* suspend read or write */ + if (ddi_copyin(arg, &cmd, sizeof (int), mode) != 0) { + return (EFAULT); + } + + mutex_enter(&vport->lock); + + /* check minor no and pid */ + if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor), + vport)) != 0) { + mutex_exit(&vport->lock); + return (rv); + } + + + switch (cmd) { + + case 0: + vport->status |= VCC_PORT_TERM_WR; + cv_broadcast(&vport->write_cv); + break; + case 1: + /* get write lock */ + rv = i_vcc_wait_port_status(vport, &vport->write_cv, + VCC_PORT_USE_WRITE_LDC); + if (rv) { + mutex_exit(&vport->lock); + return (rv); + } + vport->status &= ~VCC_PORT_TERM_WR; + cv_broadcast(&vport->write_cv); + break; + case 2: + vport->status |= VCC_PORT_TERM_RD; + cv_broadcast(&vport->read_cv); + break; + case 3: + /* get read lock */ + rv = i_vcc_wait_port_status(vport, &vport->write_cv, + VCC_PORT_USE_READ_LDC); + if (rv) { + mutex_exit(&vport->lock); + return (rv); + } + vport->status &= ~VCC_PORT_TERM_RD; + cv_broadcast(&vport->read_cv); + break; + + default: + break; + } + + mutex_exit(&vport->lock); + return (0); + + case TCFLSH: + return (0); + + default: + return (ENODEV); + } + +} + +/* cb_ioctl */ +static int +vcc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, + cred_t *credp, int *rvalp) +{ + _NOTE(ARGUNUSED(credp, rvalp)) + + int instance; + minor_t minor; + int portno; + vcc_t *vccp; + + minor = getminor(dev); + + instance = VCCINST(minor); + + vccp = ddi_get_soft_state(vcc_ssp, instance); + if (vccp == NULL) { + return (ENXIO); + } + + portno = VCCPORT(vccp, minor); + + D1("vcc_ioctl: virtual-console-concentrator@%d:%d\n", instance, portno); + + if (portno >= VCC_MAX_PORTS) { + cmn_err(CE_CONT, "vcc_ioctl:virtual-console-concentrator@%d" + " invalid portno\n", portno); + return (EINVAL); + } + + D1("vcc_ioctl: virtual-console-concentrator@%d:%d ioctl cmd=%d\n", + instance, portno, cmd); + + if (portno == VCC_CONTROL_PORT) { + /* control ioctl */ + return (i_vcc_ctrl_ioctl(vccp, cmd, (void *)arg, mode)); + } + + /* data port ioctl */ + return (i_vcc_port_ioctl(vccp, minor, portno, cmd, (void *)arg, mode)); +} + +/* cb_read */ +static int +vcc_read(dev_t dev, struct uio *uiop, cred_t *credp) +{ + _NOTE(ARGUNUSED(credp)) + + int instance; + minor_t minor; + uint_t portno; + vcc_t *vccp; + vcc_port_t *vport; + int rv = EIO; /* by default fail ! */ + char *buf; + size_t uio_size; + size_t size; + + minor = getminor(dev); + + instance = VCCINST(minor); + + vccp = ddi_get_soft_state(vcc_ssp, instance); + if (vccp == NULL) { + return (ENXIO); + } + + portno = VCCPORT(vccp, minor); + + /* no read for control port */ + if (portno == VCC_CONTROL_PORT) { + return (EIO); + } + + /* temp buf to hold ldc data */ + uio_size = uiop->uio_resid; + + if (uio_size < VCC_MTU_SZ) { + return (EINVAL); + } + + vport = &(vccp->port[portno]); + + mutex_enter(&vport->lock); + + /* check minor no and pid */ + if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor), + vport)) != 0) { + mutex_exit(&vport->lock); + return (rv); + } + + + rv = i_vcc_wait_port_status(vport, &vport->read_cv, + VCC_PORT_TERM_RD|VCC_PORT_LDC_CHANNEL_READY| + VCC_PORT_USE_READ_LDC); + if (rv) { + mutex_exit(&vport->lock); + return (rv); + } + + buf = kmem_alloc(uio_size, KM_SLEEP); + + vport->status &= ~VCC_PORT_USE_READ_LDC; + + for (; ; ) { + + size = uio_size; + rv = i_vcc_read_ldc(vport, buf, &size); + + + if (rv == EAGAIN) { + /* should block? */ + if (vport->status & VCC_PORT_NONBLOCK) { + break; + } + + } else if (rv) { + /* error */ + break; + } + + if (size > 0) { + /* got data */ + break; + } + + /* wait for data from ldc */ + vport->status &= ~VCC_PORT_LDC_DATA_READY; + rv = i_vcc_wait_port_status(vport, &vport->read_cv, + VCC_PORT_LDC_DATA_READY); + if (rv) { + break; + } + } + + mutex_exit(&vport->lock); + + if ((rv == 0) && (size > 0)) { + /* data is in buf */ + rv = uiomove(buf, size, UIO_READ, uiop); + } + + kmem_free(buf, uio_size); + i_vcc_set_port_status(vport, &vport->read_cv, VCC_PORT_USE_READ_LDC); + + return (rv); +} + + +/* cb_write */ +static int +vcc_write(dev_t dev, struct uio *uiop, cred_t *credp) +{ + _NOTE(ARGUNUSED(credp)) + + int instance; + minor_t minor; + size_t size; + size_t bytes; + uint_t portno; + vcc_t *vccp; + + vcc_port_t *vport; + int rv = EIO; + + vcc_msg_t buf; + + minor = getminor(dev); + + instance = VCCINST(minor); + + vccp = ddi_get_soft_state(vcc_ssp, instance); + if (vccp == NULL) { + return (ENXIO); + } + + portno = VCCPORT(vccp, minor); + + /* no write for control port */ + if (portno == VCC_CONTROL_PORT) { + return (EIO); + } + vport = &(vccp->port[portno]); + + /* + * check if the channel has been configured, + * if write has been suspend and grab write lock. + */ + mutex_enter(&vport->lock); + + /* check minor no and pid */ + if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor), + vport)) != 0) { + mutex_exit(&vport->lock); + return (rv); + } + + rv = i_vcc_wait_port_status(vport, &vport->write_cv, + VCC_PORT_TERM_WR|VCC_PORT_LDC_CHANNEL_READY| + VCC_PORT_USE_WRITE_LDC); + if (rv) { + mutex_exit(&vport->lock); + return (rv); + } + + vport->status &= ~VCC_PORT_USE_WRITE_LDC; + mutex_exit(&vport->lock); + size = uiop->uio_resid; + + D2("vcc_write: virtual-console-concentrator@%d:%d writing %d bytes\n", + instance, portno, size); + + + + buf.type = LDC_CONSOLE_DATA; + + while (size) { + + bytes = MIN(size, VCC_MTU_SZ); + /* move data */ + rv = uiomove(&(buf.data), bytes, UIO_WRITE, uiop); + + if (rv) { + break; + } + + /* write to ldc */ + buf.size = bytes; + + mutex_enter(&vport->lock); + + /* check minor no and pid */ + if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor), + vport)) != 0) { + mutex_exit(&vport->lock); + return (rv); + } + + rv = i_vcc_write_ldc(vport, &buf); + + mutex_exit(&vport->lock); + + if (rv) { + break; + } + + size -= bytes; + + } + + i_vcc_set_port_status(vport, &vport->write_cv, VCC_PORT_USE_WRITE_LDC); + return (rv); +} + +/* mdeg callback for a removed port */ +static int +i_vcc_md_remove_port(md_t *mdp, mde_cookie_t mdep, vcc_t *vccp) +{ + uint64_t portno; /* md requires 64bit for port number */ + int rv = MDEG_FAILURE; + vcc_port_t *vport; + + if (md_get_prop_val(mdp, mdep, "id", &portno)) { + cmn_err(CE_CONT, "vcc_mdeg_cb: port has no 'id' property\n"); + return (MDEG_FAILURE); + } + + if ((portno >= VCC_MAX_PORTS) || (portno < 0)) { + cmn_err(CE_CONT, "i_vcc_md_remove_port@%ld invalid port no\n", + portno); + return (MDEG_FAILURE); + } + + if (portno == VCC_CONTROL_PORT) { + cmn_err(CE_CONT, "i_vcc_md_remove_port@%ld can not remove" + "control port\n", + portno); + return (MDEG_FAILURE); + } + + vport = &(vccp->port[portno]); + + /* delete the port */ + mutex_enter(&vport->lock); + rv = i_vcc_delete_port(vccp, vport); + mutex_exit(&vport->lock); + + mutex_enter(&vccp->lock); + vccp->num_ports--; + mutex_exit(&vccp->lock); + + return (rv ? MDEG_FAILURE : MDEG_SUCCESS); +} + +static int +i_vcc_get_ldc_id(md_t *md, mde_cookie_t mdep, uint64_t *ldc_id) +{ + int num_nodes; + size_t size; + mde_cookie_t *channel; + int num_channels; + + + if ((num_nodes = md_node_count(md)) <= 0) { + cmn_err(CE_CONT, "i_vcc_get_ldc_channel_id:" + " Invalid node count in Machine Description subtree"); + return (-1); + } + size = num_nodes*(sizeof (*channel)); + channel = kmem_zalloc(size, KM_SLEEP); + ASSERT(channel != NULL); /* because KM_SLEEP */ + + + /* Look for channel endpoint child(ren) of the vdisk MD node */ + if ((num_channels = md_scan_dag(md, mdep, + md_find_name(md, "channel-endpoint"), + md_find_name(md, "fwd"), channel)) <= 0) { + cmn_err(CE_CONT, "i_vcc_get_ldc_id: No 'channel-endpoint'" + " found for vcc"); + kmem_free(channel, size); + return (-1); + } + + /* Get the "id" value for the first channel endpoint node */ + if (md_get_prop_val(md, channel[0], "id", ldc_id) != 0) { + cmn_err(CE_CONT, "i_vcc_get_ldc: No id property found " + "for channel-endpoint of vcc"); + kmem_free(channel, size); + return (-1); + } + + if (num_channels > 1) { + cmn_err(CE_CONT, "i_vcc_get_ldc: Warning: Using ID of first" + " of multiple channels for this vcc"); + } + + kmem_free(channel, size); + return (0); +} +/* mdeg callback for an added port */ +static int +i_vcc_md_add_port(md_t *mdp, mde_cookie_t mdep, vcc_t *vccp) +{ + uint64_t portno; /* md requires 64 bit */ + char *domain_name; + char *group_name; + uint64_t ldc_id; + uint64_t tcp_port; + vcc_port_t *vport; + + /* read in the port's reg property */ + if (md_get_prop_val(mdp, mdep, "id", &portno)) { + cmn_err(CE_CONT, "i_vcc_md_add_port_: port has no 'id' " + "property\n"); + return (MDEG_FAILURE); + } + + /* read in the port's "vcc-doman-name" property */ + if (md_get_prop_str(mdp, mdep, "vcc-domain-name", &domain_name)) { + cmn_err(CE_CONT, "i_vcc_md_add_port: port%ld has " + "no 'vcc-domain-name' property\n", portno); + return (MDEG_FAILURE); + } + + + /* read in the port's "vcc-group-name" property */ + if (md_get_prop_str(mdp, mdep, "vcc-group-name", &group_name)) { + cmn_err(CE_CONT, "i_vcc_md_add_port: port%ld has no " + "'vcc-group-name'property\n", portno); + return (MDEG_FAILURE); + } + + + /* read in the port's "vcc-tcp-port" property */ + if (md_get_prop_val(mdp, mdep, "vcc-tcp-port", &tcp_port)) { + cmn_err(CE_CONT, "i_vcc_md_add_port: port%ld has no" + "'vcc-tcp-port' property\n", portno); + return (MDEG_FAILURE); + } + + D1("i_vcc_md_add_port: port@%d domain-name=%s group-name=%s" + " tcp-port=%lld\n", portno, domain_name, group_name, tcp_port); + + /* add the port */ + if (i_vcc_add_port(vccp, group_name, tcp_port, portno, domain_name)) { + return (MDEG_FAILURE); + } + + vport = &vccp->port[portno]; + if (i_vcc_get_ldc_id(mdp, mdep, &ldc_id)) { + mutex_enter(&vport->lock); + (void) i_vcc_delete_port(vccp, vport); + mutex_exit(&vport->lock); + return (MDEG_FAILURE); + } + + /* configure the port */ + if (i_vcc_config_port(vccp, portno, ldc_id)) { + mutex_enter(&vport->lock); + (void) i_vcc_delete_port(vccp, vport); + mutex_exit(&vport->lock); + return (MDEG_FAILURE); + } + + mutex_enter(&vccp->lock); + vccp->num_ports++; + mutex_exit(&vccp->lock); + + vport = &vccp->port[VCC_CONTROL_PORT]; + + if (vport->pollflag & VCC_POLL_CONFIG) { + /* wakeup vntsd */ + mutex_enter(&vport->lock); + vport->pollevent |= VCC_POLL_ADD_PORT; + mutex_exit(&vport->lock); + pollwakeup(&vport->poll, POLLIN); + } + + return (MDEG_SUCCESS); +} + +/* mdeg callback */ +static int +vcc_mdeg_cb(void *cb_argp, mdeg_result_t *resp) +{ + int idx; + vcc_t *vccp; + int rv; + + vccp = (vcc_t *)cb_argp; + ASSERT(vccp); + + if (resp == NULL) { + return (MDEG_FAILURE); + } + + /* added port */ + D1("vcc_mdeg_cb: added %d port(s)\n", resp->added.nelem); + + for (idx = 0; idx < resp->added.nelem; idx++) { + rv = i_vcc_md_add_port(resp->added.mdp, + resp->added.mdep[idx], vccp); + + if (rv != MDEG_SUCCESS) { + return (rv); + } + } + + /* removed port */ + D1("vcc_mdeg_cb: removed %d port(s)\n", resp->removed.nelem); + + for (idx = 0; idx < resp->removed.nelem; idx++) { + rv = i_vcc_md_remove_port(resp->removed.mdp, + resp->removed.mdep[idx], vccp); + + if (rv != MDEG_SUCCESS) { + return (rv); + } + } + + /* + * XXX - Currently no support for updating already active + * ports. So, ignore the match_curr and match_prev arrays + * for now. + */ + + + return (MDEG_SUCCESS); +} + + +/* cb_chpoll */ +static int +vcc_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + int instance; + minor_t minor; + uint_t portno; + vcc_t *vccp; + vcc_port_t *vport; + + minor = getminor(dev); + + instance = VCCINST(minor); + + vccp = ddi_get_soft_state(vcc_ssp, instance); + if (vccp == NULL) { + return (ENXIO); + } + + portno = VCCPORT(vccp, minor); + + vport = &(vccp->port[portno]); + + D1("vcc_chpoll: virtual-console-concentrator@%d events 0x%x\n", + portno, events); + + *reventsp = 0; + + if (portno != VCC_CONTROL_PORT) { + return (ENXIO); + } + + /* poll for config change */ + if (vport->pollevent) { + *reventsp |= (events & POLLIN); + } + + if (((*reventsp) == 0) && (!anyyet)) { + *phpp = &vport->poll; + if (events & POLLIN) { + mutex_enter(&vport->lock); + vport->pollflag |= VCC_POLL_CONFIG; + mutex_exit(&vport->lock); + } else { + return (ENXIO); + } + } + + D1("vcc_chpoll: virtual-console-concentrator@%d:%d ev=0x%x, " + "rev=0x%x pev=0x%x, flag=0x%x\n", + instance, portno, events, (*reventsp), + vport->pollevent, vport->pollflag); + + + return (0); +} diff --git a/usr/src/uts/sun4v/io/vdc.c b/usr/src/uts/sun4v/io/vdc.c new file mode 100644 index 0000000000..8a3d5c3444 --- /dev/null +++ b/usr/src/uts/sun4v/io/vdc.c @@ -0,0 +1,3560 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * LDoms virtual disk client (vdc) device driver + * + * This driver runs on a guest logical domain and communicates with the virtual + * disk server (vds) driver running on the service domain which is exporting + * virtualized "disks" to the guest logical domain. + * + * The driver can be divided into four sections: + * + * 1) generic device driver housekeeping + * _init, _fini, attach, detach, ops structures, etc. + * + * 2) communication channel setup + * Setup the communications link over the LDC channel that vdc uses to + * talk to the vDisk server. Initialise the descriptor ring which + * allows the LDC clients to transfer data via memory mappings. + * + * 3) Support exported to upper layers (filesystems, etc) + * The upper layers call into vdc via strategy(9E) and DKIO(7I) + * ioctl calls. vdc will copy the data to be written to the descriptor + * ring or maps the buffer to store the data read by the vDisk + * server into the descriptor ring. It then sends a message to the + * vDisk server requesting it to complete the operation. + * + * 4) Handling responses from vDisk server. + * The vDisk server will ACK some or all of the messages vdc sends to it + * (this is configured during the handshake). Upon receipt of an ACK + * vdc will check the descriptor ring and signal to the upper layer + * code waiting on the IO. + */ + +#include <sys/conf.h> +#include <sys/disp.h> +#include <sys/ddi.h> +#include <sys/dkio.h> +#include <sys/efi_partition.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/mach_descrip.h> +#include <sys/modctl.h> +#include <sys/mdeg.h> +#include <sys/note.h> +#include <sys/open.h> +#include <sys/stat.h> +#include <sys/sunddi.h> +#include <sys/types.h> +#include <sys/promif.h> +#include <sys/vtoc.h> +#include <sys/archsystm.h> +#include <sys/sysmacros.h> + +#include <sys/cdio.h> +#include <sys/dktp/cm.h> +#include <sys/dktp/fdisk.h> +#include <sys/scsi/generic/sense.h> +#include <sys/scsi/impl/uscsi.h> /* Needed for defn of USCSICMD ioctl */ +#include <sys/scsi/targets/sddef.h> + +#include <sys/ldoms.h> +#include <sys/ldc.h> +#include <sys/vio_common.h> +#include <sys/vio_mailbox.h> +#include <sys/vdsk_common.h> +#include <sys/vdsk_mailbox.h> +#include <sys/vdc.h> + +/* + * function prototypes + */ + +/* standard driver functions */ +static int vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred); +static int vdc_close(dev_t dev, int flag, int otyp, cred_t *cred); +static int vdc_strategy(struct buf *buf); +static int vdc_print(dev_t dev, char *str); +static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk); +static int vdc_read(dev_t dev, struct uio *uio, cred_t *cred); +static int vdc_write(dev_t dev, struct uio *uio, cred_t *cred); +static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, + cred_t *credp, int *rvalp); +static int vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred); +static int vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred); + +static int vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, + void *arg, void **resultp); +static int vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); +static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); + +/* setup */ +static int vdc_send(ldc_handle_t ldc_handle, caddr_t pkt, size_t *msglen); +static int vdc_do_ldc_init(vdc_t *vdc); +static int vdc_start_ldc_connection(vdc_t *vdc); +static int vdc_create_device_nodes(vdc_t *vdc); +static int vdc_create_device_nodes_props(vdc_t *vdc); +static int vdc_get_ldc_id(dev_info_t *dip, uint64_t *ldc_id); +static void vdc_terminate_ldc(vdc_t *vdc); +static int vdc_init_descriptor_ring(vdc_t *vdc); +static void vdc_destroy_descriptor_ring(vdc_t *vdc); + +/* handshake with vds */ +static void vdc_init_handshake_negotiation(void *arg); +static int vdc_init_ver_negotiation(vdc_t *vdc); +static int vdc_init_attr_negotiation(vdc_t *vdc); +static int vdc_init_dring_negotiate(vdc_t *vdc); +static int vdc_handle_ver_negotiate(); +static int vdc_handle_attr_negotiate(); +static void vdc_reset_connection(vdc_t *vdc, boolean_t resetldc); +static boolean_t vdc_is_able_to_tx_data(vdc_t *vdc, int flag); + +/* processing */ +static void vdc_process_msg_thread(vdc_t *vdc); +static uint_t vdc_handle_cb(uint64_t event, caddr_t arg); +static void vdc_process_msg(void *arg); +static int vdc_process_ctrl_msg(vdc_t *vdc, vio_msg_t msg); +static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t msg); +static int vdc_process_err_msg(vdc_t *vdc, vio_msg_t msg); +static void vdc_do_process_msg(vdc_t *vdc); +static int vdc_get_next_dring_entry_id(vdc_t *vdc, uint_t needed); +static int vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, + size_t nbytes, int op, uint64_t arg, uint64_t slice); +static int vdc_wait_for_descriptor_update(vdc_t *vdc, uint_t idx, + vio_dring_msg_t dmsg); +static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); +static int vdc_get_response(vdc_t *vdc, int start, int end); +static int vdc_populate_mem_hdl(vdc_t *vdc, uint_t idx, + caddr_t addr, size_t nbytes, int operation); +static boolean_t vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg, int + num_msgs); + +/* dkio */ +static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode); +static int vdc_create_fake_geometry(vdc_t *vdc); + +/* + * Module variables + */ +uint64_t vdc_hz_timeout; +uint64_t vdc_usec_timeout = VDC_USEC_TIMEOUT_MIN; +uint64_t vdc_dump_usec_timeout = VDC_USEC_TIMEOUT_MIN / 300; +static int vdc_retries = VDC_RETRIES; +static int vdc_dump_retries = VDC_RETRIES * 10; + +/* Soft state pointer */ +static void *vdc_state; + +/* variable level controlling the verbosity of the error/debug messages */ +int vdc_msglevel = 0; + + +static void +vdc_msg(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vcmn_err(CE_CONT, format, args); + va_end(args); +} + +static struct cb_ops vdc_cb_ops = { + vdc_open, /* cb_open */ + vdc_close, /* cb_close */ + vdc_strategy, /* cb_strategy */ + vdc_print, /* cb_print */ + vdc_dump, /* cb_dump */ + vdc_read, /* cb_read */ + vdc_write, /* cb_write */ + vdc_ioctl, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* cb_str */ + D_MP | D_64BIT, /* cb_flag */ + CB_REV, /* cb_rev */ + vdc_aread, /* cb_aread */ + vdc_awrite /* cb_awrite */ +}; + +static struct dev_ops vdc_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + vdc_getinfo, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + vdc_attach, /* devo_attach */ + vdc_detach, /* devo_detach */ + nodev, /* devo_reset */ + &vdc_cb_ops, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + nulldev /* devo_power */ +}; + +static struct modldrv modldrv = { + &mod_driverops, + "virtual disk client %I%", + &vdc_ops, +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +/* -------------------------------------------------------------------------- */ + +/* + * Device Driver housekeeping and setup + */ + +int +_init(void) +{ + int status; + + if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0) + return (status); + if ((status = mod_install(&modlinkage)) != 0) + ddi_soft_state_fini(&vdc_state); + return (status); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int status; + + if ((status = mod_remove(&modlinkage)) != 0) + return (status); + ddi_soft_state_fini(&vdc_state); + return (0); +} + +static int +vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) +{ + _NOTE(ARGUNUSED(dip)) + + int instance = SDUNIT(getminor((dev_t)arg)); + vdc_t *vdc = NULL; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { + *resultp = NULL; + return (DDI_FAILURE); + } + *resultp = vdc->dip; + return (DDI_SUCCESS); + case DDI_INFO_DEVT2INSTANCE: + *resultp = (void *)(uintptr_t)instance; + return (DDI_SUCCESS); + default: + *resultp = NULL; + return (DDI_FAILURE); + } +} + +static int +vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int instance; + int rv; + uint_t retries = 0; + vdc_t *vdc = NULL; + + switch (cmd) { + case DDI_DETACH: + /* the real work happens below */ + break; + case DDI_SUSPEND: + /* nothing to do for this non-device */ + return (DDI_SUCCESS); + default: + return (DDI_FAILURE); + } + + ASSERT(cmd == DDI_DETACH); + instance = ddi_get_instance(dip); + PR1("%s[%d] Entered\n", __func__, instance); + + if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { + vdc_msg("%s[%d]: Could not get state structure.", + __func__, instance); + return (DDI_FAILURE); + } + + if (vdc->open) { + PR0("%s[%d]: Cannot detach: device is open", + __func__, instance); + return (DDI_FAILURE); + } + + PR0("%s[%d] proceeding...\n", __func__, instance); + + /* + * try and disable callbacks to prevent another handshake + */ + rv = ldc_set_cb_mode(vdc->ldc_handle, LDC_CB_DISABLE); + PR0("%s[%d] callback disabled (rv=%d)\n", __func__, instance, rv); + + /* + * Prevent any more attempts to start a handshake with the vdisk + * server and tear down the existing connection. + */ + mutex_enter(&vdc->lock); + vdc->initialized |= VDC_HANDSHAKE_STOP; + vdc_reset_connection(vdc, B_TRUE); + mutex_exit(&vdc->lock); + + if (vdc->initialized & VDC_THREAD) { + mutex_enter(&vdc->msg_proc_lock); + vdc->msg_proc_thr_state = VDC_THR_STOP; + vdc->msg_pending = B_TRUE; + cv_signal(&vdc->msg_proc_cv); + + while (vdc->msg_proc_thr_state != VDC_THR_DONE) { + PR0("%s[%d]: Waiting for thread to exit\n", + __func__, instance); + rv = cv_timedwait(&vdc->msg_proc_cv, + &vdc->msg_proc_lock, VD_GET_TIMEOUT_HZ(1)); + if ((rv == -1) && (retries++ > vdc_retries)) + break; + } + mutex_exit(&vdc->msg_proc_lock); + } + + mutex_enter(&vdc->lock); + + if (vdc->initialized & VDC_DRING) + vdc_destroy_descriptor_ring(vdc); + + if (vdc->initialized & VDC_LDC) + vdc_terminate_ldc(vdc); + + mutex_exit(&vdc->lock); + + if (vdc->initialized & VDC_MINOR) { + ddi_prop_remove_all(dip); + ddi_remove_minor_node(dip, NULL); + } + + if (vdc->initialized & VDC_LOCKS) { + mutex_destroy(&vdc->lock); + mutex_destroy(&vdc->attach_lock); + mutex_destroy(&vdc->msg_proc_lock); + mutex_destroy(&vdc->dring_lock); + cv_destroy(&vdc->cv); + cv_destroy(&vdc->attach_cv); + cv_destroy(&vdc->msg_proc_cv); + } + + if (vdc->minfo) + kmem_free(vdc->minfo, sizeof (struct dk_minfo)); + + if (vdc->cinfo) + kmem_free(vdc->cinfo, sizeof (struct dk_cinfo)); + + if (vdc->vtoc) + kmem_free(vdc->vtoc, sizeof (struct vtoc)); + + if (vdc->initialized & VDC_SOFT_STATE) + ddi_soft_state_free(vdc_state, instance); + + PR0("%s[%d] End %p\n", __func__, instance, vdc); + + return (DDI_SUCCESS); +} + + +static int +vdc_do_attach(dev_info_t *dip) +{ + int instance; + vdc_t *vdc = NULL; + int status; + uint_t retries = 0; + + ASSERT(dip != NULL); + + instance = ddi_get_instance(dip); + if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) { + vdc_msg("%s:(%d): Couldn't alloc state structure", + __func__, instance); + return (DDI_FAILURE); + } + + if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { + vdc_msg("%s:(%d): Could not get state structure.", + __func__, instance); + return (DDI_FAILURE); + } + + /* + * We assign the value to initialized in this case to zero out the + * variable and then set bits in it to indicate what has been done + */ + vdc->initialized = VDC_SOFT_STATE; + + vdc_hz_timeout = drv_usectohz(vdc_usec_timeout); + + vdc->dip = dip; + vdc->instance = instance; + vdc->open = 0; + vdc->vdisk_type = VD_DISK_TYPE_UNK; + vdc->state = VD_STATE_INIT; + vdc->ldc_state = 0; + vdc->session_id = 0; + vdc->block_size = DEV_BSIZE; + vdc->max_xfer_sz = VD_MAX_BLOCK_SIZE / DEV_BSIZE; + + vdc->vtoc = NULL; + vdc->cinfo = NULL; + vdc->minfo = NULL; + + mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&vdc->attach_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&vdc->msg_proc_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&vdc->dring_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&vdc->cv, NULL, CV_DRIVER, NULL); + cv_init(&vdc->attach_cv, NULL, CV_DRIVER, NULL); + cv_init(&vdc->msg_proc_cv, NULL, CV_DRIVER, NULL); + vdc->initialized |= VDC_LOCKS; + + vdc->msg_pending = B_FALSE; + vdc->msg_proc_thr_id = thread_create(NULL, 0, vdc_process_msg_thread, + vdc, 0, &p0, TS_RUN, minclsyspri); + if (vdc->msg_proc_thr_id == NULL) { + cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread", + instance); + return (DDI_FAILURE); + } + vdc->initialized |= VDC_THREAD; + + /* initialise LDC channel which will be used to communicate with vds */ + if (vdc_do_ldc_init(vdc) != 0) { + cmn_err(CE_NOTE, "[%d] Couldn't initialize LDC", instance); + return (DDI_FAILURE); + } + + /* Bring up connection with vds via LDC */ + status = vdc_start_ldc_connection(vdc); + if (status != 0) { + vdc_msg("%s[%d] Could not start LDC", __func__, instance); + return (DDI_FAILURE); + } + + /* + * We need to wait until the handshake has completed before leaving + * the attach(). This is to allow the device node(s) to be created + * and the first usage of the filesystem to succeed. + */ + mutex_enter(&vdc->attach_lock); + while ((vdc->ldc_state != LDC_UP) || + (vdc->state != VD_STATE_DATA)) { + + PR0("%s[%d] handshake in progress [VD %d (LDC %d)]\n", + __func__, instance, vdc->state, vdc->ldc_state); + + status = cv_timedwait(&vdc->attach_cv, &vdc->attach_lock, + VD_GET_TIMEOUT_HZ(1)); + if (status == -1) { + if (retries >= vdc_retries) { + PR0("%s[%d] Give up handshake wait.\n", + __func__, instance); + mutex_exit(&vdc->attach_lock); + return (DDI_FAILURE); + } else { + PR0("%s[%d] Retry #%d for handshake.\n", + __func__, instance, retries); + retries++; + } + } + } + mutex_exit(&vdc->attach_lock); + + if (vdc->vtoc == NULL) + vdc->vtoc = kmem_zalloc(sizeof (struct vtoc), KM_SLEEP); + + status = vdc_populate_descriptor(vdc, (caddr_t)vdc->vtoc, + P2ROUNDUP(sizeof (struct vtoc), sizeof (uint64_t)), + VD_OP_GET_VTOC, FKIOCTL, 0); + if (status) { + cmn_err(CE_NOTE, "[%d] Failed to get VTOC", instance); + return (status); + } + + /* + * Now that we have the device info we can create the + * device nodes and properties + */ + status = vdc_create_device_nodes(vdc); + if (status) { + cmn_err(CE_NOTE, "[%d] Failed to create device nodes", + instance); + return (status); + } + status = vdc_create_device_nodes_props(vdc); + if (status) { + cmn_err(CE_NOTE, "[%d] Failed to create device nodes" + " properties", instance); + return (status); + } + + ddi_report_dev(dip); + + PR0("%s[%d] Attach completed\n", __func__, instance); + return (status); +} + +static int +vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int status; + + PR0("%s[%d] Entered. Built %s %s\n", __func__, ddi_get_instance(dip), + __DATE__, __TIME__); + + switch (cmd) { + case DDI_ATTACH: + if ((status = vdc_do_attach(dip)) != 0) + (void) vdc_detach(dip, DDI_DETACH); + return (status); + case DDI_RESUME: + /* nothing to do for this non-device */ + return (DDI_SUCCESS); + default: + return (DDI_FAILURE); + } +} + +static int +vdc_do_ldc_init(vdc_t *vdc) +{ + int status = 0; + ldc_status_t ldc_state; + ldc_attr_t ldc_attr; + uint64_t ldc_id = 0; + dev_info_t *dip = NULL; + + ASSERT(vdc != NULL); + + dip = vdc->dip; + vdc->initialized |= VDC_LDC; + + if ((status = vdc_get_ldc_id(dip, &ldc_id)) != 0) { + vdc_msg("%s: Failed to get <ldc_id> property\n", __func__); + return (EIO); + } + vdc->ldc_id = ldc_id; + + ldc_attr.devclass = LDC_DEV_BLK; + ldc_attr.instance = vdc->instance; + ldc_attr.mode = LDC_MODE_UNRELIABLE; /* unreliable transport */ + ldc_attr.qlen = VD_LDC_QLEN; + + if ((vdc->initialized & VDC_LDC_INIT) == 0) { + status = ldc_init(ldc_id, &ldc_attr, &vdc->ldc_handle); + if (status != 0) { + cmn_err(CE_NOTE, "[%d] ldc_init(chan %ld) returned %d", + vdc->instance, ldc_id, status); + return (status); + } + vdc->initialized |= VDC_LDC_INIT; + } + status = ldc_status(vdc->ldc_handle, &ldc_state); + if (status != 0) { + vdc_msg("Cannot discover LDC status [err=%d].", status); + return (status); + } + vdc->ldc_state = ldc_state; + + if ((vdc->initialized & VDC_LDC_CB) == 0) { + status = ldc_reg_callback(vdc->ldc_handle, vdc_handle_cb, + (caddr_t)vdc); + if (status != 0) { + vdc_msg("%s: ldc_reg_callback()=%d", __func__, status); + return (status); + } + vdc->initialized |= VDC_LDC_CB; + } + + vdc->initialized |= VDC_LDC; + + /* + * At this stage we have initialised LDC, we will now try and open + * the connection. + */ + if (vdc->ldc_state == LDC_INIT) { + status = ldc_open(vdc->ldc_handle); + if (status != 0) { + cmn_err(CE_NOTE, "[%d] ldc_open(chan %ld) returned %d", + vdc->instance, vdc->ldc_id, status); + return (status); + } + vdc->initialized |= VDC_LDC_OPEN; + } + + return (status); +} + +static int +vdc_start_ldc_connection(vdc_t *vdc) +{ + int status = 0; + + ASSERT(vdc != NULL); + + mutex_enter(&vdc->lock); + + if (vdc->ldc_state == LDC_UP) { + PR0("%s: LDC is already UP ..\n", __func__); + mutex_exit(&vdc->lock); + return (0); + } + + if ((status = ldc_up(vdc->ldc_handle)) != 0) { + switch (status) { + case ECONNREFUSED: /* listener not ready at other end */ + PR0("%s: ldc_up(%d,...) return %d\n", + __func__, vdc->ldc_id, status); + status = 0; + break; + default: + cmn_err(CE_NOTE, "[%d] Failed to bring up LDC: " + "channel=%ld, err=%d", + vdc->instance, vdc->ldc_id, status); + } + } + + PR0("%s[%d] Finished bringing up LDC\n", __func__, vdc->instance); + + mutex_exit(&vdc->lock); + + return (status); +} + + +/* + * Function: + * vdc_create_device_nodes + * + * Description: + * This function creates the block and character device nodes under + * /devices along with the node properties. It is called as part of + * the attach(9E) of the instance during the handshake with vds after + * vds has sent the attributes to vdc. + * + * If the device is of type VD_DISK_TYPE_SLICE then the minor node + * of 2 is used in keeping with the Solaris convention that slice 2 + * refers to a whole disk. Slices start at 'a' + * + * Parameters: + * vdc - soft state pointer + * + * Return Values + * 0 - Success + * EIO - Failed to create node + * EINVAL - Unknown type of disk exported + */ +static int +vdc_create_device_nodes(vdc_t *vdc) +{ + /* uses NNNN which is OK as long as # of disks <= 10000 */ + char name[sizeof ("disk@NNNN:s,raw")]; + dev_info_t *dip = NULL; + int instance; + int num_slices = 1; + int i; + + ASSERT(vdc != NULL); + + instance = vdc->instance; + dip = vdc->dip; + + switch (vdc->vdisk_type) { + case VD_DISK_TYPE_DISK: + num_slices = V_NUMPAR; + break; + case VD_DISK_TYPE_SLICE: + num_slices = 1; + break; + case VD_DISK_TYPE_UNK: + default: + return (EINVAL); + } + + for (i = 0; i < num_slices; i++) { + (void) snprintf(name, sizeof (name), "%c", 'a' + i); + if (ddi_create_minor_node(dip, name, S_IFBLK, + VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { + vdc_msg("%s[%d]: Couldn't add block node %s.", + __func__, instance, name); + return (EIO); + } + + /* if any device node is created we set this flag */ + vdc->initialized |= VDC_MINOR; + + (void) snprintf(name, sizeof (name), "%c%s", + 'a' + i, ",raw"); + if (ddi_create_minor_node(dip, name, S_IFCHR, + VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) { + vdc_msg("%s[%d]: Could not add raw node %s.", + __func__, instance, name); + return (EIO); + } + } + + return (0); +} + +/* + * Function: + * vdc_create_device_nodes_props + * + * Description: + * This function creates the block and character device nodes under + * /devices along with the node properties. It is called as part of + * the attach(9E) of the instance during the handshake with vds after + * vds has sent the attributes to vdc. + * + * Parameters: + * vdc - soft state pointer + * + * Return Values + * 0 - Success + * EIO - Failed to create device node property + * EINVAL - Unknown type of disk exported + */ +static int +vdc_create_device_nodes_props(vdc_t *vdc) +{ + dev_info_t *dip = NULL; + int instance; + int num_slices = 1; + int64_t size = 0; + dev_t dev; + int rv; + int i; + + ASSERT(vdc != NULL); + + instance = vdc->instance; + dip = vdc->dip; + + if ((vdc->vtoc == NULL) || (vdc->vtoc->v_sanity != VTOC_SANE)) { + cmn_err(CE_NOTE, "![%d] Could not create device node property." + " No VTOC available", instance); + return (ENXIO); + } + + switch (vdc->vdisk_type) { + case VD_DISK_TYPE_DISK: + num_slices = V_NUMPAR; + break; + case VD_DISK_TYPE_SLICE: + num_slices = 1; + break; + case VD_DISK_TYPE_UNK: + default: + return (EINVAL); + } + + for (i = 0; i < num_slices; i++) { + dev = makedevice(ddi_driver_major(dip), + VD_MAKE_DEV(instance, i)); + + size = vdc->vtoc->v_part[i].p_size * vdc->vtoc->v_sectorsz; + PR0("%s[%d] sz %ld (%ld Mb) p_size %lx\n", + __func__, instance, size, size / (1024 * 1024), + vdc->vtoc->v_part[i].p_size); + + rv = ddi_prop_update_int64(dev, dip, VDC_SIZE_PROP_NAME, size); + if (rv != DDI_PROP_SUCCESS) { + vdc_msg("%s:(%d): Couldn't add \"%s\" [%d]\n", + __func__, instance, VDC_SIZE_PROP_NAME, size); + return (EIO); + } + + rv = ddi_prop_update_int64(dev, dip, VDC_NBLOCKS_PROP_NAME, + lbtodb(size)); + if (rv != DDI_PROP_SUCCESS) { + vdc_msg("%s:(%d): Couldn't add \"%s\" [%d]\n", __func__, + instance, VDC_NBLOCKS_PROP_NAME, lbtodb(size)); + return (EIO); + } + } + + return (0); +} + +static int +vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred) +{ + _NOTE(ARGUNUSED(cred)) + + int instance; + int status = 0; + vdc_t *vdc; + + ASSERT(dev != NULL); + instance = SDUNIT(getminor(*dev)); + + PR0("%s[%d] minor = %d flag = %x, otyp = %x\n", __func__, instance, + getminor(*dev), flag, otyp); + + if ((otyp != OTYP_CHR) && (otyp != OTYP_BLK)) + return (EINVAL); + + if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { + vdc_msg("%s[%d] Could not get state.", __func__, instance); + return (ENXIO); + } + + /* + * Check to see if we can communicate with vds + */ + status = vdc_is_able_to_tx_data(vdc, flag); + if (status == B_FALSE) { + PR0("%s[%d] Not ready to transmit data\n", __func__, instance); + return (ENOLINK); + } + + mutex_enter(&vdc->lock); + vdc->open++; + mutex_exit(&vdc->lock); + + return (0); +} + +static int +vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) +{ + _NOTE(ARGUNUSED(cred)) + + int instance; + vdc_t *vdc; + + instance = SDUNIT(getminor(dev)); + + PR0("%s[%d] flag = %x, otyp = %x\n", __func__, instance, flag, otyp); + + if ((otyp != OTYP_CHR) && (otyp != OTYP_BLK)) + return (EINVAL); + + if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { + vdc_msg("%s[%d] Could not get state.", __func__, instance); + return (ENXIO); + } + + /* + * Check to see if we can communicate with vds + */ + if (vdc_is_able_to_tx_data(vdc, 0) == B_FALSE) { + PR0("%s[%d] Not ready to transmit data\n", __func__, instance); + return (ETIMEDOUT); + } + + if (vdc->dkio_flush_pending) { + PR0("%s[%d]: Cannot detach: %d outstanding DKIO flushes", + __func__, instance, vdc->dkio_flush_pending); + return (EBUSY); + } + + /* + * Should not need the mutex here, since the framework should protect + * against more opens on this device, but just in case. + */ + mutex_enter(&vdc->lock); + vdc->open--; + mutex_exit(&vdc->lock); + + return (0); +} + +static int +vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) +{ + _NOTE(ARGUNUSED(credp)) + _NOTE(ARGUNUSED(rvalp)) + + return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode)); +} + +static int +vdc_print(dev_t dev, char *str) +{ + cmn_err(CE_NOTE, "vdc%d: %s", SDUNIT(getminor(dev)), str); + return (0); +} + +static int +vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) +{ + int rv = 0; + size_t nbytes = (nblk * DEV_BSIZE); + int instance = SDUNIT(getminor(dev)); + vdc_t *vdc; + + if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { + vdc_msg("%s (%d): Could not get state.", __func__, instance); + return (ENXIO); + } + + rv = vdc_populate_descriptor(vdc, addr, nbytes, VD_OP_BWRITE, + blkno, SDPART(getminor(dev))); + + PR1("%s: status=%d\n", __func__, rv); + + return (rv); +} + +/* -------------------------------------------------------------------------- */ + +/* + * Disk access routines + * + */ + +/* + * vdc_strategy() + * + * Return Value: + * 0: As per strategy(9E), the strategy() function must return 0 + * [ bioerror(9f) sets b_flags to the proper error code ] + */ +static int +vdc_strategy(struct buf *buf) +{ + int rv = -1; + vdc_t *vdc = NULL; + int instance = SDUNIT(getminor(buf->b_edev)); + int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE; + + PR1("%s: %s %ld bytes at block %ld : b_addr=0x%p", + __func__, (buf->b_flags & B_READ) ? "Read" : "Write", + buf->b_bcount, buf->b_lblkno, buf->b_un.b_addr); + + if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { + vdc_msg("%s[%d]: Could not get state.", __func__, instance); + bioerror(buf, ENXIO); + biodone(buf); + return (0); + } + + ASSERT(buf->b_bcount <= (vdc->max_xfer_sz * vdc->block_size)); + + if (vdc_is_able_to_tx_data(vdc, O_NONBLOCK) == B_FALSE) { + vdc_msg("%s: Not ready to transmit data", __func__); + bioerror(buf, ENXIO); + biodone(buf); + return (0); + } + bp_mapin(buf); + + rv = vdc_populate_descriptor(vdc, buf->b_un.b_addr, buf->b_bcount, op, + buf->b_lblkno, SDPART(getminor(buf->b_edev))); + + PR1("%s: status=%d", __func__, rv); + bioerror(buf, rv); + biodone(buf); + return (0); +} + + +static int +vdc_read(dev_t dev, struct uio *uio, cred_t *cred) +{ + _NOTE(ARGUNUSED(cred)) + + PR1("vdc_read(): Entered"); + return (physio(vdc_strategy, NULL, dev, B_READ, minphys, uio)); +} + +static int +vdc_write(dev_t dev, struct uio *uio, cred_t *cred) +{ + _NOTE(ARGUNUSED(cred)) + + PR1("vdc_write(): Entered"); + return (physio(vdc_strategy, NULL, dev, B_WRITE, minphys, uio)); +} + +static int +vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred) +{ + _NOTE(ARGUNUSED(cred)) + + PR1("vdc_aread(): Entered"); + return (aphysio(vdc_strategy, anocancel, dev, B_READ, minphys, aio)); +} + +static int +vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred) +{ + _NOTE(ARGUNUSED(cred)) + + PR1("vdc_awrite(): Entered"); + return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, minphys, aio)); +} + + +/* -------------------------------------------------------------------------- */ + +/* + * Handshake support + */ + +/* + * vdc_init_handshake_negotiation + * + * Description: + * This function is called to trigger the handshake negotiations between + * the client (vdc) and the server (vds). It may be called multiple times. + * + * Parameters: + * vdc - soft state pointer + */ +static void +vdc_init_handshake_negotiation(void *arg) +{ + vdc_t *vdc = (vdc_t *)(void *)arg; + vd_state_t state; + + ASSERT(vdc != NULL); + ASSERT(vdc->ldc_state == LDC_UP); + + mutex_enter(&vdc->lock); + + /* + * Do not continue if another thread has triggered a handshake which + * is in progress or detach() has stopped further handshakes. + */ + if (vdc->initialized & (VDC_HANDSHAKE | VDC_HANDSHAKE_STOP)) { + PR0("%s[%d] Negotiation not triggered. [init=%x]\n", + __func__, vdc->instance, vdc->initialized); + mutex_exit(&vdc->lock); + return; + } + + PR0("Initializing vdc<->vds handshake\n"); + + vdc->initialized |= VDC_HANDSHAKE; + + state = vdc->state; + + if (state == VD_STATE_INIT) { + (void) vdc_init_ver_negotiation(vdc); + } else if (state == VD_STATE_VER) { + (void) vdc_init_attr_negotiation(vdc); + } else if (state == VD_STATE_ATTR) { + (void) vdc_init_dring_negotiate(vdc); + } else if (state == VD_STATE_DATA) { + /* + * nothing to do - we have already completed the negotiation + * and we can transmit data when ready. + */ + PR0("%s[%d] Negotiation triggered after handshake completed", + __func__, vdc->instance); + } + + mutex_exit(&vdc->lock); +} + +static int +vdc_init_ver_negotiation(vdc_t *vdc) +{ + vio_ver_msg_t pkt; + size_t msglen = sizeof (pkt); + int status = -1; + + PR0("%s: Entered.\n", __func__); + + ASSERT(vdc != NULL); + ASSERT(mutex_owned(&vdc->lock)); + + /* + * set the Session ID to a unique value + * (the lower 32 bits of the clock tick) + */ + vdc->session_id = ((uint32_t)gettick() & 0xffffffff); + + pkt.tag.vio_msgtype = VIO_TYPE_CTRL; + pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; + pkt.tag.vio_subtype_env = VIO_VER_INFO; + pkt.tag.vio_sid = vdc->session_id; + pkt.dev_class = VDEV_DISK; + pkt.ver_major = VD_VER_MAJOR; + pkt.ver_minor = VD_VER_MINOR; + + status = vdc_send(vdc->ldc_handle, (caddr_t)&pkt, &msglen); + PR0("%s: vdc_send(status = %d)\n", __func__, status); + + if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { + PR0("%s[%d] vdc_send failed: id(%lx) rv(%d) size(%d)\n", + __func__, vdc->instance, vdc->ldc_handle, + status, msglen); + if (msglen != sizeof (vio_ver_msg_t)) + status = ENOMSG; + } + + return (status); +} + +static int +vdc_init_attr_negotiation(vdc_t *vdc) +{ + vd_attr_msg_t pkt; + size_t msglen = sizeof (pkt); + int status; + + ASSERT(vdc != NULL); + ASSERT(mutex_owned(&vdc->lock)); + + PR0("%s[%d] entered\n", __func__, vdc->instance); + + /* fill in tag */ + pkt.tag.vio_msgtype = VIO_TYPE_CTRL; + pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; + pkt.tag.vio_subtype_env = VIO_ATTR_INFO; + pkt.tag.vio_sid = vdc->session_id; + /* fill in payload */ + pkt.max_xfer_sz = vdc->max_xfer_sz; + pkt.vdisk_block_size = vdc->block_size; + pkt.xfer_mode = VIO_DRING_MODE; + pkt.operations = 0; /* server will set bits of valid operations */ + pkt.vdisk_type = 0; /* server will set to valid device type */ + pkt.vdisk_size = 0; /* server will set to valid size */ + + status = vdc_send(vdc->ldc_handle, (caddr_t)&pkt, &msglen); + PR0("%s: vdc_send(status = %d)\n", __func__, status); + + if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) { + PR0("%s[%d] ldc_write failed: id(%lx) rv(%d) size (%d)\n", + __func__, vdc->instance, vdc->ldc_handle, + status, msglen); + if (msglen != sizeof (vio_ver_msg_t)) + status = ENOMSG; + } + + return (status); +} + +static int +vdc_init_dring_negotiate(vdc_t *vdc) +{ + vio_dring_reg_msg_t pkt; + size_t msglen = sizeof (pkt); + int status = -1; + + ASSERT(vdc != NULL); + ASSERT(mutex_owned(&vdc->lock)); + + status = vdc_init_descriptor_ring(vdc); + PR0("%s[%d] Init of descriptor ring completed (status = %d)\n", + __func__, vdc->instance, status); + if (status != 0) { + cmn_err(CE_CONT, "[%d] Failed to init DRing (status = %d)\n", + vdc->instance, status); + vdc_reset_connection(vdc, B_FALSE); + return (status); + } + + /* fill in tag */ + pkt.tag.vio_msgtype = VIO_TYPE_CTRL; + pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; + pkt.tag.vio_subtype_env = VIO_DRING_REG; + pkt.tag.vio_sid = vdc->session_id; + /* fill in payload */ + pkt.dring_ident = 0; + pkt.num_descriptors = VD_DRING_LEN; + pkt.descriptor_size = VD_DRING_ENTRY_SZ; + pkt.options = (VIO_TX_DRING | VIO_RX_DRING); + pkt.ncookies = vdc->dring_cookie_count; + pkt.cookie[0] = vdc->dring_cookie[0]; /* for now just one cookie */ + + status = vdc_send(vdc->ldc_handle, (caddr_t)&pkt, &msglen); + if (status != 0) { + PR0("%s[%d] Failed to register DRing (status = %d)\n", + __func__, vdc->instance, status); + vdc_reset_connection(vdc, B_FALSE); + } + + return (status); +} + + +/* -------------------------------------------------------------------------- */ + +/* + * LDC helper routines + */ + +/* + * Function: + * vdc_send() + * + * Description: + * The function encapsulates the call to write a message using LDC. + * If LDC indicates that the call failed due to the queue being full, + * we retry the ldc_write() [ up to 'vdc_retries' time ], otherwise + * we return the error returned by LDC. + * + * Arguments: + * ldc_handle - LDC handle for the channel this instance of vdc uses + * pkt - address of LDC message to be sent + * msglen - the size of the message being sent. When the function + * returns, this contains the number of bytes written. + * + * Return Code: + * 0 - Success. + * EINVAL - pkt or msglen were NULL + * ECONNRESET - The connection was not up. + * EWOULDBLOCK - LDC queue is full + * xxx - other error codes returned by ldc_write + */ +static int +vdc_send(ldc_handle_t ldc_handle, caddr_t pkt, size_t *msglen) +{ + size_t size = 0; + int retries = 0; + int status = 0; + + ASSERT(msglen != NULL); + ASSERT(*msglen != 0); + + do { + size = *msglen; + status = ldc_write(ldc_handle, pkt, &size); + } while (status == EWOULDBLOCK && retries++ < vdc_retries); + + /* return the last size written */ + *msglen = size; + + return (status); +} + +/* + * Function: + * vdc_get_ldc_id() + * + * Description: + * This function gets the 'ldc-id' for this particular instance of vdc. + * The id returned is the guest domain channel endpoint LDC uses for + * communication with vds. + * + * Arguments: + * dip - dev info pointer for this instance of the device driver. + * ldc_id - pointer to variable used to return the 'ldc-id' found. + * + * Return Code: + * 0 - Success. + * ENOENT - Expected node or property did not exist. + * ENXIO - Unexpected error communicating with MD framework + */ +static int +vdc_get_ldc_id(dev_info_t *dip, uint64_t *ldc_id) +{ + int status = ENOENT; + char *node_name = NULL; + md_t *mdp = NULL; + int num_nodes; + int num_vdevs; + int num_chans; + mde_cookie_t rootnode; + mde_cookie_t *listp = NULL; + mde_cookie_t *chanp = NULL; + boolean_t found_inst = B_FALSE; + int listsz; + int idx; + uint64_t md_inst; + int obp_inst; + int instance = ddi_get_instance(dip); + + ASSERT(ldc_id != NULL); + *ldc_id = 0; + + /* + * Get the OBP instance number for comparison with the MD instance + * + * The "cfg-handle" property of a vdc node in an MD contains the MD's + * notion of "instance", or unique identifier, for that node; OBP + * stores the value of the "cfg-handle" MD property as the value of + * the "reg" property on the node in the device tree it builds from + * the MD and passes to Solaris. Thus, we look up the devinfo node's + * "reg" property value to uniquely identify this device instance. + * If the "reg" property cannot be found, the device tree state is + * presumably so broken that there is no point in continuing. + */ + if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) { + cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG); + return (ENOENT); + } + obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + OBP_REG, -1); + PR1("%s[%d]: OBP inst=%d\n", __func__, instance, obp_inst); + + /* + * We now walk the MD nodes and if an instance of a vdc node matches + * the instance got from OBP we get the ldc-id property. + */ + if ((mdp = md_get_handle()) == NULL) { + cmn_err(CE_WARN, "unable to init machine description"); + return (ENXIO); + } + + num_nodes = md_node_count(mdp); + ASSERT(num_nodes > 0); + + listsz = num_nodes * sizeof (mde_cookie_t); + + /* allocate memory for nodes */ + listp = kmem_zalloc(listsz, KM_SLEEP); + chanp = kmem_zalloc(listsz, KM_SLEEP); + + rootnode = md_root_node(mdp); + ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); + + /* + * Search for all the virtual devices, we will then check to see which + * ones are disk nodes. + */ + num_vdevs = md_scan_dag(mdp, rootnode, + md_find_name(mdp, VDC_MD_VDEV_NAME), + md_find_name(mdp, "fwd"), listp); + + if (num_vdevs <= 0) { + cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME); + status = ENOENT; + goto done; + } + + PR1("%s[%d] num_vdevs=%d\n", __func__, instance, num_vdevs); + for (idx = 0; idx < num_vdevs; idx++) { + status = md_get_prop_str(mdp, listp[idx], "name", &node_name); + if ((status != 0) || (node_name == NULL)) { + cmn_err(CE_NOTE, "Unable to get name of node type '%s'" + ": err %d", VDC_MD_VDEV_NAME, status); + continue; + } + + PR1("%s[%d] Found node %s\n", __func__, instance, node_name); + if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) { + status = md_get_prop_val(mdp, listp[idx], + VDC_MD_CFG_HDL, &md_inst); + PR1("%s[%d] vdc inst# in MD=%d\n", + __func__, instance, md_inst); + if ((status == 0) && (md_inst == obp_inst)) { + found_inst = B_TRUE; + break; + } + } + } + + if (found_inst == B_FALSE) { + cmn_err(CE_NOTE, "Unable to find correct '%s' node", + VDC_MD_DISK_NAME); + status = ENOENT; + goto done; + } + PR0("%s[%d] MD inst=%d\n", __func__, instance, md_inst); + + /* get the channels for this node */ + num_chans = md_scan_dag(mdp, listp[idx], + md_find_name(mdp, VDC_MD_CHAN_NAME), + md_find_name(mdp, "fwd"), chanp); + + /* expecting at least one channel */ + if (num_chans <= 0) { + cmn_err(CE_NOTE, "No '%s' node for '%s' port", + VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME); + status = ENOENT; + goto done; + + } else if (num_chans != 1) { + PR0("%s[%d] Expected 1 '%s' node for '%s' port, found %d\n", + __func__, instance, VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME, + num_chans); + } + + /* + * We use the first channel found (index 0), irrespective of how + * many are there in total. + */ + if (md_get_prop_val(mdp, chanp[0], VDC_ID_PROP, ldc_id) != 0) { + cmn_err(CE_NOTE, "Channel '%s' property not found", + VDC_ID_PROP); + status = ENOENT; + } + + PR0("%s[%d] LDC id is 0x%lx\n", __func__, instance, *ldc_id); + +done: + if (chanp) + kmem_free(chanp, listsz); + if (listp) + kmem_free(listp, listsz); + + (void) md_fini_handle(mdp); + + return (status); +} + + +/* + * vdc_is_able_to_tx_data() + * + * Description: + * This function checks if we are able to send data to the + * vDisk server (vds). The LDC connection needs to be up and + * vdc & vds need to have completed the handshake negotiation. + * + * Parameters: + * vdc - soft state pointer + * flag - flag to indicate if we can block or not + * [ If O_NONBLOCK or O_NDELAY (which are defined in + * open(2)) are set then do not block) + * + * Return Values + * B_TRUE - can talk to vds + * B_FALSE - unable to talk to vds + */ +static boolean_t +vdc_is_able_to_tx_data(vdc_t *vdc, int flag) +{ + vd_state_t state; + uint32_t ldc_state; + uint_t retries = 0; + int rv = -1; + + ASSERT(vdc != NULL); + + mutex_enter(&vdc->lock); + state = vdc->state; + ldc_state = vdc->ldc_state; + mutex_exit(&vdc->lock); + + if ((state == VD_STATE_DATA) && (ldc_state == LDC_UP)) + return (B_TRUE); + + if ((flag & O_NONBLOCK) || (flag & O_NDELAY)) { + PR0("%s[%d] Not ready to tx - state %d LDC state %d\n", + __func__, vdc->instance, state, ldc_state); + return (B_FALSE); + } + + /* + * We want to check and see if any negotiations triggered earlier + * have succeeded. We are prepared to wait a little while in case + * they are still in progress. + */ + mutex_enter(&vdc->lock); + while ((vdc->ldc_state != LDC_UP) || (vdc->state != VD_STATE_DATA)) { + PR0("%s: Waiting for connection at state %d (LDC state %d)\n", + __func__, vdc->state, vdc->ldc_state); + + rv = cv_timedwait(&vdc->cv, &vdc->lock, + VD_GET_TIMEOUT_HZ(retries)); + + /* + * An rv of -1 indicates that we timed out without the LDC + * state changing so it looks like the other side (vdc) is + * not yet ready/responding. + * + * Any other value of rv indicates that the LDC triggered an + * interrupt so we just loop again, check the handshake state + * and keep waiting if necessary. + */ + if (rv == -1) { + if (retries >= vdc_retries) { + PR0("%s[%d] handshake wait timed out.\n", + __func__, vdc->instance); + mutex_exit(&vdc->lock); + return (B_FALSE); + } else { + PR1("%s[%d] Retry #%d for handshake timedout\n", + __func__, vdc->instance, retries); + retries++; + } + } + } + + ASSERT(vdc->ldc_state == LDC_UP); + ASSERT(vdc->state == VD_STATE_DATA); + + mutex_exit(&vdc->lock); + + return (B_TRUE); +} + + +static void +vdc_terminate_ldc(vdc_t *vdc) +{ + int instance = ddi_get_instance(vdc->dip); + + ASSERT(vdc != NULL); + ASSERT(mutex_owned(&vdc->lock)); + + PR0("%s[%d] initialized=%x\n", __func__, instance, vdc->initialized); + + if (vdc->initialized & VDC_LDC_OPEN) { + PR0("%s[%d]: ldc_close()\n", __func__, instance); + (void) ldc_close(vdc->ldc_handle); + } + if (vdc->initialized & VDC_LDC_CB) { + PR0("%s[%d]: ldc_unreg_callback()\n", __func__, instance); + (void) ldc_unreg_callback(vdc->ldc_handle); + } + if (vdc->initialized & VDC_LDC) { + PR0("%s[%d]: ldc_fini()\n", __func__, instance); + (void) ldc_fini(vdc->ldc_handle); + vdc->ldc_handle = NULL; + } + + vdc->initialized &= ~(VDC_LDC | VDC_LDC_CB | VDC_LDC_OPEN); +} + +static void +vdc_reset_connection(vdc_t *vdc, boolean_t reset_ldc) +{ + int status; + + ASSERT(vdc != NULL); + ASSERT(mutex_owned(&vdc->lock)); + + PR0("%s[%d] Entered\n", __func__, vdc->instance); + + vdc->state = VD_STATE_INIT; + + if (reset_ldc == B_TRUE) { + status = ldc_reset(vdc->ldc_handle); + PR0("%s[%d] ldc_reset() = %d\n", + __func__, vdc->instance, status); + } + + vdc->initialized &= ~VDC_HANDSHAKE; + PR0("%s[%d] init=%x\n", __func__, vdc->instance, vdc->initialized); +} + +/* -------------------------------------------------------------------------- */ + +/* + * Descriptor Ring helper routines + */ + +static int +vdc_init_descriptor_ring(vdc_t *vdc) +{ + vd_dring_entry_t *dep = NULL; /* DRing Entry pointer */ + int status = -1; + int i; + + PR0("%s\n", __func__); + + ASSERT(vdc != NULL); + ASSERT(mutex_owned(&vdc->lock)); + ASSERT(vdc->ldc_handle != NULL); + + status = ldc_mem_dring_create(VD_DRING_LEN, VD_DRING_ENTRY_SZ, + &vdc->ldc_dring_hdl); + if ((vdc->ldc_dring_hdl == NULL) || (status != 0)) { + PR0("%s: Failed to create a descriptor ring", __func__); + return (status); + } + vdc->initialized |= VDC_DRING; + vdc->dring_entry_size = VD_DRING_ENTRY_SZ; + vdc->dring_len = VD_DRING_LEN; + + vdc->dring_cookie = kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP); + + status = ldc_mem_dring_bind(vdc->ldc_handle, vdc->ldc_dring_hdl, + LDC_SHADOW_MAP, LDC_MEM_RW, &vdc->dring_cookie[0], + &vdc->dring_cookie_count); + if (status != 0) { + PR0("%s: Failed to bind descriptor ring (%p) to channel (%p)\n", + __func__, vdc->ldc_dring_hdl, vdc->ldc_handle); + return (status); + } + ASSERT(vdc->dring_cookie_count == 1); + vdc->initialized |= VDC_DRING_BOUND; + + status = ldc_mem_dring_info(vdc->ldc_dring_hdl, &vdc->dring_mem_info); + if (status != 0) { + PR0("%s: Failed to get info for descriptor ring (%p)\n", + __func__, vdc->ldc_dring_hdl); + return (status); + } + + /* Allocate the local copy of this dring */ + vdc->local_dring = kmem_zalloc(VD_DRING_LEN * sizeof (vdc_local_desc_t), + KM_SLEEP); + vdc->initialized |= VDC_DRING_LOCAL; + + /* + * Mark all DRing entries as free and init priv desc memory handles + * If any entry is initialized, we need to free it later so we set + * the bit in 'initialized' at the start. + */ + vdc->initialized |= VDC_DRING_ENTRY; + for (i = 0; i < VD_DRING_LEN; i++) { + dep = VDC_GET_DRING_ENTRY_PTR(vdc, i); + dep->hdr.dstate = VIO_DESC_FREE; + + status = ldc_mem_alloc_handle(vdc->ldc_handle, + &vdc->local_dring[i].desc_mhdl); + if (status != 0) { + cmn_err(CE_NOTE, "![%d] Failed to alloc mem handle for" + " descriptor %d", vdc->instance, i); + return (status); + } + vdc->local_dring[i].flags = VIO_DESC_FREE; + vdc->local_dring[i].flags |= VDC_ALLOC_HANDLE; + vdc->local_dring[i].dep = dep; + + mutex_init(&vdc->local_dring[i].lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&vdc->local_dring[i].cv, NULL, CV_DRIVER, NULL); + } + + /* + * We init the index of the last DRing entry used. Since the code to + * get the next available entry increments it before selecting one, + * we set it to the last DRing entry so that it wraps around to zero + * for the 1st entry to be used. + */ + vdc->dring_curr_idx = VD_DRING_LEN - 1; + + return (status); +} + +static void +vdc_destroy_descriptor_ring(vdc_t *vdc) +{ + ldc_mem_handle_t mhdl = NULL; + int status = -1; + int i; /* loop */ + + ASSERT(vdc != NULL); + ASSERT(mutex_owned(&vdc->lock)); + ASSERT(vdc->state == VD_STATE_INIT); + + PR0("%s: Entered\n", __func__); + + if (vdc->initialized & VDC_DRING_ENTRY) { + for (i = 0; i < VD_DRING_LEN; i++) { + mhdl = vdc->local_dring[i].desc_mhdl; + + if (vdc->local_dring[i].flags | VDC_ALLOC_HANDLE) + (void) ldc_mem_free_handle(mhdl); + + mutex_destroy(&vdc->local_dring[i].lock); + cv_destroy(&vdc->local_dring[i].cv); + + bzero(&vdc->local_dring[i].desc_mhdl, + sizeof (ldc_mem_handle_t)); + } + vdc->initialized &= ~VDC_DRING_ENTRY; + } + + if (vdc->initialized & VDC_DRING_LOCAL) { + kmem_free(vdc->local_dring, + VD_DRING_LEN * sizeof (vdc_local_desc_t)); + vdc->initialized &= ~VDC_DRING_LOCAL; + } + + if (vdc->initialized & VDC_DRING_BOUND) { + status = ldc_mem_dring_unbind(vdc->ldc_dring_hdl); + if (status == 0) { + vdc->initialized &= ~VDC_DRING_BOUND; + } else { + vdc_msg("%s: Failed to unbind Descriptor Ring (%lx)\n", + vdc->ldc_dring_hdl); + } + } + + if (vdc->initialized & VDC_DRING_INIT) { + status = ldc_mem_dring_destroy(vdc->ldc_dring_hdl); + if (status == 0) { + vdc->ldc_dring_hdl = NULL; + bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t)); + vdc->initialized &= ~VDC_DRING_INIT; + } else { + vdc_msg("%s: Failed to destroy Descriptor Ring (%lx)\n", + vdc->ldc_dring_hdl); + } + } +} + +/* + * vdc_get_next_dring_entry_idx() + * + * Description: + * This function gets the index of the next Descriptor Ring entry available + * + * Return Value: + * 0 <= rv < VD_DRING_LEN Next available slot + * -1 DRing is full + */ +static int +vdc_get_next_dring_entry_idx(vdc_t *vdc, uint_t num_slots_needed) +{ + _NOTE(ARGUNUSED(num_slots_needed)) + + vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ + int idx = -1; + int start_idx = 0; + + ASSERT(vdc != NULL); + ASSERT(vdc->dring_len == VD_DRING_LEN); + ASSERT(vdc->dring_curr_idx >= 0); + ASSERT(vdc->dring_curr_idx < VD_DRING_LEN); + ASSERT(mutex_owned(&vdc->dring_lock)); + + /* Start at the last entry used */ + idx = start_idx = vdc->dring_curr_idx; + + /* + * Loop through Descriptor Ring checking for a free entry until we reach + * the entry we started at. We should never come close to filling the + * Ring at any stage, instead this is just to prevent an entry which + * gets into an inconsistent state (e.g. due to a request timing out) + * from blocking progress. + */ + do { + /* Get the next entry after the last known index tried */ + idx = (idx + 1) % VD_DRING_LEN; + + dep = VDC_GET_DRING_ENTRY_PTR(vdc, idx); + ASSERT(dep != NULL); + + if (dep->hdr.dstate == VIO_DESC_FREE) { + ASSERT(idx >= 0); + ASSERT(idx < VD_DRING_LEN); + vdc->dring_curr_idx = idx; + return (idx); + + } else if (dep->hdr.dstate == VIO_DESC_READY) { + PR0("%s: Entry %d waiting to be accepted\n", + __func__, idx); + continue; + + } else if (dep->hdr.dstate == VIO_DESC_ACCEPTED) { + PR0("%s: Entry %d waiting to be processed\n", + __func__, idx); + continue; + + } else if (dep->hdr.dstate == VIO_DESC_DONE) { + PR0("%s: Entry %d done but not marked free\n", + __func__, idx); + + /* + * If we are currently panicking, interrupts are + * disabled and we will not be getting ACKs from the + * vDisk server so we mark the descriptor ring entries + * as FREE here instead of in the ACK handler. + */ + if (panicstr) { + (void) vdc_depopulate_descriptor(vdc, idx); + dep->hdr.dstate = VIO_DESC_FREE; + vdc->local_dring[idx].flags = VIO_DESC_FREE; + } + continue; + + } else { + vdc_msg("Public Descriptor Ring entry corrupted"); + mutex_enter(&vdc->lock); + vdc_reset_connection(vdc, B_TRUE); + mutex_exit(&vdc->lock); + return (-1); + } + + } while (idx != start_idx); + + return (-1); +} + +/* + * Function: + * vdc_populate_descriptor + * + * Description: + * This routine writes the data to be transmitted to vds into the + * descriptor, notifies vds that the ring has been updated and + * then waits for the request to be processed. + * + * Arguments: + * vdc - the soft state pointer + * addr - start address of memory region. + * nbytes - number of bytes to read/write + * operation - operation we want vds to perform (VD_OP_XXX) + * arg - parameter to be sent to server (depends on VD_OP_XXX type) + * . mode for ioctl(9e) + * . LP64 diskaddr_t (block I/O) + * slice - the disk slice this request is for + * + * Return Codes: + * 0 + * EAGAIN + * EFAULT + * ENXIO + * EIO + */ +static int +vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, + uint64_t arg, uint64_t slice) +{ + vdc_local_desc_t *local_dep = NULL; /* Local Dring Entry Pointer */ + vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ + int idx = 0; /* Index of DRing entry used */ + vio_dring_msg_t dmsg; + size_t msglen = sizeof (dmsg); + int status = 0; + int rv; + int retries = 0; + + ASSERT(vdc != NULL); + ASSERT(slice < V_NUMPAR); + + /* + * Get next available DRing entry. + */ + mutex_enter(&vdc->dring_lock); + idx = vdc_get_next_dring_entry_idx(vdc, 1); + if (idx == -1) { + mutex_exit(&vdc->dring_lock); + vdc_msg("%s[%d]: no descriptor ring entry avail, seq=%d\n", + __func__, vdc->instance, vdc->seq_num); + + /* + * Since strategy should not block we don't wait for the DRing + * to empty and instead return + */ + return (EAGAIN); + } + + ASSERT(idx < VD_DRING_LEN); + local_dep = &vdc->local_dring[idx]; + dep = local_dep->dep; + ASSERT(dep != NULL); + + /* + * Wait for anybody still using the DRing entry to finish. + * (e.g. still waiting for vds to respond to a request) + */ + mutex_enter(&local_dep->lock); + + switch (operation) { + case VD_OP_BREAD: + case VD_OP_BWRITE: + PR1("buf=%p, block=%lx, nbytes=%lx\n", addr, arg, nbytes); + dep->payload.addr = (diskaddr_t)arg; + rv = vdc_populate_mem_hdl(vdc, idx, addr, nbytes, operation); + break; + + case VD_OP_FLUSH: + case VD_OP_GET_VTOC: + case VD_OP_SET_VTOC: + case VD_OP_GET_DISKGEOM: + case VD_OP_SET_DISKGEOM: + case VD_OP_SCSICMD: + if (nbytes > 0) { + rv = vdc_populate_mem_hdl(vdc, idx, addr, nbytes, + operation); + } + break; + default: + cmn_err(CE_NOTE, "[%d] Unsupported vDisk operation [%d]\n", + vdc->instance, operation); + rv = EINVAL; + } + + if (rv != 0) { + mutex_exit(&local_dep->lock); + mutex_exit(&vdc->dring_lock); + return (rv); + } + + /* + * fill in the data details into the DRing + */ + dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdc); + dep->payload.operation = operation; + dep->payload.nbytes = nbytes; + dep->payload.status = EINPROGRESS; /* vds will set valid value */ + dep->payload.slice = slice; + dep->hdr.dstate = VIO_DESC_READY; + dep->hdr.ack = 1; /* request an ACK for every message */ + + local_dep->flags = VIO_DESC_READY; + local_dep->addr = addr; + + /* + * Send a msg with the DRing details to vds + */ + VIO_INIT_DRING_DATA_TAG(dmsg); + VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdc); + dmsg.dring_ident = vdc->dring_ident; + dmsg.start_idx = idx; + dmsg.end_idx = idx; + + PR1("ident=0x%llx, st=%d, end=%d, seq=%d req=%d dep=%p\n", + vdc->dring_ident, dmsg.start_idx, dmsg.end_idx, + dmsg.seq_num, dep->payload.req_id, dep); + + status = vdc_send(vdc->ldc_handle, (caddr_t)&dmsg, &msglen); + PR1("%s[%d]: ldc_write() status=%d\n", __func__, vdc->instance, status); + if (status != 0) { + mutex_exit(&local_dep->lock); + mutex_exit(&vdc->dring_lock); + vdc_msg("%s: ldc_write(%d)\n", __func__, status); + return (EAGAIN); + } + + /* + * XXX - potential performance enhancement (Investigate at a later date) + * + * for calls from strategy(9E), instead of waiting for a response from + * vds, we could return at this stage and let the ACK handling code + * trigger the biodone(9F) + */ + + /* + * When a guest is panicking, the completion of requests needs to be + * handled differently because interrupts are disabled and vdc + * will not get messages. We have to poll for the messages instead. + */ + if (ddi_in_panic()) { + int start = 0; + retries = 0; + for (;;) { + msglen = sizeof (dmsg); + status = ldc_read(vdc->ldc_handle, (caddr_t)&dmsg, + &msglen); + if (status) { + status = EINVAL; + break; + } + + /* + * if there are no packets wait and check again + */ + if ((status == 0) && (msglen == 0)) { + if (retries++ > vdc_dump_retries) { + PR0("[%d] Giving up waiting, idx %d\n", + vdc->instance, idx); + status = EAGAIN; + break; + } + + PR1("Waiting for next packet @ %d\n", idx); + delay(drv_usectohz(vdc_dump_usec_timeout)); + continue; + } + + /* + * Ignore all messages that are not ACKs/NACKs to + * DRing requests. + */ + if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) || + (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) { + PR0("discarding pkt: type=%d sub=%d env=%d\n", + dmsg.tag.vio_msgtype, + dmsg.tag.vio_subtype, + dmsg.tag.vio_subtype_env); + continue; + } + + /* + * set the appropriate return value for the + * current request. + */ + switch (dmsg.tag.vio_subtype) { + case VIO_SUBTYPE_ACK: + status = 0; + break; + case VIO_SUBTYPE_NACK: + status = EAGAIN; + break; + default: + continue; + } + + start = dmsg.start_idx; + if (start >= VD_DRING_LEN) { + PR0("[%d] Bogus ack data : start %d\n", + vdc->instance, start); + continue; + } + + dep = VDC_GET_DRING_ENTRY_PTR(vdc, start); + + PR1("[%d] Dumping start=%d idx=%d state=%d\n", + vdc->instance, start, idx, dep->hdr.dstate); + + if (dep->hdr.dstate != VIO_DESC_DONE) { + PR0("[%d] Entry @ %d - state !DONE %d\n", + vdc->instance, start, dep->hdr.dstate); + continue; + } + + (void) vdc_depopulate_descriptor(vdc, start); + + /* + * We want to process all Dring entries up to + * the current one so that we can return an + * error with the correct request. + */ + if (idx > start) { + PR0("[%d] Looping: start %d, idx %d\n", + vdc->instance, idx, start); + continue; + } + + /* exit - all outstanding requests are completed */ + break; + } + + mutex_exit(&local_dep->lock); + mutex_exit(&vdc->dring_lock); + + return (status); + } + + /* + * Now watch the DRing entries we modified to get the response + * from vds. + */ + status = vdc_wait_for_descriptor_update(vdc, idx, dmsg); + if (status == ETIMEDOUT) { + /* debug info when dumping state on vds side */ + dep->payload.status = ECANCELED; + } + + status = vdc_depopulate_descriptor(vdc, idx); + PR1("%s[%d] Status=%d\n", __func__, vdc->instance, status); + + mutex_exit(&local_dep->lock); + mutex_exit(&vdc->dring_lock); + + return (status); +} + +static int +vdc_wait_for_descriptor_update(vdc_t *vdc, uint_t idx, vio_dring_msg_t dmsg) +{ + vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ + vdc_local_desc_t *local_dep = NULL; /* Local Dring Entry Pointer */ + size_t msglen = sizeof (dmsg); + int retries = 0; + int status = ENXIO; + int rv = 0; + + ASSERT(vdc != NULL); + ASSERT(idx < VD_DRING_LEN); + local_dep = &vdc->local_dring[idx]; + ASSERT(local_dep != NULL); + dep = local_dep->dep; + ASSERT(dep != NULL); + + while (dep->hdr.dstate != VIO_DESC_DONE) { + rv = cv_timedwait(&local_dep->cv, &local_dep->lock, + VD_GET_TIMEOUT_HZ(retries)); + if (rv == -1) { + /* + * If they persist in ignoring us we'll storm off in a + * huff and return ETIMEDOUT to the upper layers. + */ + if (retries >= vdc_retries) { + PR0("%s: Finished waiting on entry %d\n", + __func__, idx); + status = ETIMEDOUT; + break; + } else { + retries++; + PR0("%s[%d]: Timeout #%d on entry %d " + "[seq %d][req %d]\n", __func__, + vdc->instance, + retries, idx, dmsg.seq_num, + dep->payload.req_id); + } + + if (dep->hdr.dstate & VIO_DESC_ACCEPTED) { + PR0("%s[%d]: vds has accessed entry %d [seq %d]" + "[req %d] but not ack'ed it yet\n", + __func__, vdc->instance, idx, dmsg.seq_num, + dep->payload.req_id); + continue; + } + + /* + * we resend the message as it may have been dropped + * and have never made it to the other side (vds). + * (We reuse the original message but update seq ID) + */ + VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdc); + retries = 0; + status = vdc_send(vdc->ldc_handle, (caddr_t)&dmsg, + &msglen); + if (status != 0) { + vdc_msg("%s: Error (%d) while resending after " + "timeout\n", __func__, status); + status = ETIMEDOUT; + break; + } + } + } + + return (status); +} + +static int +vdc_get_response(vdc_t *vdc, int start, int end) +{ + vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ + vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ + int status = ENXIO; + int idx = -1; + + ASSERT(vdc != NULL); + ASSERT(start >= 0); + ASSERT(start <= VD_DRING_LEN); + ASSERT(start >= -1); + ASSERT(start <= VD_DRING_LEN); + + idx = start; + ldep = &vdc->local_dring[idx]; + ASSERT(ldep != NULL); + dep = ldep->dep; + ASSERT(dep != NULL); + + PR0("%s[%d] DRING entry=%d status=%d\n", __func__, vdc->instance, + idx, VIO_GET_DESC_STATE(dep->hdr.dstate)); + while (VIO_GET_DESC_STATE(dep->hdr.dstate) == VIO_DESC_DONE) { + if ((end != -1) && (idx > end)) + return (0); + + switch (ldep->operation) { + case VD_OP_BREAD: + case VD_OP_BWRITE: + /* call bioxxx */ + break; + default: + /* signal waiter */ + break; + } + + /* Clear the DRing entry */ + status = vdc_depopulate_descriptor(vdc, idx); + PR0("%s[%d] Status=%d\n", __func__, vdc->instance, status); + + /* loop accounting to get next DRing entry */ + idx++; + ldep = &vdc->local_dring[idx]; + dep = ldep->dep; + } + + return (status); +} + +static int +vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) +{ + vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ + vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ + int status = ENXIO; + + ASSERT(vdc != NULL); + ASSERT(idx < VD_DRING_LEN); + ldep = &vdc->local_dring[idx]; + ASSERT(ldep != NULL); + dep = ldep->dep; + ASSERT(dep != NULL); + + status = dep->payload.status; + VDC_MARK_DRING_ENTRY_FREE(vdc, idx); + ldep = &vdc->local_dring[idx]; + VIO_SET_DESC_STATE(ldep->flags, VIO_DESC_FREE); + + /* + * If the upper layer passed in a misaligned address we copied the + * data into an aligned buffer before sending it to LDC - we now + * copy it back to the original buffer. + */ + if (ldep->align_addr) { + ASSERT(ldep->addr != NULL); + ASSERT(dep->payload.nbytes > 0); + + bcopy(ldep->align_addr, ldep->addr, dep->payload.nbytes); + kmem_free(ldep->align_addr, + sizeof (caddr_t) * dep->payload.nbytes); + ldep->align_addr = NULL; + } + + status = ldc_mem_unbind_handle(ldep->desc_mhdl); + if (status != 0) { + cmn_err(CE_NOTE, "[%d] unbind mem hdl 0x%lx @ idx %d failed:%d", + vdc->instance, ldep->desc_mhdl, idx, status); + } + + return (status); +} + +static int +vdc_populate_mem_hdl(vdc_t *vdc, uint_t idx, caddr_t addr, size_t nbytes, + int operation) +{ + vd_dring_entry_t *dep = NULL; + vdc_local_desc_t *ldep = NULL; + ldc_mem_handle_t mhdl; + caddr_t vaddr; + int perm = LDC_MEM_RW; + int rv = 0; + int i; + + ASSERT(vdc != NULL); + ASSERT(idx < VD_DRING_LEN); + + dep = VDC_GET_DRING_ENTRY_PTR(vdc, idx); + ldep = &vdc->local_dring[idx]; + mhdl = ldep->desc_mhdl; + + switch (operation) { + case VD_OP_BREAD: + perm = LDC_MEM_W; + break; + + case VD_OP_BWRITE: + perm = LDC_MEM_R; + break; + + case VD_OP_FLUSH: + case VD_OP_GET_VTOC: + case VD_OP_SET_VTOC: + case VD_OP_GET_DISKGEOM: + case VD_OP_SET_DISKGEOM: + case VD_OP_SCSICMD: + perm = LDC_MEM_RW; + break; + + default: + ASSERT(0); /* catch bad programming in vdc */ + } + + /* + * LDC expects any addresses passed in to be 8-byte aligned. We need + * to copy the contents of any misaligned buffers to a newly allocated + * buffer and bind it instead (and copy the the contents back to the + * original buffer passed in when depopulating the descriptor) + */ + vaddr = addr; + if (((uint64_t)addr & 0x7) != 0) { + ldep->align_addr = + kmem_zalloc(sizeof (caddr_t) * nbytes, KM_SLEEP); + PR0("%s[%d] Misaligned address %lx reallocating " + "(buf=%lx entry=%d)\n", + __func__, vdc->instance, addr, ldep->align_addr, idx); + bcopy(addr, ldep->align_addr, nbytes); + vaddr = ldep->align_addr; + } + + rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), + vdc->dring_mem_info.mtype, perm, &dep->payload.cookie[0], + &dep->payload.ncookies); + PR1("%s[%d] bound mem handle; ncookies=%d\n", + __func__, vdc->instance, dep->payload.ncookies); + if (rv != 0) { + vdc_msg("%s[%d] failed to ldc_mem_bind_handle " + "(mhdl=%lx, buf=%lx entry=%d err=%d)\n", + __func__, vdc->instance, mhdl, addr, idx, rv); + if (ldep->align_addr) { + kmem_free(ldep->align_addr, + sizeof (caddr_t) * dep->payload.nbytes); + ldep->align_addr = NULL; + } + return (EAGAIN); + } + + /* + * Get the other cookies (if any). + */ + for (i = 1; i < dep->payload.ncookies; i++) { + rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]); + if (rv != 0) { + (void) ldc_mem_unbind_handle(mhdl); + vdc_msg("%s: failed to get next cookie(mhdl=%lx " + "cnum=%d), err=%d", __func__, mhdl, i, rv); + if (ldep->align_addr) { + kmem_free(ldep->align_addr, + sizeof (caddr_t) * dep->payload.nbytes); + ldep->align_addr = NULL; + } + return (EAGAIN); + } + } + + return (rv); +} + +/* + * Interrupt handlers for messages from LDC + */ + +static uint_t +vdc_handle_cb(uint64_t event, caddr_t arg) +{ + ldc_status_t ldc_state; + int rv = 0; + + vdc_t *vdc = (vdc_t *)(void *)arg; + + ASSERT(vdc != NULL); + + PR1("%s[%d] event=%x seqID=%d\n", + __func__, vdc->instance, event, vdc->seq_num); + + /* + * Depending on the type of event that triggered this callback, + * we modify the handhske state or read the data. + * + * NOTE: not done as a switch() as event could be triggered by + * a state change and a read request. Also the ordering of the + * check for the event types is deliberate. + */ + if (event & LDC_EVT_UP) { + PR0("%s[%d] Received LDC_EVT_UP\n", __func__, vdc->instance); + + /* get LDC state */ + rv = ldc_status(vdc->ldc_handle, &ldc_state); + if (rv != 0) { + cmn_err(CE_NOTE, "[%d] Couldn't get LDC status %d", + vdc->instance, rv); + vdc_reset_connection(vdc, B_TRUE); + return (LDC_SUCCESS); + } + + /* + * Reset the transaction sequence numbers when LDC comes up. + * We then kick off the handshake negotiation with the vDisk + * server. + */ + mutex_enter(&vdc->lock); + vdc->seq_num = 0; + vdc->seq_num_reply = 0; + vdc->ldc_state = ldc_state; + ASSERT(ldc_state == LDC_UP); + mutex_exit(&vdc->lock); + + vdc_init_handshake_negotiation(vdc); + + ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); + } + + if (event & LDC_EVT_READ) { + /* + * Wake up the worker thread to process the message + */ + mutex_enter(&vdc->msg_proc_lock); + vdc->msg_pending = B_TRUE; + cv_signal(&vdc->msg_proc_cv); + mutex_exit(&vdc->msg_proc_lock); + + ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); + + /* that's all we have to do - no need to handle DOWN/RESET */ + return (LDC_SUCCESS); + } + + if (event & LDC_EVT_RESET) { + PR0("%s[%d] Recvd LDC RESET event\n", __func__, vdc->instance); + } + + if (event & LDC_EVT_DOWN) { + PR0("%s[%d] Recvd LDC DOWN event\n", __func__, vdc->instance); + + /* get LDC state */ + rv = ldc_status(vdc->ldc_handle, &ldc_state); + if (rv != 0) { + cmn_err(CE_NOTE, "[%d] Couldn't get LDC status %d", + vdc->instance, rv); + ldc_state = LDC_OPEN; + } + mutex_enter(&vdc->lock); + vdc->ldc_state = ldc_state; + mutex_exit(&vdc->lock); + + vdc_reset_connection(vdc, B_TRUE); + } + + if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) + cmn_err(CE_NOTE, "![%d] Unexpected LDC event (%lx) received", + vdc->instance, event); + + return (LDC_SUCCESS); +} + +/* -------------------------------------------------------------------------- */ + +/* + * The following functions process the incoming messages from vds + */ + + +static void +vdc_process_msg_thread(vdc_t *vdc) +{ + int status = 0; + boolean_t q_is_empty = B_TRUE; + + ASSERT(vdc != NULL); + + mutex_enter(&vdc->msg_proc_lock); + PR0("%s[%d]: Starting\n", __func__, vdc->instance); + + vdc->msg_proc_thr_state = VDC_THR_RUNNING; + + while (vdc->msg_proc_thr_state == VDC_THR_RUNNING) { + + PR1("%s[%d] Waiting\n", __func__, vdc->instance); + while (vdc->msg_pending == B_FALSE) + cv_wait(&vdc->msg_proc_cv, &vdc->msg_proc_lock); + + PR1("%s[%d] Message Received\n", __func__, vdc->instance); + + /* check if there is data */ + status = ldc_chkq(vdc->ldc_handle, &q_is_empty); + if ((status != 0) && + (vdc->msg_proc_thr_state == VDC_THR_RUNNING)) { + cmn_err(CE_NOTE, "[%d] Unable to communicate with vDisk" + " server. Cannot check LDC queue: %d", + vdc->instance, status); + mutex_enter(&vdc->lock); + vdc_reset_connection(vdc, B_TRUE); + mutex_exit(&vdc->lock); + vdc->msg_proc_thr_state = VDC_THR_STOP; + continue; + } + + if (q_is_empty == B_FALSE) { + PR1("%s: new pkt(s) available\n", __func__); + vdc_process_msg(vdc); + } + + vdc->msg_pending = B_FALSE; + } + + PR0("Message processing thread stopped\n"); + vdc->msg_pending = B_FALSE; + vdc->msg_proc_thr_state = VDC_THR_DONE; + cv_signal(&vdc->msg_proc_cv); + mutex_exit(&vdc->msg_proc_lock); + thread_exit(); +} + + +/* + * Function: + * vdc_process_msg() + * + * Description: + * This function is called by the message processing thread each time it + * is triggered when LDC sends an interrupt to indicate that there are + * more packets on the queue. When it is called it will continue to loop + * and read the messages until there are no more left of the queue. If it + * encounters an invalid sized message it will drop it and check the next + * message. + * + * Arguments: + * arg - soft state pointer for this instance of the device driver. + * + * Return Code: + * None. + */ +static void +vdc_process_msg(void *arg) +{ + vdc_t *vdc = (vdc_t *)(void *)arg; + vio_msg_t vio_msg; + size_t nbytes = sizeof (vio_msg); + int status; + + ASSERT(vdc != NULL); + + mutex_enter(&vdc->lock); + + PR1("%s\n", __func__); + + for (;;) { + + /* read all messages - until no more left */ + status = ldc_read(vdc->ldc_handle, (caddr_t)&vio_msg, &nbytes); + + if (status) { + vdc_msg("%s: ldc_read() failed = %d", __func__, status); + + /* if status is ECONNRESET --- reset vdc state */ + if (status == EIO || status == ECONNRESET) { + vdc_reset_connection(vdc, B_FALSE); + } + + mutex_exit(&vdc->lock); + return; + } + + if ((nbytes > 0) && (nbytes < sizeof (vio_msg_tag_t))) { + cmn_err(CE_CONT, "![%d] Expect %lu bytes; recv'd %lu\n", + vdc->instance, sizeof (vio_msg_tag_t), nbytes); + mutex_exit(&vdc->lock); + return; + } + + if (nbytes == 0) { + PR2("%s[%d]: ldc_read() done..\n", + __func__, vdc->instance); + mutex_exit(&vdc->lock); + return; + } + + PR1("%s[%d] (%x/%x/%x)\n", __func__, vdc->instance, + vio_msg.tag.vio_msgtype, + vio_msg.tag.vio_subtype, + vio_msg.tag.vio_subtype_env); + + /* + * Verify the Session ID of the message + * + * Every message after the Version has been negotiated should + * have the correct session ID set. + */ + if ((vio_msg.tag.vio_sid != vdc->session_id) && + (vio_msg.tag.vio_subtype_env != VIO_VER_INFO)) { + PR0("%s: Incorrect SID 0x%x msg 0x%lx, expected 0x%x\n", + __func__, vio_msg.tag.vio_sid, &vio_msg, + vdc->session_id); + vdc_reset_connection(vdc, B_FALSE); + mutex_exit(&vdc->lock); + return; + } + + switch (vio_msg.tag.vio_msgtype) { + case VIO_TYPE_CTRL: + status = vdc_process_ctrl_msg(vdc, vio_msg); + break; + case VIO_TYPE_DATA: + status = vdc_process_data_msg(vdc, vio_msg); + break; + case VIO_TYPE_ERR: + status = vdc_process_err_msg(vdc, vio_msg); + break; + default: + PR1("%s", __func__); + status = EINVAL; + break; + } + + if (status != 0) { + PR0("%s[%d] Error (%d) occcurred processing msg\n", + __func__, vdc->instance, status); + vdc_reset_connection(vdc, B_FALSE); + } + } + _NOTE(NOTREACHED) +} + +/* + * Function: + * vdc_process_ctrl_msg() + * + * Description: + * This function is called by the message processing thread each time + * an LDC message with a msgtype of VIO_TYPE_CTRL is received. + * + * Arguments: + * vdc - soft state pointer for this instance of the device driver. + * msg - the LDC message sent by vds + * + * Return Codes: + * 0 - Success. + * EPROTO - A message was received which shouldn't have happened according + * to the protocol + * ENOTSUP - An action which is allowed according to the protocol but which + * isn't (or doesn't need to be) implemented yet. + * EINVAL - An invalid value was returned as part of a message. + */ +static int +vdc_process_ctrl_msg(vdc_t *vdc, vio_msg_t msg) +{ + size_t msglen = sizeof (msg); + vd_attr_msg_t *attr_msg = NULL; + vio_dring_reg_msg_t *dring_msg = NULL; + int status = -1; + + ASSERT(msg.tag.vio_msgtype == VIO_TYPE_CTRL); + ASSERT(vdc != NULL); + ASSERT(mutex_owned(&vdc->lock)); + + /* Depending on which state we are in; process the message */ + switch (vdc->state) { + case VD_STATE_INIT: + if (msg.tag.vio_subtype_env != VIO_VER_INFO) { + status = EPROTO; + break; + } + + switch (msg.tag.vio_subtype) { + case VIO_SUBTYPE_ACK: + vdc->state = VD_STATE_VER; + status = vdc_init_attr_negotiation(vdc); + break; + case VIO_SUBTYPE_NACK: + /* + * For now there is only one version number so we + * cannot step back to an earlier version but in the + * future we may need to add further logic here + * to try negotiating an earlier version as the VIO + * design allow for it. + */ + + /* + * vds could not handle the version we sent so we just + * stop negotiating. + */ + status = EPROTO; + break; + + case VIO_SUBTYPE_INFO: + /* + * Handle the case where vds starts handshake + * (for now only vdc is the instigatior) + */ + status = ENOTSUP; + break; + + default: + status = ENOTSUP; + break; + } + break; + + case VD_STATE_VER: + if (msg.tag.vio_subtype_env != VIO_ATTR_INFO) { + status = EPROTO; + break; + } + + switch (msg.tag.vio_subtype) { + case VIO_SUBTYPE_ACK: + /* + * We now verify the attributes sent by vds. + */ + attr_msg = (vd_attr_msg_t *)&msg; + vdc->vdisk_size = attr_msg->vdisk_size; + vdc->vdisk_type = attr_msg->vdisk_type; + + if ((attr_msg->max_xfer_sz != vdc->max_xfer_sz) || + (attr_msg->vdisk_block_size != vdc->block_size)) { + /* + * Future support: step down to the block size + * and max transfer size suggested by the + * server. (If this value is less than 128K + * then multiple Dring entries per request + * would need to be implemented) + */ + cmn_err(CE_NOTE, "[%d] Couldn't process block " + "attrs from vds", vdc->instance); + status = EINVAL; + break; + } + + if ((attr_msg->xfer_mode != VIO_DRING_MODE) || + (attr_msg->vdisk_size > INT64_MAX) || + (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) { + vdc_msg("%s[%d] Couldn't process attrs " + "from vds", __func__, vdc->instance); + status = EINVAL; + break; + } + + vdc->state = VD_STATE_ATTR; + status = vdc_init_dring_negotiate(vdc); + break; + + case VIO_SUBTYPE_NACK: + /* + * vds could not handle the attributes we sent so we + * stop negotiating. + */ + status = EPROTO; + break; + + case VIO_SUBTYPE_INFO: + /* + * Handle the case where vds starts the handshake + * (for now; vdc is the only supported instigatior) + */ + status = ENOTSUP; + break; + + default: + status = ENOTSUP; + break; + } + break; + + + case VD_STATE_ATTR: + if (msg.tag.vio_subtype_env != VIO_DRING_REG) { + status = EPROTO; + break; + } + + switch (msg.tag.vio_subtype) { + case VIO_SUBTYPE_ACK: + /* Verify that we have sent all the descr. ring info */ + /* nop for now as we have just 1 dring */ + dring_msg = (vio_dring_reg_msg_t *)&msg; + + /* save the received dring_ident */ + vdc->dring_ident = dring_msg->dring_ident; + PR0("%s[%d] Received dring ident=0x%lx\n", + __func__, vdc->instance, vdc->dring_ident); + + /* + * Send an RDX message to vds to indicate we are ready + * to send data + */ + msg.tag.vio_msgtype = VIO_TYPE_CTRL; + msg.tag.vio_subtype = VIO_SUBTYPE_INFO; + msg.tag.vio_subtype_env = VIO_RDX; + msg.tag.vio_sid = vdc->session_id; + status = vdc_send(vdc->ldc_handle, (caddr_t)&msg, + &msglen); + if (status != 0) { + cmn_err(CE_NOTE, "[%d] Failed to send RDX" + " message (%d)", vdc->instance, status); + break; + } + + status = vdc_create_fake_geometry(vdc); + if (status != 0) { + cmn_err(CE_NOTE, "[%d] Failed to create disk " + "geometery(%d)", vdc->instance, status); + break; + } + + vdc->state = VD_STATE_RDX; + break; + + case VIO_SUBTYPE_NACK: + /* + * vds could not handle the DRing info we sent so we + * stop negotiating. + */ + cmn_err(CE_CONT, "server could not register DRing\n"); + vdc_reset_connection(vdc, B_FALSE); + vdc_destroy_descriptor_ring(vdc); + status = EPROTO; + break; + + case VIO_SUBTYPE_INFO: + /* + * Handle the case where vds starts handshake + * (for now only vdc is the instigatior) + */ + status = ENOTSUP; + break; + default: + status = ENOTSUP; + } + break; + + case VD_STATE_RDX: + if (msg.tag.vio_subtype_env != VIO_RDX) { + status = EPROTO; + break; + } + + PR0("%s: Received RDX - handshake successful\n", __func__); + + status = 0; + vdc->state = VD_STATE_DATA; + + cv_broadcast(&vdc->attach_cv); + break; + + default: + cmn_err(CE_NOTE, "[%d] unknown handshake negotiation state %d", + vdc->instance, vdc->state); + break; + } + + return (status); +} + + +/* + * Function: + * vdc_process_data_msg() + * + * Description: + * This function is called by the message processing thread each time it + * a message with a msgtype of VIO_TYPE_DATA is received. It will either + * be an ACK or NACK from vds[1] which vdc handles as follows. + * ACK - wake up the waiting thread + * NACK - resend any messages necessary + * + * [1] Although the message format allows it, vds should not send a + * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for + * some bizarre reason it does, vdc will reset the connection. + * + * Arguments: + * vdc - soft state pointer for this instance of the device driver. + * msg - the LDC message sent by vds + * + * Return Code: + * 0 - Success. + * > 0 - error value returned by LDC + */ +static int +vdc_process_data_msg(vdc_t *vdc, vio_msg_t msg) +{ + int status = 0; + vdc_local_desc_t *local_dep = NULL; + vio_dring_msg_t *dring_msg = NULL; + size_t msglen = sizeof (*dring_msg); + uint_t num_msgs; + uint_t start; + uint_t end; + uint_t i; + + ASSERT(msg.tag.vio_msgtype == VIO_TYPE_DATA); + ASSERT(vdc != NULL); + ASSERT(mutex_owned(&vdc->lock)); + + dring_msg = (vio_dring_msg_t *)&msg; + + /* + * Check to see if the message has bogus data + */ + start = dring_msg->start_idx; + end = dring_msg->end_idx; + if ((start >= VD_DRING_LEN) || (end >= VD_DRING_LEN)) { + vdc_msg("%s: Bogus ACK data : start %d, end %d\n", + __func__, start, end); + return (EPROTO); + } + + /* + * calculate the number of messages that vds ACK'ed + * + * Assumes, (like the rest of vdc) that there is a 1:1 mapping + * between requests and Dring entries. + */ + num_msgs = (end >= start) ? + (end - start + 1) : + (VD_DRING_LEN - start + end + 1); + + /* + * Verify that the sequence number is what vdc expects. + */ + if (vdc_verify_seq_num(vdc, dring_msg, num_msgs) == B_FALSE) { + return (ENXIO); + } + + switch (msg.tag.vio_subtype) { + case VIO_SUBTYPE_ACK: + PR2("%s: DATA ACK\n", __func__); + + /* + * Wake the thread waiting for each DRing entry ACK'ed + */ + for (i = 0; i < num_msgs; i++) { + int idx = (start + i) % VD_DRING_LEN; + + local_dep = &vdc->local_dring[idx]; + mutex_enter(&local_dep->lock); + cv_signal(&local_dep->cv); + mutex_exit(&local_dep->lock); + } + break; + + case VIO_SUBTYPE_NACK: + PR0("%s: DATA NACK\n", __func__); + dring_msg = (vio_dring_msg_t *)&msg; + VDC_DUMP_DRING_MSG(dring_msg); + + /* Resend necessary messages */ + for (i = 0; i < num_msgs; i++) { + int idx = (start + i) % VD_DRING_LEN; + + local_dep = &vdc->local_dring[idx]; + ASSERT(local_dep != NULL); + mutex_enter(&local_dep->lock); + + if (local_dep->dep->hdr.dstate != VIO_DESC_READY) { + PR0("%s[%d]: Won't resend entry %d [flag=%d]\n", + __func__, vdc->instance, idx, + local_dep->dep->hdr.dstate); + mutex_exit(&local_dep->lock); + break; + } + + /* we'll reuse the message passed in */ + VIO_INIT_DRING_DATA_TAG(msg); + dring_msg->tag.vio_sid = vdc->session_id; + dring_msg->seq_num = ++(vdc->seq_num); + VDC_DUMP_DRING_MSG(dring_msg); + + status = vdc_send(vdc->ldc_handle, (caddr_t)&dring_msg, + &msglen); + PR1("%s: ldc_write() status=%d\n", __func__, status); + if (status != 0) { + vdc_msg("%s ldc_write(%d)\n", __func__, status); + mutex_exit(&local_dep->lock); + break; + } + + mutex_exit(&local_dep->lock); + } + break; + + case VIO_SUBTYPE_INFO: + default: + cmn_err(CE_NOTE, "[%d] Got an unexpected DATA msg [subtype %d]", + vdc->instance, msg.tag.vio_subtype); + break; + } + + return (status); +} + +/* + * Function: + * vdc_process_err_msg() + * + * NOTE: No error messages are used as part of the vDisk protocol + */ +static int +vdc_process_err_msg(vdc_t *vdc, vio_msg_t msg) +{ + _NOTE(ARGUNUSED(vdc)) + _NOTE(ARGUNUSED(msg)) + + int status = ENOTSUP; + + ASSERT(msg.tag.vio_msgtype == VIO_TYPE_ERR); + cmn_err(CE_NOTE, "[%d] Got an ERR msg", vdc->instance); + + return (status); +} + +/* + * Function: + * vdc_verify_seq_num() + * + * Description: + * This functions verifies that the sequence number sent back by vds with + * the latest message correctly follows the last request processed. + * + * Arguments: + * vdc - soft state pointer for this instance of the driver. + * dring_msg - pointer to the LDC message sent by vds + * num_msgs - the number of requests being acknowledged + * + * Return Code: + * B_TRUE - Success. + * B_FALSE - The seq numbers are so out of sync, vdc cannot deal with them + */ +static boolean_t +vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg, int num_msgs) +{ + ASSERT(vdc != NULL); + ASSERT(dring_msg != NULL); + + /* + * Check to see if the messages were responded to in the correct + * order by vds. There are 3 possible scenarios: + * - the seq_num we expected is returned (everything is OK) + * - a seq_num earlier than the last one acknowledged is returned, + * if so something is seriously wrong so we reset the connection + * - a seq_num greater than what we expected is returned. + */ + if (dring_msg->seq_num != (vdc->seq_num_reply + num_msgs)) { + vdc_msg("%s[%d]: Bogus seq_num %d, expected %d\n", + __func__, vdc->instance, dring_msg->seq_num, + vdc->seq_num_reply + num_msgs); + if (dring_msg->seq_num < (vdc->seq_num_reply + num_msgs)) { + return (B_FALSE); + } else { + /* + * vds has responded with a seq_num greater than what we + * expected + */ + return (B_FALSE); + } + } + vdc->seq_num_reply += num_msgs; + + return (B_TRUE); +} + +/* -------------------------------------------------------------------------- */ + +/* + * DKIO(7) support + * + * XXX FIXME - needs to be converted to use the structures defined in the + * latest VIO spec to communicate with the vDisk server. + */ + +typedef struct vdc_dk_arg { + struct dk_callback dkc; + int mode; + dev_t dev; + vdc_t *vdc; +} vdc_dk_arg_t; + +/* + * Function: + * vdc_dkio_flush_cb() + * + * Description: + * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called + * by kernel code. + * + * Arguments: + * arg - a pointer to a vdc_dk_arg_t structure. + */ +void +vdc_dkio_flush_cb(void *arg) +{ + struct vdc_dk_arg *dk_arg = (struct vdc_dk_arg *)arg; + struct dk_callback *dkc = NULL; + vdc_t *vdc = NULL; + int rv; + + if (dk_arg == NULL) { + vdc_msg("%s[?] DKIOCFLUSHWRITECACHE arg is NULL\n", __func__); + return; + } + dkc = &dk_arg->dkc; + vdc = dk_arg->vdc; + ASSERT(vdc != NULL); + + rv = vdc_populate_descriptor(vdc, NULL, 0, VD_OP_FLUSH, + dk_arg->mode, SDPART(getminor(dk_arg->dev))); + if (rv != 0) { + PR0("%s[%d] DKIOCFLUSHWRITECACHE failed : model %x\n", + __func__, vdc->instance, + ddi_model_convert_from(dk_arg->mode & FMODELS)); + return; + } + + /* + * Trigger the call back to notify the caller the the ioctl call has + * been completed. + */ + if ((dk_arg->mode & FKIOCTL) && + (dkc != NULL) && + (dkc->dkc_callback != NULL)) { + ASSERT(dkc->dkc_cookie != NULL); + (*dkc->dkc_callback)(dkc->dkc_cookie, ENOTSUP); + } + + /* Indicate that one less DKIO write flush is outstanding */ + mutex_enter(&vdc->lock); + vdc->dkio_flush_pending--; + ASSERT(vdc->dkio_flush_pending >= 0); + mutex_exit(&vdc->lock); +} + + +/* + * This structure is used in the DKIO(7I) array below. + */ +typedef struct vdc_dk_ioctl { + uint8_t op; /* VD_OP_XXX value */ + int cmd; /* Solaris ioctl operation number */ + uint8_t copy; /* copyin and/or copyout needed ? */ + size_t nbytes; /* size of structure to be copied */ + size_t nbytes32; /* size of 32bit struct if different */ + /* to 64bit struct (zero otherwise) */ +} vdc_dk_ioctl_t; + +/* + * Subset of DKIO(7I) operations currently supported + */ +static vdc_dk_ioctl_t dk_ioctl[] = { + {VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0, + 0, 0}, + {VD_OP_GET_WCE, DKIOCGETWCE, 0, + 0, 0}, + {VD_OP_SET_WCE, DKIOCSETWCE, 0, + 0, 0}, + {VD_OP_GET_VTOC, DKIOCGVTOC, VD_COPYOUT, + sizeof (struct vtoc), sizeof (struct vtoc32)}, + {VD_OP_SET_VTOC, DKIOCSVTOC, VD_COPYIN, + sizeof (struct vtoc), sizeof (struct vtoc32)}, + {VD_OP_SET_DISKGEOM, DKIOCSGEOM, VD_COPYIN, + sizeof (struct dk_geom), 0}, + {VD_OP_GET_DISKGEOM, DKIOCGGEOM, VD_COPYOUT, + sizeof (struct dk_geom), 0}, + {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, VD_COPYOUT, + sizeof (struct dk_geom), 0}, + {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, VD_COPYOUT, + sizeof (struct dk_geom), 0}, + {VD_OP_SET_DISKGEOM, DKIOCSGEOM, VD_COPYOUT, + sizeof (struct dk_geom), 0}, + {VD_OP_SCSICMD, USCSICMD, VD_COPYIN|VD_COPYOUT, + sizeof (struct uscsi_cmd), sizeof (struct uscsi_cmd32)}, + {0, DKIOCINFO, VD_COPYOUT, + sizeof (struct dk_cinfo), 0}, + {0, DKIOCGMEDIAINFO, VD_COPYOUT, + sizeof (struct dk_minfo), 0}, + {0, DKIOCREMOVABLE, 0, + 0, 0}, + {0, CDROMREADOFFSET, 0, + 0, 0} +}; + +/* + * Function: + * vd_process_ioctl() + * + * Description: + * This routine is the driver entry point for handling user + * requests to get the device geometry. + * + * Arguments: + * dev - the device number + * cmd - the operation [dkio(7I)] to be processed + * arg - pointer to user provided structure + * (contains data to be set or reference parameter for get) + * mode - bit flag, indicating open settings, 32/64 bit type, etc + * rvalp - calling process return value, used in some ioctl calls + * (passed throught to vds who fills in the value) + * + * Assumptions: + * vds will make the ioctl calls in the 64 bit address space so vdc + * will convert the data to/from 32 bit as necessary before doing + * the copyin or copyout. + * + * Return Code: + * 0 + * EFAULT + * ENXIO + * EIO + * ENOTSUP + */ +static int +vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) +{ + int instance = SDUNIT(getminor(dev)); + vdc_t *vdc = NULL; + int op = -1; /* VD_OP_XXX value */ + int rv = -1; + int idx = 0; /* index into dk_ioctl[] */ + size_t len = 0; /* #bytes to send to vds */ + size_t alloc_len = 0; /* #bytes to allocate mem for */ + size_t copy_len = 0; /* #bytes to copy in/out */ + caddr_t mem_p = NULL; + boolean_t do_convert_32to64 = B_FALSE; + size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); + + PR0("%s: Processing ioctl(%x) for dev %x : model %x\n", + __func__, cmd, dev, ddi_model_convert_from(mode & FMODELS)); + + vdc = ddi_get_soft_state(vdc_state, instance); + if (vdc == NULL) { + cmn_err(CE_NOTE, "![%d] Could not get soft state structure", + instance); + return (ENXIO); + } + + /* + * Check to see if we can communicate with the vDisk server + */ + rv = vdc_is_able_to_tx_data(vdc, O_NONBLOCK); + if (rv == B_FALSE) { + PR0("%s[%d] Not ready to transmit data\n", __func__, instance); + return (ENOLINK); + } + + /* + * Validate the ioctl operation to be performed. + * + * If we have looped through the array without finding a match then we + * don't support this ioctl. + */ + for (idx = 0; idx < nioctls; idx++) { + if (cmd == dk_ioctl[idx].cmd) + break; + } + + if (idx >= nioctls) { + PR0("%s[%d] Unsupported ioctl(%x)\n", + __func__, vdc->instance, cmd); + return (ENOTSUP); + } + + copy_len = len = dk_ioctl[idx].nbytes; + op = dk_ioctl[idx].op; + + /* + * Some ioctl operations have different sized structures for 32 bit + * and 64 bit. If the userland caller is 32 bit, we need to check + * to see if the operation is one of those special cases and + * flag that we need to convert to and/or from 32 bit since vds + * will make the call as 64 bit. + */ + if ((ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) && + (dk_ioctl[idx].nbytes != 0) && + (dk_ioctl[idx].nbytes32 != 0)) { + do_convert_32to64 = B_TRUE; + copy_len = dk_ioctl[idx].nbytes32; + } + + /* + * Deal with the ioctls which the server does not provide. + */ + switch (cmd) { + case CDROMREADOFFSET: + case DKIOCREMOVABLE: + return (ENOTTY); + + case DKIOCINFO: + { + struct dk_cinfo cinfo; + if (vdc->cinfo == NULL) + return (ENXIO); + + bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); + cinfo.dki_partition = SDPART(getminor(dev)); + + rv = ddi_copyout(&cinfo, (void *)arg, + sizeof (struct dk_cinfo), mode); + if (rv != 0) + return (EFAULT); + + return (0); + } + + case DKIOCGMEDIAINFO: + if (vdc->minfo == NULL) + return (ENXIO); + + rv = ddi_copyout(vdc->minfo, (void *)arg, + sizeof (struct dk_minfo), mode); + if (rv != 0) + return (EFAULT); + + return (0); + } + + /* catch programming error in vdc - should be a VD_OP_XXX ioctl */ + ASSERT(op != 0); + + /* LDC requires that the memory being mapped is 8-byte aligned */ + alloc_len = P2ROUNDUP(len, sizeof (uint64_t)); + PR1("%s[%d]: struct size %d alloc %d\n", + __func__, instance, len, alloc_len); + + if (alloc_len != 0) + mem_p = kmem_zalloc(alloc_len, KM_SLEEP); + + if (dk_ioctl[idx].copy & VD_COPYIN) { + if (arg == NULL) { + if (mem_p != NULL) + kmem_free(mem_p, alloc_len); + return (EINVAL); + } + + ASSERT(copy_len != 0); + + rv = ddi_copyin((void *)arg, mem_p, copy_len, mode); + if (rv != 0) { + if (mem_p != NULL) + kmem_free(mem_p, alloc_len); + return (EFAULT); + } + + /* + * some operations need the data to be converted from 32 bit + * to 64 bit structures so that vds can process them on the + * other side. + */ + if (do_convert_32to64) { + switch (cmd) { + case DKIOCSVTOC: + { + struct vtoc vt; + struct vtoc32 vt32; + + ASSERT(mem_p != NULL); + vt32 = *((struct vtoc32 *)(mem_p)); + + vtoc32tovtoc(vt32, vt); + bcopy(&vt, mem_p, len); + break; + } + + case USCSICMD: + { + struct uscsi_cmd scmd; + struct uscsi_cmd *uscmd = &scmd; + struct uscsi_cmd32 *uscmd32; + + ASSERT(mem_p != NULL); + uscmd32 = (struct uscsi_cmd32 *)mem_p; + + /* + * Convert the ILP32 uscsi data from the + * application to LP64 for internal use. + */ + uscsi_cmd32touscsi_cmd(uscmd32, uscmd); + bcopy(uscmd, mem_p, len); + break; + } + default: + break; + } + } + } + + /* + * handle the special case of DKIOCFLUSHWRITECACHE + */ + if (cmd == DKIOCFLUSHWRITECACHE) { + struct dk_callback *dkc = (struct dk_callback *)arg; + + PR0("%s[%d]: DKIOCFLUSHWRITECACHE\n", __func__, instance); + + /* no mem should have been allocated hence no need to free it */ + ASSERT(mem_p == NULL); + + /* + * If arg is NULL, we break here and the call operates + * synchronously; waiting for vds to return. + * + * i.e. after the request to vds returns successfully, + * all writes completed prior to the ioctl will have been + * flushed from the disk write cache to persistent media. + */ + if (dkc != NULL) { + vdc_dk_arg_t arg; + arg.mode = mode; + arg.dev = dev; + bcopy(dkc, &arg.dkc, sizeof (*dkc)); + + mutex_enter(&vdc->lock); + vdc->dkio_flush_pending++; + arg.vdc = vdc; + mutex_exit(&vdc->lock); + + /* put the request on a task queue */ + rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb, + (void *)&arg, DDI_SLEEP); + + return (rv == NULL ? ENOMEM : 0); + } + } + + /* + * send request to vds to service the ioctl. + */ + rv = vdc_populate_descriptor(vdc, mem_p, alloc_len, op, mode, + SDPART((getminor(dev)))); + if (rv != 0) { + /* + * This is not necessarily an error. The ioctl could + * be returning a value such as ENOTTY to indicate + * that the ioctl is not applicable. + */ + PR0("%s[%d]: vds returned %d for ioctl 0x%x\n", + __func__, instance, rv, cmd); + if (mem_p != NULL) + kmem_free(mem_p, alloc_len); + return (rv); + } + + /* + * If the VTOC has been changed, then vdc needs to update the copy + * it saved in the soft state structure and try and update the device + * node properties. Failing to set the properties should not cause + * an error to be return the caller though. + */ + if (cmd == DKIOCSVTOC) { + bcopy(mem_p, vdc->vtoc, sizeof (struct vtoc)); + if (vdc_create_device_nodes_props(vdc)) { + cmn_err(CE_NOTE, "![%d] Failed to update device nodes" + " properties", instance); + } + } + + /* + * if we don't have to do a copyout, we have nothing left to do + * so we just return. + */ + if ((dk_ioctl[idx].copy & VD_COPYOUT) == 0) { + if (mem_p != NULL) + kmem_free(mem_p, alloc_len); + return (0); + } + + /* sanity check */ + if (mem_p == NULL) + return (EFAULT); + + + /* + * some operations need the data to be converted from 64 bit + * back to 32 bit structures after vds has processed them. + */ + if (do_convert_32to64) { + switch (cmd) { + case DKIOCGVTOC: + { + struct vtoc vt; + struct vtoc32 vt32; + + ASSERT(mem_p != NULL); + vt = *((struct vtoc *)(mem_p)); + + vtoctovtoc32(vt, vt32); + bcopy(&vt32, mem_p, copy_len); + break; + } + + case USCSICMD: + { + struct uscsi_cmd32 *uc32; + struct uscsi_cmd *uc; + + len = sizeof (struct uscsi_cmd32); + + ASSERT(mem_p != NULL); + uc = (struct uscsi_cmd *)mem_p; + uc32 = kmem_zalloc(len, KM_SLEEP); + + uscsi_cmdtouscsi_cmd32(uc, uc32); + bcopy(uc32, mem_p, copy_len); + PR0("%s[%d]: uscsi_cmd32:%x\n", __func__, instance, + ((struct uscsi_cmd32 *)mem_p)->uscsi_cdblen); + kmem_free(uc32, len); + break; + } + default: + PR1("%s[%d]: This mode (%x) should just work for(%x)\n", + __func__, instance, mode, cmd); + break; + } + } + + ASSERT(len != 0); + ASSERT(mem_p != NULL); + + rv = ddi_copyout(mem_p, (void *)arg, copy_len, mode); + if (rv != 0) { + vdc_msg("%s[%d]: Could not do copy out for ioctl (%x)\n", + __func__, instance, cmd); + rv = EFAULT; + } + + if (mem_p != NULL) + kmem_free(mem_p, alloc_len); + + return (rv); +} + +/* + * Function: + * vdc_create_fake_geometry() + * + * Description: + * This routine fakes up the disk info needed for some DKIO ioctls. + * - DKIOCINFO + * - DKIOCGMEDIAINFO + * + * [ just like lofi(7D) and ramdisk(7D) ] + * + * Arguments: + * vdc - soft state pointer for this instance of the device driver. + * + * Return Code: + * 0 - Success + */ +static int +vdc_create_fake_geometry(vdc_t *vdc) +{ + ASSERT(vdc != NULL); + + /* + * DKIOCINFO support + */ + vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP); + + (void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME); + (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); + vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz / vdc->block_size; + vdc->cinfo->dki_ctype = DKC_SCSI_CCS; + vdc->cinfo->dki_flags = DKI_FMTVOL; + vdc->cinfo->dki_cnum = 0; + vdc->cinfo->dki_addr = 0; + vdc->cinfo->dki_space = 0; + vdc->cinfo->dki_prio = 0; + vdc->cinfo->dki_vec = 0; + vdc->cinfo->dki_unit = vdc->instance; + vdc->cinfo->dki_slave = 0; + /* + * The partition number will be created on the fly depending on the + * actual slice (i.e. minor node) that is used to request the data. + */ + vdc->cinfo->dki_partition = 0; + + /* + * DKIOCGMEDIAINFO support + */ + vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP); + vdc->minfo->dki_media_type = DK_FIXED_DISK; + vdc->minfo->dki_capacity = 1; + vdc->minfo->dki_lbsize = DEV_BSIZE; + + return (0); +} diff --git a/usr/src/uts/sun4v/io/vds.c b/usr/src/uts/sun4v/io/vds.c new file mode 100644 index 0000000000..0495ef2de3 --- /dev/null +++ b/usr/src/uts/sun4v/io/vds.c @@ -0,0 +1,2013 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Virtual disk server + */ + + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/dkio.h> +#include <sys/file.h> +#include <sys/mdeg.h> +#include <sys/modhash.h> +#include <sys/note.h> +#include <sys/pathname.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/sysmacros.h> +#include <sys/vio_common.h> +#include <sys/vdsk_mailbox.h> +#include <sys/vdsk_common.h> +#include <sys/vtoc.h> +#include <sys/scsi/impl/uscsi.h> + + +/* Virtual disk server initialization flags */ +#define VDS_LOCKING 0x01 +#define VDS_LDI 0x02 +#define VDS_MDEG 0x04 + +/* Virtual disk server tunable parameters */ +#define VDS_LDC_RETRIES 3 +#define VDS_NCHAINS 32 + +/* Identification parameters for MD, synthetic dkio(7i) structures, etc. */ +#define VDS_NAME "virtual-disk-server" + +#define VD_NAME "vd" +#define VD_VOLUME_NAME "vdisk" +#define VD_ASCIILABEL "Virtual Disk" + +#define VD_CHANNEL_ENDPOINT "channel-endpoint" +#define VD_ID_PROP "id" +#define VD_BLOCK_DEVICE_PROP "vds-block-device" + +/* Virtual disk initialization flags */ +#define VD_LOCKING 0x01 +#define VD_TASKQ 0x02 +#define VD_LDC 0x04 +#define VD_DRING 0x08 +#define VD_SID 0x10 +#define VD_SEQ_NUM 0x20 + +/* Flags for opening/closing backing devices via LDI */ +#define VD_OPEN_FLAGS (FEXCL | FREAD | FWRITE) + +/* + * By Solaris convention, slice/partition 2 represents the entire disk; + * unfortunately, this convention does not appear to be codified. + */ +#define VD_ENTIRE_DISK_SLICE 2 + +/* Return a cpp token as a string */ +#define STRINGIZE(token) #token + +/* + * Print a message prefixed with the current function name to the message log + * (and optionally to the console for verbose boots); these macros use cpp's + * concatenation of string literals and C99 variable-length-argument-list + * macros + */ +#define PRN(...) _PRN("?%s(): "__VA_ARGS__, "") +#define _PRN(format, ...) \ + cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__) + +/* Return a pointer to the "i"th vdisk dring element */ +#define VD_DRING_ELEM(i) ((vd_dring_entry_t *)(void *) \ + (vd->dring + (i)*vd->descriptor_size)) + +/* Return the virtual disk client's type as a string (for use in messages) */ +#define VD_CLIENT(vd) \ + (((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" : \ + (((vd)->xfer_mode == VIO_DRING_MODE) ? "dring client" : \ + (((vd)->xfer_mode == 0) ? "null client" : \ + "unsupported client"))) + +/* Debugging macros */ +#ifdef DEBUG +#define PR0 if (vd_msglevel > 0) PRN +#define PR1 if (vd_msglevel > 1) PRN +#define PR2 if (vd_msglevel > 2) PRN + +#define VD_DUMP_DRING_ELEM(elem) \ + PRN("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n", \ + elem->hdr.dstate, \ + elem->payload.operation, \ + elem->payload.status, \ + elem->payload.nbytes, \ + elem->payload.addr, \ + elem->payload.ncookies); + +#else /* !DEBUG */ +#define PR0(...) +#define PR1(...) +#define PR2(...) + +#define VD_DUMP_DRING_ELEM(elem) + +#endif /* DEBUG */ + + +typedef struct vds { + uint_t initialized; /* driver inst initialization flags */ + dev_info_t *dip; /* driver inst devinfo pointer */ + kmutex_t lock; /* lock for this structure */ + ldi_ident_t ldi_ident; /* driver's identifier for LDI */ + mod_hash_t *vd_table; /* table of virtual disks served */ + mdeg_handle_t mdeg; /* handle for MDEG operations */ +} vds_t; + +typedef struct vd { + uint_t initialized; /* vdisk initialization flags */ + kmutex_t lock; /* lock for this structure */ + vds_t *vds; /* server for this vdisk */ + ddi_taskq_t *taskq; /* taskq for this vdisk */ + ldi_handle_t ldi_handle[V_NUMPAR]; /* LDI slice handles */ + dev_t dev[V_NUMPAR]; /* dev numbers for slices */ + uint_t nslices; /* number for slices */ + size_t vdisk_size; /* number of blocks in vdisk */ + vd_disk_type_t vdisk_type; /* slice or entire disk */ + boolean_t pseudo; /* underlying pseudo dev */ + struct dk_geom dk_geom; /* synthetic for slice type */ + struct vtoc vtoc; /* synthetic for slice type */ + ldc_status_t ldc_state; /* LDC connection state */ + ldc_handle_t ldc_handle; /* handle for LDC comm */ + size_t max_msglen; /* largest LDC message len */ + boolean_t enabled; /* whether vdisk is enabled */ + vd_state_t state; /* client handshake state */ + uint8_t xfer_mode; /* transfer mode with client */ + uint32_t sid; /* client's session ID */ + uint64_t seq_num; /* message sequence number */ + uint64_t dring_ident; /* identifier of dring */ + ldc_dring_handle_t dring_handle; /* handle for dring ops */ + uint32_t descriptor_size; /* num bytes in desc */ + uint32_t dring_len; /* number of dring elements */ + caddr_t dring; /* address of dring */ +} vd_t; + +typedef struct vds_operation { + uint8_t operation; + int (*function)(vd_t *vd, vd_dring_payload_t *request); +} vds_operation_t; + +typedef struct ioctl { + uint8_t operation; + const char *operation_name; + int cmd; + const char *cmd_name; + uint_t copy; + size_t nbytes; +} ioctl_t; + + +static int vds_ldc_retries = VDS_LDC_RETRIES; +static void *vds_state; +static uint64_t vds_operations; /* see vds_operation[] definition below */ + +static int vd_open_flags = VD_OPEN_FLAGS; + +#ifdef DEBUG +static int vd_msglevel; +#endif /* DEBUG */ + + +static int +vd_bread(vd_t *vd, vd_dring_payload_t *request) +{ + int status; + struct buf buf; + + PR1("Read %lu bytes at block %lu", request->nbytes, request->addr); + if (request->nbytes == 0) + return (EINVAL); /* no service for trivial requests */ + ASSERT(mutex_owned(&vd->lock)); + ASSERT(request->slice < vd->nslices); + + bioinit(&buf); + buf.b_flags = B_BUSY | B_READ; + buf.b_bcount = request->nbytes; + buf.b_un.b_addr = kmem_alloc(buf.b_bcount, KM_SLEEP); + buf.b_lblkno = request->addr; + buf.b_edev = vd->dev[request->slice]; + + if ((status = ldi_strategy(vd->ldi_handle[request->slice], &buf)) == 0) + status = biowait(&buf); + biofini(&buf); + if ((status == 0) && + ((status = ldc_mem_copy(vd->ldc_handle, buf.b_un.b_addr, 0, + &request->nbytes, request->cookie, request->ncookies, + LDC_COPY_OUT)) != 0)) { + PRN("ldc_mem_copy() returned errno %d copying to client", + status); + } + kmem_free(buf.b_un.b_addr, buf.b_bcount); /* nbytes can change */ + return (status); +} + +static int +vd_do_bwrite(vd_t *vd, uint_t slice, diskaddr_t block, size_t nbytes, + ldc_mem_cookie_t *cookie, uint64_t ncookies, caddr_t data) +{ + int status; + struct buf buf; + + ASSERT(mutex_owned(&vd->lock)); + ASSERT(slice < vd->nslices); + ASSERT(nbytes != 0); + ASSERT(data != NULL); + + /* Get data from client */ + if ((status = ldc_mem_copy(vd->ldc_handle, data, 0, &nbytes, + cookie, ncookies, LDC_COPY_IN)) != 0) { + PRN("ldc_mem_copy() returned errno %d copying from client", + status); + return (status); + } + + bioinit(&buf); + buf.b_flags = B_BUSY | B_WRITE; + buf.b_bcount = nbytes; + buf.b_un.b_addr = data; + buf.b_lblkno = block; + buf.b_edev = vd->dev[slice]; + + if ((status = ldi_strategy(vd->ldi_handle[slice], &buf)) == 0) + status = biowait(&buf); + biofini(&buf); + return (status); +} + +static int +vd_bwrite(vd_t *vd, vd_dring_payload_t *request) +{ + int status; + caddr_t data; + + + PR1("Write %ld bytes at block %lu", request->nbytes, request->addr); + if (request->nbytes == 0) + return (EINVAL); /* no service for trivial requests */ + data = kmem_alloc(request->nbytes, KM_SLEEP); + status = vd_do_bwrite(vd, request->slice, request->addr, + request->nbytes, request->cookie, request->ncookies, data); + kmem_free(data, request->nbytes); + return (status); +} + +static int +vd_do_slice_ioctl(vd_t *vd, int cmd, void *buf) +{ + switch (cmd) { + case DKIOCGGEOM: + ASSERT(buf != NULL); + bcopy(&vd->dk_geom, buf, sizeof (vd->dk_geom)); + return (0); + case DKIOCGVTOC: + ASSERT(buf != NULL); + bcopy(&vd->vtoc, buf, sizeof (vd->vtoc)); + return (0); + default: + return (ENOTSUP); + } +} + +static int +vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, ioctl_t *ioctl) +{ + int rval = 0, status; + size_t nbytes = request->nbytes; /* modifiable copy */ + + + ASSERT(mutex_owned(&vd->lock)); + ASSERT(request->slice < vd->nslices); + PR0("Performing %s", ioctl->operation_name); + + /* Get data from client, if necessary */ + if (ioctl->copy & VD_COPYIN) { + ASSERT(nbytes != 0 && buf != NULL); + PR1("Getting \"arg\" data from client"); + if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, + request->cookie, request->ncookies, + LDC_COPY_IN)) != 0) { + PRN("ldc_mem_copy() returned errno %d " + "copying from client", status); + return (status); + } + } + + /* + * Handle single-slice block devices internally; otherwise, have the + * real driver perform the ioctl() + */ + if (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo) { + if ((status = vd_do_slice_ioctl(vd, ioctl->cmd, buf)) != 0) + return (status); + } else if ((status = ldi_ioctl(vd->ldi_handle[request->slice], + ioctl->cmd, (intptr_t)buf, FKIOCTL, kcred, &rval)) != 0) { + PR0("ldi_ioctl(%s) = errno %d", ioctl->cmd_name, status); + return (status); + } +#ifdef DEBUG + if (rval != 0) { + PRN("%s set rval = %d, which is not being returned to client", + ioctl->cmd_name, rval); + } +#endif /* DEBUG */ + + /* Send data to client, if necessary */ + if (ioctl->copy & VD_COPYOUT) { + ASSERT(nbytes != 0 && buf != NULL); + PR1("Sending \"arg\" data to client"); + if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, + request->cookie, request->ncookies, + LDC_COPY_OUT)) != 0) { + PRN("ldc_mem_copy() returned errno %d " + "copying to client", status); + return (status); + } + } + + return (status); +} + +#define RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t)) +static int +vd_ioctl(vd_t *vd, vd_dring_payload_t *request) +{ + static ioctl_t ioctl[] = { + /* Command (no-copy) operations */ + {VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), DKIOCFLUSHWRITECACHE, + STRINGIZE(DKIOCFLUSHWRITECACHE), 0, 0}, + + /* "Get" (copy-out) operations */ + {VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), DKIOCGETWCE, + STRINGIZE(DKIOCGETWCE), VD_COPYOUT, RNDSIZE(int)}, + {VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM), DKIOCGGEOM, + STRINGIZE(DKIOCGGEOM), VD_COPYOUT, RNDSIZE(struct dk_geom)}, + {VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), DKIOCGVTOC, + STRINGIZE(DKIOCGVTOC), VD_COPYOUT, RNDSIZE(struct vtoc)}, + + /* "Set" (copy-in) operations */ + {VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), DKIOCSETWCE, + STRINGIZE(DKIOCSETWCE), VD_COPYOUT, RNDSIZE(int)}, + {VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM), DKIOCSGEOM, + STRINGIZE(DKIOCSGEOM), VD_COPYIN, RNDSIZE(struct dk_geom)}, + {VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), DKIOCSVTOC, + STRINGIZE(DKIOCSVTOC), VD_COPYIN, RNDSIZE(struct vtoc)}, + + /* "Get/set" (copy-in/copy-out) operations */ + {VD_OP_SCSICMD, STRINGIZE(VD_OP_SCSICMD), USCSICMD, + STRINGIZE(USCSICMD), VD_COPYIN|VD_COPYOUT, + RNDSIZE(struct uscsi_cmd)} + + }; + int i, status; + void *buf = NULL; + size_t nioctls = (sizeof (ioctl))/(sizeof (ioctl[0])); + + + ASSERT(mutex_owned(&vd->lock)); + ASSERT(request->slice < vd->nslices); + + /* + * Determine ioctl corresponding to caller's "operation" and + * validate caller's "nbytes" + */ + for (i = 0; i < nioctls; i++) { + if (request->operation == ioctl[i].operation) { + if (request->nbytes > ioctl[i].nbytes) { + PRN("%s: Expected <= %lu \"nbytes\", " + "got %lu", ioctl[i].operation_name, + ioctl[i].nbytes, request->nbytes); + return (EINVAL); + } else if ((request->nbytes % sizeof (uint64_t)) != 0) { + PRN("%s: nbytes = %lu not a multiple of %lu", + ioctl[i].operation_name, request->nbytes, + sizeof (uint64_t)); + return (EINVAL); + } + + break; + } + } + ASSERT(i < nioctls); /* because "operation" already validated */ + + if (request->nbytes) + buf = kmem_zalloc(request->nbytes, KM_SLEEP); + status = vd_do_ioctl(vd, request, buf, &ioctl[i]); + if (request->nbytes) + kmem_free(buf, request->nbytes); + return (status); +} + +/* + * Define the supported operations once the functions for performing them have + * been defined + */ +static const vds_operation_t vds_operation[] = { + {VD_OP_BREAD, vd_bread}, + {VD_OP_BWRITE, vd_bwrite}, + {VD_OP_FLUSH, vd_ioctl}, + {VD_OP_GET_WCE, vd_ioctl}, + {VD_OP_SET_WCE, vd_ioctl}, + {VD_OP_GET_VTOC, vd_ioctl}, + {VD_OP_SET_VTOC, vd_ioctl}, + {VD_OP_GET_DISKGEOM, vd_ioctl}, + {VD_OP_SET_DISKGEOM, vd_ioctl}, + {VD_OP_SCSICMD, vd_ioctl} +}; + +static const size_t vds_noperations = + (sizeof (vds_operation))/(sizeof (vds_operation[0])); + +/* + * Process a request using a defined operation + */ +static int +vd_process_request(vd_t *vd, vd_dring_payload_t *request) +{ + int i; + + + PR1("Entered"); + ASSERT(mutex_owned(&vd->lock)); + + /* Range-check slice */ + if (request->slice >= vd->nslices) { + PRN("Invalid \"slice\" %u (max %u) for virtual disk", + request->slice, (vd->nslices - 1)); + return (EINVAL); + } + + /* Perform the requested operation */ + for (i = 0; i < vds_noperations; i++) + if (request->operation == vds_operation[i].operation) + return (vds_operation[i].function(vd, request)); + + /* No matching operation found */ + PRN("Unsupported operation %u", request->operation); + return (ENOTSUP); +} + +static int +send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen) +{ + int retry, status; + size_t nbytes; + + + for (retry = 0, status = EWOULDBLOCK; + retry < vds_ldc_retries && status == EWOULDBLOCK; + retry++) { + PR1("ldc_write() attempt %d", (retry + 1)); + nbytes = msglen; + status = ldc_write(ldc_handle, msg, &nbytes); + } + + if (status != 0) { + PRN("ldc_write() returned errno %d", status); + return (status); + } else if (nbytes != msglen) { + PRN("ldc_write() performed only partial write"); + return (EIO); + } + + PR1("SENT %lu bytes", msglen); + return (0); +} + +/* + * Return 1 if the "type", "subtype", and "env" fields of the "tag" first + * argument match the corresponding remaining arguments; otherwise, return 0 + */ +int +vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env) +{ + return ((tag->vio_msgtype == type) && + (tag->vio_subtype == subtype) && + (tag->vio_subtype_env == env)) ? 1 : 0; +} + +static int +process_ver_msg(vio_msg_t *msg, size_t msglen) +{ + vio_ver_msg_t *ver_msg = (vio_ver_msg_t *)msg; + + + ASSERT(msglen >= sizeof (msg->tag)); + + if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, + VIO_VER_INFO)) { + return (ENOMSG); /* not a version message */ + } + + if (msglen != sizeof (*ver_msg)) { + PRN("Expected %lu-byte version message; " + "received %lu bytes", sizeof (*ver_msg), msglen); + return (EBADMSG); + } + + if (ver_msg->dev_class != VDEV_DISK) { + PRN("Expected device class %u (disk); received %u", + VDEV_DISK, ver_msg->dev_class); + return (EBADMSG); + } + + if ((ver_msg->ver_major != VD_VER_MAJOR) || + (ver_msg->ver_minor != VD_VER_MINOR)) { + /* Unsupported version; send back supported version */ + ver_msg->ver_major = VD_VER_MAJOR; + ver_msg->ver_minor = VD_VER_MINOR; + return (EBADMSG); + } + + /* Valid message, version accepted */ + ver_msg->dev_class = VDEV_DISK_SERVER; + return (0); +} + +static int +vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) +{ + vd_attr_msg_t *attr_msg = (vd_attr_msg_t *)msg; + + + PR0("Entered"); + ASSERT(mutex_owned(&vd->lock)); + ASSERT(msglen >= sizeof (msg->tag)); + + if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, + VIO_ATTR_INFO)) { + return (ENOMSG); /* not an attribute message */ + } + + if (msglen != sizeof (*attr_msg)) { + PRN("Expected %lu-byte attribute message; " + "received %lu bytes", sizeof (*attr_msg), msglen); + return (EBADMSG); + } + + if (attr_msg->max_xfer_sz == 0) { + PRN("Received maximum transfer size of 0 from client"); + return (EBADMSG); + } + + if ((attr_msg->xfer_mode != VIO_DESC_MODE) && + (attr_msg->xfer_mode != VIO_DRING_MODE)) { + PRN("Client requested unsupported transfer mode"); + return (EBADMSG); + } + + + /* Success: valid message and transfer mode */ + vd->xfer_mode = attr_msg->xfer_mode; + if (vd->xfer_mode == VIO_DESC_MODE) { + /* + * The vd_dring_inband_msg_t contains one cookie; need room + * for up to n-1 more cookies, where "n" is the number of full + * pages plus possibly one partial page required to cover + * "max_xfer_sz". Add room for one more cookie if + * "max_xfer_sz" isn't an integral multiple of the page size. + * Must first get the maximum transfer size in bytes. + */ +#if 1 /* NEWOBP */ + size_t max_xfer_bytes = attr_msg->vdisk_block_size ? + attr_msg->vdisk_block_size*attr_msg->max_xfer_sz : + attr_msg->max_xfer_sz; + size_t max_inband_msglen = + sizeof (vd_dring_inband_msg_t) + + ((max_xfer_bytes/PAGESIZE + + ((max_xfer_bytes % PAGESIZE) ? 1 : 0))* + (sizeof (ldc_mem_cookie_t))); +#else /* NEWOBP */ + size_t max_inband_msglen = + sizeof (vd_dring_inband_msg_t) + + ((attr_msg->max_xfer_sz/PAGESIZE + + (attr_msg->max_xfer_sz % PAGESIZE ? 1 : 0))* + (sizeof (ldc_mem_cookie_t))); +#endif /* NEWOBP */ + + /* + * Set the maximum expected message length to + * accommodate in-band-descriptor messages with all + * their cookies + */ + vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen); + } + + attr_msg->vdisk_size = vd->vdisk_size; + attr_msg->vdisk_type = vd->vdisk_type; + attr_msg->operations = vds_operations; + PR0("%s", VD_CLIENT(vd)); + return (0); +} + +static int +vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) +{ + int status; + size_t expected; + ldc_mem_info_t dring_minfo; + vio_dring_reg_msg_t *reg_msg = (vio_dring_reg_msg_t *)msg; + + + PR0("Entered"); + ASSERT(mutex_owned(&vd->lock)); + ASSERT(msglen >= sizeof (msg->tag)); + + if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, + VIO_DRING_REG)) { + return (ENOMSG); /* not a register-dring message */ + } + + if (msglen < sizeof (*reg_msg)) { + PRN("Expected at least %lu-byte register-dring message; " + "received %lu bytes", sizeof (*reg_msg), msglen); + return (EBADMSG); + } + + expected = sizeof (*reg_msg) + + (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0])); + if (msglen != expected) { + PRN("Expected %lu-byte register-dring message; " + "received %lu bytes", expected, msglen); + return (EBADMSG); + } + + if (vd->initialized & VD_DRING) { + PRN("A dring was previously registered; only support one"); + return (EBADMSG); + } + + if (reg_msg->ncookies != 1) { + /* + * In addition to fixing the assertion in the success case + * below, supporting drings which require more than one + * "cookie" requires increasing the value of vd->max_msglen + * somewhere in the code path prior to receiving the message + * which results in calling this function. Note that without + * making this change, the larger message size required to + * accommodate multiple cookies cannot be successfully + * received, so this function will not even get called. + * Gracefully accommodating more dring cookies might + * reasonably demand exchanging an additional attribute or + * making a minor protocol adjustment + */ + PRN("reg_msg->ncookies = %u != 1", reg_msg->ncookies); + return (EBADMSG); + } + + status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie, + reg_msg->ncookies, reg_msg->num_descriptors, + reg_msg->descriptor_size, LDC_SHADOW_MAP, &vd->dring_handle); + if (status != 0) { + PRN("ldc_mem_dring_map() returned errno %d", status); + return (status); + } + + /* + * To remove the need for this assertion, must call + * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a + * successful call to ldc_mem_dring_map() + */ + ASSERT(reg_msg->ncookies == 1); + + if ((status = + ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) { + PRN("ldc_mem_dring_info() returned errno %d", status); + if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0) + PRN("ldc_mem_dring_unmap() returned errno %d", status); + return (status); + } + + if (dring_minfo.vaddr == NULL) { + PRN("Descriptor ring virtual address is NULL"); + return (EBADMSG); /* FIXME appropriate status? */ + } + + + /* Valid message and dring mapped */ + PR1("descriptor size = %u, dring length = %u", + vd->descriptor_size, vd->dring_len); + vd->initialized |= VD_DRING; + vd->dring_ident = 1; /* "There Can Be Only One" */ + vd->dring = dring_minfo.vaddr; + vd->descriptor_size = reg_msg->descriptor_size; + vd->dring_len = reg_msg->num_descriptors; + reg_msg->dring_ident = vd->dring_ident; + return (0); +} + +static int +vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) +{ + vio_dring_unreg_msg_t *unreg_msg = (vio_dring_unreg_msg_t *)msg; + + + PR0("Entered"); + ASSERT(mutex_owned(&vd->lock)); + ASSERT(msglen >= sizeof (msg->tag)); + + if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, + VIO_DRING_UNREG)) { + return (ENOMSG); /* not an unregister-dring message */ + } + + if (msglen != sizeof (*unreg_msg)) { + PRN("Expected %lu-byte unregister-dring message; " + "received %lu bytes", sizeof (*unreg_msg), msglen); + return (EBADMSG); + } + + if (unreg_msg->dring_ident != vd->dring_ident) { + PRN("Expected dring ident %lu; received %lu", + vd->dring_ident, unreg_msg->dring_ident); + return (EBADMSG); + } + + /* FIXME set ack in unreg_msg? */ + return (0); +} + +static int +process_rdx_msg(vio_msg_t *msg, size_t msglen) +{ + PR0("Entered"); + ASSERT(msglen >= sizeof (msg->tag)); + + if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) + return (ENOMSG); /* not an RDX message */ + + if (msglen != sizeof (vio_rdx_msg_t)) { + PRN("Expected %lu-byte RDX message; received %lu bytes", + sizeof (vio_rdx_msg_t), msglen); + return (EBADMSG); + } + + return (0); +} + +static void +vd_reset_connection(vd_t *vd, boolean_t reset_ldc) +{ + int status = 0; + + + ASSERT(mutex_owned(&vd->lock)); + PR0("Resetting connection with %s", VD_CLIENT(vd)); + if ((vd->initialized & VD_DRING) && + ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)) + PRN("ldc_mem_dring_unmap() returned errno %d", status); + if ((reset_ldc == B_TRUE) && + ((status = ldc_reset(vd->ldc_handle)) != 0)) + PRN("ldc_reset() returned errno %d", status); + vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING); + vd->state = VD_STATE_INIT; + vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ +} + +static int +vd_check_seq_num(vd_t *vd, uint64_t seq_num) +{ + ASSERT(mutex_owned(&vd->lock)); + if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) { + PRN("Received seq_num %lu; expected %lu", + seq_num, (vd->seq_num + 1)); + vd_reset_connection(vd, B_FALSE); + return (1); + } + + vd->seq_num = seq_num; + vd->initialized |= VD_SEQ_NUM; /* superfluous after first time... */ + return (0); +} + +/* + * Return the expected size of an inband-descriptor message with all the + * cookies it claims to include + */ +static size_t +expected_inband_size(vd_dring_inband_msg_t *msg) +{ + return ((sizeof (*msg)) + + (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0]))); +} + +/* + * Process an in-band descriptor message: used with clients like OBP, with + * which vds exchanges descriptors within VIO message payloads, rather than + * operating on them within a descriptor ring + */ +static int +vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) +{ + size_t expected; + vd_dring_inband_msg_t *desc_msg = (vd_dring_inband_msg_t *)msg; + + + PR1("Entered"); + ASSERT(mutex_owned(&vd->lock)); + ASSERT(msglen >= sizeof (msg->tag)); + + if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, + VIO_DESC_DATA)) + return (ENOMSG); /* not an in-band-descriptor message */ + + if (msglen < sizeof (*desc_msg)) { + PRN("Expected at least %lu-byte descriptor message; " + "received %lu bytes", sizeof (*desc_msg), msglen); + return (EBADMSG); + } + + if (msglen != (expected = expected_inband_size(desc_msg))) { + PRN("Expected %lu-byte descriptor message; " + "received %lu bytes", expected, msglen); + return (EBADMSG); + } + + if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) { + return (EBADMSG); + } + + /* Valid message; process the request */ + desc_msg->payload.status = vd_process_request(vd, &desc_msg->payload); + return (0); +} + +static boolean_t +vd_accept_dring_elems(vd_t *vd, uint32_t start, uint32_t ndesc) +{ + uint32_t i, n; + + + /* Check descriptor states */ + for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len) { + if (VD_DRING_ELEM(i)->hdr.dstate != VIO_DESC_READY) { + PRN("descriptor %u not ready", i); + VD_DUMP_DRING_ELEM(VD_DRING_ELEM(i)); + return (B_FALSE); + } + } + + /* Descriptors are valid; accept them */ + for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len) + VD_DRING_ELEM(i)->hdr.dstate = VIO_DESC_ACCEPTED; + + return (B_TRUE); +} + +static int +vd_process_dring(vd_t *vd, uint32_t start, uint32_t end) +{ + int status; + boolean_t accepted; + uint32_t i, io_status, n, ndesc; + + + ASSERT(mutex_owned(&vd->lock)); + PR1("start = %u, end = %u", start, end); + + /* Validate descriptor range */ + if ((start >= vd->dring_len) || (end >= vd->dring_len)) { + PRN("\"start\" = %u, \"end\" = %u; both must be less than %u", + start, end, vd->dring_len); + return (EINVAL); + } + + /* Acquire updated dring elements */ + if ((status = ldc_mem_dring_acquire(vd->dring_handle, + start, end)) != 0) { + PRN("ldc_mem_dring_acquire() returned errno %d", status); + return (status); + } + /* Accept updated dring elements */ + ndesc = ((end < start) ? end + vd->dring_len : end) - start + 1; + PR1("ndesc = %u", ndesc); + accepted = vd_accept_dring_elems(vd, start, ndesc); + /* Release dring elements */ + if ((status = ldc_mem_dring_release(vd->dring_handle, + start, end)) != 0) { + PRN("ldc_mem_dring_release() returned errno %d", status); + return (status); + } + /* If a descriptor was in the wrong state, return an error */ + if (!accepted) + return (EINVAL); + + + /* Process accepted dring elements */ + for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len) { + vd_dring_entry_t *elem = VD_DRING_ELEM(i); + + /* Process descriptor outside acquire/release bracket */ + PR1("Processing dring element %u", i); + io_status = vd_process_request(vd, &elem->payload); + + /* Re-acquire client's dring element */ + if ((status = ldc_mem_dring_acquire(vd->dring_handle, + i, i)) != 0) { + PRN("ldc_mem_dring_acquire() returned errno %d", + status); + return (status); + } + /* Update processed element */ + if (elem->hdr.dstate == VIO_DESC_ACCEPTED) { + elem->payload.status = io_status; + elem->hdr.dstate = VIO_DESC_DONE; + } else { + /* Perhaps client timed out waiting for I/O... */ + accepted = B_FALSE; + PRN("element %u no longer \"accepted\"", i); + VD_DUMP_DRING_ELEM(elem); + } + /* Release updated processed element */ + if ((status = ldc_mem_dring_release(vd->dring_handle, + i, i)) != 0) { + PRN("ldc_mem_dring_release() returned errno %d", + status); + return (status); + } + /* If the descriptor was in the wrong state, return an error */ + if (!accepted) + return (EINVAL); + } + + return (0); +} + +static int +vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) +{ + vio_dring_msg_t *dring_msg = (vio_dring_msg_t *)msg; + + + PR1("Entered"); + ASSERT(mutex_owned(&vd->lock)); + ASSERT(msglen >= sizeof (msg->tag)); + + if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, + VIO_DRING_DATA)) { + return (ENOMSG); /* not a dring-data message */ + } + + if (msglen != sizeof (*dring_msg)) { + PRN("Expected %lu-byte dring message; received %lu bytes", + sizeof (*dring_msg), msglen); + return (EBADMSG); + } + + if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) { + return (EBADMSG); + } + + if (dring_msg->dring_ident != vd->dring_ident) { + PRN("Expected dring ident %lu; received ident %lu", + vd->dring_ident, dring_msg->dring_ident); + return (EBADMSG); + } + + + /* Valid message; process dring */ + dring_msg->tag.vio_subtype = VIO_SUBTYPE_ACK; + return (vd_process_dring(vd, dring_msg->start_idx, dring_msg->end_idx)); +} + +static int +recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes) +{ + int retry, status; + size_t size = *nbytes; + boolean_t isempty = B_FALSE; + + + /* FIXME work around interrupt problem */ + if ((ldc_chkq(ldc_handle, &isempty) != 0) || isempty) + return (ENOMSG); + + for (retry = 0, status = ETIMEDOUT; + retry < vds_ldc_retries && status == ETIMEDOUT; + retry++) { + PR1("ldc_read() attempt %d", (retry + 1)); + *nbytes = size; + status = ldc_read(ldc_handle, msg, nbytes); + } + + if (status != 0) { + PRN("ldc_read() returned errno %d", status); + return (status); + } else if (*nbytes == 0) { + PR1("ldc_read() returned 0 and no message read"); + return (ENOMSG); + } + + PR1("RCVD %lu-byte message", *nbytes); + return (0); +} + +static int +vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) +{ + int status; + + + PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype, + msg->tag.vio_subtype, msg->tag.vio_subtype_env); + ASSERT(mutex_owned(&vd->lock)); + + /* + * Validate session ID up front, since it applies to all messages + * once set + */ + if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) { + PRN("Expected SID %u, received %u", vd->sid, + msg->tag.vio_sid); + return (EBADMSG); + } + + + /* + * Process the received message based on connection state + */ + switch (vd->state) { + case VD_STATE_INIT: /* expect version message */ + if ((status = process_ver_msg(msg, msglen)) != 0) + return (status); + + /* The first version message sets the SID */ + ASSERT(!(vd->initialized & VD_SID)); + vd->sid = msg->tag.vio_sid; + vd->initialized |= VD_SID; + + /* Version negotiated, move to that state */ + vd->state = VD_STATE_VER; + return (0); + + case VD_STATE_VER: /* expect attribute message */ + if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0) + return (status); + + /* Attributes exchanged, move to that state */ + vd->state = VD_STATE_ATTR; + return (0); + + case VD_STATE_ATTR: + switch (vd->xfer_mode) { + case VIO_DESC_MODE: /* expect RDX message */ + if ((status = process_rdx_msg(msg, msglen)) != 0) + return (status); + + /* Ready to receive in-band descriptors */ + vd->state = VD_STATE_DATA; + return (0); + + case VIO_DRING_MODE: /* expect register-dring message */ + if ((status = + vd_process_dring_reg_msg(vd, msg, msglen)) != 0) + return (status); + + /* One dring negotiated, move to that state */ + vd->state = VD_STATE_DRING; + return (0); + + default: + ASSERT("Unsupported transfer mode"); + PRN("Unsupported transfer mode"); + return (ENOTSUP); + } + + case VD_STATE_DRING: /* expect RDX, register-dring, or unreg-dring */ + if ((status = process_rdx_msg(msg, msglen)) == 0) { + /* Ready to receive data */ + vd->state = VD_STATE_DATA; + return (0); + } else if (status != ENOMSG) { + return (status); + } + + + /* + * If another register-dring message is received, stay in + * dring state in case the client sends RDX; although the + * protocol allows multiple drings, this server does not + * support using more than one + */ + if ((status = + vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG) + return (status); + + /* + * Acknowledge an unregister-dring message, but reset the + * connection anyway: Although the protocol allows + * unregistering drings, this server cannot serve a vdisk + * without its only dring + */ + status = vd_process_dring_unreg_msg(vd, msg, msglen); + return ((status == 0) ? ENOTSUP : status); + + case VD_STATE_DATA: + switch (vd->xfer_mode) { + case VIO_DESC_MODE: /* expect in-band-descriptor message */ + return (vd_process_desc_msg(vd, msg, msglen)); + + case VIO_DRING_MODE: /* expect dring-data or unreg-dring */ + /* + * Typically expect dring-data messages, so handle + * them first + */ + if ((status = vd_process_dring_msg(vd, msg, + msglen)) != ENOMSG) + return (status); + + /* + * Acknowledge an unregister-dring message, but reset + * the connection anyway: Although the protocol + * allows unregistering drings, this server cannot + * serve a vdisk without its only dring + */ + status = vd_process_dring_unreg_msg(vd, msg, msglen); + return ((status == 0) ? ENOTSUP : status); + + default: + ASSERT("Unsupported transfer mode"); + PRN("Unsupported transfer mode"); + return (ENOTSUP); + } + + default: + ASSERT("Invalid client connection state"); + PRN("Invalid client connection state"); + return (ENOTSUP); + } +} + +static void +vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) +{ + int status; + boolean_t reset_ldc = B_FALSE; + + + ASSERT(mutex_owned(&vd->lock)); + + /* + * Check that the message is at least big enough for a "tag", so that + * message processing can proceed based on tag-specified message type + */ + if (msglen < sizeof (vio_msg_tag_t)) { + PRN("Received short (%lu-byte) message", msglen); + /* Can't "nack" short message, so drop the big hammer */ + vd_reset_connection(vd, B_TRUE); + return; + } + + /* + * Process the message + */ + switch (status = vd_do_process_msg(vd, msg, msglen)) { + case 0: + /* "ack" valid, successfully-processed messages */ + msg->tag.vio_subtype = VIO_SUBTYPE_ACK; + break; + + case ENOMSG: + PRN("Received unexpected message"); + _NOTE(FALLTHROUGH); + case EBADMSG: + case ENOTSUP: + /* "nack" invalid messages */ + msg->tag.vio_subtype = VIO_SUBTYPE_NACK; + break; + + default: + /* "nack" failed messages */ + msg->tag.vio_subtype = VIO_SUBTYPE_NACK; + /* An LDC error probably occurred, so try resetting it */ + reset_ldc = B_TRUE; + break; + } + + /* "ack" or "nack" the message */ + PR1("Sending %s", + (msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); + if (send_msg(vd->ldc_handle, msg, msglen) != 0) + reset_ldc = B_TRUE; + + /* Reset the connection for nack'ed or failed messages */ + if ((status != 0) || reset_ldc) + vd_reset_connection(vd, reset_ldc); +} + +static void +vd_process_queue(void *arg) +{ + vd_t *vd = (vd_t *)arg; + size_t max_msglen, nbytes; + vio_msg_t *vio_msg; + + + PR2("Entered"); + ASSERT(vd != NULL); + mutex_enter(&vd->lock); + max_msglen = vd->max_msglen; /* vd->maxmsglen can change */ + vio_msg = kmem_alloc(max_msglen, KM_SLEEP); + for (nbytes = vd->max_msglen; + vd->enabled && recv_msg(vd->ldc_handle, vio_msg, &nbytes) == 0; + nbytes = vd->max_msglen) + vd_process_msg(vd, vio_msg, nbytes); + kmem_free(vio_msg, max_msglen); + mutex_exit(&vd->lock); + PR2("Returning"); +} + +static uint_t +vd_handle_ldc_events(uint64_t event, caddr_t arg) +{ + uint_t status; + vd_t *vd = (vd_t *)(void *)arg; + + + ASSERT(vd != NULL); + mutex_enter(&vd->lock); + if (event & LDC_EVT_READ) { + PR1("New packet(s) available"); + /* Queue a task to process the new data */ + if (ddi_taskq_dispatch(vd->taskq, vd_process_queue, vd, 0) != + DDI_SUCCESS) + PRN("Unable to dispatch vd_process_queue()"); + } else if (event & LDC_EVT_RESET) { + PR0("Attempting to bring up reset channel"); + if (((status = ldc_up(vd->ldc_handle)) != 0) && + (status != ECONNREFUSED)) { + PRN("ldc_up() returned errno %d", status); + } + } else if (event & LDC_EVT_UP) { + /* Reset the connection state when channel comes (back) up */ + vd_reset_connection(vd, B_FALSE); + } + mutex_exit(&vd->lock); + return (LDC_SUCCESS); +} + +static uint_t +vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg) +{ + _NOTE(ARGUNUSED(key, val)) + (*((uint_t *)arg))++; + return (MH_WALK_TERMINATE); +} + + +static int +vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + uint_t vd_present = 0; + minor_t instance; + vds_t *vds; + + + PR0("Entered"); + switch (cmd) { + case DDI_DETACH: + /* the real work happens below */ + break; + case DDI_SUSPEND: + /* nothing to do for this non-device */ + return (DDI_SUCCESS); + default: + return (DDI_FAILURE); + } + + ASSERT(cmd == DDI_DETACH); + instance = ddi_get_instance(dip); + if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { + PRN("Could not get state for instance %u", instance); + ddi_soft_state_free(vds_state, instance); + return (DDI_FAILURE); + } + + /* Do no detach when serving any vdisks */ + mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present); + if (vd_present) { + PR0("Not detaching because serving vdisks"); + return (DDI_FAILURE); + } + + PR0("Detaching"); + if (vds->initialized & VDS_MDEG) + (void) mdeg_unregister(vds->mdeg); + if (vds->initialized & VDS_LDI) + (void) ldi_ident_release(vds->ldi_ident); + mod_hash_destroy_hash(vds->vd_table); + if (vds->initialized & VDS_LOCKING) + mutex_destroy(&vds->lock); + ddi_soft_state_free(vds_state, instance); + return (DDI_SUCCESS); +} + +static boolean_t +is_pseudo_device(dev_info_t *dip) +{ + dev_info_t *parent, *root = ddi_root_node(); + + + for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root); + parent = ddi_get_parent(parent)) { + if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0) + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +vd_get_params(ldi_handle_t lh, char *block_device, vd_t *vd) +{ + int otyp, rval, status; + dev_info_t *dip; + struct dk_cinfo dk_cinfo; + + + /* Get block device's device number, otyp, and size */ + if ((status = ldi_get_dev(lh, &vd->dev[0])) != 0) { + PRN("ldi_get_dev() returned errno %d for %s", + status, block_device); + return (status); + } + if ((status = ldi_get_otyp(lh, &otyp)) != 0) { + PRN("ldi_get_otyp() returned errno %d for %s", + status, block_device); + return (status); + } + if (otyp != OTYP_BLK) { + PRN("Cannot serve non-block device %s", block_device); + return (ENOTBLK); + } + if (ldi_get_size(lh, &vd->vdisk_size) != DDI_SUCCESS) { + PRN("ldi_get_size() failed for %s", block_device); + return (EIO); + } + + /* Determine if backing block device is a pseudo device */ + if ((dip = ddi_hold_devi_by_instance(getmajor(vd->dev[0]), + dev_to_instance(vd->dev[0]), 0)) == NULL) { + PRN("%s is no longer accessible", block_device); + return (EIO); + } + vd->pseudo = is_pseudo_device(dip); + ddi_release_devi(dip); + if (vd->pseudo) { + vd->vdisk_type = VD_DISK_TYPE_SLICE; + vd->nslices = 1; + return (0); /* ...and we're done */ + } + + /* Get dk_cinfo to determine slice of backing block device */ + if ((status = ldi_ioctl(lh, DKIOCINFO, (intptr_t)&dk_cinfo, + FKIOCTL, kcred, &rval)) != 0) { + PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s", + status, block_device); + return (status); + } + + if (dk_cinfo.dki_partition >= V_NUMPAR) { + PRN("slice %u >= maximum slice %u for %s", + dk_cinfo.dki_partition, V_NUMPAR, block_device); + return (EIO); + } + + /* If block device slice is entire disk, fill in all slice devices */ + if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE) { + uint_t slice; + major_t major = getmajor(vd->dev[0]); + minor_t minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE; + + vd->vdisk_type = VD_DISK_TYPE_DISK; + vd->nslices = V_NUMPAR; + for (slice = 0; slice < vd->nslices; slice++) + vd->dev[slice] = makedevice(major, (minor + slice)); + return (0); /* ...and we're done */ + } + + /* Otherwise, we have a (partial) slice of a block device */ + vd->vdisk_type = VD_DISK_TYPE_SLICE; + vd->nslices = 1; + + + /* Initialize dk_geom structure for single-slice block device */ + if ((status = ldi_ioctl(lh, DKIOCGGEOM, (intptr_t)&vd->dk_geom, + FKIOCTL, kcred, &rval)) != 0) { + PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s", + status, block_device); + return (status); + } + if (vd->dk_geom.dkg_nsect == 0) { + PRN("%s geometry claims 0 sectors per track", block_device); + return (EIO); + } + if (vd->dk_geom.dkg_nhead == 0) { + PRN("%s geometry claims 0 heads", block_device); + return (EIO); + } + vd->dk_geom.dkg_ncyl = + lbtodb(vd->vdisk_size)/vd->dk_geom.dkg_nsect/vd->dk_geom.dkg_nhead; + vd->dk_geom.dkg_acyl = 0; + vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl; + + + /* Initialize vtoc structure for single-slice block device */ + if ((status = ldi_ioctl(lh, DKIOCGVTOC, (intptr_t)&vd->vtoc, + FKIOCTL, kcred, &rval)) != 0) { + PRN("ldi_ioctl(DKIOCGVTOC) returned errno %d for %s", + status, block_device); + return (status); + } + bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume, + MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume))); + bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part)); + vd->vtoc.v_nparts = 1; + vd->vtoc.v_part[0].p_tag = V_UNASSIGNED; + vd->vtoc.v_part[0].p_flag = 0; + vd->vtoc.v_part[0].p_start = 0; + vd->vtoc.v_part[0].p_size = lbtodb(vd->vdisk_size); + bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel, + MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel))); + + + return (0); +} + +static int +vds_do_init_vd(vds_t *vds, uint64_t id, char *block_device, uint64_t ldc_id, + vd_t **vdp) +{ + char tq_name[TASKQ_NAMELEN]; + int param_status, status; + uint_t slice; + ddi_iblock_cookie_t iblock = NULL; + ldc_attr_t ldc_attr; + ldi_handle_t lh = NULL; + vd_t *vd; + + + ASSERT(vds != NULL); + ASSERT(block_device != NULL); + ASSERT(vdp != NULL); + PR0("Adding vdisk for %s", block_device); + + if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) { + PRN("No memory for virtual disk"); + return (EAGAIN); + } + *vdp = vd; /* assign here so vds_destroy_vd() can cleanup later */ + vd->vds = vds; + + + /* Get device parameters */ + if ((status = ldi_open_by_name(block_device, FREAD, kcred, &lh, + vds->ldi_ident)) != 0) { + PRN("ldi_open_by_name(%s) = errno %d", block_device, status); + return (status); + } + param_status = vd_get_params(lh, block_device, vd); + if ((status = ldi_close(lh, FREAD, kcred)) != 0) { + PRN("ldi_close(%s) = errno %d", block_device, status); + return (status); + } + if (param_status != 0) + return (param_status); + ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR); + PR0("vdisk_type = %s, pseudo = %s, nslices = %u", + ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"), + (vd->pseudo ? "yes" : "no"), vd->nslices); + + + /* Initialize locking */ + if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED, + &iblock) != DDI_SUCCESS) { + PRN("Could not get iblock cookie."); + return (EIO); + } + + mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock); + vd->initialized |= VD_LOCKING; + + + /* Open the backing-device slices */ + for (slice = 0; slice < vd->nslices; slice++) { + ASSERT(vd->ldi_handle[slice] == NULL); + PR0("Opening device %u, minor %u = slice %u", + getmajor(vd->dev[slice]), getminor(vd->dev[slice]), slice); + if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK, + vd_open_flags, kcred, &vd->ldi_handle[slice], + vds->ldi_ident)) != 0) { + PRN("ldi_open_by_dev() returned errno %d for slice %u", + status, slice); + /* vds_destroy_vd() will close any open slices */ +#if 0 /* FIXME */ + return (status); +#endif + } + } + + + /* Create the task queue for the vdisk */ + (void) snprintf(tq_name, sizeof (tq_name), "vd%lu", id); + PR1("tq_name = %s", tq_name); + if ((vd->taskq = ddi_taskq_create(vds->dip, tq_name, 1, + TASKQ_DEFAULTPRI, 0)) == NULL) { + PRN("Could not create task queue"); + return (EIO); + } + vd->initialized |= VD_TASKQ; + vd->enabled = 1; /* before callback can dispatch to taskq */ + + + /* Bring up LDC */ + ldc_attr.devclass = LDC_DEV_BLK_SVC; + ldc_attr.instance = ddi_get_instance(vds->dip); + ldc_attr.mode = LDC_MODE_UNRELIABLE; + ldc_attr.qlen = VD_LDC_QLEN; + if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) { + PRN("ldc_init(%lu) = errno %d", ldc_id, status); + return (status); + } + vd->initialized |= VD_LDC; + + if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events, + (caddr_t)vd)) != 0) { + PRN("ldc_reg_callback() returned errno %d", status); + return (status); + } + + if ((status = ldc_open(vd->ldc_handle)) != 0) { + PRN("ldc_open() returned errno %d", status); + return (status); + } + + if (((status = ldc_up(vd->ldc_handle)) != 0) && + (status != ECONNREFUSED)) { + PRN("ldc_up() returned errno %d", status); + return (status); + } + + + /* Add the successfully-initialized vdisk to the server's table */ + if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) { + PRN("Error adding vdisk ID %lu to table", id); + return (EIO); + } + + return (0); +} + +/* + * Destroy the state associated with a virtual disk + */ +static void +vds_destroy_vd(void *arg) +{ + vd_t *vd = (vd_t *)arg; + + + PR0("Entered"); + if (vd == NULL) + return; + + /* Disable queuing requests for the vdisk */ + if (vd->initialized & VD_LOCKING) { + mutex_enter(&vd->lock); + vd->enabled = 0; + mutex_exit(&vd->lock); + } + + /* Drain and destroy the task queue (*before* shutting down LDC) */ + if (vd->initialized & VD_TASKQ) + ddi_taskq_destroy(vd->taskq); /* waits for queued tasks */ + + /* Shut down LDC */ + if (vd->initialized & VD_LDC) { + if (vd->initialized & VD_DRING) + (void) ldc_mem_dring_unmap(vd->dring_handle); + (void) ldc_unreg_callback(vd->ldc_handle); + (void) ldc_close(vd->ldc_handle); + (void) ldc_fini(vd->ldc_handle); + } + + /* Close any open backing-device slices */ + for (uint_t slice = 0; slice < vd->nslices; slice++) { + if (vd->ldi_handle[slice] != NULL) { + PR0("Closing slice %u", slice); + (void) ldi_close(vd->ldi_handle[slice], + vd_open_flags, kcred); + } + } + + /* Free lock */ + if (vd->initialized & VD_LOCKING) + mutex_destroy(&vd->lock); + + /* Finally, free the vdisk structure itself */ + kmem_free(vd, sizeof (*vd)); +} + +static int +vds_init_vd(vds_t *vds, uint64_t id, char *block_device, uint64_t ldc_id) +{ + int status; + vd_t *vd = NULL; + + +#ifdef lint + (void) vd; +#endif /* lint */ + + if ((status = vds_do_init_vd(vds, id, block_device, ldc_id, &vd)) != 0) + vds_destroy_vd(vd); + + return (status); +} + +static int +vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel, + uint64_t *ldc_id) +{ + int num_channels; + + + /* Look for channel endpoint child(ren) of the vdisk MD node */ + if ((num_channels = md_scan_dag(md, vd_node, + md_find_name(md, VD_CHANNEL_ENDPOINT), + md_find_name(md, "fwd"), channel)) <= 0) { + PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT); + return (-1); + } + + /* Get the "id" value for the first channel endpoint node */ + if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) { + PRN("No \"%s\" property found for \"%s\" of vdisk", + VD_ID_PROP, VD_CHANNEL_ENDPOINT); + return (-1); + } + + if (num_channels > 1) { + PRN("Using ID of first of multiple channels for this vdisk"); + } + + return (0); +} + +static int +vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id) +{ + int num_nodes, status; + size_t size; + mde_cookie_t *channel; + + + if ((num_nodes = md_node_count(md)) <= 0) { + PRN("Invalid node count in Machine Description subtree"); + return (-1); + } + size = num_nodes*(sizeof (*channel)); + channel = kmem_zalloc(size, KM_SLEEP); + status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id); + kmem_free(channel, size); + + return (status); +} + +static void +vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) +{ + char *block_device = NULL; + uint64_t id = 0, ldc_id = 0; + + + if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { + PRN("Error getting vdisk \"%s\"", VD_ID_PROP); + return; + } + PR0("Adding vdisk ID %lu", id); + if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP, + &block_device) != 0) { + PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); + return; + } + + if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) { + PRN("Error getting LDC ID for vdisk %lu", id); + return; + } + + if (vds_init_vd(vds, id, block_device, ldc_id) != 0) { + PRN("Failed to add vdisk ID %lu", id); + return; + } +} + +static void +vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node) +{ + uint64_t id = 0; + + + if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) { + PRN("Unable to get \"%s\" property from vdisk's MD node", + VD_ID_PROP); + return; + } + PR0("Removing vdisk ID %lu", id); + if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0) + PRN("No vdisk entry found for vdisk ID %lu", id); +} + +static void +vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node, + md_t *curr_md, mde_cookie_t curr_vd_node) +{ + char *curr_dev, *prev_dev; + uint64_t curr_id = 0, curr_ldc_id = 0; + uint64_t prev_id = 0, prev_ldc_id = 0; + size_t len; + + + /* Validate that vdisk ID has not changed */ + if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) { + PRN("Error getting previous vdisk \"%s\" property", + VD_ID_PROP); + return; + } + if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) { + PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP); + return; + } + if (curr_id != prev_id) { + PRN("Not changing vdisk: ID changed from %lu to %lu", + prev_id, curr_id); + return; + } + + /* Validate that LDC ID has not changed */ + if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) { + PRN("Error getting LDC ID for vdisk %lu", prev_id); + return; + } + + if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) { + PRN("Error getting LDC ID for vdisk %lu", curr_id); + return; + } + if (curr_ldc_id != prev_ldc_id) { + _NOTE(NOTREACHED); /* FIXME is there a better way? */ + PRN("Not changing vdisk: " + "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id); + return; + } + + /* Determine whether device path has changed */ + if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP, + &prev_dev) != 0) { + PRN("Error getting previous vdisk \"%s\"", + VD_BLOCK_DEVICE_PROP); + return; + } + if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP, + &curr_dev) != 0) { + PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP); + return; + } + if (((len = strlen(curr_dev)) == strlen(prev_dev)) && + (strncmp(curr_dev, prev_dev, len) == 0)) + return; /* no relevant (supported) change */ + + PR0("Changing vdisk ID %lu", prev_id); + /* Remove old state, which will close vdisk and reset */ + if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0) + PRN("No entry found for vdisk ID %lu", prev_id); + /* Re-initialize vdisk with new state */ + if (vds_init_vd(vds, curr_id, curr_dev, curr_ldc_id) != 0) { + PRN("Failed to change vdisk ID %lu", curr_id); + return; + } +} + +static int +vds_process_md(void *arg, mdeg_result_t *md) +{ + int i; + vds_t *vds = arg; + + + if (md == NULL) + return (MDEG_FAILURE); + ASSERT(vds != NULL); + + for (i = 0; i < md->removed.nelem; i++) + vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]); + for (i = 0; i < md->match_curr.nelem; i++) + vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i], + md->match_curr.mdp, md->match_curr.mdep[i]); + for (i = 0; i < md->added.nelem; i++) + vds_add_vd(vds, md->added.mdp, md->added.mdep[i]); + + return (MDEG_SUCCESS); +} + +static int +vds_do_attach(dev_info_t *dip) +{ + static char reg_prop[] = "reg"; /* devinfo ID prop */ + + /* MDEG specification for a (particular) vds node */ + static mdeg_prop_spec_t vds_prop_spec[] = { + {MDET_PROP_STR, "name", {VDS_NAME}}, + {MDET_PROP_VAL, "cfg-handle", {0}}, + {MDET_LIST_END, NULL, {0}}}; + static mdeg_node_spec_t vds_spec = {"virtual-device", vds_prop_spec}; + + /* MDEG specification for matching a vd node */ + static md_prop_match_t vd_prop_spec[] = { + {MDET_PROP_VAL, VD_ID_PROP}, + {MDET_LIST_END, NULL}}; + static mdeg_node_match_t vd_spec = {"virtual-device-port", + vd_prop_spec}; + + int status; + uint64_t cfg_handle; + minor_t instance = ddi_get_instance(dip); + vds_t *vds; + + + /* + * The "cfg-handle" property of a vds node in an MD contains the MD's + * notion of "instance", or unique identifier, for that node; OBP + * stores the value of the "cfg-handle" MD property as the value of + * the "reg" property on the node in the device tree it builds from + * the MD and passes to Solaris. Thus, we look up the devinfo node's + * "reg" property value to uniquely identify this device instance when + * registering with the MD event-generation framework. If the "reg" + * property cannot be found, the device tree state is presumably so + * broken that there is no point in continuing. + */ + if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, reg_prop)) { + PRN("vds \"%s\" property does not exist", reg_prop); + return (DDI_FAILURE); + } + + /* Get the MD instance for later MDEG registration */ + cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + reg_prop, -1); + + if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) { + PRN("Could not allocate state for instance %u", instance); + return (DDI_FAILURE); + } + + if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) { + PRN("Could not get state for instance %u", instance); + ddi_soft_state_free(vds_state, instance); + return (DDI_FAILURE); + } + + + vds->dip = dip; + vds->vd_table = mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS, + vds_destroy_vd, + sizeof (void *)); + ASSERT(vds->vd_table != NULL); + + mutex_init(&vds->lock, NULL, MUTEX_DRIVER, NULL); + vds->initialized |= VDS_LOCKING; + + if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) { + PRN("ldi_ident_from_dip() returned errno %d", status); + return (DDI_FAILURE); + } + vds->initialized |= VDS_LDI; + + /* Register for MD updates */ + vds_prop_spec[1].ps_val = cfg_handle; + if (mdeg_register(&vds_spec, &vd_spec, vds_process_md, vds, + &vds->mdeg) != MDEG_SUCCESS) { + PRN("Unable to register for MD updates"); + return (DDI_FAILURE); + } + vds->initialized |= VDS_MDEG; + + ddi_report_dev(dip); + return (DDI_SUCCESS); +} + +static int +vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int status; + + PR0("Entered"); + switch (cmd) { + case DDI_ATTACH: + if ((status = vds_do_attach(dip)) != DDI_SUCCESS) + (void) vds_detach(dip, DDI_DETACH); + return (status); + case DDI_RESUME: + /* nothing to do for this non-device */ + return (DDI_SUCCESS); + default: + return (DDI_FAILURE); + } +} + +static struct dev_ops vds_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + ddi_no_info, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + vds_attach, /* devo_attach */ + vds_detach, /* devo_detach */ + nodev, /* devo_reset */ + NULL, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + nulldev /* devo_power */ +}; + +static struct modldrv modldrv = { + &mod_driverops, + "virtual disk server v%I%", + &vds_ops, +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + + +int +_init(void) +{ + int i, status; + + + PR0("Built %s %s", __DATE__, __TIME__); + if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0) + return (status); + if ((status = mod_install(&modlinkage)) != 0) { + ddi_soft_state_fini(&vds_state); + return (status); + } + + /* Fill in the bit-mask of server-supported operations */ + for (i = 0; i < vds_noperations; i++) + vds_operations |= 1 << (vds_operation[i].operation - 1); + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int status; + + + PR0("Entered"); + if ((status = mod_remove(&modlinkage)) != 0) + return (status); + ddi_soft_state_fini(&vds_state); + return (0); +} diff --git a/usr/src/uts/sun4v/io/vldc.c b/usr/src/uts/sun4v/io/vldc.c new file mode 100644 index 0000000000..6c366c5c59 --- /dev/null +++ b/usr/src/uts/sun4v/io/vldc.c @@ -0,0 +1,1581 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/file.h> +#include <sys/errno.h> +#include <sys/uio.h> +#include <sys/open.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/cmn_err.h> +#include <sys/ksynch.h> +#include <sys/modctl.h> +#include <sys/stat.h> /* needed for S_IFBLK and S_IFCHR */ +#include <sys/debug.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/cred.h> +#include <sys/promif.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/cyclic.h> +#include <sys/note.h> +#include <sys/mach_descrip.h> +#include <sys/mdeg.h> +#include <sys/ldc.h> +#include <sys/vldc_impl.h> + +/* + * Function prototypes. + */ + +/* DDI entrypoints */ +static int vldc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); +static int vldc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); +static int vldc_open(dev_t *devp, int flag, int otyp, cred_t *cred); +static int vldc_close(dev_t dev, int flag, int otyp, cred_t *cred); +static int vldc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, + cred_t *credp, int *rvalp); +static int vldc_read(dev_t dev, struct uio *uiop, cred_t *credp); +static int vldc_write(dev_t dev, struct uio *uiop, cred_t *credp); +static int vldc_chpoll(dev_t dev, short events, int anyyet, + short *reventsp, struct pollhead **phpp); + +/* Internal functions */ +static uint_t i_vldc_cb(uint64_t event, caddr_t arg); +static int i_vldc_mdeg_cb(void *cb_argp, mdeg_result_t *resp); +static int i_vldc_mdeg_register(vldc_t *vldcp); +static int i_vldc_mdeg_unregister(vldc_t *vldcp); +static int i_vldc_add_port(vldc_t *vldcp, md_t *mdp, mde_cookie_t node); +static int i_vldc_remove_port(vldc_t *vldcp, uint_t portno); +static int i_vldc_close_port(vldc_t *vldcp, uint_t portno); + +/* soft state structure */ +static void *vldc_ssp; + +/* + * Matching criteria passed to the MDEG to register interest + * in changes to 'virtual-device-port' nodes identified by their + * 'id' property. + */ +static md_prop_match_t vport_prop_match[] = { + { MDET_PROP_VAL, "id" }, + { MDET_LIST_END, NULL } +}; + +static mdeg_node_match_t vport_match = { "virtual-device-port", + vport_prop_match }; + +/* + * Specification of an MD node passed to the MDEG to filter any + * 'virtual-device-port' nodes that do not belong to the specified + * node. This template is copied for each vldc instance and filled + * in with the appropriate 'name' and 'cfg-handle' values before + * being passed to the MDEG. + */ +static mdeg_prop_spec_t vldc_prop_template[] = { + { MDET_PROP_STR, "name", NULL }, + { MDET_PROP_VAL, "cfg-handle", NULL }, + { MDET_LIST_END, NULL, NULL } +}; + +#define VLDC_MDEG_PROP_NAME(specp) ((specp)[0].ps_str) +#define VLDC_SET_MDEG_PROP_NAME(specp, name) ((specp)[0].ps_str = (name)) +#define VLDC_SET_MDEG_PROP_INST(specp, inst) ((specp)[1].ps_val = (inst)) + + +static struct cb_ops vldc_cb_ops = { + vldc_open, /* open */ + vldc_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + vldc_read, /* read */ + vldc_write, /* write */ + vldc_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + ddi_segmap, /* segmap */ + vldc_chpoll, /* chpoll */ + ddi_prop_op, /* prop_op */ + NULL, /* stream */ + D_NEW | D_MP /* flag */ +}; + +static struct dev_ops vldc_ops = { + DEVO_REV, /* rev */ + 0, /* ref count */ + ddi_getinfo_1to1, /* getinfo */ + nulldev, /* identify */ + nulldev, /* probe */ + vldc_attach, /* attach */ + vldc_detach, /* detach */ + nodev, /* reset */ + &vldc_cb_ops, /* cb_ops */ + (struct bus_ops *)NULL /* bus_ops */ +}; + +extern struct mod_ops mod_driverops; + +static struct modldrv md = { + &mod_driverops, /* Type - it is a driver */ + "sun4v Virtual LDC Driver %I%", /* Name of the module */ + &vldc_ops, /* driver specific ops */ +}; + +static struct modlinkage ml = { + MODREV_1, + &md, + NULL +}; + +/* maximum MTU and cookie size tunables */ +uint32_t vldc_max_mtu = VLDC_MAX_MTU; +uint64_t vldc_max_cookie = VLDC_MAX_COOKIE; + + +#ifdef DEBUG + +/* + * Print debug messages + * + * set vldcdbg to 0x7 to enable all messages + * + * 0x4 - Warnings + * 0x2 - All debug messages (most verbose) + * 0x1 - Minimal debug messages + */ + +int vldcdbg = 0x0; + +static void +vldcdebug(const char *fmt, ...) +{ + char buf[512]; + va_list ap; + + va_start(ap, fmt); + (void) vsnprintf(buf, sizeof (buf), fmt, ap); + va_end(ap); + + cmn_err(CE_CONT, "?%s", buf); +} + +#define D1 if (vldcdbg & 0x01) vldcdebug +#define D2 if (vldcdbg & 0x02) vldcdebug +#define DWARN if (vldcdbg & 0x04) vldcdebug + +#else /* not DEBUG */ + +#define D1 if (0) printf +#define D2 if (0) printf +#define DWARN if (0) printf + +#endif /* not DEBUG */ + + +/* _init(9E): initialize the loadable module */ +int +_init(void) +{ + int error; + + /* init the soft state structure */ + error = ddi_soft_state_init(&vldc_ssp, sizeof (vldc_t), 1); + if (error != 0) { + return (error); + } + + /* Link the driver into the system */ + error = mod_install(&ml); + + return (error); +} + +/* _info(9E): return information about the loadable module */ +int +_info(struct modinfo *modinfop) +{ + /* Report status of the dynamically loadable driver module */ + return (mod_info(&ml, modinfop)); +} + +/* _fini(9E): prepare the module for unloading. */ +int +_fini(void) +{ + int error; + + /* Unlink the driver module from the system */ + if ((error = mod_remove(&ml)) == 0) { + /* + * We have successfully "removed" the driver. + * destroy soft state + */ + ddi_soft_state_fini(&vldc_ssp); + } + + return (error); +} + +/* ldc callback */ +static uint_t +i_vldc_cb(uint64_t event, caddr_t arg) +{ + vldc_port_t *vport = (vldc_port_t *)arg; + short pollevents = 0; + int rv; + + D1("i_vldc_cb: callback invoked port=%d, event=0x%lx\n", + vport->number, event); + + if (event & LDC_EVT_UP) { + pollevents |= POLLOUT; + vport->hanged_up = B_FALSE; + + } else if (event & LDC_EVT_DOWN) { + pollevents |= POLLHUP; + vport->hanged_up = B_TRUE; + + } else if (event & LDC_EVT_RESET) { + /* do an ldc_up because we can't be sure the other side will */ + if ((rv = ldc_up(vport->ldc_handle)) != 0) + if (rv != ECONNREFUSED) + DWARN("i_vldc_cb: port@%d failed to" + " bring up LDC channel=%ld, err=%d\n", + vport->number, vport->ldc_id, rv); + } + + if (event & LDC_EVT_READ) + pollevents |= POLLIN; + + if (pollevents != 0) { + D1("i_vldc_cb: port@%d pollwakeup=0x%x\n", + vport->number, pollevents); + pollwakeup(&vport->poll, pollevents); + } + + return (LDC_SUCCESS); +} + +/* mdeg callback */ +static int +i_vldc_mdeg_cb(void *cb_argp, mdeg_result_t *resp) +{ + vldc_t *vldcp; + int idx; + uint64_t portno; + int rv; + md_t *mdp; + mde_cookie_t node; + + if (resp == NULL) { + D1("i_vldc_mdeg_cb: no result returned\n"); + return (MDEG_FAILURE); + } + + vldcp = (vldc_t *)cb_argp; + + mutex_enter(&vldcp->lock); + if (vldcp->detaching == B_TRUE) { + D1("i_vldc_mdeg_cb: detach in progress\n"); + mutex_exit(&vldcp->lock); + return (MDEG_FAILURE); + } + + D1("i_vldc_mdeg_cb: added=%d, removed=%d, matched=%d\n", + resp->added.nelem, resp->removed.nelem, resp->match_prev.nelem); + + /* process added ports */ + for (idx = 0; idx < resp->added.nelem; idx++) { + mdp = resp->added.mdp; + node = resp->added.mdep[idx]; + + D1("i_vldc_mdeg_cb: processing added node 0x%lx\n", node); + + /* attempt to add a port */ + if ((rv = i_vldc_add_port(vldcp, mdp, node)) != MDEG_SUCCESS) { + cmn_err(CE_NOTE, "?i_vldc_mdeg_cb: unable to add port, " + "err = %d", rv); + } + } + + /* process removed ports */ + for (idx = 0; idx < resp->removed.nelem; idx++) { + mdp = resp->removed.mdp; + node = resp->removed.mdep[idx]; + + D1("i_vldc_mdeg_cb: processing removed node 0x%lx\n", node); + + /* read in the port's id property */ + if (md_get_prop_val(mdp, node, "id", &portno)) { + cmn_err(CE_NOTE, "?i_vldc_mdeg_cb: node 0x%lx of " + "removed list has no 'id' property", node); + continue; + } + + /* attempt to remove a port */ + if ((rv = i_vldc_remove_port(vldcp, portno)) != 0) { + cmn_err(CE_NOTE, "?i_vldc_mdeg_cb: unable to remove " + "port %lu, err %d", portno, rv); + } + } + + /* + * Currently no support for updating already active ports. So, ignore + * the match_curr and match_prev arrays for now. + */ + + mutex_exit(&vldcp->lock); + + return (MDEG_SUCCESS); +} + +/* register callback to mdeg */ +static int +i_vldc_mdeg_register(vldc_t *vldcp) +{ + mdeg_prop_spec_t *pspecp; + mdeg_node_spec_t *inst_specp; + mdeg_handle_t mdeg_hdl; + size_t templatesz; + int inst; + char *name; + size_t namesz; + char *nameprop; + int rv; + + /* get the unique vldc instance assigned by the LDom manager */ + inst = ddi_prop_get_int(DDI_DEV_T_ANY, vldcp->dip, + DDI_PROP_DONTPASS, "reg", -1); + if (inst == -1) { + cmn_err(CE_NOTE, "?vldc%d has no 'reg' property", + ddi_get_instance(vldcp->dip)); + return (DDI_FAILURE); + } + + /* get the name of the vldc instance */ + rv = ddi_prop_lookup_string(DDI_DEV_T_ANY, vldcp->dip, + DDI_PROP_DONTPASS, "name", &nameprop); + if (rv != DDI_PROP_SUCCESS) { + cmn_err(CE_NOTE, "?vldc%d has no 'name' property", + ddi_get_instance(vldcp->dip)); + return (DDI_FAILURE); + } + + D1("i_vldc_mdeg_register: name=%s, instance=%d\n", nameprop, inst); + + /* + * Allocate and initialize a per-instance copy + * of the global property spec array that will + * uniquely identify this vldc instance. + */ + templatesz = sizeof (vldc_prop_template); + pspecp = kmem_alloc(templatesz, KM_SLEEP); + + bcopy(vldc_prop_template, pspecp, templatesz); + + /* copy in the name property */ + namesz = strlen(nameprop) + 1; + name = kmem_alloc(namesz, KM_SLEEP); + + bcopy(nameprop, name, namesz); + VLDC_SET_MDEG_PROP_NAME(pspecp, name); + + /* copy in the instance property */ + VLDC_SET_MDEG_PROP_INST(pspecp, inst); + + /* initialize the complete prop spec structure */ + inst_specp = kmem_alloc(sizeof (mdeg_node_spec_t), KM_SLEEP); + inst_specp->namep = "virtual-device"; + inst_specp->specp = pspecp; + + /* perform the registration */ + rv = mdeg_register(inst_specp, &vport_match, i_vldc_mdeg_cb, + vldcp, &mdeg_hdl); + + if (rv != MDEG_SUCCESS) { + cmn_err(CE_NOTE, "?i_vldc_mdeg_register: mdeg_register " + "failed, err = %d", rv); + kmem_free(name, namesz); + kmem_free(pspecp, templatesz); + kmem_free(inst_specp, sizeof (mdeg_node_spec_t)); + return (DDI_FAILURE); + } + + /* save off data that will be needed later */ + vldcp->inst_spec = inst_specp; + vldcp->mdeg_hdl = mdeg_hdl; + + return (DDI_SUCCESS); +} + +/* unregister callback from mdeg */ +static int +i_vldc_mdeg_unregister(vldc_t *vldcp) +{ + char *name; + int rv; + + D1("i_vldc_mdeg_unregister: hdl=0x%lx\n", vldcp->mdeg_hdl); + + rv = mdeg_unregister(vldcp->mdeg_hdl); + if (rv != MDEG_SUCCESS) { + return (rv); + } + + /* + * Clean up cached MDEG data + */ + name = VLDC_MDEG_PROP_NAME(vldcp->inst_spec->specp); + if (name != NULL) { + kmem_free(name, strlen(name) + 1); + } + kmem_free(vldcp->inst_spec->specp, sizeof (vldc_prop_template)); + vldcp->inst_spec->specp = NULL; + + kmem_free(vldcp->inst_spec, sizeof (mdeg_node_spec_t)); + vldcp->inst_spec = NULL; + + return (MDEG_SUCCESS); +} + +static int +i_vldc_get_port_channel(md_t *mdp, mde_cookie_t node, uint64_t *ldc_id) +{ + int num_nodes, nchan; + size_t listsz; + mde_cookie_t *listp; + + /* + * Find the channel-endpoint node(s) (which should be under this + * port node) which contain the channel id(s). + */ + if ((num_nodes = md_node_count(mdp)) <= 0) { + cmn_err(CE_NOTE, "?i_vldc_get_port_channel: invalid number of " + "channel-endpoint nodes found (%d)", num_nodes); + return (-1); + } + + /* allocate space for node list */ + listsz = num_nodes * sizeof (mde_cookie_t); + listp = kmem_alloc(listsz, KM_SLEEP); + + nchan = md_scan_dag(mdp, node, md_find_name(mdp, "channel-endpoint"), + md_find_name(mdp, "fwd"), listp); + + if (nchan <= 0) { + cmn_err(CE_NOTE, "?i_vldc_get_port_channel: no channel-endpoint" + " nodes found"); + kmem_free(listp, listsz); + return (-1); + } + + D2("i_vldc_get_port_channel: %d channel-endpoint nodes found", nchan); + + /* use property from first node found */ + if (md_get_prop_val(mdp, listp[0], "id", ldc_id)) { + cmn_err(CE_NOTE, "?i_vldc_get_port_channel: channel-endpoint " + "has no 'id' property"); + kmem_free(listp, listsz); + return (-1); + } + + kmem_free(listp, listsz); + + return (0); +} + +/* add a vldc port */ +static int +i_vldc_add_port(vldc_t *vldcp, md_t *mdp, mde_cookie_t node) +{ + vldc_port_t *vport; + char *sname; + uint64_t portno; + int vldc_inst; + minor_t minor; + int minor_idx; + boolean_t new_minor; + int rv; + + /* read in the port's id property */ + if (md_get_prop_val(mdp, node, "id", &portno)) { + cmn_err(CE_NOTE, "?i_vldc_add_port: node 0x%lx of added " + "list has no 'id' property", node); + return (MDEG_FAILURE); + } + + if (portno >= VLDC_MAX_PORTS) { + cmn_err(CE_NOTE, "?i_vldc_add_port: found port number (%lu) " + "larger than maximum supported number of ports", portno); + return (MDEG_FAILURE); + } + + vport = &(vldcp->port[portno]); + + if (vport->minorp != NULL) { + cmn_err(CE_NOTE, "?i_vldc_add_port: trying to add a port (%lu)" + " which is already bound", portno); + return (MDEG_FAILURE); + } + + vport->number = portno; + + /* get all channels for this device (currently only one) */ + if (i_vldc_get_port_channel(mdp, node, &vport->ldc_id) == -1) { + return (MDEG_FAILURE); + } + + /* set the default MTU */ + vport->mtu = VLDC_DEFAULT_MTU; + + /* get the service being exported by this port */ + if (md_get_prop_str(mdp, node, "vldc-svc-name", &sname)) { + cmn_err(CE_NOTE, "?i_vldc_add_port: vdevice has no " + "'vldc-svc-name' property"); + return (MDEG_FAILURE); + } + + /* minor number look up */ + for (minor_idx = 0; minor_idx < vldcp->minors_assigned; + minor_idx++) { + if (strcmp(vldcp->minor_tbl[minor_idx].sname, sname) == 0) { + /* found previously assigned minor number */ + break; + } + } + + new_minor = B_FALSE; + if (minor_idx == vldcp->minors_assigned) { + /* end of lookup - assign new minor number */ + if (vldcp->minors_assigned == VLDC_MAX_MINORS) { + cmn_err(CE_NOTE, "?i_vldc_add_port: too many minor " + "nodes (%d)", minor_idx); + return (MDEG_FAILURE); + } + + (void) strlcpy(vldcp->minor_tbl[minor_idx].sname, + sname, MAXPATHLEN); + + vldcp->minors_assigned++; + new_minor = B_TRUE; + } + + ASSERT(vldcp->minor_tbl[minor_idx].portno == VLDC_INVALID_PORTNO); + + vport->minorp = &vldcp->minor_tbl[minor_idx]; + vldcp->minor_tbl[minor_idx].portno = portno; + vldcp->minor_tbl[minor_idx].in_use = 0; + + D1("i_vldc_add_port: port@%d mtu=%d, ldc=%ld, service=%s\n", + vport->number, vport->mtu, vport->ldc_id, sname); + + /* + * Create a minor node. The minor number is + * (vldc_inst << VLDC_INST_SHIFT) | minor_idx + */ + vldc_inst = ddi_get_instance(vldcp->dip); + + minor = (vldc_inst << VLDC_INST_SHIFT) | (minor_idx); + + rv = ddi_create_minor_node(vldcp->dip, sname, S_IFCHR, + minor, DDI_NT_SERIAL, 0); + + if (rv != DDI_SUCCESS) { + cmn_err(CE_NOTE, "?i_vldc_add_port: failed to create minor" + "node (%u), err = %d", minor, rv); + vldcp->minor_tbl[minor_idx].portno = VLDC_INVALID_PORTNO; + if (new_minor) { + vldcp->minors_assigned--; + } + return (MDEG_FAILURE); + } + + /* + * The port is now bound to a minor node and is initially in the + * closed state. + */ + vport->status = VLDC_PORT_CLOSED; + + D1("i_vldc_add_port: port %lu initialized\n", portno); + + return (MDEG_SUCCESS); +} + +/* remove a vldc port */ +static int +i_vldc_remove_port(vldc_t *vldcp, uint_t portno) +{ + vldc_port_t *vport; + vldc_minor_t *vminor; + + vport = &(vldcp->port[portno]); + vminor = vport->minorp; + if (vminor == NULL) { + cmn_err(CE_NOTE, "?i_vldc_remove_port: trying to remove a " + "port (%u) which is not bound", portno); + return (MDEG_FAILURE); + } + + /* + * Make sure that all new attempts to open or use the minor node + * associated with the port will fail. + */ + mutex_enter(&vminor->lock); + vminor->portno = VLDC_INVALID_PORTNO; + mutex_exit(&vminor->lock); + + /* send hangup to anyone polling */ + pollwakeup(&vport->poll, POLLHUP); + + /* Now wait for all current users of the minor node to finish. */ + mutex_enter(&vminor->lock); + while (vminor->in_use > 0) { + cv_wait(&vminor->cv, &vminor->lock); + } + + if ((vport->status == VLDC_PORT_READY) || + (vport->status == VLDC_PORT_OPEN)) { + /* close the port before it is torn down */ + (void) i_vldc_close_port(vldcp, portno); + } + + /* remove minor node */ + ddi_remove_minor_node(vldcp->dip, vport->minorp->sname); + vport->minorp = NULL; + + mutex_exit(&vminor->lock); + + D1("i_vldc_remove_port: removed vldc port %u\n", portno); + + return (MDEG_SUCCESS); +} + +/* close a ldc channel */ +static int +i_vldc_ldc_close(vldc_port_t *vport) +{ + int rv = 0; + int err; + + err = ldc_close(vport->ldc_handle); + if (err != 0) + rv = err; + err = ldc_unreg_callback(vport->ldc_handle); + if ((err != 0) && (rv != 0)) + rv = err; + err = ldc_fini(vport->ldc_handle); + if ((err != 0) && (rv != 0)) + rv = err; + + return (rv); +} + +/* close a vldc port */ +static int +i_vldc_close_port(vldc_t *vldcp, uint_t portno) +{ + vldc_port_t *vport; + int rv; + + vport = &(vldcp->port[portno]); + + ASSERT(MUTEX_HELD(&vport->minorp->lock)); + + if (vport->status == VLDC_PORT_CLOSED) { + /* nothing to do */ + DWARN("i_vldc_close_port: port %d in an unexpected " + "state (%d)\n", portno, vport->status); + return (DDI_SUCCESS); + } + + rv = DDI_SUCCESS; + if (vport->status == VLDC_PORT_READY) { + rv = i_vldc_ldc_close(vport); + } else { + ASSERT(vport->status == VLDC_PORT_OPEN); + } + + /* free memory */ + kmem_free(vport->send_buf, vport->mtu); + kmem_free(vport->recv_buf, vport->mtu); + + vport->status = VLDC_PORT_CLOSED; + + return (rv); +} + +/* + * attach(9E): attach a device to the system. + * called once for each instance of the device on the system. + */ +static int +vldc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int i, instance; + vldc_t *vldcp; + + switch (cmd) { + + case DDI_ATTACH: + + instance = ddi_get_instance(dip); + + if (ddi_soft_state_zalloc(vldc_ssp, instance) != DDI_SUCCESS) { + return (DDI_FAILURE); + } + + vldcp = ddi_get_soft_state(vldc_ssp, instance); + if (vldcp == NULL) { + ddi_soft_state_free(vldc_ssp, instance); + return (ENXIO); + } + + D1("vldc_attach: DDI_ATTACH instance=%d\n", instance); + + mutex_init(&vldcp->lock, NULL, MUTEX_DRIVER, NULL); + vldcp->dip = dip; + vldcp->detaching = B_FALSE; + + for (i = 0; i < VLDC_MAX_PORTS; i++) { + /* No minor node association to start with */ + vldcp->port[i].minorp = NULL; + } + + for (i = 0; i < VLDC_MAX_MINORS; i++) { + mutex_init(&(vldcp->minor_tbl[i].lock), NULL, + MUTEX_DRIVER, NULL); + cv_init(&(vldcp->minor_tbl[i].cv), NULL, + CV_DRIVER, NULL); + /* No port association to start with */ + vldcp->minor_tbl[i].portno = VLDC_INVALID_PORTNO; + } + + /* Register for MD update notification */ + if (i_vldc_mdeg_register(vldcp) != DDI_SUCCESS) { + ddi_soft_state_free(vldc_ssp, instance); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); + + case DDI_RESUME: + + return (DDI_SUCCESS); + + default: + + return (DDI_FAILURE); + } +} + +/* + * detach(9E): detach a device from the system. + */ +static int +vldc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int i, instance; + vldc_t *vldcp; + + switch (cmd) { + + case DDI_DETACH: + + instance = ddi_get_instance(dip); + + vldcp = ddi_get_soft_state(vldc_ssp, instance); + if (vldcp == NULL) { + return (DDI_FAILURE); + } + + D1("vldc_detach: DDI_DETACH instance=%d\n", instance); + + mutex_enter(&vldcp->lock); + + /* Fail the detach if all ports have not been removed. */ + for (i = 0; i < VLDC_MAX_MINORS; i++) { + if (vldcp->minor_tbl[i].portno != VLDC_INVALID_PORTNO) { + D1("vldc_detach: vldc@%d:%d is bound, " + "detach failed\n", + instance, vldcp->minor_tbl[i].portno); + mutex_exit(&vldcp->lock); + return (DDI_FAILURE); + } + } + + /* + * Prevent MDEG from adding new ports before the callback can + * be unregistered. The lock can't be held accross the + * unregistration call because a callback may be in progress + * and blocked on the lock. + */ + vldcp->detaching = B_TRUE; + + mutex_exit(&vldcp->lock); + + if (i_vldc_mdeg_unregister(vldcp) != MDEG_SUCCESS) { + vldcp->detaching = B_FALSE; + return (DDI_FAILURE); + } + + /* Tear down all bound ports and free resources. */ + for (i = 0; i < VLDC_MAX_MINORS; i++) { + if (vldcp->minor_tbl[i].portno != VLDC_INVALID_PORTNO) { + (void) i_vldc_remove_port(vldcp, i); + } + mutex_destroy(&(vldcp->minor_tbl[i].lock)); + cv_destroy(&(vldcp->minor_tbl[i].cv)); + } + + mutex_destroy(&vldcp->lock); + ddi_soft_state_free(vldc_ssp, instance); + + return (DDI_SUCCESS); + + case DDI_SUSPEND: + + return (DDI_SUCCESS); + + default: + + return (DDI_FAILURE); + } +} + +/* cb_open */ +static int +vldc_open(dev_t *devp, int flag, int otyp, cred_t *cred) +{ + _NOTE(ARGUNUSED(flag, otyp, cred)) + + int instance; + minor_t minor; + uint64_t portno; + vldc_t *vldcp; + vldc_port_t *vport; + vldc_minor_t *vminor; + + minor = getminor(*devp); + instance = VLDCINST(minor); + vldcp = ddi_get_soft_state(vldc_ssp, instance); + if (vldcp == NULL) + return (ENXIO); + + vminor = VLDCMINOR(vldcp, minor); + mutex_enter(&vminor->lock); + portno = vminor->portno; + if (portno == VLDC_INVALID_PORTNO) { + mutex_exit(&vminor->lock); + return (ENXIO); + } + + vport = &(vldcp->port[portno]); + + D1("vldc_open: opening vldc@%d:%lu\n", instance, portno); + + if (vport->status != VLDC_PORT_CLOSED) { + mutex_exit(&vminor->lock); + return (EBUSY); + } + + vport->recv_buf = kmem_alloc(vport->mtu, KM_SLEEP); + vport->send_buf = kmem_alloc(vport->mtu, KM_SLEEP); + + vport->is_stream = B_FALSE; /* assume not a stream */ + vport->hanged_up = B_FALSE; + + vport->status = VLDC_PORT_OPEN; + + mutex_exit(&vminor->lock); + + return (DDI_SUCCESS); +} + +/* cb_close */ +static int +vldc_close(dev_t dev, int flag, int otyp, cred_t *cred) +{ + _NOTE(ARGUNUSED(flag, otyp, cred)) + + int instance; + minor_t minor; + uint64_t portno; + vldc_t *vldcp; + vldc_minor_t *vminor; + int rv; + + minor = getminor(dev); + instance = VLDCINST(minor); + vldcp = ddi_get_soft_state(vldc_ssp, instance); + if (vldcp == NULL) { + return (ENXIO); + } + + vminor = VLDCMINOR(vldcp, minor); + mutex_enter(&vminor->lock); + portno = vminor->portno; + if (portno == VLDC_INVALID_PORTNO) { + mutex_exit(&vminor->lock); + return (ENOLINK); + } + + D1("vldc_close: closing vldc@%d:%lu\n", instance, portno); + + rv = i_vldc_close_port(vldcp, portno); + + mutex_exit(&vminor->lock); + + return (rv); +} + +static int +vldc_set_ldc_mode(vldc_port_t *vport, vldc_t *vldcp, int channel_mode) +{ + ldc_attr_t attr; + int rv; + + ASSERT(MUTEX_HELD(&vport->minorp->lock)); + + /* validate mode */ + switch (channel_mode) { + case LDC_MODE_STREAM: + vport->is_stream = B_TRUE; + break; + case LDC_MODE_RAW: + case LDC_MODE_UNRELIABLE: + case LDC_MODE_RELIABLE: + vport->is_stream = B_FALSE; + break; + default: + return (EINVAL); + } + + if (vport->status == VLDC_PORT_READY) { + rv = i_vldc_ldc_close(vport); + vport->status = VLDC_PORT_OPEN; + if (rv != 0) { + DWARN("vldc_set_ldc_mode: i_vldc_ldc_close " + "failed, rv=%d\n", rv); + return (rv); + } + } + + D1("vldc_set_ldc_mode: vport status %d, mode %d\n", + vport->status, channel_mode); + + vport->ldc_mode = channel_mode; + + /* initialize the channel */ + attr.devclass = LDC_DEV_SERIAL; + attr.instance = ddi_get_instance(vldcp->dip); + attr.qlen = VLDC_QUEUE_LEN; + attr.mode = vport->ldc_mode; + + if ((rv = ldc_init(vport->ldc_id, &attr, + &vport->ldc_handle)) != 0) { + DWARN("vldc_ioctl_opt_op: ldc_init failed, rv=%d\n", rv); + goto error_init; + } + + /* register it */ + if ((rv = ldc_reg_callback(vport->ldc_handle, + i_vldc_cb, (caddr_t)vport)) != 0) { + DWARN("vldc_ioctl_opt_op: ldc_reg_callback failed, rv=%d\n", + rv); + goto error_reg; + } + + /* open the channel */ + if ((rv = ldc_open(vport->ldc_handle)) != 0) { + DWARN("vldc_ioctl_opt_op: ldc_open failed, rv=%d\n", rv); + goto error_open; + } + + vport->status = VLDC_PORT_READY; + + /* + * Attempt to bring the channel up, but do not + * fail if the other end is not up yet. + */ + rv = ldc_up(vport->ldc_handle); + + if (rv == ECONNREFUSED) { + D1("vldc_ioctl_opt_op: remote endpoint not up yet\n"); + } else if (rv != 0) { + DWARN("vldc_ioctl_opt_op: ldc_up failed, rv=%d\n", rv); + goto error_up; + } + + D1("vldc_ioctl_opt_op: ldc %ld initialized successfully\n", + vport->ldc_id); + + return (0); + +error_up: + vport->status = VLDC_PORT_OPEN; + (void) ldc_close(vport->ldc_handle); +error_open: + (void) ldc_unreg_callback(vport->ldc_handle); +error_reg: + (void) ldc_fini(vport->ldc_handle); +error_init: + return (rv); +} + +/* ioctl to read cookie */ +static int +i_vldc_ioctl_read_cookie(vldc_port_t *vport, int vldc_instance, void *arg, + int mode) +{ + vldc_data_t copy_info; + caddr_t buf; + uint64_t len; + int rv; + + if (ddi_copyin(arg, ©_info, sizeof (copy_info), mode) == -1) { + return (EFAULT); + } + + len = copy_info.length; + if (len > vldc_max_cookie) { + return (EINVAL); + } + + /* allocate a temporary buffer */ + buf = kmem_alloc(len, KM_SLEEP); + + mutex_enter(&vport->minorp->lock); + + D2("i_vldc_ioctl_read_cookie: vldc@%d:%d reading from 0x%lx " + "size 0x%lx to 0x%lx\n", vldc_instance, vport->number, + copy_info.dst_addr, copy_info.length, copy_info.src_addr); + + /* read from the HV into the temporary buffer */ + rv = ldc_mem_rdwr_pa(vport->ldc_handle, buf, &len, + (caddr_t)copy_info.dst_addr, LDC_COPY_IN); + if (rv != 0) { + DWARN("i_vldc_ioctl_read_cookie: vldc@%d:%d cannot read " + "address 0x%lx, rv=%d\n", vldc_instance, vport->number, + copy_info.dst_addr, rv); + mutex_exit(&vport->minorp->lock); + kmem_free(buf, copy_info.length); + return (EFAULT); + } + + D2("i_vldc_ioctl_read_cookie: vldc@%d:%d read succeeded\n", + vldc_instance, vport->number); + + mutex_exit(&vport->minorp->lock); + + /* copy data from temporary buffer out to the caller and free buffer */ + rv = ddi_copyout(buf, (caddr_t)copy_info.src_addr, len, mode); + kmem_free(buf, copy_info.length); + if (rv != 0) { + return (EFAULT); + } + + /* set the structure to reflect outcome */ + copy_info.length = len; + if (ddi_copyout(©_info, arg, sizeof (copy_info), mode) != 0) { + return (EFAULT); + } + + return (0); +} + +/* ioctl to write cookie */ +static int +i_vldc_ioctl_write_cookie(vldc_port_t *vport, int vldc_instance, void *arg, + int mode) +{ + vldc_data_t copy_info; + caddr_t buf; + uint64_t len; + int rv; + + if (ddi_copyin((caddr_t)arg, ©_info, + sizeof (copy_info), mode) != 0) { + return (EFAULT); + } + + len = copy_info.length; + if (len > vldc_max_cookie) { + return (EINVAL); + } + + D2("i_vldc_ioctl_write_cookie: vldc@%d:%d writing 0x%lx size 0x%lx " + "to 0x%lx\n", vldc_instance, vport->number, copy_info.src_addr, + copy_info.length, copy_info.dst_addr); + + /* allocate a temporary buffer */ + buf = kmem_alloc(len, KM_SLEEP); + + /* copy into the temporary buffer the data to be written to the HV */ + if (ddi_copyin((caddr_t)copy_info.src_addr, buf, + copy_info.length, mode) != 0) { + kmem_free(buf, copy_info.length); + return (EFAULT); + } + + mutex_enter(&vport->minorp->lock); + + /* write the data from the temporary buffer to the HV */ + rv = ldc_mem_rdwr_pa(vport->ldc_handle, buf, &len, + (caddr_t)copy_info.dst_addr, LDC_COPY_OUT); + if (rv != 0) { + DWARN("i_vldc_ioctl_write_cookie: vldc@%d:%d failed to write at" + " address 0x%lx\n, rv=%d", vldc_instance, vport->number, + copy_info.dst_addr, rv); + mutex_exit(&vport->minorp->lock); + kmem_free(buf, copy_info.length); + return (EFAULT); + } + + D2("i_vldc_ioctl_write_cookie: vldc@%d:%d write succeeded\n", + vldc_instance, vport->number); + + mutex_exit(&vport->minorp->lock); + + kmem_free(buf, copy_info.length); + + /* set the structure to reflect outcome */ + copy_info.length = len; + if (ddi_copyout(©_info, (caddr_t)arg, + sizeof (copy_info), mode) != 0) { + return (EFAULT); + } + + return (0); +} + +/* vldc specific ioctl option commands */ +static int +i_vldc_ioctl_opt_op(vldc_port_t *vport, vldc_t *vldcp, void *arg, int mode) +{ + vldc_opt_op_t vldc_cmd; + uint32_t new_mtu; + int rv = 0; + + if (ddi_copyin(arg, &vldc_cmd, sizeof (vldc_cmd), mode) != 0) { + return (EFAULT); + } + + D1("vldc_ioctl_opt_op: op %d\n", vldc_cmd.opt_sel); + + switch (vldc_cmd.opt_sel) { + + case VLDC_OPT_MTU_SZ: + + if (vldc_cmd.op_sel == VLDC_OP_GET) { + vldc_cmd.opt_val = vport->mtu; + if (ddi_copyout(&vldc_cmd, arg, + sizeof (vldc_cmd), mode) == -1) { + return (EFAULT); + } + } else { + new_mtu = vldc_cmd.opt_val; + + if ((new_mtu < LDC_PACKET_SIZE) || + (new_mtu > vldc_max_mtu)) { + return (EINVAL); + } + + mutex_enter(&vport->minorp->lock); + + if ((vport->status != VLDC_PORT_CLOSED) && + (new_mtu != vport->mtu)) { + /* + * The port has buffers allocated since it is + * not closed plus the MTU size has changed. + * Reallocate the buffers to the new MTU size. + */ + kmem_free(vport->recv_buf, vport->mtu); + vport->recv_buf = kmem_alloc(new_mtu, KM_SLEEP); + + kmem_free(vport->send_buf, vport->mtu); + vport->send_buf = kmem_alloc(new_mtu, KM_SLEEP); + + vport->mtu = new_mtu; + } + + mutex_exit(&vport->minorp->lock); + } + + break; + + case VLDC_OPT_STATUS: + + if (vldc_cmd.op_sel == VLDC_OP_GET) { + vldc_cmd.opt_val = vport->status; + if (ddi_copyout(&vldc_cmd, arg, + sizeof (vldc_cmd), mode) == -1) { + return (EFAULT); + } + } else { + return (ENOTSUP); + } + + break; + + case VLDC_OPT_MODE: + + if (vldc_cmd.op_sel == VLDC_OP_GET) { + vldc_cmd.opt_val = vport->ldc_mode; + if (ddi_copyout(&vldc_cmd, arg, + sizeof (vldc_cmd), mode) == -1) { + return (EFAULT); + } + } else { + mutex_enter(&vport->minorp->lock); + rv = vldc_set_ldc_mode(vport, vldcp, vldc_cmd.opt_val); + mutex_exit(&vport->minorp->lock); + } + + break; + + default: + + D1("vldc_ioctl_opt_op: unsupported op %d\n", vldc_cmd.opt_sel); + return (ENOTSUP); + } + + return (rv); +} + +/* cb_ioctl */ +static int +vldc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + _NOTE(ARGUNUSED(credp, rvalp)) + + int rv = EINVAL; + int instance; + minor_t minor; + uint64_t portno; + vldc_t *vldcp; + vldc_port_t *vport; + vldc_minor_t *vminor; + + minor = getminor(dev); + instance = VLDCINST(minor); + vldcp = ddi_get_soft_state(vldc_ssp, instance); + if (vldcp == NULL) { + return (ENXIO); + } + + vminor = VLDCMINOR(vldcp, minor); + mutex_enter(&vminor->lock); + portno = vminor->portno; + if (portno == VLDC_INVALID_PORTNO) { + mutex_exit(&vminor->lock); + return (ENOLINK); + } + vminor->in_use += 1; + mutex_exit(&vminor->lock); + + vport = &(vldcp->port[portno]); + + D1("vldc_ioctl: vldc@%d:%lu cmd=0x%x\n", instance, portno, cmd); + + switch (cmd) { + + case VLDC_IOCTL_OPT_OP: + + rv = i_vldc_ioctl_opt_op(vport, vldcp, (void *)arg, mode); + break; + + case VLDC_IOCTL_READ_COOKIE: + + rv = i_vldc_ioctl_read_cookie(vport, instance, + (void *)arg, mode); + break; + + case VLDC_IOCTL_WRITE_COOKIE: + + rv = i_vldc_ioctl_write_cookie(vport, instance, + (void *)arg, mode); + break; + + default: + + DWARN("vldc_ioctl: vldc@%d:%lu unknown cmd=0x%x\n", + instance, portno, cmd); + rv = EINVAL; + break; + } + + mutex_enter(&vminor->lock); + vminor->in_use -= 1; + if (vminor->in_use == 0) { + cv_signal(&vminor->cv); + } + mutex_exit(&vminor->lock); + + D1("vldc_ioctl: rv=%d\n", rv); + + return (rv); +} + +/* cb_read */ +static int +vldc_read(dev_t dev, struct uio *uiop, cred_t *credp) +{ + _NOTE(ARGUNUSED(credp)) + + int instance; + minor_t minor; + size_t size = 0; + uint64_t portno; + vldc_t *vldcp; + vldc_port_t *vport; + vldc_minor_t *vminor; + int rv = 0; + + minor = getminor(dev); + instance = VLDCINST(minor); + vldcp = ddi_get_soft_state(vldc_ssp, instance); + if (vldcp == NULL) { + return (ENXIO); + } + + vminor = VLDCMINOR(vldcp, minor); + mutex_enter(&vminor->lock); + portno = vminor->portno; + if (portno == VLDC_INVALID_PORTNO) { + mutex_exit(&vminor->lock); + return (ENOLINK); + } + + D2("vldc_read: vldc@%d:%lu reading data\n", instance, portno); + + vport = &(vldcp->port[portno]); + + /* check the port status */ + if (vport->status != VLDC_PORT_READY) { + DWARN("vldc_read: vldc@%d:%lu not in the ready state\n", + instance, portno); + mutex_exit(&vminor->lock); + return (ENOTACTIVE); + } + + /* read data */ + size = MIN(vport->mtu, uiop->uio_resid); + rv = ldc_read(vport->ldc_handle, vport->recv_buf, &size); + + D2("vldc_read: vldc@%d:%lu ldc_read size=%ld, rv=%d\n", + instance, portno, size, rv); + + if (rv == 0) { + if (size != 0) { + rv = uiomove(vport->recv_buf, size, UIO_READ, uiop); + } else { + rv = EWOULDBLOCK; + } + } else { + switch (rv) { + case ENOBUFS: + break; + case ETIMEDOUT: + case EWOULDBLOCK: + rv = EWOULDBLOCK; + break; + default: + rv = ECONNRESET; + break; + } + } + + mutex_exit(&vminor->lock); + + return (rv); +} + +/* cb_write */ +static int +vldc_write(dev_t dev, struct uio *uiop, cred_t *credp) +{ + _NOTE(ARGUNUSED(credp)) + + int instance; + minor_t minor; + size_t size; + size_t orig_size; + uint64_t portno; + vldc_t *vldcp; + vldc_port_t *vport; + vldc_minor_t *vminor; + int rv = EINVAL; + + minor = getminor(dev); + instance = VLDCINST(minor); + vldcp = ddi_get_soft_state(vldc_ssp, instance); + if (vldcp == NULL) { + return (ENXIO); + } + + vminor = VLDCMINOR(vldcp, minor); + mutex_enter(&vminor->lock); + portno = vminor->portno; + if (portno == VLDC_INVALID_PORTNO) { + mutex_exit(&vminor->lock); + return (ENOLINK); + } + + vport = &(vldcp->port[portno]); + + /* check the port status */ + if (vport->status != VLDC_PORT_READY) { + DWARN("vldc_write: vldc@%d:%lu not in the ready state\n", + instance, portno); + mutex_exit(&vminor->lock); + return (ENOTACTIVE); + } + + orig_size = uiop->uio_resid; + size = orig_size; + + if (size > vport->mtu) { + if (vport->is_stream) { + /* can only send MTU size at a time */ + size = vport->mtu; + } else { + mutex_exit(&vminor->lock); + return (EMSGSIZE); + } + } + + D2("vldc_write: vldc@%d:%lu writing %lu bytes\n", instance, portno, + size); + + rv = uiomove(vport->send_buf, size, UIO_WRITE, uiop); + if (rv == 0) { + rv = ldc_write(vport->ldc_handle, (caddr_t)vport->send_buf, + &size); + if (rv != 0) { + DWARN("vldc_write: vldc@%d:%lu failed writing %lu " + "bytes rv=%d\n", instance, portno, size, rv); + } + } else { + size = 0; + } + + mutex_exit(&vminor->lock); + + /* resid is total number of bytes *not* sent */ + uiop->uio_resid = orig_size - size; + + return (rv); +} + +/* cb_chpoll */ +static int +vldc_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + int instance; + minor_t minor; + uint64_t portno; + vldc_t *vldcp; + vldc_port_t *vport; + vldc_minor_t *vminor; + ldc_status_t ldc_state; + boolean_t isempty; + int rv; + + minor = getminor(dev); + instance = VLDCINST(minor); + vldcp = ddi_get_soft_state(vldc_ssp, instance); + if (vldcp == NULL) { + return (ENXIO); + } + + vminor = VLDCMINOR(vldcp, minor); + mutex_enter(&vminor->lock); + portno = vminor->portno; + if (portno == VLDC_INVALID_PORTNO) { + mutex_exit(&vminor->lock); + return (ENOLINK); + } + + vport = &(vldcp->port[portno]); + + /* check the port status */ + if (vport->status != VLDC_PORT_READY) { + mutex_exit(&vminor->lock); + return (ENOTACTIVE); + } + + D2("vldc_chpoll: vldc@%d:%lu polling events 0x%x\n", + instance, portno, events); + + rv = ldc_status(vport->ldc_handle, &ldc_state); + if (rv != 0) { + DWARN("vldc_chpoll: vldc@%d:%lu could not get ldc status, " + "rv=%d\n", instance, portno, rv); + mutex_exit(&vminor->lock); + return (EBADFD); + } + + *reventsp = 0; + + if (ldc_state == LDC_UP) { + /* + * Check if the receive queue is empty and if not, signal that + * there is data ready to read. + */ + if (events & POLLIN) { + if ((ldc_chkq(vport->ldc_handle, &isempty) == 0) && + (isempty == B_FALSE)) { + *reventsp |= POLLIN; + } + } + + if (events & POLLOUT) + *reventsp |= POLLOUT; + + } else if (vport->hanged_up) { + *reventsp |= POLLHUP; + vport->hanged_up = B_FALSE; + } + + mutex_exit(&vminor->lock); + + if (((*reventsp) == 0) && (!anyyet)) { + *phpp = &vport->poll; + } + + D2("vldc_chpoll: vldc@%d:%lu ev=0x%x, rev=0x%x\n", + instance, portno, events, *reventsp); + + return (0); +} diff --git a/usr/src/uts/sun4v/io/vnet.c b/usr/src/uts/sun4v/io/vnet.c new file mode 100644 index 0000000000..ad625953e7 --- /dev/null +++ b/usr/src/uts/sun4v/io/vnet.c @@ -0,0 +1,1049 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> +#include <sys/stream.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/ksynch.h> +#include <sys/stat.h> +#include <sys/modctl.h> +#include <sys/debug.h> +#include <sys/ethernet.h> +#include <sys/dlpi.h> +#include <net/if.h> +#include <sys/mac.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/strsun.h> +#include <sys/note.h> +#include <sys/vnet.h> + +/* + * Function prototypes. + */ + +/* DDI entrypoints */ +static int vnetdevinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); +static int vnetattach(dev_info_t *, ddi_attach_cmd_t); +static int vnetdetach(dev_info_t *, ddi_detach_cmd_t); + +/* MAC entrypoints */ +static uint64_t vnet_m_stat(void *arg, enum mac_stat stat); +static int vnet_m_start(void *); +static void vnet_m_stop(void *); +static int vnet_m_promisc(void *, boolean_t); +static int vnet_m_multicst(void *, boolean_t, const uint8_t *); +static int vnet_m_unicst(void *, const uint8_t *); +static void vnet_m_resources(void *); +static void vnet_m_ioctl(void *, queue_t *, mblk_t *); +mblk_t *vnet_m_tx(void *, mblk_t *); + +/* vnet internal functions */ +static int vnet_mac_register(vnet_t *); +static int vnet_read_mac_address(vnet_t *vnetp); +static void vnet_add_vptl(vnet_t *vnetp, vp_tl_t *vp_tlp); +static void vnet_del_vptl(vnet_t *vnetp, vp_tl_t *vp_tlp); +static vp_tl_t *vnet_get_vptl(vnet_t *vnetp, const char *devname); +static fdb_t *vnet_lookup_fdb(fdb_fanout_t *fdbhp, uint8_t *macaddr); + +/* exported functions */ +void vnet_add_fdb(void *arg, uint8_t *macaddr, mac_tx_t m_tx, void *txarg); +void vnet_del_fdb(void *arg, uint8_t *macaddr); +void vnet_modify_fdb(void *arg, uint8_t *macaddr, mac_tx_t m_tx, void *txarg); +void vnet_add_def_rte(void *arg, mac_tx_t m_tx, void *txarg); +void vnet_del_def_rte(void *arg); + +/* externs */ +extern int vgen_init(void *vnetp, dev_info_t *vnetdip, void *vnetmacp, + const uint8_t *macaddr, mac_t **vgenmacp); +extern void vgen_uninit(void *arg); + +/* + * Linked list of "vnet_t" structures - one per instance. + */ +static vnet_t *vnet_headp = NULL; +static krwlock_t vnet_rw; + +/* Tunables */ +uint32_t vnet_ntxds = VNET_NTXDS; /* power of 2 transmit descriptors */ +uint32_t vnet_reclaim_lowat = VNET_RECLAIM_LOWAT; /* tx recl low watermark */ +uint32_t vnet_reclaim_hiwat = VNET_RECLAIM_HIWAT; /* tx recl high watermark */ +uint32_t vnet_ldcwd_interval = VNET_LDCWD_INTERVAL; /* watchdog freq in msec */ +uint32_t vnet_ldcwd_txtimeout = VNET_LDCWD_TXTIMEOUT; /* tx timeout in msec */ +uint32_t vnet_ldc_qlen = VNET_LDC_QLEN; /* ldc qlen */ +uint32_t vnet_nfdb_hash = VNET_NFDB_HASH; /* size of fdb hash table */ + +/* + * Property names + */ +static char macaddr_propname[] = "local-mac-address"; + +static struct ether_addr etherbroadcastaddr = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +/* + * MIB II broadcast/multicast packets + */ +#define IS_BROADCAST(ehp) \ + (ether_cmp(&ehp->ether_dhost, ðerbroadcastaddr) == 0) +#define IS_MULTICAST(ehp) \ + ((ehp->ether_dhost.ether_addr_octet[0] & 01) == 1) + +/* + * This is the string displayed by modinfo(1m). + */ +static char vnet_ident[] = "vnet driver v1.0"; +extern struct mod_ops mod_driverops; +static struct cb_ops cb_vnetops = { + nulldev, /* cb_open */ + nulldev, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + nodev, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* cb_stream */ + (int)(D_MP) /* cb_flag */ +}; + +static struct dev_ops vnetops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + NULL, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + vnetattach, /* devo_attach */ + vnetdetach, /* devo_detach */ + nodev, /* devo_reset */ + &cb_vnetops, /* devo_cb_ops */ + (struct bus_ops *)NULL /* devo_bus_ops */ +}; + +static struct modldrv modldrv = { + &mod_driverops, /* Type of module. This one is a driver */ + vnet_ident, /* ID string */ + &vnetops /* driver specific ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modldrv, NULL +}; + + +/* + * Print debug messages - set to 0xf to enable all msgs + */ +int _vnet_dbglevel = 0x8; + +void +_vnetdebug_printf(void *arg, const char *fmt, ...) +{ + char buf[512]; + va_list ap; + vnet_t *vnetp = (vnet_t *)arg; + + va_start(ap, fmt); + (void) vsprintf(buf, fmt, ap); + va_end(ap); + + if (vnetp == NULL) + cmn_err(CE_CONT, "%s\n", buf); + else + cmn_err(CE_CONT, "vnet%d: %s\n", vnetp->instance, buf); +} + +#ifdef DEBUG + +/* + * XXX: any changes to the definitions below need corresponding changes in + * vnet_gen.c + */ + +/* + * debug levels: + * DBG_LEVEL1: Function entry/exit tracing + * DBG_LEVEL2: Info messages + * DBG_LEVEL3: Warning messages + * DBG_LEVEL4: Error messages + */ + +enum { DBG_LEVEL1 = 0x01, DBG_LEVEL2 = 0x02, DBG_LEVEL3 = 0x04, + DBG_LEVEL4 = 0x08 }; + +#define DBG1(_s) do { \ + if ((_vnet_dbglevel & DBG_LEVEL1) != 0) { \ + _vnetdebug_printf _s; \ + } \ + _NOTE(CONSTCOND) } while (0) + +#define DBG2(_s) do { \ + if ((_vnet_dbglevel & DBG_LEVEL2) != 0) { \ + _vnetdebug_printf _s; \ + } \ + _NOTE(CONSTCOND) } while (0) + +#define DWARN(_s) do { \ + if ((_vnet_dbglevel & DBG_LEVEL3) != 0) { \ + _vnetdebug_printf _s; \ + } \ + _NOTE(CONSTCOND) } while (0) + +#define DERR(_s) do { \ + if ((_vnet_dbglevel & DBG_LEVEL4) != 0) { \ + _vnetdebug_printf _s; \ + } \ + _NOTE(CONSTCOND) } while (0) + +#else + +#define DBG1(_s) if (0) _vnetdebug_printf _s +#define DBG2(_s) if (0) _vnetdebug_printf _s +#define DWARN(_s) if (0) _vnetdebug_printf _s +#define DERR(_s) if (0) _vnetdebug_printf _s + +#endif + +/* _init(9E): initialize the loadable module */ +int +_init(void) +{ + int status; + + DBG1((NULL, "_init: enter\n")); + + mac_init_ops(&vnetops, "vnet"); + status = mod_install(&modlinkage); + if (status != 0) { + mac_fini_ops(&vnetops); + } + + DBG1((NULL, "_init: exit\n")); + return (status); +} + +/* _fini(9E): prepare the module for unloading. */ +int +_fini(void) +{ + int status; + + DBG1((NULL, "_fini: enter\n")); + + status = mod_remove(&modlinkage); + if (status != 0) + return (status); + mac_fini_ops(&vnetops); + + DBG1((NULL, "_fini: exit\n")); + return (status); +} + +/* _info(9E): return information about the loadable module */ +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * attach(9E): attach a device to the system. + * called once for each instance of the device on the system. + */ +static int +vnetattach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + mac_t *macp; + vnet_t *vnetp; + vp_tl_t *vp_tlp; + int instance; + int status; + enum { AST_init = 0x0, AST_vnet_alloc = 0x1, + AST_mac_alloc = 0x2, AST_read_macaddr = 0x4, + AST_vgen_init = 0x8, AST_vptl_alloc = 0x10, + AST_fdbh_alloc = 0x20 } + attach_state; + mac_t *vgenmacp = NULL; + uint32_t nfdbh = 0; + + attach_state = AST_init; + + switch (cmd) { + case DDI_ATTACH: + break; + case DDI_RESUME: + case DDI_PM_RESUME: + default: + goto vnet_attach_fail; + } + + instance = ddi_get_instance(dip); + DBG1((NULL, "vnetattach: instance(%d) enter\n", instance)); + + /* allocate vnet_t and mac_t structures */ + vnetp = kmem_zalloc(sizeof (vnet_t), KM_SLEEP); + attach_state |= AST_vnet_alloc; + + macp = kmem_zalloc(sizeof (mac_t), KM_SLEEP); + attach_state |= AST_mac_alloc; + + /* setup links to vnet_t from both devinfo and mac_t */ + ddi_set_driver_private(dip, (caddr_t)vnetp); + macp->m_driver = vnetp; + vnetp->dip = dip; + vnetp->macp = macp; + vnetp->instance = instance; + + /* read the mac address */ + status = vnet_read_mac_address(vnetp); + if (status != DDI_SUCCESS) { + goto vnet_attach_fail; + } + attach_state |= AST_read_macaddr; + + /* + * Initialize the generic vnet proxy transport. This is the first + * and default transport used by vnet. The generic transport + * is provided by using sun4v LDC (logical domain channel). On success, + * vgen_init() provides a pointer to mac_t of generic transport. + * Currently, this generic layer provides network connectivity to other + * vnets within ldoms and also to remote hosts oustide ldoms through + * the virtual switch (vsw) device on domain0. In the future, when + * physical adapters that are able to share their resources (such as + * dma channels) with guest domains become available, the vnet device + * will use hardware specific driver to communicate directly over the + * physical device to reach remote hosts without going through vswitch. + */ + status = vgen_init(vnetp, vnetp->dip, vnetp->macp, + (uint8_t *)vnetp->curr_macaddr, &vgenmacp); + if (status != DDI_SUCCESS) { + DERR((vnetp, "vgen_init() failed\n")); + goto vnet_attach_fail; + } + attach_state |= AST_vgen_init; + + vp_tlp = kmem_zalloc(sizeof (vp_tl_t), KM_SLEEP); + vp_tlp->macp = vgenmacp; + (void) snprintf(vp_tlp->name, MAXNAMELEN, "%s%u", "vgen", instance); + (void) strcpy(vnetp->vgen_name, vp_tlp->name); + + /* add generic transport to the list of vnet proxy transports */ + vnet_add_vptl(vnetp, vp_tlp); + attach_state |= AST_vptl_alloc; + + nfdbh = vnet_nfdb_hash; + if ((nfdbh < VNET_NFDB_HASH) || (nfdbh > VNET_NFDB_HASH_MAX)) { + vnetp->nfdb_hash = VNET_NFDB_HASH; + } + else + vnetp->nfdb_hash = nfdbh; + + /* allocate fdb hash table, with an extra slot for default route */ + vnetp->fdbhp = kmem_zalloc(sizeof (fdb_fanout_t) * + (vnetp->nfdb_hash + 1), KM_SLEEP); + attach_state |= AST_fdbh_alloc; + + /* register with MAC layer */ + status = vnet_mac_register(vnetp); + if (status != DDI_SUCCESS) { + goto vnet_attach_fail; + } + + /* add to the list of vnet devices */ + WRITE_ENTER(&vnet_rw); + vnetp->nextp = vnet_headp; + vnet_headp = vnetp; + RW_EXIT(&vnet_rw); + + DBG1((NULL, "vnetattach: instance(%d) exit\n", instance)); + return (DDI_SUCCESS); + +vnet_attach_fail: + if (attach_state & AST_fdbh_alloc) { + kmem_free(vnetp->fdbhp, + sizeof (fdb_fanout_t) * (vnetp->nfdb_hash + 1)); + } + if (attach_state & AST_vptl_alloc) { + WRITE_ENTER(&vnetp->trwlock); + vnet_del_vptl(vnetp, vp_tlp); + RW_EXIT(&vnetp->trwlock); + } + if (attach_state & AST_vgen_init) { + vgen_uninit(vgenmacp->m_driver); + } + if (attach_state & AST_mac_alloc) { + KMEM_FREE(macp); + } + if (attach_state & AST_vnet_alloc) { + KMEM_FREE(vnetp); + } + return (DDI_FAILURE); +} + +/* + * detach(9E): detach a device from the system. + */ +static int +vnetdetach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + vnet_t *vnetp; + vnet_t **vnetpp; + vp_tl_t *vp_tlp; + int instance; + + instance = ddi_get_instance(dip); + DBG1((NULL, "vnetdetach: instance(%d) enter\n", instance)); + + vnetp = ddi_get_driver_private(dip); + if (vnetp == NULL) { + goto vnet_detach_fail; + } + + switch (cmd) { + case DDI_DETACH: + break; + case DDI_SUSPEND: + case DDI_PM_SUSPEND: + default: + goto vnet_detach_fail; + } + + /* + * Unregister from the MAC subsystem. This can fail, in + * particular if there are DLPI style-2 streams still open - + * in which case we just return failure. + */ + if (mac_unregister(vnetp->macp) != 0) + goto vnet_detach_fail; + + /* unlink from instance(vnet_t) list */ + WRITE_ENTER(&vnet_rw); + for (vnetpp = &vnet_headp; *vnetpp; vnetpp = &(*vnetpp)->nextp) { + if (*vnetpp == vnetp) { + *vnetpp = vnetp->nextp; + break; + } + } + RW_EXIT(&vnet_rw); + + /* uninit and free vnet proxy transports */ + WRITE_ENTER(&vnetp->trwlock); + while ((vp_tlp = vnetp->tlp) != NULL) { + if (strcmp(vnetp->vgen_name, vp_tlp->name) == 0) { + /* uninitialize generic transport */ + vgen_uninit(vp_tlp->macp->m_driver); + } + vnet_del_vptl(vnetp, vp_tlp); + } + RW_EXIT(&vnetp->trwlock); + + KMEM_FREE(vnetp->macp); + KMEM_FREE(vnetp); + + return (DDI_SUCCESS); + +vnet_detach_fail: + return (DDI_FAILURE); +} + +/* enable the device for transmit/receive */ +static int +vnet_m_start(void *arg) +{ + vnet_t *vnetp = arg; + vp_tl_t *vp_tlp; + mac_t *vp_macp; + + DBG1((vnetp, "vnet_m_start: enter\n")); + + /* + * XXX + * Currently, we only have generic transport. m_start() invokes + * vgen_start() which enables ports/channels in vgen and + * initiates handshake with peer vnets and vsw. In the future when we + * have support for hardware specific transports, this information + * needs to be propagted back to vnet from vgen and we need to revisit + * this code (see comments in vnet_attach()). + * + */ + WRITE_ENTER(&vnetp->trwlock); + for (vp_tlp = vnetp->tlp; vp_tlp != NULL; vp_tlp = vp_tlp->nextp) { + vp_macp = vp_tlp->macp; + vp_macp->m_start(vp_macp->m_driver); + } + RW_EXIT(&vnetp->trwlock); + + DBG1((vnetp, "vnet_m_start: exit\n")); + return (VNET_SUCCESS); + +} + +/* stop transmit/receive for the device */ +static void +vnet_m_stop(void *arg) +{ + vnet_t *vnetp = arg; + vp_tl_t *vp_tlp; + mac_t *vp_macp; + + DBG1((vnetp, "vnet_m_stop: enter\n")); + + WRITE_ENTER(&vnetp->trwlock); + for (vp_tlp = vnetp->tlp; vp_tlp != NULL; vp_tlp = vp_tlp->nextp) { + vp_macp = vp_tlp->macp; + vp_macp->m_stop(vp_macp->m_driver); + } + RW_EXIT(&vnetp->trwlock); + + DBG1((vnetp, "vnet_m_stop: exit\n")); +} + +/* set the unicast mac address of the device */ +static int +vnet_m_unicst(void *arg, const uint8_t *macaddr) +{ + _NOTE(ARGUNUSED(macaddr)) + + vnet_t *vnetp = arg; + + DBG1((vnetp, "vnet_m_unicst: enter\n")); + /* + * XXX: setting mac address dynamically is not supported. + */ +#if 0 + bcopy(macaddr, vnetp->curr_macaddr, ETHERADDRL); +#endif + DBG1((vnetp, "vnet_m_unicst: exit\n")); + + return (VNET_SUCCESS); +} + +/* enable/disable a multicast address */ +static int +vnet_m_multicst(void *arg, boolean_t add, const uint8_t *mca) +{ + _NOTE(ARGUNUSED(add, mca)) + + vnet_t *vnetp = arg; + vp_tl_t *vp_tlp; + mac_t *vp_macp; + int rv = VNET_SUCCESS; + + DBG1((vnetp, "vnet_m_multicst: enter\n")); + READ_ENTER(&vnetp->trwlock); + for (vp_tlp = vnetp->tlp; vp_tlp != NULL; vp_tlp = vp_tlp->nextp) { + if (strcmp(vnetp->vgen_name, vp_tlp->name) == 0) { + vp_macp = vp_tlp->macp; + rv = vp_macp->m_multicst(vp_macp->m_driver, add, mca); + break; + } + } + RW_EXIT(&vnetp->trwlock); + DBG1((vnetp, "vnet_m_multicst: exit\n")); + return (rv); +} + +/* set or clear promiscuous mode on the device */ +static int +vnet_m_promisc(void *arg, boolean_t on) +{ + _NOTE(ARGUNUSED(on)) + + vnet_t *vnetp = arg; + DBG1((vnetp, "vnet_m_promisc: enter\n")); + /* + * XXX: setting promiscuous mode is not supported, just return success. + */ + DBG1((vnetp, "vnet_m_promisc: exit\n")); + return (VNET_SUCCESS); +} + +/* + * Transmit a chain of packets. This function provides switching functionality + * based on the destination mac address to reach other guests (within ldoms) or + * external hosts. + */ +mblk_t * +vnet_m_tx(void *arg, mblk_t *mp) +{ + vnet_t *vnetp; + mblk_t *next; + uint32_t fdbhash; + fdb_t *fdbp; + fdb_fanout_t *fdbhp; + struct ether_header *ehp; + uint8_t *macaddr; + mblk_t *resid_mp; + + vnetp = (vnet_t *)arg; + DBG1((vnetp, "vnet_m_tx: enter\n")); + ASSERT(mp != NULL); + + while (mp != NULL) { + next = mp->b_next; + mp->b_next = NULL; + + /* get the destination mac address in the eth header */ + ehp = (struct ether_header *)mp->b_rptr; + macaddr = (uint8_t *)&ehp->ether_dhost; + + /* Calculate hash value and fdb fanout */ + fdbhash = MACHASH(macaddr, vnetp->nfdb_hash); + fdbhp = &(vnetp->fdbhp[fdbhash]); + + READ_ENTER(&fdbhp->rwlock); + fdbp = vnet_lookup_fdb(fdbhp, macaddr); + if (fdbp) { + /* + * If the destination is in FDB, the destination is + * a vnet device within ldoms and directly reachable, + * invoke the tx function in the fdb entry. + */ + resid_mp = fdbp->m_tx(fdbp->txarg, mp); + if (resid_mp != NULL) { + /* m_tx failed */ + mp->b_next = next; + RW_EXIT(&fdbhp->rwlock); + break; + } + RW_EXIT(&fdbhp->rwlock); + } else { + /* destination is not in FDB */ + RW_EXIT(&fdbhp->rwlock); + /* + * If the destination is broadcast/multicast + * or an unknown unicast address, forward the + * packet to vsw, using the last slot in fdb which is + * reserved for default route. + */ + fdbhp = &(vnetp->fdbhp[vnetp->nfdb_hash]); + READ_ENTER(&fdbhp->rwlock); + fdbp = fdbhp->headp; + if (fdbp) { + resid_mp = fdbp->m_tx(fdbp->txarg, mp); + if (resid_mp != NULL) { + /* m_tx failed */ + mp->b_next = next; + RW_EXIT(&fdbhp->rwlock); + break; + } + } else { + /* drop the packet */ + freemsg(mp); + } + RW_EXIT(&fdbhp->rwlock); + } + + mp = next; + } + + DBG1((vnetp, "vnet_m_tx: exit\n")); + return (mp); +} + +/* register resources with mac layer */ +static void +vnet_m_resources(void *arg) +{ + vnet_t *vnetp = arg; + vp_tl_t *vp_tlp; + mac_t *vp_macp; + + DBG1((vnetp, "vnet_m_resources: enter\n")); + + WRITE_ENTER(&vnetp->trwlock); + for (vp_tlp = vnetp->tlp; vp_tlp != NULL; vp_tlp = vp_tlp->nextp) { + vp_macp = vp_tlp->macp; + vp_macp->m_resources(vp_macp->m_driver); + } + RW_EXIT(&vnetp->trwlock); + + DBG1((vnetp, "vnet_m_resources: exit\n")); +} + +/* + * vnet specific ioctls + */ +static void +vnet_m_ioctl(void *arg, queue_t *wq, mblk_t *mp) +{ + vnet_t *vnetp = (vnet_t *)arg; + struct iocblk *iocp; + int cmd; + + DBG1((vnetp, "vnet_m_ioctl: enter\n")); + + iocp = (struct iocblk *)mp->b_rptr; + iocp->ioc_error = 0; + cmd = iocp->ioc_cmd; + switch (cmd) { + default: + miocnak(wq, mp, 0, EINVAL); + break; + } + DBG1((vnetp, "vnet_m_ioctl: exit\n")); +} + +/* get statistics from the device */ +uint64_t +vnet_m_stat(void *arg, enum mac_stat stat) +{ + vnet_t *vnetp = arg; + vp_tl_t *vp_tlp; + mac_t *vp_macp; + uint64_t val = 0; + + DBG1((vnetp, "vnet_m_stat: enter\n")); + + /* + * get the specified statistic from each transport + * and return the aggregate val + */ + READ_ENTER(&vnetp->trwlock); + for (vp_tlp = vnetp->tlp; vp_tlp != NULL; vp_tlp = vp_tlp->nextp) { + vp_macp = vp_tlp->macp; + val += vp_macp->m_stat(vp_macp->m_driver, stat); + } + RW_EXIT(&vnetp->trwlock); + + DBG1((vnetp, "vnet_m_stat: exit\n")); + return (val); +} + +/* wrapper function for mac_register() */ +static int +vnet_mac_register(vnet_t *vnetp) +{ + mac_info_t *mip; + mac_t *macp; + + macp = vnetp->macp; + + mip = &(macp->m_info); + mip->mi_media = DL_ETHER; + mip->mi_sdu_min = 0; + mip->mi_sdu_max = ETHERMTU; + mip->mi_cksum = 0; + mip->mi_poll = 0; /* DL_CAPAB_POLL ? */ + mip->mi_addr_length = ETHERADDRL; + bcopy(ðerbroadcastaddr, mip->mi_brdcst_addr, ETHERADDRL); + bcopy(vnetp->curr_macaddr, mip->mi_unicst_addr, ETHERADDRL); + + MAC_STAT_MIB(mip->mi_stat); + mip->mi_stat[MAC_STAT_UNKNOWNS] = B_FALSE; + MAC_STAT_ETHER(mip->mi_stat); + mip->mi_stat[MAC_STAT_SQE_ERRORS] = B_FALSE; + mip->mi_stat[MAC_STAT_MACRCV_ERRORS] = B_FALSE; + + macp->m_stat = vnet_m_stat; + macp->m_start = vnet_m_start; + macp->m_stop = vnet_m_stop; + macp->m_promisc = vnet_m_promisc; + macp->m_multicst = vnet_m_multicst; + macp->m_unicst = vnet_m_unicst; + macp->m_resources = vnet_m_resources; + macp->m_ioctl = vnet_m_ioctl; + macp->m_tx = vnet_m_tx; + + macp->m_dip = vnetp->dip; + macp->m_ident = MAC_IDENT; + + /* + * Finally, we're ready to register ourselves with the MAC layer + * interface; if this succeeds, we're all ready to start() + */ + if (mac_register(macp) != 0) { + KMEM_FREE(macp); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* add vp_tl to the list */ +static void +vnet_add_vptl(vnet_t *vnetp, vp_tl_t *vp_tlp) +{ + vp_tl_t *ttlp; + + WRITE_ENTER(&vnetp->trwlock); + if (vnetp->tlp == NULL) { + vnetp->tlp = vp_tlp; + } else { + ttlp = vnetp->tlp; + while (ttlp->nextp) + ttlp = ttlp->nextp; + ttlp->nextp = vp_tlp; + } + RW_EXIT(&vnetp->trwlock); +} + +/* remove vp_tl from the list */ +static void +vnet_del_vptl(vnet_t *vnetp, vp_tl_t *vp_tlp) +{ + vp_tl_t *ttlp, **pretlp; + boolean_t found = B_FALSE; + + pretlp = &vnetp->tlp; + ttlp = *pretlp; + while (ttlp) { + if (ttlp == vp_tlp) { + found = B_TRUE; + (*pretlp) = ttlp->nextp; + ttlp->nextp = NULL; + break; + } + pretlp = &(ttlp->nextp); + ttlp = *pretlp; + } + + if (found) { + KMEM_FREE(vp_tlp); + } +} + +/* get vp_tl corresponding to the given name */ +static vp_tl_t * +vnet_get_vptl(vnet_t *vnetp, const char *name) +{ + vp_tl_t *tlp; + + tlp = vnetp->tlp; + while (tlp) { + if (strcmp(tlp->name, name) == 0) { + return (tlp); + } + tlp = tlp->nextp; + } + DWARN((vnetp, + "vnet_get_vptl: can't find vp_tl with name (%s)\n", name)); + return (NULL); +} + +/* read the mac address of the device */ +static int +vnet_read_mac_address(vnet_t *vnetp) +{ + uchar_t *macaddr; + uint32_t size; + int rv; + + rv = ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, vnetp->dip, + DDI_PROP_DONTPASS, macaddr_propname, &macaddr, &size); + if ((rv != DDI_PROP_SUCCESS) || (size != ETHERADDRL)) { + DWARN((vnetp, + "vnet_read_mac_address: prop_lookup failed (%s) err (%d)\n", + macaddr_propname, rv)); + return (DDI_FAILURE); + } + bcopy(macaddr, (caddr_t)vnetp->vendor_addr, ETHERADDRL); + bcopy(macaddr, (caddr_t)vnetp->curr_macaddr, ETHERADDRL); + ddi_prop_free(macaddr); + + return (DDI_SUCCESS); +} + + +/* + * Functions below are called only by generic transport to add/remove/modify + * entries in forwarding database. See comments in vgen_port_init(vnet_gen.c). + */ + +/* add an entry into the forwarding database */ +void +vnet_add_fdb(void *arg, uint8_t *macaddr, mac_tx_t m_tx, void *txarg) +{ + vnet_t *vnetp = (vnet_t *)arg; + uint32_t fdbhash; + fdb_t *fdbp; + fdb_fanout_t *fdbhp; + + /* Calculate hash value and fdb fanout */ + fdbhash = MACHASH(macaddr, vnetp->nfdb_hash); + fdbhp = &(vnetp->fdbhp[fdbhash]); + + WRITE_ENTER(&fdbhp->rwlock); + + fdbp = kmem_zalloc(sizeof (fdb_t), KM_NOSLEEP); + if (fdbp == NULL) { + RW_EXIT(&fdbhp->rwlock); + return; + } + bcopy(macaddr, (caddr_t)fdbp->macaddr, ETHERADDRL); + fdbp->m_tx = m_tx; + fdbp->txarg = txarg; + fdbp->nextp = fdbhp->headp; + fdbhp->headp = fdbp; + + RW_EXIT(&fdbhp->rwlock); +} + +/* delete an entry from the forwarding database */ +void +vnet_del_fdb(void *arg, uint8_t *macaddr) +{ + vnet_t *vnetp = (vnet_t *)arg; + uint32_t fdbhash; + fdb_t *fdbp; + fdb_t **pfdbp; + fdb_fanout_t *fdbhp; + + /* Calculate hash value and fdb fanout */ + fdbhash = MACHASH(macaddr, vnetp->nfdb_hash); + fdbhp = &(vnetp->fdbhp[fdbhash]); + + WRITE_ENTER(&fdbhp->rwlock); + + for (pfdbp = &fdbhp->headp; (fdbp = *pfdbp) != NULL; + pfdbp = &fdbp->nextp) { + if (bcmp(fdbp->macaddr, macaddr, ETHERADDRL) == 0) { + /* Unlink it from the list */ + *pfdbp = fdbp->nextp; + KMEM_FREE(fdbp); + break; + } + } + + RW_EXIT(&fdbhp->rwlock); +} + +/* modify an existing entry in the forwarding database */ +void +vnet_modify_fdb(void *arg, uint8_t *macaddr, mac_tx_t m_tx, void *txarg) +{ + vnet_t *vnetp = (vnet_t *)arg; + uint32_t fdbhash; + fdb_t *fdbp; + fdb_fanout_t *fdbhp; + + /* Calculate hash value and fdb fanout */ + fdbhash = MACHASH(macaddr, vnetp->nfdb_hash); + fdbhp = &(vnetp->fdbhp[fdbhash]); + + WRITE_ENTER(&fdbhp->rwlock); + + for (fdbp = fdbhp->headp; fdbp != NULL; fdbp = fdbp->nextp) { + if (bcmp(fdbp->macaddr, macaddr, ETHERADDRL) == 0) { + /* change the entry to have new tx params */ + fdbp->m_tx = m_tx; + fdbp->txarg = txarg; + break; + } + } + + RW_EXIT(&fdbhp->rwlock); +} + +/* look up an fdb entry based on the mac address, caller holds lock */ +static fdb_t * +vnet_lookup_fdb(fdb_fanout_t *fdbhp, uint8_t *macaddr) +{ + fdb_t *fdbp = NULL; + + for (fdbp = fdbhp->headp; fdbp != NULL; fdbp = fdbp->nextp) { + if (bcmp(fdbp->macaddr, macaddr, ETHERADDRL) == 0) { + break; + } + } + + return (fdbp); +} + +/* add default route entry into the forwarding database */ +void +vnet_add_def_rte(void *arg, mac_tx_t m_tx, void *txarg) +{ + vnet_t *vnetp = (vnet_t *)arg; + fdb_t *fdbp; + fdb_fanout_t *fdbhp; + + /* + * The last hash list is reserved for default route entry, + * and for now, we have only one entry in this list. + */ + fdbhp = &(vnetp->fdbhp[vnetp->nfdb_hash]); + + WRITE_ENTER(&fdbhp->rwlock); + + if (fdbhp->headp) { + DWARN((vnetp, + "vnet_add_def_rte: default rte already exists\n")); + RW_EXIT(&fdbhp->rwlock); + return; + } + fdbp = kmem_zalloc(sizeof (fdb_t), KM_NOSLEEP); + if (fdbp == NULL) { + RW_EXIT(&fdbhp->rwlock); + return; + } + bzero(fdbp->macaddr, ETHERADDRL); + fdbp->m_tx = m_tx; + fdbp->txarg = txarg; + fdbp->nextp = NULL; + fdbhp->headp = fdbp; + + RW_EXIT(&fdbhp->rwlock); +} + +/* delete default route entry from the forwarding database */ +void +vnet_del_def_rte(void *arg) +{ + vnet_t *vnetp = (vnet_t *)arg; + fdb_t *fdbp; + fdb_fanout_t *fdbhp; + + /* + * The last hash list is reserved for default route entry, + * and for now, we have only one entry in this list. + */ + fdbhp = &(vnetp->fdbhp[vnetp->nfdb_hash]); + + WRITE_ENTER(&fdbhp->rwlock); + + if (fdbhp->headp == NULL) { + RW_EXIT(&fdbhp->rwlock); + return; + } + fdbp = fdbhp->headp; + KMEM_FREE(fdbp); + fdbhp->headp = NULL; + + RW_EXIT(&fdbhp->rwlock); +} diff --git a/usr/src/uts/sun4v/io/vnet_gen.c b/usr/src/uts/sun4v/io/vnet_gen.c new file mode 100644 index 0000000000..56f753e5e7 --- /dev/null +++ b/usr/src/uts/sun4v/io/vnet_gen.c @@ -0,0 +1,4899 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> +#include <sys/stream.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/ksynch.h> +#include <sys/stat.h> +#include <sys/modctl.h> +#include <sys/debug.h> +#include <sys/ethernet.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/strsun.h> +#include <sys/note.h> +#include <sys/mac.h> +#include <sys/ldc.h> +#include <sys/mach_descrip.h> +#include <sys/mdeg.h> +#include <sys/vio_mailbox.h> +#include <sys/vio_common.h> +#include <sys/vnet_common.h> +#include <sys/vnet_gen.h> +#include <sys/vnet_mailbox.h> + +/* + * Implementation of the mac functionality for vnet using the + * generic(default) transport layer of sun4v Logical Domain Channels(LDC). + */ + +/* + * Function prototypes. + */ +/* vgen proxy entry points */ +int vgen_init(void *vnetp, dev_info_t *vnetdip, void *vnetmacp, + const uint8_t *macaddr, mac_t **vgenmacp); +void vgen_uninit(void *arg); +static int vgen_start(void *arg); +static void vgen_stop(void *arg); +static mblk_t *vgen_tx(void *arg, mblk_t *mp); +static void vgen_resources(void *arg); +static int vgen_multicst(void *arg, boolean_t add, + const uint8_t *mca); +static int vgen_promisc(void *arg, boolean_t on); +static int vgen_unicst(void *arg, const uint8_t *mca); +static uint64_t vgen_stat(void *arg, enum mac_stat stat); +static void vgen_ioctl(void *arg, queue_t *wq, mblk_t *mp); + +/* externs - functions provided by vnet to add/remove/modify entries in fdb */ +void vnet_add_fdb(void *arg, uint8_t *macaddr, mac_tx_t m_tx, void *txarg); +void vnet_del_fdb(void *arg, uint8_t *macaddr); +void vnet_modify_fdb(void *arg, uint8_t *macaddr, mac_tx_t m_tx, void *txarg); +void vnet_add_def_rte(void *arg, mac_tx_t m_tx, void *txarg); +void vnet_del_def_rte(void *arg); + +/* vgen internal functions */ +static void vgen_detach_ports(vgen_t *vgenp); +static void vgen_port_detach(vgen_port_t *portp); +static void vgen_port_list_insert(vgen_port_t *portp); +static void vgen_port_list_remove(vgen_port_t *portp); +static vgen_port_t *vgen_port_lookup(vgen_portlist_t *plistp, + int port_num); +static int vgen_mdeg_reg(vgen_t *vgenp); +static void vgen_mdeg_unreg(vgen_t *vgenp); +static int vgen_mdeg_cb(void *cb_argp, mdeg_result_t *resp); +static int vgen_add_port(vgen_t *vgenp, md_t *mdp, mde_cookie_t mdex); +static int vgen_remove_port(vgen_t *vgenp, md_t *mdp, mde_cookie_t mdex); +static int vgen_port_attach_mdeg(vgen_t *vgenp, int port_num, uint64_t *ldcids, + int num_ids, struct ether_addr *macaddr, boolean_t vsw_port); +static void vgen_port_detach_mdeg(vgen_port_t *portp); +static int vgen_update_port(vgen_t *vgenp, md_t *curr_mdp, + mde_cookie_t curr_mdex, md_t *prev_mdp, mde_cookie_t prev_mdex); +static uint64_t vgen_port_stat(vgen_port_t *portp, enum mac_stat stat); + +static int vgen_ldc_attach(vgen_port_t *portp, uint64_t ldc_id); +static void vgen_ldc_detach(vgen_ldc_t *ldcp); +static int vgen_alloc_tx_ring(vgen_ldc_t *ldcp); +static void vgen_free_tx_ring(vgen_ldc_t *ldcp); +static void vgen_init_ports(vgen_t *vgenp); +static void vgen_port_init(vgen_port_t *portp); +static void vgen_uninit_ports(vgen_t *vgenp); +static void vgen_port_uninit(vgen_port_t *portp); +static void vgen_init_ldcs(vgen_port_t *portp); +static void vgen_uninit_ldcs(vgen_port_t *portp); +static int vgen_ldc_init(vgen_ldc_t *ldcp); +static void vgen_ldc_uninit(vgen_ldc_t *ldcp); +static int vgen_init_tbufs(vgen_ldc_t *ldcp); +static void vgen_uninit_tbufs(vgen_ldc_t *ldcp); +static void vgen_clobber_tbufs(vgen_ldc_t *ldcp); +static void vgen_clobber_rxds(vgen_ldc_t *ldcp); +static uint64_t vgen_ldc_stat(vgen_ldc_t *ldcp, enum mac_stat stat); +static void vgen_init_macp(vgen_t *vgenp, mac_t *macp); +static uint_t vgen_ldc_cb(uint64_t event, caddr_t arg); +static int vgen_portsend(vgen_port_t *portp, mblk_t *mp); +static int vgen_ldcsend(vgen_ldc_t *ldcp, mblk_t *mp); +static void vgen_reclaim(vgen_ldc_t *ldcp); +static void vgen_reclaim_dring(vgen_ldc_t *ldcp); +static int vgen_num_txpending(vgen_ldc_t *ldcp); +static int vgen_tx_dring_full(vgen_ldc_t *ldcp); +static int vgen_ldc_txtimeout(vgen_ldc_t *ldcp); +static void vgen_ldc_watchdog(void *arg); +static void vgen_copymsg(mblk_t *mp, void *bufp); +static int vgen_setup_kstats(vgen_ldc_t *ldcp); +static void vgen_destroy_kstats(vgen_ldc_t *ldcp); +static int vgen_kstat_update(kstat_t *ksp, int rw); + +/* vgen handshake functions */ +static vgen_ldc_t *vh_nextphase(vgen_ldc_t *ldcp); +static int vgen_supported_version(vgen_ldc_t *ldcp, uint16_t ver_major, + uint16_t ver_minor); +static int vgen_next_version(vgen_ldc_t *ldcp, vgen_ver_t *verp); +static int vgen_sendmsg(vgen_ldc_t *ldcp, caddr_t msg, size_t msglen, + boolean_t caller_holds_lock); +static int vgen_send_version_negotiate(vgen_ldc_t *ldcp); +static int vgen_send_attr_info(vgen_ldc_t *ldcp); +static int vgen_send_dring_reg(vgen_ldc_t *ldcp); +static int vgen_send_rdx_info(vgen_ldc_t *ldcp); +static int vgen_send_dring_data(vgen_ldc_t *ldcp, uint32_t start, + uint32_t end, uint64_t next_txseq); +static int vgen_send_mcast_info(vgen_ldc_t *ldcp); +static int vgen_handshake_phase2(vgen_ldc_t *ldcp); +static void vgen_handshake_reset(vgen_ldc_t *ldcp); +static void vgen_reset_hphase(vgen_ldc_t *ldcp); +static void vgen_handshake(vgen_ldc_t *ldcp); +static int vgen_handshake_done(vgen_ldc_t *ldcp); +static void vgen_handshake_retry(vgen_ldc_t *ldcp); +static void vgen_handle_version_negotiate(vgen_ldc_t *ldcp, + vio_msg_tag_t *tagp); +static void vgen_handle_attr_info(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp); +static void vgen_handle_dring_reg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp); +static void vgen_handle_rdx_info(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp); +static void vgen_handle_mcast_info(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp); +static void vgen_handle_ctrlmsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp); +static void vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, + mblk_t **headp, mblk_t **tailp); +static void vgen_handle_datamsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, + mblk_t **headp, mblk_t **tailp); +static void vgen_handle_errmsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp); +static int vgen_check_sid(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp); +static uint64_t vgen_macaddr_strtoul(const uint8_t *macaddr); +static int vgen_macaddr_ultostr(uint64_t value, uint8_t *macaddr); +static caddr_t vgen_print_ethaddr(uint8_t *a, char *ebuf); +static void vgen_hwatchdog(void *arg); +static void vgen_print_attr_info(vgen_ldc_t *ldcp, int endpoint); +static void vgen_print_hparams(vgen_hparams_t *hp); +static void vgen_print_ldcinfo(vgen_ldc_t *ldcp); + +/* + * The handshake process consists of 5 phases defined below, with VH_PHASE0 + * being the pre-handshake phase and VH_DONE is the phase to indicate + * successful completion of all phases. + * Each phase may have one to several handshake states which are required + * to complete successfully to move to the next phase. + * Refer to the functions vgen_handshake() and vgen_handshake_done() for + * more details. + */ +/* handshake phases */ +enum { VH_PHASE0, VH_PHASE1, VH_PHASE2, VH_PHASE3, VH_DONE = 0x80 }; + +/* handshake states */ +enum { + + VER_INFO_SENT = 0x1, + VER_ACK_RCVD = 0x2, + VER_INFO_RCVD = 0x4, + VER_ACK_SENT = 0x8, + VER_NEGOTIATED = (VER_ACK_RCVD | VER_ACK_SENT), + + ATTR_INFO_SENT = 0x10, + ATTR_ACK_RCVD = 0x20, + ATTR_INFO_RCVD = 0x40, + ATTR_ACK_SENT = 0x80, + ATTR_INFO_EXCHANGED = (ATTR_ACK_RCVD | ATTR_ACK_SENT), + + DRING_INFO_SENT = 0x100, + DRING_ACK_RCVD = 0x200, + DRING_INFO_RCVD = 0x400, + DRING_ACK_SENT = 0x800, + DRING_INFO_EXCHANGED = (DRING_ACK_RCVD | DRING_ACK_SENT), + + RDX_INFO_SENT = 0x1000, + RDX_ACK_RCVD = 0x2000, + RDX_INFO_RCVD = 0x4000, + RDX_ACK_SENT = 0x8000, + RDX_EXCHANGED = (RDX_ACK_RCVD | RDX_ACK_SENT) + +}; + +#define LDC_LOCK(ldcp) \ + mutex_enter(&((ldcp)->cblock));\ + mutex_enter(&((ldcp)->txlock));\ + mutex_enter(&((ldcp)->tclock)); +#define LDC_UNLOCK(ldcp) \ + mutex_exit(&((ldcp)->tclock));\ + mutex_exit(&((ldcp)->txlock));\ + mutex_exit(&((ldcp)->cblock)); + +static struct ether_addr etherbroadcastaddr = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; +/* + * MIB II broadcast/multicast packets + */ +#define IS_BROADCAST(ehp) \ + (ether_cmp(&ehp->ether_dhost, ðerbroadcastaddr) == 0) +#define IS_MULTICAST(ehp) \ + ((ehp->ether_dhost.ether_addr_octet[0] & 01) == 1) + +/* + * Property names + */ +static char macaddr_propname[] = "mac-address"; +static char rmacaddr_propname[] = "remote-mac-address"; +static char channel_propname[] = "channel-endpoint"; +static char reg_propname[] = "reg"; +static char port_propname[] = "port"; +static char swport_propname[] = "switch-port"; +static char id_propname[] = "id"; + +/* versions supported - in decreasing order */ +static vgen_ver_t vgen_versions[VGEN_NUM_VER] = { {1, 0} }; + +/* Tunables */ +uint32_t vgen_hwd_interval = 1000; /* handshake watchdog freq in msec */ +uint32_t vgen_max_hretries = 1; /* max # of handshake retries */ + +uint32_t vgen_ldcwr_retries = 10; /* max # of ldc_write() retries */ + +#ifdef DEBUG +/* flags to simulate error conditions for debugging */ +int vgen_trigger_txtimeout = 0; +int vgen_trigger_rxlost = 0; +#endif + +/* MD update matching structure */ +static md_prop_match_t vport_prop_match[] = { + { MDET_PROP_VAL, "id" }, + { MDET_LIST_END, NULL } +}; + +static mdeg_node_match_t vport_match = { "virtual-device-port", + vport_prop_match }; + +/* template for matching a particular vnet instance */ +static mdeg_prop_spec_t vgen_prop_template[] = { + { MDET_PROP_STR, "name", "network" }, + { MDET_PROP_VAL, "cfg-handle", NULL }, + { MDET_LIST_END, NULL, NULL } +}; + +#define VGEN_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val) + +static int vgen_mdeg_cb(void *cb_argp, mdeg_result_t *resp); + +/* externs */ +extern uint32_t vnet_ntxds; +extern uint32_t vnet_reclaim_lowat; +extern uint32_t vnet_reclaim_hiwat; +extern uint32_t vnet_ldcwd_interval; +extern uint32_t vnet_ldcwd_txtimeout; +extern uint32_t vnet_ldc_qlen; +extern int _vnet_dbglevel; +extern void _vnetdebug_printf(void *vnetp, const char *fmt, ...); + +#ifdef DEBUG + +/* + * XXX: definitions below need to be in sync with those in vnet.c + */ + +/* + * debug levels: + * DBG_LEVEL1: Function entry/exit tracing + * DBG_LEVEL2: Info messages + * DBG_LEVEL3: Warning messages + * DBG_LEVEL4: Error messages + */ + +enum { DBG_LEVEL1 = 0x01, DBG_LEVEL2 = 0x02, DBG_LEVEL3 = 0x04, + DBG_LEVEL4 = 0x08 }; + +#define DBG1(_s) do { \ + if ((_vnet_dbglevel & DBG_LEVEL1) != 0) { \ + _vnetdebug_printf _s; \ + } \ + _NOTE(CONSTCOND) } while (0) + +#define DBG2(_s) do { \ + if ((_vnet_dbglevel & DBG_LEVEL2) != 0) { \ + _vnetdebug_printf _s; \ + } \ + _NOTE(CONSTCOND) } while (0) + +#define DWARN(_s) do { \ + if ((_vnet_dbglevel & DBG_LEVEL3) != 0) { \ + _vnetdebug_printf _s; \ + } \ + _NOTE(CONSTCOND) } while (0) + +#define DERR(_s) do { \ + if ((_vnet_dbglevel & DBG_LEVEL4) != 0) { \ + _vnetdebug_printf _s; \ + } \ + _NOTE(CONSTCOND) } while (0) + +#else + +#define DBG1(_s) if (0) _vnetdebug_printf _s +#define DBG2(_s) if (0) _vnetdebug_printf _s +#define DWARN(_s) if (0) _vnetdebug_printf _s +#define DERR(_s) if (0) _vnetdebug_printf _s + +#endif + +#ifdef DEBUG + +/* simulate handshake error conditions for debug */ +uint32_t vgen_hdbg; +#define HDBG_VERSION 0x1 +#define HDBG_TIMEOUT 0x2 +#define HDBG_BAD_SID 0x4 +#define HDBG_OUT_STATE 0x8 + +#if 0 +/* debug version negotiation, need to redefine VGEN_NUM_VER */ +vgen_ver_t dbg_vgen_versions[VGEN_NUM_VER] = + { {5, 0}, {3, 0}, {2, 1}, {1, 2}, {1, 1} }; +#endif + +#endif + +/* + * vgen_init() is called by an instance of vnet driver to initialize the + * corresponding generic proxy transport layer. The arguments passed by vnet + * are - an opaque pointer to the vnet instance, pointers to dev_info_t and + * mac_t of the vnet device, mac address of the vnet device, and a pointer to + * the mac_t of the generic transport is returned in the last argument. + */ +int +vgen_init(void *vnetp, dev_info_t *vnetdip, void *vnetmacp, + const uint8_t *macaddr, mac_t **vgenmacp) +{ + vgen_t *vgenp; + mac_t *macp; + int instance; + + if ((vnetp == NULL) || (vnetdip == NULL) ||(vnetmacp == NULL)) + return (DDI_FAILURE); + + instance = ddi_get_instance(vnetdip); + + DBG1((vnetp, "vgen_init: enter vnet_instance(%d)\n", instance)); + + vgenp = kmem_zalloc(sizeof (vgen_t), KM_SLEEP); + + vgenp->vnetp = vnetp; + vgenp->vnetdip = vnetdip; + vgenp->vnetmacp = vnetmacp; + bcopy(macaddr, &(vgenp->macaddr), ETHERADDRL); + + /* allocate multicast table */ + vgenp->mctab = kmem_zalloc(VGEN_INIT_MCTAB_SIZE * + sizeof (struct ether_addr), KM_SLEEP); + vgenp->mccount = 0; + vgenp->mcsize = VGEN_INIT_MCTAB_SIZE; + + mutex_init(&vgenp->lock, NULL, MUTEX_DRIVER, NULL); + + /* register with MD event generator */ + if (vgen_mdeg_reg(vgenp) != DDI_SUCCESS) { + mutex_destroy(&vgenp->lock); + kmem_free(vgenp->mctab, VGEN_INIT_MCTAB_SIZE * + sizeof (struct ether_addr)); + KMEM_FREE(vgenp); + return (DDI_FAILURE); + } + + macp = &vgenp->vgenmac; + vgen_init_macp(vgenp, macp); + + /* register mac_t of this vgen_t with vnet */ + *vgenmacp = macp; + + DBG1((vnetp, "vgen_init: exit vnet_instance(%d)\n", instance)); + return (DDI_SUCCESS); +} + +/* + * Called by vnet to undo the initializations done by vgen_init(). + * The handle provided by generic transport during vgen_init() is the argument. + */ +void +vgen_uninit(void *arg) +{ + vgen_t *vgenp = (vgen_t *)arg; + void *vnetp; + int instance; + + if (vgenp == NULL) + return; + + instance = ddi_get_instance(vgenp->vnetdip); + vnetp = vgenp->vnetp; + + DBG1((vnetp, "vgen_uninit: enter vnet_instance(%d)\n", instance)); + + /* unregister with MD event generator */ + vgen_mdeg_unreg(vgenp); + + mutex_enter(&vgenp->lock); + + /* detach all ports from the device */ + vgen_detach_ports(vgenp); + + /* free multicast table */ + kmem_free(vgenp->mctab, vgenp->mcsize * sizeof (struct ether_addr)); + + mutex_exit(&vgenp->lock); + + mutex_destroy(&vgenp->lock); + + KMEM_FREE(vgenp); + + DBG1((vnetp, "vgen_uninit: exit vnet_instance(%d)\n", instance)); +} + +/* enable transmit/receive for the device */ +static int +vgen_start(void *arg) +{ + vgen_t *vgenp = (vgen_t *)arg; + + DBG1((vgenp->vnetp, "vgen_start: enter\n")); + + mutex_enter(&vgenp->lock); + vgen_init_ports(vgenp); + vgenp->flags |= VGEN_STARTED; + mutex_exit(&vgenp->lock); + + DBG1((vgenp->vnetp, "vgen_start: exit\n")); + return (DDI_SUCCESS); +} + +/* stop transmit/receive */ +static void +vgen_stop(void *arg) +{ + vgen_t *vgenp = (vgen_t *)arg; + + DBG1((vgenp->vnetp, "vgen_stop: enter\n")); + + mutex_enter(&vgenp->lock); + vgen_uninit_ports(vgenp); + vgenp->flags &= ~(VGEN_STARTED); + mutex_exit(&vgenp->lock); + + DBG1((vgenp->vnetp, "vgen_stop: exit\n")); +} + +/* vgen transmit function */ +static mblk_t * +vgen_tx(void *arg, mblk_t *mp) +{ + vgen_port_t *portp; + int status; + + portp = (vgen_port_t *)arg; + status = vgen_portsend(portp, mp); + if (status != VGEN_SUCCESS) { + /* failure */ + return (mp); + } + /* success */ + return (NULL); +} + +/* transmit packets over the given port */ +static int +vgen_portsend(vgen_port_t *portp, mblk_t *mp) +{ + vgen_ldclist_t *ldclp; + vgen_ldc_t *ldcp; + vgen_t *vgenp; + int status; + + vgenp = portp->vgenp; + ldclp = &portp->ldclist; + READ_ENTER(&ldclp->rwlock); + /* + * XXX - for now, we have a single channel. + */ + if (ldclp->headp == NULL) { + DWARN((vgenp->vnetp, "vgen_portsend: dropping packet\n")); + RW_EXIT(&ldclp->rwlock); + return (VGEN_FAILURE); + } + ldcp = ldclp->headp; + + if (ldcp->need_resched) { + /* out of tx resources, see vgen_ldcsend() for details. */ + DWARN((vgenp->vnetp, "vgen_portsend: dropping packet...\n")); + + mutex_enter(&ldcp->txlock); + ldcp->statsp->tx_no_desc++; + mutex_exit(&ldcp->txlock); + + RW_EXIT(&ldclp->rwlock); + freemsg(mp); + return (VGEN_SUCCESS); + } + + status = vgen_ldcsend(ldcp, mp); + RW_EXIT(&ldclp->rwlock); + + if (status != VGEN_TX_SUCCESS) + return (VGEN_FAILURE); + + return (VGEN_SUCCESS); +} + +/* channel transmit function */ +static int +vgen_ldcsend(vgen_ldc_t *ldcp, mblk_t *mp) +{ + void *vnetp; + size_t size; + uint64_t datalen; + uchar_t *rptr; + mblk_t *bp = NULL; + int rv; + uint32_t i; + uint32_t start; + uint32_t end; + int txpending = 0; + int ci; + uint32_t ncookies; + uint64_t nc; + vgen_private_desc_t *tbufp; + vgen_private_desc_t *ntbufp; + vnet_public_desc_t *txdp; + vio_dring_entry_hdr_t *hdrp; + vgen_stats_t *statsp; + struct ether_header *ehp; + boolean_t is_bcast = B_FALSE; + boolean_t is_mcast = B_FALSE; + boolean_t reclaim = B_FALSE; + boolean_t need_intr = B_FALSE; + boolean_t err = B_FALSE; + + vnetp = LDC_TO_VNET(ldcp); + statsp = ldcp->statsp; + DBG1((vnetp, "vgen_ldcsend: enter ldcid(%lx)\n", ldcp->ldc_id)); + + /* drop the packet if handshake is not done or ldc is not up */ + if ((ldcp->hphase != VH_DONE) || (ldcp->ldc_status != LDC_UP)) { + DWARN((vnetp, + "vgen_ldcsend: id(%lx) status(%d), dropping packet\n", + ldcp->ldc_id, ldcp->ldc_status)); + freemsg(mp); + return (VGEN_TX_SUCCESS); + } + + size = msgsize(mp); + if (size > (size_t)ETHERMAX) { + DWARN((vnetp, "vgen_ldcsend: id(%lx) invalid size(%d)\n", + ldcp->ldc_id, size)); + freemsg(mp); + return (VGEN_TX_SUCCESS); + } + if ((size < (size_t)ETHERMIN) || /* needs padding to ETHERMIN */ + (mp->b_cont) || /* more than 1 mblk */ + ((uintptr_t)mp->b_rptr & 0x7) || /* data not 8 byte aligned */ + ((mp->b_wptr - mp->b_rptr) & 0x7)) { /* datalen not multiple of 8 */ + if (size < ETHERMIN) + size = ETHERMIN; + /* + * The data buffer returned by allocb(9F) is 8byte aligned. + * We allocate extra 8 bytes to ensure size is multiple of + * 8 bytes for ldc_mem_bind_handle(). + */ + bp = allocb(size + 8, BPRI_MED); + if (bp == NULL) { + /* drop the packet */ + freemsg(mp); + mutex_enter(&ldcp->txlock); + statsp->tx_allocb_fail++; + mutex_exit(&ldcp->txlock); + return (VGEN_TX_SUCCESS); + } + vgen_copymsg(mp, bp->b_rptr); + bp->b_wptr += size; + datalen = size; /* actual data length without pad */ + size = (datalen + 7) & ~7; + bp->b_wptr += (size - datalen); + } else { /* size/alignment are ok */ + datalen = size; + } + + mutex_enter(&ldcp->txlock); + + /* check if the channel is still up & running */ + if ((ldcp->hphase != VH_DONE) || (ldcp->ldc_status != LDC_UP)) { + DWARN((vnetp, + "vgen_ldcsend: id(%lx) status(%d), dropping packet\n", + ldcp->ldc_id, ldcp->ldc_status)); + err = B_TRUE; + goto vgen_tx_exit; + } + + /* + * allocate a descriptor + */ + tbufp = ldcp->next_tbufp; + ntbufp = NEXTTBUF(ldcp, tbufp); + if (tbufp->flags != VGEN_PRIV_DESC_FREE || + ntbufp == ldcp->cur_tbufp) { /* out of tbufs/txds */ + + mutex_enter(&ldcp->tclock); + if (ntbufp == ldcp->cur_tbufp) + ldcp->need_resched = B_TRUE; + mutex_exit(&ldcp->tclock); + + statsp->tx_no_desc++; + mutex_exit(&ldcp->txlock); + if (bp) + freemsg(bp); +#ifdef VGEN_USE_MAC_TX_UPDATE + /* + * This cflag is disabled by default. This can be enabled if we + * want to return failure to the mac layer when we run out of + * descriptors and use mac_tx_update() to restart tx when + * descriptors become available. However, stopping tx would + * affect traffic going over other ports, as upper mac layer + * has no concept of multiple ports within a device. + * So currently, to avoid this, drop packets when we run out + * of descrs and just return success. See the corresponding + * code in vgen_portsend() and vgen_reclaim_dring(). + */ + return (VGEN_TX_NORESOURCES); +#else + freemsg(mp); /* drop the packet */ + return (VGEN_TX_SUCCESS); +#endif + } + + txpending = vgen_num_txpending(ldcp); + if (txpending >= ldcp->reclaim_hiwat) { + /* + * if num of pending transmits is more than hiwat, + * reclaim now and also enable ack bit. + */ + reclaim = B_TRUE; + need_intr = B_TRUE; + } else { + if (txpending >= ldcp->reclaim_lowat) { + /* + * if the num of pending transmits is more than lowat + * enable ack bit in the descr and reclaim in intr(). + */ + need_intr = B_TRUE; + } + } + + i = tbufp - ldcp->tbufp; + + rptr = bp ? (bp->b_rptr) : (mp->b_rptr); + ci = 0; + rv = ldc_mem_bind_handle(tbufp->memhandle, (caddr_t)rptr, size, + LDC_SHADOW_MAP, LDC_MEM_R, &(tbufp->memcookie[ci]), &ncookies); + if (rv != 0) { + DWARN((vnetp, "vgen_ldcsend: id(%lx)ldc_mem_bind_handle failed" + " rv(%d) tbufi(%d)\n", ldcp->ldc_id, rv, i)); + err = B_TRUE; + statsp->oerrors++; + goto vgen_tx_exit; + } + + if ((ncookies < 0) || (ncookies > (uint64_t)MAX_COOKIES)) { + DWARN((vnetp, + "vgen_ldcsend: id(%lx)ldc_mem_bind_handle returned" + " invalid cookies (%d)\n", ldcp->ldc_id, ncookies)); + err = B_TRUE; + statsp->oerrors++; + (void) ldc_mem_unbind_handle(tbufp->memhandle); + goto vgen_tx_exit; + } + + if (ncookies > 1) { + nc = ncookies - 1; + while (nc) { + ci++; + rv = ldc_mem_nextcookie(tbufp->memhandle, + &(tbufp->memcookie[ci])); + if (rv != 0) { + DWARN((vnetp, + "vgen_ldcsend: ldc_mem_nextcookie" + " err(%d)\n", rv)); + err = B_TRUE; + statsp->oerrors++; + (void) ldc_mem_unbind_handle(tbufp->memhandle); + goto vgen_tx_exit; + } + nc--; + } + } + + ehp = (struct ether_header *)rptr; + is_bcast = IS_BROADCAST(ehp); + is_mcast = IS_MULTICAST(ehp); + /* save the packet, free when the descr done flag is set */ + tbufp->mp = (bp ? bp : mp); + tbufp->flags = VGEN_PRIV_DESC_BUSY; + tbufp->datalen = datalen; + tbufp->ncookies = ncookies; + tbufp->seqnum = ldcp->next_txseq; + + /* initialize the corresponding public descriptor (txd) */ + txdp = tbufp->descp; + hdrp = &txdp->hdr; + hdrp->dstate = VIO_DESC_READY; + if (need_intr) + hdrp->ack = B_TRUE; + txdp->nbytes = datalen; + txdp->ncookies = ncookies; + bcopy((tbufp->memcookie), (txdp->memcookie), + ncookies * sizeof (ldc_mem_cookie_t)); + + /* send dring datamsg to the peer */ + start = end = i; + rv = vgen_send_dring_data(ldcp, start, end, ldcp->next_txseq); + if (rv != 0) { + /* vgen_send_dring_data() error: drop the packet */ + DWARN((vnetp, + "vgen_ldcsend: vgen_send_dring_data(): failed: " + "id(%lx) rv(%d) len (%d)\n", ldcp->ldc_id, rv, datalen)); + (void) ldc_mem_unbind_handle(tbufp->memhandle); + tbufp->flags = VGEN_PRIV_DESC_FREE; /* free tbuf */ + hdrp->dstate = VIO_DESC_FREE; /* free txd */ + hdrp->ack = B_FALSE; + statsp->oerrors++; + err = B_TRUE; + goto vgen_tx_exit; + } + + /* update next available tbuf in the ring */ + ldcp->next_tbufp = ntbufp; + /* update tx seqnum and index */ + ldcp->next_txseq++; + INCR_TXI(ldcp->next_txi, ldcp); + + /* update stats */ + statsp->opackets++; + statsp->obytes += datalen; + if (is_bcast) + statsp->brdcstxmt++; + else if (is_mcast) + statsp->multixmt++; + +vgen_tx_exit: + mutex_exit(&ldcp->txlock); + + if (reclaim) { + vgen_reclaim(ldcp); + } + DBG1((vnetp, "vgen_ldcsend: exit: ldcid (%lx)\n", ldcp->ldc_id)); + + if (err) { + if (bp) + freemsg(bp); +#ifdef VGEN_USE_MAC_TX_UPDATE + return (VGEN_TX_FAILURE); /* transmit failed */ +#else + freemsg(mp); /* drop the packet */ + return (VGEN_TX_SUCCESS); +#endif + } else { + if (bp) /* free original pkt, copy is in bp */ + freemsg(mp); + return (VGEN_TX_SUCCESS); + } +} + +/* register resources */ +static void +vgen_resources(void *arg) +{ + vgen_t *vgenp; + mac_rx_fifo_t mrf; + + vgenp = (vgen_t *)arg; + DBG1((vgenp->vnetp, "vgen_resources: enter\n")); + + mrf.mrf_type = MAC_RX_FIFO; + mrf.mrf_blank = NULL; + mrf.mrf_arg = NULL; + mrf.mrf_normal_blank_time = 0; + mrf.mrf_normal_pkt_count = 0; + vgenp->mrh = mac_resource_add(vgenp->vnetmacp, (mac_resource_t *)&mrf); + + DBG1((vgenp->vnetp, "vgen_resources: exit\n")); +} + +/* enable/disable a multicast address */ +static int +vgen_multicst(void *arg, boolean_t add, const uint8_t *mca) +{ + vgen_t *vgenp; + vnet_mcast_msg_t mcastmsg; + vio_msg_tag_t *tagp; + vgen_port_t *portp; + vgen_portlist_t *plistp; + vgen_ldc_t *ldcp; + vgen_ldclist_t *ldclp; + void *vnetp; + struct ether_addr *addrp; + int rv; + uint32_t i; + + vgenp = (vgen_t *)arg; + vnetp = vgenp->vnetp; + addrp = (struct ether_addr *)mca; + tagp = &mcastmsg.tag; + bzero(&mcastmsg, sizeof (mcastmsg)); + + mutex_enter(&vgenp->lock); + + plistp = &(vgenp->vgenports); + + READ_ENTER(&plistp->rwlock); + + portp = vgenp->vsw_portp; + if (portp == NULL) { + RW_EXIT(&plistp->rwlock); + goto vgen_mcast_exit; + } + ldclp = &portp->ldclist; + + READ_ENTER(&ldclp->rwlock); + + ldcp = ldclp->headp; + if (ldcp == NULL) { + RW_EXIT(&ldclp->rwlock); + RW_EXIT(&plistp->rwlock); + goto vgen_mcast_exit; + } + + mutex_enter(&ldcp->cblock); + + if (ldcp->hphase == VH_DONE) { + /* + * If handshake is done, send a msg to vsw to add/remove + * the multicast address. + */ + tagp->vio_msgtype = VIO_TYPE_CTRL; + tagp->vio_subtype = VIO_SUBTYPE_INFO; + tagp->vio_subtype_env = VNET_MCAST_INFO; + tagp->vio_sid = ldcp->local_sid; + bcopy(mca, &(mcastmsg.mca), ETHERADDRL); + mcastmsg.set = add; + mcastmsg.count = 1; + rv = vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (mcastmsg), + B_FALSE); + if (rv != VGEN_SUCCESS) { + DWARN((vnetp, "vgen_mutlicst: vgen_sendmsg failed" + "id (%lx)\n", ldcp->ldc_id)); + } + } else { + /* set the flag to send a msg to vsw after handshake is done */ + ldcp->need_mcast_sync = B_TRUE; + } + + mutex_exit(&ldcp->cblock); + + if (add) { + + /* expand multicast table if necessary */ + if (vgenp->mccount >= vgenp->mcsize) { + struct ether_addr *newtab; + uint32_t newsize; + + + newsize = vgenp->mcsize * 2; + + newtab = kmem_zalloc(newsize * + sizeof (struct ether_addr), KM_NOSLEEP); + + bcopy(vgenp->mctab, newtab, vgenp->mcsize * + sizeof (struct ether_addr)); + kmem_free(vgenp->mctab, + vgenp->mcsize * sizeof (struct ether_addr)); + + vgenp->mctab = newtab; + vgenp->mcsize = newsize; + } + + /* add address to the table */ + vgenp->mctab[vgenp->mccount++] = *addrp; + + } else { + + /* delete address from the table */ + for (i = 0; i < vgenp->mccount; i++) { + if (ether_cmp(addrp, &(vgenp->mctab[i])) == 0) { + + /* + * If there's more than one address in this + * table, delete the unwanted one by moving + * the last one in the list over top of it; + * otherwise, just remove it. + */ + if (vgenp->mccount > 1) { + vgenp->mctab[i] = + vgenp->mctab[vgenp->mccount-1]; + } + vgenp->mccount--; + break; + } + } + } + + RW_EXIT(&ldclp->rwlock); + RW_EXIT(&plistp->rwlock); + +vgen_mcast_exit: + mutex_exit(&vgenp->lock); + return (DDI_SUCCESS); +} + +/* set or clear promiscuous mode on the device */ +static int +vgen_promisc(void *arg, boolean_t on) +{ + _NOTE(ARGUNUSED(arg, on)) + return (DDI_SUCCESS); +} + +/* set the unicast mac address of the device */ +static int +vgen_unicst(void *arg, const uint8_t *mca) +{ + _NOTE(ARGUNUSED(arg, mca)) + return (DDI_SUCCESS); +} + +/* get device statistics */ +static uint64_t +vgen_stat(void *arg, enum mac_stat stat) +{ + vgen_t *vgenp = (vgen_t *)arg; + vgen_port_t *portp; + vgen_portlist_t *plistp; + uint64_t val; + + val = 0; + + plistp = &(vgenp->vgenports); + READ_ENTER(&plistp->rwlock); + + for (portp = plistp->headp; portp != NULL; portp = portp->nextp) { + val += vgen_port_stat(portp, stat); + } + + RW_EXIT(&plistp->rwlock); + + return (val); +} + +static void +vgen_ioctl(void *arg, queue_t *wq, mblk_t *mp) +{ + _NOTE(ARGUNUSED(arg, wq, mp)) +} + +/* vgen internal functions */ +/* detach all ports from the device */ +static void +vgen_detach_ports(vgen_t *vgenp) +{ + vgen_port_t *portp; + vgen_portlist_t *plistp; + + plistp = &(vgenp->vgenports); + WRITE_ENTER(&plistp->rwlock); + + while ((portp = plistp->headp) != NULL) { + vgen_port_detach(portp); + } + + RW_EXIT(&plistp->rwlock); +} + +/* + * detach the given port. + */ +static void +vgen_port_detach(vgen_port_t *portp) +{ + vgen_t *vgenp; + vgen_ldclist_t *ldclp; + int port_num; + + vgenp = portp->vgenp; + port_num = portp->port_num; + + DBG1((vgenp->vnetp, + "vgen_port_detach: enter: port_num(%d)\n", port_num)); + + /* remove it from port list */ + vgen_port_list_remove(portp); + + /* detach channels from this port */ + ldclp = &portp->ldclist; + WRITE_ENTER(&ldclp->rwlock); + while (ldclp->headp) { + vgen_ldc_detach(ldclp->headp); + } + RW_EXIT(&ldclp->rwlock); + + if (vgenp->vsw_portp == portp) { + vgenp->vsw_portp = NULL; + } + KMEM_FREE(portp); + + DBG1((vgenp->vnetp, + "vgen_port_detach: exit: port_num(%d)\n", port_num)); +} + +/* add a port to port list */ +static void +vgen_port_list_insert(vgen_port_t *portp) +{ + vgen_portlist_t *plistp; + vgen_t *vgenp; + + vgenp = portp->vgenp; + plistp = &(vgenp->vgenports); + + if (plistp->headp == NULL) { + plistp->headp = portp; + } else { + plistp->tailp->nextp = portp; + } + plistp->tailp = portp; + portp->nextp = NULL; +} + +/* remove a port from port list */ +static void +vgen_port_list_remove(vgen_port_t *portp) +{ + vgen_port_t *prevp; + vgen_port_t *nextp; + vgen_portlist_t *plistp; + vgen_t *vgenp; + + vgenp = portp->vgenp; + + plistp = &(vgenp->vgenports); + + if (plistp->headp == NULL) + return; + + if (portp == plistp->headp) { + plistp->headp = portp->nextp; + if (portp == plistp->tailp) + plistp->tailp = plistp->headp; + } else { + for (prevp = plistp->headp; ((nextp = prevp->nextp) != NULL) && + (nextp != portp); prevp = nextp); + if (nextp == portp) { + prevp->nextp = portp->nextp; + } + if (portp == plistp->tailp) + plistp->tailp = prevp; + } +} + +/* lookup a port in the list based on port_num */ +static vgen_port_t * +vgen_port_lookup(vgen_portlist_t *plistp, int port_num) +{ + vgen_port_t *portp = NULL; + + for (portp = plistp->headp; portp != NULL; portp = portp->nextp) { + if (portp->port_num == port_num) { + break; + } + } + + return (portp); +} + +/* enable ports for transmit/receive */ +static void +vgen_init_ports(vgen_t *vgenp) +{ + vgen_port_t *portp; + vgen_portlist_t *plistp; + + plistp = &(vgenp->vgenports); + READ_ENTER(&plistp->rwlock); + + for (portp = plistp->headp; portp != NULL; portp = portp->nextp) { + vgen_port_init(portp); + } + + RW_EXIT(&plistp->rwlock); +} + +static void +vgen_port_init(vgen_port_t *portp) +{ + vgen_t *vgenp; + + vgenp = portp->vgenp; + /* + * Create fdb entry in vnet, corresponding to the mac + * address of this port. Note that the port specified + * is vsw-port. This is done so that vsw-port acts + * as the route to reach this macaddr, until the + * channel for this port comes up (LDC_UP) and + * handshake is done successfully. + * eg, if the peer is OBP-vnet, it may not bring the + * channel up for this port and may communicate via + * vsw to reach this port. + * Later, when Solaris-vnet comes up at the other end + * of the channel for this port and brings up the channel, + * it is an indication that peer vnet is capable of + * distributed switching, so the direct route through this + * port is specified in fdb, using vnet_modify_fdb(macaddr); + */ + vnet_add_fdb(vgenp->vnetp, (uint8_t *)&portp->macaddr, + vgen_tx, vgenp->vsw_portp); + + if (portp == vgenp->vsw_portp) { + /* + * create the default route entry in vnet's fdb. + * This is the entry used by vnet to reach + * unknown destinations, which basically goes + * through vsw on domain0 and out through the + * physical device bound to vsw. + */ + vnet_add_def_rte(vgenp->vnetp, vgen_tx, portp); + } + + /* Bring up the channels of this port */ + vgen_init_ldcs(portp); +} + +/* disable transmit/receive on ports */ +static void +vgen_uninit_ports(vgen_t *vgenp) +{ + vgen_port_t *portp; + vgen_portlist_t *plistp; + + plistp = &(vgenp->vgenports); + READ_ENTER(&plistp->rwlock); + + for (portp = plistp->headp; portp != NULL; portp = portp->nextp) { + vgen_port_uninit(portp); + } + + RW_EXIT(&plistp->rwlock); +} + +static void +vgen_port_uninit(vgen_port_t *portp) +{ + vgen_t *vgenp; + + vgenp = portp->vgenp; + + vgen_uninit_ldcs(portp); + /* delete the entry in vnet's fdb for this port */ + vnet_del_fdb(vgenp->vnetp, (uint8_t *)&portp->macaddr); + if (portp == vgenp->vsw_portp) { + /* + * if this is vsw-port, then delete the default + * route entry in vnet's fdb. + */ + vnet_del_def_rte(vgenp->vnetp); + } +} + +/* register with MD event generator */ +static int +vgen_mdeg_reg(vgen_t *vgenp) +{ + mdeg_prop_spec_t *pspecp; + mdeg_node_spec_t *parentp; + uint_t templatesz; + int rv; + mdeg_handle_t hdl; + int i; + void *vnetp = vgenp->vnetp; + + i = ddi_prop_get_int(DDI_DEV_T_ANY, vgenp->vnetdip, + DDI_PROP_DONTPASS, reg_propname, -1); + if (i == -1) { + return (DDI_FAILURE); + } + templatesz = sizeof (vgen_prop_template); + pspecp = kmem_zalloc(templatesz, KM_NOSLEEP); + if (pspecp == NULL) { + return (DDI_FAILURE); + } + parentp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_NOSLEEP); + if (parentp == NULL) { + kmem_free(pspecp, templatesz); + return (DDI_FAILURE); + } + + bcopy(vgen_prop_template, pspecp, templatesz); + + /* + * NOTE: The instance here refers to the value of "reg" property and + * not the dev_info instance (ddi_get_instance()) of vnet. + */ + VGEN_SET_MDEG_PROP_INST(pspecp, i); + + parentp->namep = "virtual-device"; + parentp->specp = pspecp; + + /* save parentp in vgen_t */ + vgenp->mdeg_parentp = parentp; + + rv = mdeg_register(parentp, &vport_match, vgen_mdeg_cb, vgenp, &hdl); + if (rv != MDEG_SUCCESS) { + DERR((vnetp, "vgen_mdeg_reg: mdeg_register failed\n")); + KMEM_FREE(parentp); + kmem_free(pspecp, templatesz); + vgenp->mdeg_parentp = NULL; + return (DDI_FAILURE); + } + + /* save mdeg handle in vgen_t */ + vgenp->mdeg_hdl = hdl; + + return (DDI_SUCCESS); +} + +/* unregister with MD event generator */ +static void +vgen_mdeg_unreg(vgen_t *vgenp) +{ + (void) mdeg_unregister(vgenp->mdeg_hdl); + KMEM_FREE(vgenp->mdeg_parentp); + vgenp->mdeg_parentp = NULL; + vgenp->mdeg_hdl = NULL; +} + +/* callback function registered with MD event generator */ +static int +vgen_mdeg_cb(void *cb_argp, mdeg_result_t *resp) +{ + int idx; + int vsw_idx = -1; + uint64_t val; + vgen_t *vgenp; + + if ((resp == NULL) || (cb_argp == NULL)) { + return (MDEG_FAILURE); + } + + vgenp = (vgen_t *)cb_argp; + DBG1((vgenp->vnetp, "vgen_mdeg_cb: enter\n")); + + mutex_enter(&vgenp->lock); + + DBG1((vgenp->vnetp, + "vgen_mdeg_cb: ports: removed(%x), added(%x), updated(%x)\n", + resp->removed.nelem, resp->added.nelem, resp->match_curr.nelem)); + + for (idx = 0; idx < resp->removed.nelem; idx++) { + (void) vgen_remove_port(vgenp, resp->removed.mdp, + resp->removed.mdep[idx]); + } + + if (vgenp->vsw_portp == NULL) { + /* + * find vsw_port and add it first, because other ports need + * this when adding fdb entry (see vgen_port_init()). + */ + for (idx = 0; idx < resp->added.nelem; idx++) { + if (!(md_get_prop_val(resp->added.mdp, + resp->added.mdep[idx], swport_propname, &val))) { + if (val == 0) { + /* + * This port is connected to the + * vsw on dom0. + */ + vsw_idx = idx; + (void) vgen_add_port(vgenp, + resp->added.mdp, + resp->added.mdep[idx]); + break; + } + } + } + if (vsw_idx == -1) { + DWARN((vgenp->vnetp, "vgen_mdeg_cb: " + "can't find vsw_port\n")); + return (MDEG_FAILURE); + } + } + + for (idx = 0; idx < resp->added.nelem; idx++) { + if ((vsw_idx != -1) && (vsw_idx == idx)) /* skip vsw_port */ + continue; + (void) vgen_add_port(vgenp, resp->added.mdp, + resp->added.mdep[idx]); + } + + for (idx = 0; idx < resp->match_curr.nelem; idx++) { + (void) vgen_update_port(vgenp, resp->match_curr.mdp, + resp->match_curr.mdep[idx], + resp->match_prev.mdp, + resp->match_prev.mdep[idx]); + } + + mutex_exit(&vgenp->lock); + DBG1((vgenp->vnetp, "vgen_mdeg_cb: exit\n")); + return (MDEG_SUCCESS); +} + +/* add a new port to the device */ +static int +vgen_add_port(vgen_t *vgenp, md_t *mdp, mde_cookie_t mdex) +{ + uint64_t port_num; + uint64_t *ldc_ids; + uint64_t macaddr; + uint64_t val; + int num_ldcs; + int vsw_port = B_FALSE; + int i; + int addrsz; + int num_nodes = 0; + int listsz = 0; + mde_cookie_t *listp = NULL; + uint8_t *addrp; + struct ether_addr ea; + + /* read "id" property to get the port number */ + if (md_get_prop_val(mdp, mdex, id_propname, &port_num)) { + DWARN((vgenp->vnetp, + "vgen_add_port: prop(%s) not found\n", id_propname)); + return (DDI_FAILURE); + } + + /* + * Find the channel endpoint node(s) under this port node. + */ + if ((num_nodes = md_node_count(mdp)) <= 0) { + DWARN((vgenp->vnetp, + "vgen_add_port: invalid number of nodes found (%d)", + num_nodes)); + return (DDI_FAILURE); + } + + /* allocate space for node list */ + listsz = num_nodes * sizeof (mde_cookie_t); + listp = kmem_zalloc(listsz, KM_NOSLEEP); + if (listp == NULL) + return (DDI_FAILURE); + + num_ldcs = md_scan_dag(mdp, mdex, + md_find_name(mdp, channel_propname), + md_find_name(mdp, "fwd"), listp); + + if (num_ldcs <= 0) { + DWARN((vgenp->vnetp, + "vgen_add_port: can't find %s nodes", channel_propname)); + kmem_free(listp, listsz); + return (DDI_FAILURE); + } + + DBG2((vgenp->vnetp, "vgen_add_port: num_ldcs %d", num_ldcs)); + + ldc_ids = kmem_zalloc(num_ldcs * sizeof (uint64_t), KM_NOSLEEP); + if (ldc_ids == NULL) { + kmem_free(listp, listsz); + return (DDI_FAILURE); + } + + for (i = 0; i < num_ldcs; i++) { + /* read channel ids */ + if (md_get_prop_val(mdp, listp[i], id_propname, &ldc_ids[i])) { + DWARN((vgenp->vnetp, + "vgen_add_port: prop(%s) not found\n", + id_propname)); + kmem_free(listp, listsz); + kmem_free(ldc_ids, num_ldcs * sizeof (uint64_t)); + return (DDI_FAILURE); + } + DBG2((vgenp->vnetp, "vgen_add_port: ldc_id 0x%llx", + ldc_ids[i])); + } + + kmem_free(listp, listsz); + + if (md_get_prop_data(mdp, mdex, rmacaddr_propname, &addrp, + &addrsz)) { + DWARN((vgenp->vnetp, + "vgen_add_port: prop(%s) not found\n", rmacaddr_propname)); + kmem_free(ldc_ids, num_ldcs * sizeof (uint64_t)); + return (DDI_FAILURE); + } + + if (addrsz < ETHERADDRL) { + DWARN((vgenp->vnetp, + "vgen_add_port: invalid address size (%d)\n", addrsz)); + kmem_free(ldc_ids, num_ldcs * sizeof (uint64_t)); + return (DDI_FAILURE); + } + + macaddr = *((uint64_t *)addrp); + + DBG2((vgenp->vnetp, "vgen_add_port: remote mac address 0x%llx\n", + macaddr)); + + for (i = ETHERADDRL - 1; i >= 0; i--) { + ea.ether_addr_octet[i] = macaddr & 0xFF; + macaddr >>= 8; + } + + if (vgenp->vsw_portp == NULL) { + if (!(md_get_prop_val(mdp, mdex, swport_propname, &val))) { + if (val == 0) { + /* This port is connected to the vsw on dom0 */ + vsw_port = B_TRUE; + } + } + } + (void) vgen_port_attach_mdeg(vgenp, (int)port_num, ldc_ids, num_ldcs, + &ea, vsw_port); + + kmem_free(ldc_ids, num_ldcs * sizeof (uint64_t)); + + return (DDI_SUCCESS); +} + +/* remove a port from the device */ +static int +vgen_remove_port(vgen_t *vgenp, md_t *mdp, mde_cookie_t mdex) +{ + uint64_t port_num; + vgen_port_t *portp; + vgen_portlist_t *plistp; + + /* read "id" property to get the port number */ + if (md_get_prop_val(mdp, mdex, id_propname, &port_num)) { + DWARN((vgenp->vnetp, + "vgen_remove_port: prop(%s) not found\n", id_propname)); + return (DDI_FAILURE); + } + + plistp = &(vgenp->vgenports); + + WRITE_ENTER(&plistp->rwlock); + portp = vgen_port_lookup(plistp, (int)port_num); + if (portp == NULL) { + DWARN((vgenp->vnetp, "vgen_remove_port: can't find port(%lx)\n", + port_num)); + RW_EXIT(&plistp->rwlock); + return (DDI_FAILURE); + } + + vgen_port_detach_mdeg(portp); + RW_EXIT(&plistp->rwlock); + + return (DDI_SUCCESS); +} + +/* attach a port to the device based on mdeg data */ +static int +vgen_port_attach_mdeg(vgen_t *vgenp, int port_num, uint64_t *ldcids, + int num_ids, struct ether_addr *macaddr, boolean_t vsw_port) +{ + vgen_port_t *portp; + vgen_portlist_t *plistp; + int i; + + portp = kmem_zalloc(sizeof (vgen_port_t), KM_NOSLEEP); + if (portp == NULL) { + return (DDI_FAILURE); + } + portp->vgenp = vgenp; + portp->port_num = port_num; + + DBG1((vgenp->vnetp, + "vgen_port_attach_mdeg: port_num(%d)\n", portp->port_num)); + + portp->ldclist.num_ldcs = 0; + portp->ldclist.headp = NULL; + rw_init(&portp->ldclist.rwlock, NULL, RW_DRIVER, NULL); + + ether_copy(macaddr, &portp->macaddr); + for (i = 0; i < num_ids; i++) { + DBG2((vgenp->vnetp, "vgen_port_attach_mdeg: ldcid (%lx)\n", + ldcids[i])); + (void) vgen_ldc_attach(portp, ldcids[i]); + } + + /* link it into the list of ports */ + plistp = &(vgenp->vgenports); + WRITE_ENTER(&plistp->rwlock); + vgen_port_list_insert(portp); + RW_EXIT(&plistp->rwlock); + + /* This port is connected to the vsw on domain0 */ + if (vsw_port) + vgenp->vsw_portp = portp; + + if (vgenp->flags & VGEN_STARTED) { /* interface is configured */ + vgen_port_init(portp); + } + + DBG1((vgenp->vnetp, + "vgen_port_attach_mdeg: exit: port_num(%d)\n", portp->port_num)); + return (DDI_SUCCESS); +} + +/* detach a port from the device based on mdeg data */ +static void +vgen_port_detach_mdeg(vgen_port_t *portp) +{ + vgen_t *vgenp = portp->vgenp; + + DBG1((vgenp->vnetp, + "vgen_port_detach_mdeg: enter: port_num(%d)\n", portp->port_num)); + /* stop the port if needed */ + if (vgenp->flags & VGEN_STARTED) { + vgen_port_uninit(portp); + } + vgen_port_detach(portp); + + DBG1((vgenp->vnetp, + "vgen_port_detach_mdeg: exit: port_num(%d)\n", portp->port_num)); +} + +static int +vgen_update_port(vgen_t *vgenp, md_t *curr_mdp, mde_cookie_t curr_mdex, + md_t *prev_mdp, mde_cookie_t prev_mdex) +{ + _NOTE(ARGUNUSED(vgenp, curr_mdp, curr_mdex, prev_mdp, prev_mdex)) + + /* XXX: TBD */ + return (DDI_SUCCESS); +} + +static uint64_t +vgen_port_stat(vgen_port_t *portp, enum mac_stat stat) +{ + vgen_ldclist_t *ldclp; + vgen_ldc_t *ldcp; + uint64_t val; + + val = 0; + ldclp = &portp->ldclist; + + READ_ENTER(&ldclp->rwlock); + for (ldcp = ldclp->headp; ldcp != NULL; ldcp = ldcp->nextp) { + val += vgen_ldc_stat(ldcp, stat); + } + RW_EXIT(&ldclp->rwlock); + + return (val); +} + +/* attach the channel corresponding to the given ldc_id to the port */ +static int +vgen_ldc_attach(vgen_port_t *portp, uint64_t ldc_id) +{ + vgen_t *vgenp; + vgen_ldclist_t *ldclp; + vgen_ldc_t *ldcp, **prev_ldcp; + ldc_attr_t attr; + int status; + ldc_status_t istatus; + enum {AST_init = 0x0, AST_ldc_alloc = 0x1, + AST_mutex_init = 0x2, AST_ldc_init = 0x4, + AST_ldc_reg_cb = 0x8, AST_alloc_tx_ring = 0x10} + attach_state; + + attach_state = AST_init; + vgenp = portp->vgenp; + ldclp = &portp->ldclist; + + ldcp = kmem_zalloc(sizeof (vgen_ldc_t), KM_NOSLEEP); + if (ldcp == NULL) { + goto ldc_attach_failed; + } + ldcp->ldc_id = ldc_id; + ldcp->portp = portp; + ldcp->reclaim_lowat = vnet_reclaim_lowat; + ldcp->reclaim_hiwat = vnet_reclaim_hiwat; + + attach_state |= AST_ldc_alloc; + + mutex_init(&ldcp->txlock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ldcp->cblock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ldcp->tclock, NULL, MUTEX_DRIVER, NULL); + + attach_state |= AST_mutex_init; + + attr.devclass = LDC_DEV_NT; + attr.instance = ddi_get_instance(vgenp->vnetdip); + attr.mode = LDC_MODE_UNRELIABLE; + attr.qlen = vnet_ldc_qlen; + status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle); + if (status != 0) { + DWARN((vgenp->vnetp, "ldc_init failed, id (%lx) rv (%d)\n", + ldc_id, status)); + goto ldc_attach_failed; + } + attach_state |= AST_ldc_init; + + status = ldc_reg_callback(ldcp->ldc_handle, vgen_ldc_cb, (caddr_t)ldcp); + if (status != 0) { + DWARN((vgenp->vnetp, + "ldc_reg_callback failed, id (%lx) rv (%d)\n", + ldc_id, status)); + goto ldc_attach_failed; + } + attach_state |= AST_ldc_reg_cb; + + (void) ldc_status(ldcp->ldc_handle, &istatus); + ASSERT(istatus == LDC_INIT); + ldcp->ldc_status = istatus; + + /* allocate transmit resources */ + status = vgen_alloc_tx_ring(ldcp); + if (status != 0) { + goto ldc_attach_failed; + } + attach_state |= AST_alloc_tx_ring; + + /* Setup kstats for the channel */ + status = vgen_setup_kstats(ldcp); + if (status != VGEN_SUCCESS) { + goto ldc_attach_failed; + } + + /* initialize vgen_versions supported */ + bcopy(vgen_versions, ldcp->vgen_versions, sizeof (ldcp->vgen_versions)); + + /* link it into the list of channels for this port */ + WRITE_ENTER(&ldclp->rwlock); + prev_ldcp = (vgen_ldc_t **)(&ldclp->headp); + ldcp->nextp = *prev_ldcp; + *prev_ldcp = ldcp; + ldclp->num_ldcs++; + RW_EXIT(&ldclp->rwlock); + + ldcp->flags |= CHANNEL_ATTACHED; + return (DDI_SUCCESS); + +ldc_attach_failed: + if (attach_state & AST_alloc_tx_ring) { + vgen_free_tx_ring(ldcp); + } + if (attach_state & AST_ldc_reg_cb) { + (void) ldc_unreg_callback(ldcp->ldc_handle); + } + if (attach_state & AST_ldc_init) { + (void) ldc_fini(ldcp->ldc_handle); + } + if (attach_state & AST_mutex_init) { + mutex_destroy(&ldcp->tclock); + mutex_destroy(&ldcp->txlock); + mutex_destroy(&ldcp->cblock); + } + if (attach_state & AST_ldc_alloc) { + KMEM_FREE(ldcp); + } + return (DDI_FAILURE); +} + +/* detach a channel from the port */ +static void +vgen_ldc_detach(vgen_ldc_t *ldcp) +{ + vgen_port_t *portp; + vgen_t *vgenp; + vgen_ldc_t *pldcp; + vgen_ldc_t **prev_ldcp; + vgen_ldclist_t *ldclp; + + portp = ldcp->portp; + vgenp = portp->vgenp; + ldclp = &portp->ldclist; + + prev_ldcp = (vgen_ldc_t **)&ldclp->headp; + for (; (pldcp = *prev_ldcp) != NULL; prev_ldcp = &pldcp->nextp) { + if (pldcp == ldcp) { + break; + } + } + + if (pldcp == NULL) { + /* invalid ldcp? */ + return; + } + + if (ldcp->ldc_status != LDC_INIT) { + DWARN((vgenp->vnetp, + "vgen_ldc_detach: ldc_status is not INIT id(%lx)\n", + ldcp->ldc_id)); + } + + if (ldcp->flags & CHANNEL_ATTACHED) { + ldcp->flags &= ~(CHANNEL_ATTACHED); + + vgen_destroy_kstats(ldcp); + /* free transmit resources */ + vgen_free_tx_ring(ldcp); + (void) ldc_unreg_callback(ldcp->ldc_handle); + (void) ldc_fini(ldcp->ldc_handle); + mutex_destroy(&ldcp->tclock); + mutex_destroy(&ldcp->txlock); + mutex_destroy(&ldcp->cblock); + + /* unlink it from the list */ + *prev_ldcp = ldcp->nextp; + ldclp->num_ldcs--; + KMEM_FREE(ldcp); + } +} + +/* + * This function allocates transmit resources for the channel. + * The resources consist of a transmit descriptor ring and an associated + * transmit buffer ring. + */ +static int +vgen_alloc_tx_ring(vgen_ldc_t *ldcp) +{ + void *tbufp; + ldc_mem_info_t minfo; + uint32_t txdsize; + uint32_t tbufsize; + int status; + void *vnetp = LDC_TO_VNET(ldcp); + + ldcp->num_txds = vnet_ntxds; + txdsize = sizeof (vnet_public_desc_t); + tbufsize = sizeof (vgen_private_desc_t); + + /* allocate transmit buffer ring */ + tbufp = kmem_zalloc(ldcp->num_txds * tbufsize, KM_NOSLEEP); + if (tbufp == NULL) { + return (DDI_FAILURE); + } + + /* create transmit descriptor ring */ + status = ldc_mem_dring_create(ldcp->num_txds, txdsize, + &ldcp->tx_dhandle); + if (status) { + DWARN((vnetp, "vgen_alloc_tx_ring: ldc_mem_dring_create() " + "failed, id(%lx)\n", ldcp->ldc_id)); + kmem_free(tbufp, ldcp->num_txds * tbufsize); + return (DDI_FAILURE); + } + + /* get the addr of descripror ring */ + status = ldc_mem_dring_info(ldcp->tx_dhandle, &minfo); + if (status) { + DWARN((vnetp, "vgen_alloc_tx_ring: ldc_mem_dring_info() " + "failed, id(%lx)\n", ldcp->ldc_id)); + kmem_free(tbufp, ldcp->num_txds * tbufsize); + (void) ldc_mem_dring_destroy(ldcp->tx_dhandle); + ldcp->tbufp = NULL; + return (DDI_FAILURE); + } + ldcp->txdp = (vnet_public_desc_t *)(minfo.vaddr); + ldcp->tbufp = tbufp; + + ldcp->txdendp = &((ldcp->txdp)[ldcp->num_txds]); + ldcp->tbufendp = &((ldcp->tbufp)[ldcp->num_txds]); + + return (DDI_SUCCESS); +} + +/* Free transmit resources for the channel */ +static void +vgen_free_tx_ring(vgen_ldc_t *ldcp) +{ + int tbufsize = sizeof (vgen_private_desc_t); + + /* free transmit descriptor ring */ + (void) ldc_mem_dring_destroy(ldcp->tx_dhandle); + + /* free transmit buffer ring */ + kmem_free(ldcp->tbufp, ldcp->num_txds * tbufsize); + ldcp->txdp = ldcp->txdendp = NULL; + ldcp->tbufp = ldcp->tbufendp = NULL; +} + +/* enable transmit/receive on the channels for the port */ +static void +vgen_init_ldcs(vgen_port_t *portp) +{ + vgen_ldclist_t *ldclp = &portp->ldclist; + vgen_ldc_t *ldcp; + + READ_ENTER(&ldclp->rwlock); + ldcp = ldclp->headp; + for (; ldcp != NULL; ldcp = ldcp->nextp) { + (void) vgen_ldc_init(ldcp); + } + RW_EXIT(&ldclp->rwlock); +} + +/* stop transmit/receive on the channels for the port */ +static void +vgen_uninit_ldcs(vgen_port_t *portp) +{ + vgen_ldclist_t *ldclp = &portp->ldclist; + vgen_ldc_t *ldcp; + + READ_ENTER(&ldclp->rwlock); + ldcp = ldclp->headp; + for (; ldcp != NULL; ldcp = ldcp->nextp) { + vgen_ldc_uninit(ldcp); + } + RW_EXIT(&ldclp->rwlock); +} + +/* enable transmit/receive on the channel */ +static int +vgen_ldc_init(vgen_ldc_t *ldcp) +{ + void *vnetp = LDC_TO_VNET(ldcp); + ldc_status_t istatus; + int rv; + enum { ST_init = 0x0, ST_init_tbufs = 0x1, + ST_ldc_open = 0x2, ST_dring_bind = 0x4 + } + init_state; + uint32_t ncookies = 0; + + init_state = ST_init; + + LDC_LOCK(ldcp); + + rv = ldc_open(ldcp->ldc_handle); + if (rv != 0) { + DWARN((vnetp, + "vgen_ldcinit: ldc_open failed: id<%lx> rv(%d)\n", + ldcp->ldc_id, rv)); + goto ldcinit_failed; + } + init_state |= ST_ldc_open; + + (void) ldc_status(ldcp->ldc_handle, &istatus); + if (istatus != LDC_OPEN && istatus != LDC_READY) { + DWARN((vnetp, + "vgen_ldcinit: id (%lx) status(%d) is not OPEN/READY\n", + ldcp->ldc_id, istatus)); + goto ldcinit_failed; + } + ldcp->ldc_status = istatus; + + rv = vgen_init_tbufs(ldcp); + if (rv != 0) { + DWARN((vnetp, + "vgen_ldcinit: vgen_init_tbufs() failed: id(%lx)\n", + ldcp->ldc_id)); + goto ldcinit_failed; + } + init_state |= ST_init_tbufs; + + /* Bind descriptor ring to the channel */ + rv = ldc_mem_dring_bind(ldcp->ldc_handle, ldcp->tx_dhandle, + LDC_SHADOW_MAP, LDC_MEM_RW, &ldcp->tx_dcookie, &ncookies); + if (rv != 0) { + DWARN((vnetp, "vgen_ldcinit: id (%lx) " + "ldc_mem_dring_bind failed\n", ldcp->ldc_id)); + goto ldcinit_failed; + } + + ASSERT(ncookies == 1); + ldcp->num_txdcookies = ncookies; + + init_state |= ST_dring_bind; + + rv = ldc_up(ldcp->ldc_handle); + if (rv != 0) { + DBG2((vnetp, + "vgen_ldcinit: ldc_up err id(%lx) rv(%d)\n", + ldcp->ldc_id, rv)); + } + + (void) ldc_status(ldcp->ldc_handle, &istatus); + if (istatus != LDC_UP) { + DBG2((vnetp, "vgen_ldcinit: id(%lx) status(%d) is not UP\n", + ldcp->ldc_id, istatus)); + } + ldcp->ldc_status = istatus; + + /* initialize transmit watchdog timeout */ + ldcp->wd_tid = timeout(vgen_ldc_watchdog, (caddr_t)ldcp, + drv_usectohz(vnet_ldcwd_interval * 1000)); + + ldcp->flags |= CHANNEL_STARTED; + + LDC_UNLOCK(ldcp); + return (DDI_SUCCESS); + +ldcinit_failed: + if (init_state & ST_dring_bind) { + (void) ldc_mem_dring_unbind(ldcp->tx_dhandle); + } + if (init_state & ST_init_tbufs) { + vgen_uninit_tbufs(ldcp); + } + if (init_state & ST_ldc_open) { + (void) ldc_close(ldcp->ldc_handle); + } + LDC_UNLOCK(ldcp); + return (DDI_FAILURE); +} + +/* stop transmit/receive on the channel */ +static void +vgen_ldc_uninit(vgen_ldc_t *ldcp) +{ + void *vnetp = LDC_TO_VNET(ldcp); + int rv; + + DBG1((vnetp, "vgen_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id)); + LDC_LOCK(ldcp); + + if ((ldcp->flags & CHANNEL_STARTED) == 0) { + LDC_UNLOCK(ldcp); + DWARN((vnetp, "vgen_ldc_uninit: id(%lx) CHANNEL_STARTED" + " flag is not set\n", ldcp->ldc_id)); + return; + } + + /* disable further callbacks */ + rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE); + if (rv != 0) { + DWARN((vnetp, "vgen_ldc_uninit: id (%lx) " + "ldc_set_cb_mode failed\n", ldcp->ldc_id)); + } + + /* clear handshake done bit and wait for pending tx and cb to finish */ + ldcp->hphase &= ~(VH_DONE); + LDC_UNLOCK(ldcp); + drv_usecwait(1000); + LDC_LOCK(ldcp); + + vgen_reset_hphase(ldcp); + + /* reset transmit watchdog timeout */ + if (ldcp->wd_tid) { + (void) untimeout(ldcp->wd_tid); + ldcp->wd_tid = 0; + } + + /* unbind tx descriptor ring from the channel */ + rv = ldc_mem_dring_unbind(ldcp->tx_dhandle); + if (rv != 0) { + DWARN((vnetp, "vgen_ldcuninit: ldc_mem_dring_unbind " + "failed id(%lx)\n", ldcp->ldc_id)); + } + + vgen_uninit_tbufs(ldcp); + + rv = ldc_close(ldcp->ldc_handle); + if (rv != 0) { + DWARN((vnetp, "vgen_ldcuninit: ldc_close err id(%lx)\n", + ldcp->ldc_id)); + } + ldcp->ldc_status = LDC_INIT; + ldcp->flags &= ~(CHANNEL_STARTED); + + LDC_UNLOCK(ldcp); + + DBG1((vnetp, "vgen_ldc_uninit: exit: id(%lx)\n", ldcp->ldc_id)); +} + +/* Initialize the transmit buffer ring for the channel */ +static int +vgen_init_tbufs(vgen_ldc_t *ldcp) +{ + vgen_private_desc_t *tbufp; + vnet_public_desc_t *txdp; + vio_dring_entry_hdr_t *hdrp; + int i; + int rv; + + bzero(ldcp->tbufp, sizeof (*tbufp) * (ldcp->num_txds)); + bzero(ldcp->txdp, sizeof (*txdp) * (ldcp->num_txds)); + + /* + * for each tx buf (priv_desc), allocate a ldc mem_handle which is + * required to map the data during transmit, set the flags + * to free (available for use by transmit routine). + */ + + for (i = 0; i < ldcp->num_txds; i++) { + tbufp = &(ldcp->tbufp[i]); + rv = ldc_mem_alloc_handle(ldcp->ldc_handle, + &(tbufp->memhandle)); + if (rv) { + tbufp->memhandle = 0; + goto init_tbufs_failed; + } + tbufp->flags = VGEN_PRIV_DESC_FREE; + txdp = &(ldcp->txdp[i]); + hdrp = &txdp->hdr; + hdrp->dstate = VIO_DESC_FREE; + hdrp->ack = B_FALSE; + tbufp->descp = txdp; + } + + /* reset tbuf walking pointers */ + ldcp->next_tbufp = ldcp->tbufp; + ldcp->cur_tbufp = ldcp->tbufp; + + /* initialize tx seqnum and index */ + ldcp->next_txseq = VNET_ISS; + ldcp->next_txi = 0; + + return (DDI_SUCCESS); + +init_tbufs_failed:; + vgen_uninit_tbufs(ldcp); + return (DDI_FAILURE); +} + +/* Uninitialize transmit buffer ring for the channel */ +static void +vgen_uninit_tbufs(vgen_ldc_t *ldcp) +{ + vgen_private_desc_t *tbufp = ldcp->tbufp; + vnet_public_desc_t *txdp; + vio_dring_entry_hdr_t *hdrp; + int i; + + /* for each tbuf (priv_desc), free ldc mem_handle */ + for (i = 0; i < ldcp->num_txds; i++) { + + tbufp = &(ldcp->tbufp[i]); + txdp = tbufp->descp; + hdrp = &txdp->hdr; + + if (tbufp->flags != VGEN_PRIV_DESC_FREE) { + (void) ldc_mem_unbind_handle(tbufp->memhandle); + freemsg(tbufp->mp); + tbufp->mp = NULL; + tbufp->flags = VGEN_PRIV_DESC_FREE; + hdrp->dstate = VIO_DESC_FREE; + hdrp->ack = B_FALSE; + } + if (tbufp->memhandle) { + (void) ldc_mem_free_handle(tbufp->memhandle); + tbufp->memhandle = 0; + } + tbufp->descp = NULL; + } + + bzero(ldcp->tbufp, sizeof (*tbufp) * (ldcp->num_txds)); + bzero(ldcp->txdp, sizeof (*txdp) * (ldcp->num_txds)); +} + +/* clobber tx descriptor ring */ +static void +vgen_clobber_tbufs(vgen_ldc_t *ldcp) +{ + vnet_public_desc_t *txdp; + vgen_private_desc_t *tbufp; + vio_dring_entry_hdr_t *hdrp; + void *vnetp = LDC_TO_VNET(ldcp); + int i; +#ifdef DEBUG + int ndone = 0; +#endif + + for (i = 0; i < ldcp->num_txds; i++) { + + tbufp = &(ldcp->tbufp[i]); + txdp = tbufp->descp; + hdrp = &txdp->hdr; + + if (tbufp->flags & VGEN_PRIV_DESC_BUSY) { + (void) ldc_mem_unbind_handle(tbufp->memhandle); + freemsg(tbufp->mp); + tbufp->mp = NULL; + tbufp->flags = VGEN_PRIV_DESC_FREE; +#ifdef DEBUG + if (hdrp->dstate == VIO_DESC_DONE) + ndone++; +#endif + hdrp->dstate = VIO_DESC_FREE; + hdrp->ack = B_FALSE; + } + } + /* reset tbuf walking pointers */ + ldcp->next_tbufp = ldcp->tbufp; + ldcp->cur_tbufp = ldcp->tbufp; + + /* reset tx seqnum and index */ + ldcp->next_txseq = VNET_ISS; + ldcp->next_txi = 0; +#ifdef DEBUG + DBG2((vnetp, + "vgen_clobber_tbufs: id(0x%lx) num descrs done (%d)\n", + ldcp->ldc_id, ndone)); +#endif +} + +/* clobber receive descriptor ring */ +static void +vgen_clobber_rxds(vgen_ldc_t *ldcp) +{ + ldcp->rx_dhandle = 0; + bzero(&ldcp->rx_dcookie, sizeof (ldcp->rx_dcookie)); + ldcp->rxdp = NULL; + ldcp->next_rxi = 0; + ldcp->num_rxds = 0; + ldcp->next_rxseq = VNET_ISS; +} + +/* initialize receive descriptor ring */ +static int +vgen_init_rxds(vgen_ldc_t *ldcp, uint32_t num_desc, uint32_t desc_size, + ldc_mem_cookie_t *dcookie, uint32_t ncookies) +{ + int rv; + ldc_mem_info_t minfo; + + rv = ldc_mem_dring_map(ldcp->ldc_handle, dcookie, ncookies, num_desc, + desc_size, LDC_SHADOW_MAP, &(ldcp->rx_dhandle)); + if (rv != 0) { + return (DDI_FAILURE); + } + + /* + * sucessfully mapped, now try to + * get info about the mapped dring + */ + rv = ldc_mem_dring_info(ldcp->rx_dhandle, &minfo); + if (rv != 0) { + (void) ldc_mem_dring_unmap(ldcp->rx_dhandle); + return (DDI_FAILURE); + } + + /* + * save ring address, number of descriptors. + */ + ldcp->rxdp = (vnet_public_desc_t *)(minfo.vaddr); + bcopy(dcookie, &(ldcp->rx_dcookie), sizeof (*dcookie)); + ldcp->num_rxdcookies = ncookies; + ldcp->num_rxds = num_desc; + ldcp->next_rxi = 0; + ldcp->next_rxseq = VNET_ISS; + + return (DDI_SUCCESS); +} + +/* get channel statistics */ +static uint64_t +vgen_ldc_stat(vgen_ldc_t *ldcp, enum mac_stat stat) +{ + vgen_stats_t *statsp; + uint64_t val; + + val = 0; + statsp = ldcp->statsp; + switch (stat) { + + case MAC_STAT_MULTIRCV: + val = statsp->multircv; + break; + + case MAC_STAT_BRDCSTRCV: + val = statsp->brdcstrcv; + break; + + case MAC_STAT_MULTIXMT: + val = statsp->multixmt; + break; + + case MAC_STAT_BRDCSTXMT: + val = statsp->brdcstxmt; + break; + + case MAC_STAT_NORCVBUF: + val = statsp->norcvbuf; + break; + + case MAC_STAT_IERRORS: + val = statsp->ierrors; + break; + + case MAC_STAT_NOXMTBUF: + val = statsp->noxmtbuf; + break; + + case MAC_STAT_OERRORS: + val = statsp->oerrors; + break; + + case MAC_STAT_COLLISIONS: + break; + + case MAC_STAT_RBYTES: + val = statsp->rbytes; + break; + + case MAC_STAT_IPACKETS: + val = statsp->ipackets; + break; + + case MAC_STAT_OBYTES: + val = statsp->obytes; + break; + + case MAC_STAT_OPACKETS: + val = statsp->opackets; + break; + + /* stats not relevant to ldc, return 0 */ + case MAC_STAT_IFSPEED: + case MAC_STAT_ALIGN_ERRORS: + case MAC_STAT_FCS_ERRORS: + case MAC_STAT_FIRST_COLLISIONS: + case MAC_STAT_MULTI_COLLISIONS: + case MAC_STAT_DEFER_XMTS: + case MAC_STAT_TX_LATE_COLLISIONS: + case MAC_STAT_EX_COLLISIONS: + case MAC_STAT_MACXMT_ERRORS: + case MAC_STAT_CARRIER_ERRORS: + case MAC_STAT_TOOLONG_ERRORS: + case MAC_STAT_XCVR_ADDR: + case MAC_STAT_XCVR_ID: + case MAC_STAT_XCVR_INUSE: + case MAC_STAT_CAP_1000FDX: + case MAC_STAT_CAP_1000HDX: + case MAC_STAT_CAP_100FDX: + case MAC_STAT_CAP_100HDX: + case MAC_STAT_CAP_10FDX: + case MAC_STAT_CAP_10HDX: + case MAC_STAT_CAP_ASMPAUSE: + case MAC_STAT_CAP_PAUSE: + case MAC_STAT_CAP_AUTONEG: + case MAC_STAT_ADV_CAP_1000FDX: + case MAC_STAT_ADV_CAP_1000HDX: + case MAC_STAT_ADV_CAP_100FDX: + case MAC_STAT_ADV_CAP_100HDX: + case MAC_STAT_ADV_CAP_10FDX: + case MAC_STAT_ADV_CAP_10HDX: + case MAC_STAT_ADV_CAP_ASMPAUSE: + case MAC_STAT_ADV_CAP_PAUSE: + case MAC_STAT_ADV_CAP_AUTONEG: + case MAC_STAT_LP_CAP_1000FDX: + case MAC_STAT_LP_CAP_1000HDX: + case MAC_STAT_LP_CAP_100FDX: + case MAC_STAT_LP_CAP_100HDX: + case MAC_STAT_LP_CAP_10FDX: + case MAC_STAT_LP_CAP_10HDX: + case MAC_STAT_LP_CAP_ASMPAUSE: + case MAC_STAT_LP_CAP_PAUSE: + case MAC_STAT_LP_CAP_AUTONEG: + case MAC_STAT_LINK_ASMPAUSE: + case MAC_STAT_LINK_PAUSE: + case MAC_STAT_LINK_AUTONEG: + case MAC_STAT_LINK_DUPLEX: + default: + val = 0; + break; + + } + return (val); +} + +static void +vgen_init_macp(vgen_t *vgenp, mac_t *macp) +{ + macp->m_driver = (void *)vgenp; + macp->m_start = vgen_start; + macp->m_stop = vgen_stop; + macp->m_tx = vgen_tx; + macp->m_resources = vgen_resources; + macp->m_multicst = vgen_multicst; + macp->m_promisc = vgen_promisc; + macp->m_unicst = vgen_unicst; + macp->m_stat = vgen_stat; + macp->m_ioctl = vgen_ioctl; +} + +/* Interrupt handler for the channel */ +static uint_t +vgen_ldc_cb(uint64_t event, caddr_t arg) +{ + _NOTE(ARGUNUSED(event)) + vgen_ldc_t *ldcp; + void *vnetp; + vgen_t *vgenp; + size_t msglen; + ldc_status_t istatus; + uint64_t ldcmsg[7]; + int rv; + vio_msg_tag_t *tagp; + mblk_t *mp = NULL; + mblk_t *bp = NULL; + mblk_t *bpt = NULL; + mblk_t *headp = NULL; + mblk_t *tailp = NULL; + vgen_stats_t *statsp; + + ldcp = (vgen_ldc_t *)arg; + vgenp = LDC_TO_VGEN(ldcp); + vnetp = LDC_TO_VNET(ldcp); + statsp = ldcp->statsp; + + DBG1((vnetp, "vgen_ldc_cb enter: ldcid (%lx)\n", ldcp->ldc_id)); + + mutex_enter(&ldcp->cblock); + statsp->callbacks++; + if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) { + DWARN((vnetp, "vgen_ldc_cb: id(%lx), status(%d) is LDC_INIT\n", + ldcp->ldc_id, ldcp->ldc_status)); + mutex_exit(&ldcp->cblock); + return (LDC_SUCCESS); + } + + /* check ldc status change events first */ + (void) ldc_status(ldcp->ldc_handle, &istatus); + + if (istatus != ldcp->ldc_status) { + switch (istatus) { + case LDC_UP: + ldcp->ldc_status = istatus; + DBG1((vnetp, + "vgen_ldc_cb: id(%lx) status(%d) is LDC_UP\n", + ldcp->ldc_id, ldcp->ldc_status)); + + if (ldcp->portp != vgenp->vsw_portp) { + /* + * modify fdb entry to use this port as the + * channel is up, instead of going through the + * vsw-port (see comments in vgen_port_init()) + */ + vnet_modify_fdb(vnetp, + (uint8_t *)&ldcp->portp->macaddr, + vgen_tx, ldcp->portp); + } + /* Initialize local session id */ + ldcp->local_sid = ddi_get_lbolt(); + /* clear peer session id */ + ldcp->peer_sid = 0; + ldcp->hretries = 0; + /* Initiate Handshake process with peer ldc endpoint */ + vgen_handshake_reset(ldcp); + vgen_handshake(vh_nextphase(ldcp)); + break; + + case LDC_OPEN: + case LDC_READY: + ldcp->ldc_status = istatus; + if ((ldcp->portp != vgenp->vsw_portp) && + (vgenp->vsw_portp != NULL)) { + /* + * modify fdb entry to use vsw-port as the + * channel is reset and we don't have a direct + * link to the destination (see comments + * in vgen_port_init()). + */ + vnet_modify_fdb(vnetp, + (uint8_t *)&ldcp->portp->macaddr, + vgen_tx, vgenp->vsw_portp); + } + /* clear sids */ + ldcp->local_sid = 0; + ldcp->peer_sid = 0; + if (ldcp->hphase != VH_PHASE0) { + vgen_handshake_reset(ldcp); + } + DBG1((vnetp, + "vgen_ldc_cb: id(%lx) status is (%d)\n", + ldcp->ldc_id, ldcp->ldc_status)); + break; + + default: + DWARN((vnetp, + "vgen_ldc_cb: id(%lx) istatus=(%d) status(%d) is" + " *UNKNOWN*\n", + ldcp->ldc_id, istatus, ldcp->ldc_status)); + break; + } + } + + if (istatus != LDC_UP) { + DBG1((vnetp, "vgen_ldc_cb: id(%lx) status(%d) is NOT LDC_UP\n", + ldcp->ldc_id, ldcp->ldc_status)); + mutex_exit(&ldcp->cblock); + return (LDC_SUCCESS); + } + + /* if ldc_status is UP, receive all packets */ + do { + msglen = sizeof (ldcmsg); + rv = ldc_read(ldcp->ldc_handle, (caddr_t)&ldcmsg, &msglen); + + if (rv != 0) { + DWARN((vnetp, + "vgen_ldc_cb:ldc_read err id(%lx) rv(%d) " + "len(%d)\n", ldcp->ldc_id, rv, msglen)); + break; + } + if (msglen == 0) { + DBG2((vnetp, "vgen_ldc_cb: ldc_read id(%lx) NODATA", + ldcp->ldc_id)); + break; + } + DBG2((vnetp, "vgen_ldc_cb: ldc_read id(%lx): msglen(%d)", + ldcp->ldc_id, msglen)); + + tagp = (vio_msg_tag_t *)ldcmsg; + + if (ldcp->peer_sid) { + /* + * check sid only after we have received peer's sid + * in the version negotiate msg. + */ +#ifdef DEBUG + if (vgen_hdbg & HDBG_BAD_SID) { + /* simulate bad sid condition */ + tagp->vio_sid = 0; + vgen_hdbg &= ~(HDBG_BAD_SID); + } +#endif + if (vgen_check_sid(ldcp, tagp) == VGEN_FAILURE) { + /* + * If sid mismatch is detected, + * reset the channel. + */ + ldcp->need_ldc_reset = B_TRUE; + vgen_handshake_reset(ldcp); + mutex_exit(&ldcp->cblock); + return (LDC_SUCCESS); + } + } + + switch (tagp->vio_msgtype) { + case VIO_TYPE_CTRL: + vgen_handle_ctrlmsg(ldcp, tagp); + break; + + case VIO_TYPE_DATA: + headp = tailp = NULL; + vgen_handle_datamsg(ldcp, tagp, &headp, &tailp); + /* build a chain of received packets */ + if (headp != NULL) { + if (bp == NULL) { + bp = headp; + bpt = tailp; + } else { + bpt->b_next = headp; + bpt = tailp; + } + } + break; + + case VIO_TYPE_ERR: + vgen_handle_errmsg(ldcp, tagp); + break; + + default: + DWARN((vnetp, + "vgen_ldc_cb: Unknown VIO_TYPE(%x)\n", + tagp->vio_msgtype)); + break; + } + + } while (msglen); + + mutex_exit(&ldcp->cblock); + /* send up the received packets to MAC layer */ + while (bp != NULL) { + mp = bp; + bp = bp->b_next; + mp->b_next = mp->b_prev = NULL; + DBG2((vnetp, "vgen_ldc_cb: id(%lx) rx pkt len (%lx)\n", + ldcp->ldc_id, MBLKL(mp))); + mac_rx((mac_t *)vgenp->vnetmacp, vgenp->mrh, mp); + } + DBG1((vnetp, "vgen_ldc_cb exit: ldcid (%lx)\n", ldcp->ldc_id)); + + return (LDC_SUCCESS); +} + +/* vgen handshake functions */ + +/* change the hphase for the channel to the next phase */ +static vgen_ldc_t * +vh_nextphase(vgen_ldc_t *ldcp) +{ + if (ldcp->hphase == VH_PHASE3) { + ldcp->hphase = VH_DONE; + } else { + ldcp->hphase++; + } + return (ldcp); +} + +/* + * Check whether the given version is supported or not and + * return VGEN_SUCCESS if supported. + */ +static int +vgen_supported_version(vgen_ldc_t *ldcp, uint16_t ver_major, +uint16_t ver_minor) +{ + vgen_ver_t *versions = ldcp->vgen_versions; + int i = 0; + + while (i < VGEN_NUM_VER) { + if ((versions[i].ver_major == 0) && + (versions[i].ver_minor == 0)) { + break; + } + if ((versions[i].ver_major == ver_major) && + (versions[i].ver_minor == ver_minor)) { + return (VGEN_SUCCESS); + } + i++; + } + return (VGEN_FAILURE); +} + +/* + * Given a version, return VGEN_SUCCESS if a lower version is supported. + */ +static int +vgen_next_version(vgen_ldc_t *ldcp, vgen_ver_t *verp) +{ + vgen_ver_t *versions = ldcp->vgen_versions; + int i = 0; + + while (i < VGEN_NUM_VER) { + if ((versions[i].ver_major == 0) && + (versions[i].ver_minor == 0)) { + break; + } + /* + * if we support a lower minor version within the same major + * version, or if we support a lower major version, + * update the verp parameter with this lower version and + * return success. + */ + if (((versions[i].ver_major == verp->ver_major) && + (versions[i].ver_minor < verp->ver_minor)) || + (versions[i].ver_major < verp->ver_major)) { + verp->ver_major = versions[i].ver_major; + verp->ver_minor = versions[i].ver_minor; + return (VGEN_SUCCESS); + } + i++; + } + + return (VGEN_FAILURE); +} + +/* + * wrapper routine to send the given message over ldc using ldc_write(). + */ +static int +vgen_sendmsg(vgen_ldc_t *ldcp, caddr_t msg, size_t msglen, + boolean_t caller_holds_lock) +{ + int rv; + size_t len; + void *vnetp = LDC_TO_VNET(ldcp); + uint32_t retries = 0; + + len = msglen; + if ((len == 0) || (msg == NULL)) + return (VGEN_FAILURE); + + if (!caller_holds_lock) { + mutex_enter(&ldcp->txlock); + } + + do { + len = msglen; + rv = ldc_write(ldcp->ldc_handle, (caddr_t)msg, &len); + if (retries++ >= vgen_ldcwr_retries) + break; + } while (rv == EWOULDBLOCK); + + if (!caller_holds_lock) { + mutex_exit(&ldcp->txlock); + } + + if ((rv != 0) || (len != msglen)) { + DWARN((vnetp, + "vgen_sendmsg: ldc_write failed: id(%lx) rv(%d)" + " msglen (%d)\n", ldcp->ldc_id, rv, msglen)); + return (VGEN_FAILURE); + } + return (VGEN_SUCCESS); +} + +/* send version negotiate message to the peer over ldc */ +static int +vgen_send_version_negotiate(vgen_ldc_t *ldcp) +{ + vio_ver_msg_t vermsg; + vio_msg_tag_t *tagp = &vermsg.tag; + void *vnetp = LDC_TO_VNET(ldcp); + int rv; + + bzero(&vermsg, sizeof (vermsg)); + + tagp->vio_msgtype = VIO_TYPE_CTRL; + tagp->vio_subtype = VIO_SUBTYPE_INFO; + tagp->vio_subtype_env = VIO_VER_INFO; + tagp->vio_sid = ldcp->local_sid; + + /* get version msg payload from ldcp->local */ + vermsg.ver_major = ldcp->local_hparams.ver_major; + vermsg.ver_minor = ldcp->local_hparams.ver_minor; + vermsg.dev_class = ldcp->local_hparams.dev_class; + + rv = vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (vermsg), B_FALSE); + if (rv != VGEN_SUCCESS) { + DWARN((vnetp, "vgen_send_version_negotiate: vgen_sendmsg failed" + "id (%lx)\n", ldcp->ldc_id)); + return (VGEN_FAILURE); + } + + ldcp->hstate |= VER_INFO_SENT; + DBG2((vnetp, + "vgen_send_version_negotiate: VER_INFO_SENT id (%lx) ver(%d,%d)\n", + ldcp->ldc_id, vermsg.ver_major, vermsg.ver_minor)); + + return (VGEN_SUCCESS); +} + +/* send attr info message to the peer over ldc */ +static int +vgen_send_attr_info(vgen_ldc_t *ldcp) +{ + vnet_attr_msg_t attrmsg; + vio_msg_tag_t *tagp = &attrmsg.tag; + void *vnetp = LDC_TO_VNET(ldcp); + int rv; + + bzero(&attrmsg, sizeof (attrmsg)); + + tagp->vio_msgtype = VIO_TYPE_CTRL; + tagp->vio_subtype = VIO_SUBTYPE_INFO; + tagp->vio_subtype_env = VIO_ATTR_INFO; + tagp->vio_sid = ldcp->local_sid; + + /* get attr msg payload from ldcp->local */ + attrmsg.mtu = ldcp->local_hparams.mtu; + attrmsg.addr = ldcp->local_hparams.addr; + attrmsg.addr_type = ldcp->local_hparams.addr_type; + attrmsg.xfer_mode = ldcp->local_hparams.xfer_mode; + attrmsg.ack_freq = ldcp->local_hparams.ack_freq; + + rv = vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (attrmsg), B_FALSE); + if (rv != VGEN_SUCCESS) { + DWARN((vnetp, "vgen_send_attr_info: vgen_sendmsg failed" + "id (%lx)\n", ldcp->ldc_id)); + return (VGEN_FAILURE); + } + + ldcp->hstate |= ATTR_INFO_SENT; + DBG2((vnetp, "vgen_send_attr_info: ATTR_INFO_SENT id (%lx)\n", + ldcp->ldc_id)); + + return (VGEN_SUCCESS); +} + +/* send descriptor ring register message to the peer over ldc */ +static int +vgen_send_dring_reg(vgen_ldc_t *ldcp) +{ + vio_dring_reg_msg_t msg; + vio_msg_tag_t *tagp = &msg.tag; + void *vnetp = LDC_TO_VNET(ldcp); + int rv; + + bzero(&msg, sizeof (msg)); + + tagp->vio_msgtype = VIO_TYPE_CTRL; + tagp->vio_subtype = VIO_SUBTYPE_INFO; + tagp->vio_subtype_env = VIO_DRING_REG; + tagp->vio_sid = ldcp->local_sid; + + /* get dring info msg payload from ldcp->local */ + bcopy(&(ldcp->local_hparams.dring_cookie), (msg.cookie), + sizeof (ldc_mem_cookie_t)); + msg.ncookies = ldcp->local_hparams.num_dcookies; + msg.num_descriptors = ldcp->local_hparams.num_desc; + msg.descriptor_size = ldcp->local_hparams.desc_size; + + /* + * dring_ident is set to 0. After mapping the dring, peer sets this + * value and sends it in the ack, which is saved in + * vgen_handle_dring_reg(). + */ + msg.dring_ident = 0; + + rv = vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (msg), B_FALSE); + if (rv != VGEN_SUCCESS) { + DWARN((vnetp, "vgen_send_dring_reg: vgen_sendmsg failed" + "id (%lx)\n", ldcp->ldc_id)); + return (VGEN_FAILURE); + } + + ldcp->hstate |= DRING_INFO_SENT; + DBG2((vnetp, "vgen_send_dring_reg: DRING_INFO_SENT id (%lx)\n", + ldcp->ldc_id)); + + return (VGEN_SUCCESS); +} + +static int +vgen_send_rdx_info(vgen_ldc_t *ldcp) +{ + vio_rdx_msg_t rdxmsg; + vio_msg_tag_t *tagp = &rdxmsg.tag; + void *vnetp = LDC_TO_VNET(ldcp); + int rv; + + bzero(&rdxmsg, sizeof (rdxmsg)); + + tagp->vio_msgtype = VIO_TYPE_CTRL; + tagp->vio_subtype = VIO_SUBTYPE_INFO; + tagp->vio_subtype_env = VIO_RDX; + tagp->vio_sid = ldcp->local_sid; + + rv = vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (rdxmsg), B_FALSE); + if (rv != VGEN_SUCCESS) { + DWARN((vnetp, "vgen_send_rdx_info: vgen_sendmsg failed" + "id (%lx)\n", ldcp->ldc_id)); + return (VGEN_FAILURE); + } + + ldcp->hstate |= RDX_INFO_SENT; + DBG2((vnetp, "vgen_send_rdx_info: RDX_INFO_SENT id (%lx)\n", + ldcp->ldc_id)); + + return (VGEN_SUCCESS); +} + +/* send descriptor ring data message to the peer over ldc */ +static int +vgen_send_dring_data(vgen_ldc_t *ldcp, uint32_t start, uint32_t end, + uint64_t next_txseq) +{ + vio_dring_msg_t dringmsg, *msgp = &dringmsg; + vio_msg_tag_t *tagp = &msgp->tag; + void *vnetp = LDC_TO_VNET(ldcp); + int rv; + + bzero(msgp, sizeof (*msgp)); + + tagp->vio_msgtype = VIO_TYPE_DATA; + tagp->vio_subtype = VIO_SUBTYPE_INFO; + tagp->vio_subtype_env = VIO_DRING_DATA; + tagp->vio_sid = ldcp->local_sid; + + msgp->seq_num = next_txseq; + msgp->dring_ident = ldcp->local_hparams.dring_ident; + msgp->start_idx = start; + msgp->end_idx = end; + + rv = vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (dringmsg), B_TRUE); + if (rv != VGEN_SUCCESS) { + DWARN((vnetp, "vgen_send_dring_data: vgen_sendmsg failed" + "id (%lx)\n", ldcp->ldc_id)); + return (VGEN_FAILURE); + } + + DBG2((vnetp, "vgen_send_dring_data: DRING_DATA_SENT id (%lx)\n", + ldcp->ldc_id)); + + return (VGEN_SUCCESS); +} + +/* send multicast addr info message to vsw */ +static int +vgen_send_mcast_info(vgen_ldc_t *ldcp) +{ + vnet_mcast_msg_t mcastmsg; + vnet_mcast_msg_t *msgp; + vio_msg_tag_t *tagp; + vgen_t *vgenp; + void *vnetp; + struct ether_addr *mca; + int rv; + int i; + uint32_t size; + uint32_t mccount; + uint32_t n; + + msgp = &mcastmsg; + tagp = &msgp->tag; + vgenp = LDC_TO_VGEN(ldcp); + vnetp = LDC_TO_VNET(ldcp); + + mccount = vgenp->mccount; + i = 0; + + do { + tagp->vio_msgtype = VIO_TYPE_CTRL; + tagp->vio_subtype = VIO_SUBTYPE_INFO; + tagp->vio_subtype_env = VNET_MCAST_INFO; + tagp->vio_sid = ldcp->local_sid; + + n = ((mccount >= VNET_NUM_MCAST) ? VNET_NUM_MCAST : mccount); + size = n * sizeof (struct ether_addr); + + mca = &(vgenp->mctab[i]); + bcopy(mca, (msgp->mca), size); + msgp->set = B_TRUE; + msgp->count = n; + + rv = vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (*msgp), + B_FALSE); + if (rv != VGEN_SUCCESS) { + DWARN((vnetp, "vgen_send_mcast_info: vgen_sendmsg err" + "id (%lx)\n", ldcp->ldc_id)); + return (VGEN_FAILURE); + } + + mccount -= n; + i += n; + + } while (mccount); + + return (VGEN_SUCCESS); +} + +/* Initiate Phase 2 of handshake */ +static int +vgen_handshake_phase2(vgen_ldc_t *ldcp) +{ + int rv; +#ifdef DEBUG + if (vgen_hdbg & HDBG_OUT_STATE) { + /* simulate out of state condition */ + vgen_hdbg &= ~(HDBG_OUT_STATE); + rv = vgen_send_rdx_info(ldcp); + return (rv); + } + if (vgen_hdbg & HDBG_TIMEOUT) { + /* simulate timeout condition */ + vgen_hdbg &= ~(HDBG_TIMEOUT); + return (VGEN_SUCCESS); + } +#endif + if ((rv = vgen_send_attr_info(ldcp)) != VGEN_SUCCESS) { + return (rv); + } + if ((rv = vgen_send_dring_reg(ldcp)) != VGEN_SUCCESS) { + return (rv); + } + + return (VGEN_SUCCESS); +} + +/* + * This function resets the handshake phase to VH_PHASE0(pre-handshake phase). + * This can happen after a channel comes up (status: LDC_UP) or + * when handshake gets terminated due to various conditions. + */ +static void +vgen_reset_hphase(vgen_ldc_t *ldcp) +{ + vgen_t *vgenp = LDC_TO_VGEN(ldcp); + void *vnetp = LDC_TO_VNET(ldcp); + ldc_status_t istatus; + + DBG2((vnetp, "vgen_reset_hphase: id(0x%lx)\n", ldcp->ldc_id)); + /* reset hstate and hphase */ + ldcp->hstate = 0; + ldcp->hphase = VH_PHASE0; + + /* reset handshake watchdog timeout */ + if (ldcp->htid) { + (void) untimeout(ldcp->htid); + ldcp->htid = 0; + } + + /* + * Unmap drings, if dring_ready is set. + */ + if (ldcp->local_hparams.dring_ready) { + ldcp->local_hparams.dring_ready = B_FALSE; + /* do not unbind our dring */ + } + + if (ldcp->peer_hparams.dring_ready) { + ldcp->peer_hparams.dring_ready = B_FALSE; + /* Unmap peer's dring */ + (void) ldc_mem_dring_unmap(ldcp->rx_dhandle); + vgen_clobber_rxds(ldcp); + } + + vgen_clobber_tbufs(ldcp); + + /* + * clear local handshake params and initialize. + */ + bzero(&(ldcp->local_hparams), sizeof (ldcp->local_hparams)); + +#ifdef DEBUG +#if 0 + if (vgen_hdbg & HDBG_VERSION) { + bcopy(dbg_vgen_versions, ldcp->vgen_versions, + sizeof (ldcp->vgen_versions)); + } +#endif +#endif + /* set version to the highest version supported */ + ldcp->local_hparams.ver_major = + ldcp->vgen_versions[0].ver_major; + ldcp->local_hparams.ver_minor = + ldcp->vgen_versions[0].ver_minor; + ldcp->local_hparams.dev_class = VDEV_NETWORK; + + /* set attr_info params */ + ldcp->local_hparams.mtu = ETHERMAX; + ldcp->local_hparams.addr = + vgen_macaddr_strtoul(vgenp->macaddr); + ldcp->local_hparams.addr_type = ADDR_TYPE_MAC; + ldcp->local_hparams.xfer_mode = VIO_DRING_MODE; + ldcp->local_hparams.ack_freq = 0; /* don't need acks */ + +#ifdef DEBUG +#if 0 + vgen_print_attr_info(ldcp, VGEN_LOCAL); +#endif +#endif + + /* + * set dring_info params. + * Note: dring is already created and bound. + */ + bcopy(&(ldcp->tx_dcookie), &(ldcp->local_hparams.dring_cookie), + sizeof (ldc_mem_cookie_t)); + ldcp->local_hparams.num_dcookies = ldcp->num_txdcookies; + ldcp->local_hparams.num_desc = ldcp->num_txds; + ldcp->local_hparams.desc_size = sizeof (vnet_public_desc_t); + + /* + * dring_ident is set to 0. After mapping the dring, peer sets this + * value and sends it in the ack, which is saved in + * vgen_handle_dring_reg(). + */ + ldcp->local_hparams.dring_ident = 0; + + /* clear peer_hparams */ + bzero(&(ldcp->peer_hparams), sizeof (ldcp->peer_hparams)); + + /* reset the channel if required */ + if (ldcp->need_ldc_reset) { + DWARN((vnetp, + "vgen_reset_hphase: id (%lx), Doing Channel Reset...\n", + ldcp->ldc_id)); + ldcp->need_ldc_reset = B_FALSE; + (void) ldc_reset(ldcp->ldc_handle); + (void) ldc_status(ldcp->ldc_handle, &istatus); + DBG2((vnetp, + "vgen_reset_hphase: id (%lx), RESET Done,ldc_status(%x)\n", + ldcp->ldc_id, istatus)); + ldcp->ldc_status = istatus; + /* clear sids */ + ldcp->local_sid = 0; + ldcp->peer_sid = 0; + (void) ldc_up(ldcp->ldc_handle); + } +} + +/* wrapper function for vgen_reset_hphase */ +static void +vgen_handshake_reset(vgen_ldc_t *ldcp) +{ + ASSERT(MUTEX_HELD(&ldcp->cblock)); + mutex_enter(&ldcp->txlock); + mutex_enter(&ldcp->tclock); + + vgen_reset_hphase(ldcp); + + mutex_exit(&ldcp->tclock); + mutex_exit(&ldcp->txlock); +} + +/* + * Initiate handshake with the peer by sending various messages + * based on the handshake-phase that the channel is currently in. + */ +static void +vgen_handshake(vgen_ldc_t *ldcp) +{ + uint32_t hphase = ldcp->hphase; + void *vnetp = LDC_TO_VNET(ldcp); + vgen_t *vgenp = LDC_TO_VGEN(ldcp); + + switch (hphase) { + + case VH_PHASE1: + + /* + * start timer, for entire handshake process, turn this timer + * off if all phases of handshake complete successfully and + * hphase goes to VH_DONE(below) or + * vgen_reset_hphase() gets called or + * channel is reset due to errors or + * vgen_ldc_uninit() is invoked(vgen_stop). + */ + ldcp->htid = timeout(vgen_hwatchdog, (caddr_t)ldcp, + drv_usectohz(vgen_hwd_interval * 1000)); + + /* Phase 1 involves negotiating the version */ + if (vgen_send_version_negotiate(ldcp) != VGEN_SUCCESS) { + vgen_handshake_reset(ldcp); + } + break; + + case VH_PHASE2: + if (vgen_handshake_phase2(ldcp) != VGEN_SUCCESS) { + vgen_handshake_reset(ldcp); + } + break; + + case VH_PHASE3: + if (vgen_send_rdx_info(ldcp) != VGEN_SUCCESS) { + vgen_handshake_reset(ldcp); + } + break; + + case VH_DONE: + /* reset handshake watchdog timeout */ + if (ldcp->htid) { + (void) untimeout(ldcp->htid); + ldcp->htid = 0; + } + ldcp->hretries = 0; +#if 0 + vgen_print_ldcinfo(ldcp); +#endif + DBG1((vnetp, "vgen_handshake: id(0x%lx) Handshake Done\n", + ldcp->ldc_id)); + + if (ldcp->need_mcast_sync) { + /* need to sync multicast table with vsw */ + + ldcp->need_mcast_sync = B_FALSE; + mutex_exit(&ldcp->cblock); + + mutex_enter(&vgenp->lock); + (void) vgen_send_mcast_info(ldcp); + mutex_exit(&vgenp->lock); + + mutex_enter(&ldcp->cblock); + + } + break; + + default: + break; + } +} + +/* + * Check if the current handshake phase has completed successfully and + * return the status. + */ +static int +vgen_handshake_done(vgen_ldc_t *ldcp) +{ + uint32_t hphase = ldcp->hphase; + int status = 0; + void *vnetp = LDC_TO_VNET(ldcp); + + switch (hphase) { + + case VH_PHASE1: + /* + * Phase1 is done, if version negotiation + * completed successfully. + */ + status = ((ldcp->hstate & VER_NEGOTIATED) == + VER_NEGOTIATED); + break; + + case VH_PHASE2: + /* + * Phase 2 is done, if attr info and dring info + * have been exchanged successfully. + */ + status = (((ldcp->hstate & ATTR_INFO_EXCHANGED) == + ATTR_INFO_EXCHANGED) && + ((ldcp->hstate & DRING_INFO_EXCHANGED) == + DRING_INFO_EXCHANGED)); + break; + + case VH_PHASE3: + /* Phase 3 is done, if rdx msg has been exchanged */ + status = ((ldcp->hstate & RDX_EXCHANGED) == + RDX_EXCHANGED); + break; + + default: + break; + } + + if (status == 0) { + return (VGEN_FAILURE); + } + DBG2((vnetp, "VNET_HANDSHAKE_DONE: PHASE(%d)\n", hphase)); + return (VGEN_SUCCESS); +} + +/* retry handshake on failure */ +static void +vgen_handshake_retry(vgen_ldc_t *ldcp) +{ + /* reset handshake phase */ + vgen_handshake_reset(ldcp); + if (vgen_max_hretries) { /* handshake retry is specified */ + if (ldcp->hretries++ < vgen_max_hretries) + vgen_handshake(vh_nextphase(ldcp)); + } +} + +/* + * Handle a version info msg from the peer or an ACK/NACK from the peer + * to a version info msg that we sent. + */ +static void +vgen_handle_version_negotiate(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp) +{ + vio_ver_msg_t *vermsg = (vio_ver_msg_t *)tagp; + int ack = 0; + int failed = 0; + void *vnetp = LDC_TO_VNET(ldcp); + int idx; + vgen_ver_t *versions = ldcp->vgen_versions; + + DBG1((vnetp, "vgen_handle_version_negotiate: enter\n")); + switch (tagp->vio_subtype) { + case VIO_SUBTYPE_INFO: + + /* Cache sid of peer if this is the first time */ + if (ldcp->peer_sid == 0) { + DBG2((vnetp, + "vgen_handle_version_negotiate: id (%lx) Caching" + " peer_sid(%x)\n", ldcp->ldc_id, tagp->vio_sid)); + ldcp->peer_sid = tagp->vio_sid; + } + + if (ldcp->hphase != VH_PHASE1) { + /* + * If we are not already in VH_PHASE1, reset to + * pre-handshake state, and initiate handshake + * to the peer too. + */ + vgen_handshake_reset(ldcp); + vgen_handshake(vh_nextphase(ldcp)); + } + ldcp->hstate |= VER_INFO_RCVD; + + /* save peer's requested values */ + ldcp->peer_hparams.ver_major = vermsg->ver_major; + ldcp->peer_hparams.ver_minor = vermsg->ver_minor; + ldcp->peer_hparams.dev_class = vermsg->dev_class; + + if ((vermsg->dev_class != VDEV_NETWORK) && + (vermsg->dev_class != VDEV_NETWORK_SWITCH)) { + /* unsupported dev_class, send NACK */ + + tagp->vio_subtype = VIO_SUBTYPE_NACK; + tagp->vio_sid = ldcp->local_sid; + /* send reply msg back to peer */ + (void) vgen_sendmsg(ldcp, (caddr_t)tagp, + sizeof (*vermsg), B_FALSE); + DWARN((vnetp, + "vgen_handle_version_negotiate: Version" + " Negotiation Failed id (%lx)\n", ldcp->ldc_id)); + vgen_handshake_reset(ldcp); + return; + } + + DBG2((vnetp, "vgen_handle_version_negotiate: VER_INFO_RCVD," + " id (%lx), ver(%d,%d)\n", ldcp->ldc_id, + vermsg->ver_major, vermsg->ver_minor)); + + idx = 0; + + for (;;) { + + if (vermsg->ver_major > versions[idx].ver_major) { + + /* nack with next lower version */ + tagp->vio_subtype = VIO_SUBTYPE_NACK; + vermsg->ver_major = versions[idx].ver_major; + vermsg->ver_minor = versions[idx].ver_minor; + break; + } + + if (vermsg->ver_major == versions[idx].ver_major) { + + /* major version match - ACK version */ + tagp->vio_subtype = VIO_SUBTYPE_ACK; + ack = 1; + + /* + * lower minor version to the one this endpt + * supports, if necessary + */ + if (vermsg->ver_minor > + versions[idx].ver_minor) { + vermsg->ver_minor = + versions[idx].ver_minor; + ldcp->peer_hparams.ver_minor = + versions[idx].ver_minor; + } + break; + } + + idx++; + + if (idx == VGEN_NUM_VER) { + + /* no version match - send NACK */ + tagp->vio_subtype = VIO_SUBTYPE_NACK; + vermsg->ver_major = 0; + vermsg->ver_minor = 0; + failed = 1; + break; + } + + } + + tagp->vio_sid = ldcp->local_sid; + + /* send reply msg back to peer */ + if (vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (*vermsg), + B_FALSE) != VGEN_SUCCESS) { + vgen_handshake_reset(ldcp); + return; + } + + if (ack) { + ldcp->hstate |= VER_ACK_SENT; + DBG2((vnetp, "vgen_handle_version_negotiate:" + " VER_ACK_SENT, id (%lx) ver(%d,%d) \n", + ldcp->ldc_id, vermsg->ver_major, + vermsg->ver_minor)); + } + if (failed) { + DWARN((vnetp, "vgen_handle_version_negotiate:" + " Version Negotiation Failed id (%lx)\n", + ldcp->ldc_id)); + vgen_handshake_reset(ldcp); + return; + } + if (vgen_handshake_done(ldcp) == VGEN_SUCCESS) { + + /* VER_ACK_SENT and VER_ACK_RCVD */ + + /* local and peer versions match? */ + ASSERT((ldcp->local_hparams.ver_major == + ldcp->peer_hparams.ver_major) && + (ldcp->local_hparams.ver_minor == + ldcp->peer_hparams.ver_minor)); + + /* move to the next phase */ + vgen_handshake(vh_nextphase(ldcp)); + } + + break; + + case VIO_SUBTYPE_ACK: + + if (ldcp->hphase != VH_PHASE1) { + /* This should not happen. */ + DWARN((vnetp, + "vgen_handle_version_negotiate:" + " VER_ACK_RCVD id (%lx) Invalid Phase(%u)\n", + ldcp->ldc_id, ldcp->hphase)); + vgen_handshake_reset(ldcp); + return; + } + + /* SUCCESS - we have agreed on a version */ + ldcp->local_hparams.ver_major = vermsg->ver_major; + ldcp->local_hparams.ver_minor = vermsg->ver_minor; + ldcp->hstate |= VER_ACK_RCVD; + + DBG2((vnetp, "vgen_handle_version_negotiate:" + " VER_ACK_RCVD, id (%lx) ver(%d,%d) \n", + ldcp->ldc_id, vermsg->ver_major, vermsg->ver_minor)); + + if (vgen_handshake_done(ldcp) == VGEN_SUCCESS) { + + /* VER_ACK_SENT and VER_ACK_RCVD */ + + /* local and peer versions match? */ + ASSERT((ldcp->local_hparams.ver_major == + ldcp->peer_hparams.ver_major) && + (ldcp->local_hparams.ver_minor == + ldcp->peer_hparams.ver_minor)); + + /* move to the next phase */ + vgen_handshake(vh_nextphase(ldcp)); + } + break; + + case VIO_SUBTYPE_NACK: + + if (ldcp->hphase != VH_PHASE1) { + /* This should not happen. */ + DWARN((vnetp, + "vgen_handle_version_negotiate:" + " VER_NACK_RCVD id (%lx) Invalid Phase(%u)\n", + ldcp->ldc_id, ldcp->hphase)); + vgen_handshake_reset(ldcp); + return; + } + + DBG2((vnetp, "vgen_handle_version_negotiate:" + " VER_NACK_RCVD id(%lx) next ver(%d,%d)\n", + ldcp->ldc_id, vermsg->ver_major, vermsg->ver_minor)); + + /* check if version in NACK is zero */ + if (vermsg->ver_major == 0 && vermsg->ver_minor == 0) { + /* + * Version Negotiation has failed. + */ + DWARN((vnetp, "vgen_handle_version_negotiate:" + " Version Negotiation Failed id (%lx)\n", + ldcp->ldc_id)); + vgen_handshake_reset(ldcp); + return; + } + + idx = 0; + + for (;;) { + + if (vermsg->ver_major > versions[idx].ver_major) { + /* select next lower version */ + + ldcp->local_hparams.ver_major = + versions[idx].ver_major; + ldcp->local_hparams.ver_minor = + versions[idx].ver_minor; + break; + } + + if (vermsg->ver_major == versions[idx].ver_major) { + /* major version match */ + + ldcp->local_hparams.ver_major = + versions[idx].ver_major; + + ldcp->local_hparams.ver_minor = + versions[idx].ver_minor; + break; + } + + idx++; + + if (idx == VGEN_NUM_VER) { + /* + * no version match. + * Version Negotiation has failed. + */ + DWARN((vnetp, "vgen_handle_version_negotiate:" + " Version Negotiation Failed id (%lx)\n", + ldcp->ldc_id)); + vgen_handshake_reset(ldcp); + return; + } + + } + + if (vgen_send_version_negotiate(ldcp) != VGEN_SUCCESS) { + vgen_handshake_reset(ldcp); + return; + } + + break; + } + DBG1((vnetp, "vgen_handle_version_negotiate: exit\n")); +} + +/* Check if the attributes are supported */ +static int +vgen_check_attr_info(vgen_ldc_t *ldcp, vnet_attr_msg_t *msg) +{ + _NOTE(ARGUNUSED(ldcp)) + +#if 0 + uint64_t port_macaddr; + port_macaddr = vgen_macaddr_strtoul((uint8_t *) + &(ldcp->portp->macaddr)); +#endif + /* + * currently, we support these attr values: + * mtu of ethernet, addr_type of mac, xfer_mode of + * ldc shared memory, ack_freq of 0 (data is acked if + * the ack bit is set in the descriptor) and the address should + * match the address in the port node. + */ + if ((msg->mtu != ETHERMAX) || + (msg->addr_type != ADDR_TYPE_MAC) || + (msg->xfer_mode != VIO_DRING_MODE) || + (msg->ack_freq > 64)) { +#if 0 + (msg->addr != port_macaddr)) +cmn_err(CE_CONT, "vgen_check_attr_info: msg->addr(%lx), port_macaddr(%lx)\n", + msg->addr, port_macaddr); +#endif + return (VGEN_FAILURE); + } + + return (VGEN_SUCCESS); +} + +/* + * Handle an attribute info msg from the peer or an ACK/NACK from the peer + * to an attr info msg that we sent. + */ +static void +vgen_handle_attr_info(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp) +{ + vnet_attr_msg_t *attrmsg = (vnet_attr_msg_t *)tagp; + void *vnetp = LDC_TO_VNET(ldcp); + int ack = 0; + + DBG1((vnetp, "vgen_handle_attr_info: enter\n")); + if (ldcp->hphase != VH_PHASE2) { + DWARN((vnetp, + "vgen_handle_attr_info: Rcvd ATTR_INFO id(%lx)" + " subtype (%d), Invalid Phase(%u)\n", ldcp->ldc_id, + tagp->vio_subtype, ldcp->hphase)); + vgen_handshake_reset(ldcp); + return; + } + switch (tagp->vio_subtype) { + case VIO_SUBTYPE_INFO: + + DBG2((vnetp, "vgen_handle_attr_info: ATTR_INFO_RCVD id(%lx)\n", + ldcp->ldc_id)); + ldcp->hstate |= ATTR_INFO_RCVD; + + /* save peer's values */ + ldcp->peer_hparams.mtu = attrmsg->mtu; + ldcp->peer_hparams.addr = attrmsg->addr; + ldcp->peer_hparams.addr_type = attrmsg->addr_type; + ldcp->peer_hparams.xfer_mode = attrmsg->xfer_mode; + ldcp->peer_hparams.ack_freq = attrmsg->ack_freq; + + if (vgen_check_attr_info(ldcp, attrmsg) == VGEN_FAILURE) { + /* unsupported attr, send NACK */ + tagp->vio_subtype = VIO_SUBTYPE_NACK; + } else { + ack = 1; + tagp->vio_subtype = VIO_SUBTYPE_ACK; + } + tagp->vio_sid = ldcp->local_sid; + + /* send reply msg back to peer */ + if (vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (*attrmsg), + B_FALSE) != VGEN_SUCCESS) { + vgen_handshake_reset(ldcp); + return; + } + + if (ack) { + ldcp->hstate |= ATTR_ACK_SENT; + DBG2((vnetp, "vgen_handle_attr_info:" + " ATTR_ACK_SENT id(%lx)\n", ldcp->ldc_id)); +#ifdef DEBUG +#if 0 + vgen_print_attr_info(ldcp, VGEN_PEER); +#endif +#endif + } else { + /* failed */ + DWARN((vnetp, "vgen_handle_attr_info:" + " ATTR_NACK_SENT id(%lx)\n", ldcp->ldc_id)); + vgen_handshake_reset(ldcp); + return; + } + + if (vgen_handshake_done(ldcp) == VGEN_SUCCESS) { + vgen_handshake(vh_nextphase(ldcp)); + } + + break; + + case VIO_SUBTYPE_ACK: + + ldcp->hstate |= ATTR_ACK_RCVD; + + DBG2((vnetp, "vgen_handle_attr_info: ATTR_ACK_RCVD id(%lx)\n", + ldcp->ldc_id)); + + if (vgen_handshake_done(ldcp) == VGEN_SUCCESS) { + vgen_handshake(vh_nextphase(ldcp)); + } + break; + + case VIO_SUBTYPE_NACK: + + DBG2((vnetp, "vgen_handle_attr_info: ATTR_NACK_RCVD id(%lx)\n", + ldcp->ldc_id)); + vgen_handshake_reset(ldcp); + break; + } + DBG1((vnetp, "vgen_handle_attr_info: exit\n")); +} + +/* Check if the dring info msg is ok */ +static int +vgen_check_dring_reg(vio_dring_reg_msg_t *msg) +{ + /* check if msg contents are ok */ + if ((msg->num_descriptors < 128) || (msg->descriptor_size < + sizeof (vnet_public_desc_t))) { + return (VGEN_FAILURE); + } + return (VGEN_SUCCESS); +} + +/* + * Handle a descriptor ring register msg from the peer or an ACK/NACK from + * the peer to a dring register msg that we sent. + */ +static void +vgen_handle_dring_reg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp) +{ + vio_dring_reg_msg_t *msg = (vio_dring_reg_msg_t *)tagp; + void *vnetp = LDC_TO_VNET(ldcp); + ldc_mem_cookie_t dcookie; + int ack = 0; + int rv = 0; + + DBG1((vnetp, "vgen_handle_dring_reg: enter\n")); + if (ldcp->hphase < VH_PHASE2) { + /* dring_info can be rcvd in any of the phases after Phase1 */ + DWARN((vnetp, + "vgen_handle_dring_reg: Rcvd DRING_INFO, id (%lx)" + " Subtype (%d), Invalid Phase(%u)\n", ldcp->ldc_id, + tagp->vio_subtype, ldcp->hphase)); + vgen_handshake_reset(ldcp); + return; + } + switch (tagp->vio_subtype) { + case VIO_SUBTYPE_INFO: + + DBG2((vnetp, "vgen_handle_dring_reg: DRING_INFO_RCVD id(%lx)\n", + ldcp->ldc_id)); + ldcp->hstate |= DRING_INFO_RCVD; + bcopy((msg->cookie), &dcookie, sizeof (dcookie)); + + ASSERT(msg->ncookies == 1); + + if (vgen_check_dring_reg(msg) == VGEN_SUCCESS) { + /* + * verified dring info msg to be ok, + * now try to map the remote dring. + */ + rv = vgen_init_rxds(ldcp, msg->num_descriptors, + msg->descriptor_size, &dcookie, + msg->ncookies); + if (rv == DDI_SUCCESS) { + /* now we can ack the peer */ + ack = 1; + } + } + if (ack == 0) { + /* failed, send NACK */ + tagp->vio_subtype = VIO_SUBTYPE_NACK; + } else { + if (!(ldcp->peer_hparams.dring_ready)) { + + /* save peer's dring_info values */ + bcopy(&dcookie, + &(ldcp->peer_hparams.dring_cookie), + sizeof (dcookie)); + ldcp->peer_hparams.num_desc = + msg->num_descriptors; + ldcp->peer_hparams.desc_size = + msg->descriptor_size; + ldcp->peer_hparams.num_dcookies = + msg->ncookies; + + /* set dring_ident for the peer */ + ldcp->peer_hparams.dring_ident = + (uint64_t)ldcp->rxdp; + /* return the dring_ident in ack msg */ + msg->dring_ident = + (uint64_t)ldcp->rxdp; + + ldcp->peer_hparams.dring_ready = B_TRUE; + } + tagp->vio_subtype = VIO_SUBTYPE_ACK; + } + tagp->vio_sid = ldcp->local_sid; + /* send reply msg back to peer */ + if (vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (*msg), + B_FALSE) != VGEN_SUCCESS) { + vgen_handshake_reset(ldcp); + return; + } + + if (ack) { + ldcp->hstate |= DRING_ACK_SENT; + DBG2((vnetp, "vgen_handle_dring_reg: DRING_ACK_SENT" + " id (%lx)\n", ldcp->ldc_id)); + } else { + DWARN((vnetp, "vgen_handle_dring_reg: DRING_NACK_SENT" + " id (%lx)\n", ldcp->ldc_id)); + vgen_handshake_reset(ldcp); + return; + } + + if (vgen_handshake_done(ldcp) == VGEN_SUCCESS) { + vgen_handshake(vh_nextphase(ldcp)); + } + + break; + + case VIO_SUBTYPE_ACK: + + ldcp->hstate |= DRING_ACK_RCVD; + + DBG2((vnetp, "vgen_handle_dring_reg: DRING_ACK_RCVD" + " id (%lx)\n", ldcp->ldc_id)); + + if (!(ldcp->local_hparams.dring_ready)) { + /* local dring is now ready */ + ldcp->local_hparams.dring_ready = B_TRUE; + + /* save dring_ident acked by peer */ + ldcp->local_hparams.dring_ident = + msg->dring_ident; + } + + if (vgen_handshake_done(ldcp) == VGEN_SUCCESS) { + vgen_handshake(vh_nextphase(ldcp)); + } + + break; + + case VIO_SUBTYPE_NACK: + + DBG2((vnetp, "vgen_handle_dring_reg: DRING_NACK_RCVD" + " id (%lx)\n", ldcp->ldc_id)); + vgen_handshake_reset(ldcp); + break; + } + DBG1((vnetp, "vgen_handle_dring_reg: exit\n")); +} + +/* + * Handle a rdx info msg from the peer or an ACK/NACK + * from the peer to a rdx info msg that we sent. + */ +static void +vgen_handle_rdx_info(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp) +{ + void *vnetp = LDC_TO_VNET(ldcp); + + DBG1((vnetp, "vgen_handle_rdx_info: enter\n")); + if (ldcp->hphase != VH_PHASE3) { + DWARN((vnetp, + "vgen_handle_rdx_info: Rcvd RDX_INFO, id (%lx)" + " Subtype (%d), Invalid Phase(%u)\n", ldcp->ldc_id, + tagp->vio_subtype, ldcp->hphase)); + vgen_handshake_reset(ldcp); + return; + } + switch (tagp->vio_subtype) { + case VIO_SUBTYPE_INFO: + + DBG2((vnetp, "vgen_handle_rdx_info: RDX_INFO_RCVD id (%lx)\n", + ldcp->ldc_id)); + ldcp->hstate |= RDX_INFO_RCVD; + + tagp->vio_subtype = VIO_SUBTYPE_ACK; + tagp->vio_sid = ldcp->local_sid; + /* send reply msg back to peer */ + if (vgen_sendmsg(ldcp, (caddr_t)tagp, + sizeof (vio_rdx_msg_t), B_FALSE) != VGEN_SUCCESS) { + vgen_handshake_reset(ldcp); + return; + } + + ldcp->hstate |= RDX_ACK_SENT; + DBG2((vnetp, "vgen_handle_rdx_info: RDX_ACK_SENT id (%lx)\n", + ldcp->ldc_id)); + + if (vgen_handshake_done(ldcp) == VGEN_SUCCESS) { + vgen_handshake(vh_nextphase(ldcp)); + } + + break; + + case VIO_SUBTYPE_ACK: + + ldcp->hstate |= RDX_ACK_RCVD; + + DBG2((vnetp, "vgen_handle_rdx_info: RDX_ACK_RCVD id (%lx)\n", + ldcp->ldc_id)); + + if (vgen_handshake_done(ldcp) == VGEN_SUCCESS) { + vgen_handshake(vh_nextphase(ldcp)); + } + break; + + case VIO_SUBTYPE_NACK: + + DBG2((vnetp, "vgen_handle_rdx_info: RDX_NACK_RCVD id (%lx)\n", + ldcp->ldc_id)); + vgen_handshake_reset(ldcp); + break; + } + DBG1((vnetp, "vgen_handle_rdx_info: exit\n")); +} + +/* Handle ACK/NACK from vsw to a set multicast msg that we sent */ +static void +vgen_handle_mcast_info(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp) +{ + void *vnetp = LDC_TO_VNET(ldcp); + vgen_t *vgenp = LDC_TO_VGEN(ldcp); + vnet_mcast_msg_t *msgp = (vnet_mcast_msg_t *)tagp; + struct ether_addr *addrp; + int count; + int i; + + DBG1((vnetp, "vgen_handle_mcast_info: enter\n")); + switch (tagp->vio_subtype) { + + case VIO_SUBTYPE_INFO: + + /* vnet shouldn't recv set mcast msg, only vsw handles it */ + DWARN((vnetp, + "vgen_handle_mcast_info: rcvd SET_MCAST_INFO id (%lx)\n", + ldcp->ldc_id)); + break; + + case VIO_SUBTYPE_ACK: + + /* success adding/removing multicast addr */ + DBG2((vnetp, + "vgen_handle_mcast_info: rcvd SET_MCAST_ACK id (%lx)\n", + ldcp->ldc_id)); + break; + + case VIO_SUBTYPE_NACK: + + DWARN((vnetp, + "vgen_handle_mcast_info: rcvd SET_MCAST_NACK id (%lx)\n", + ldcp->ldc_id)); + if (!(msgp->set)) { + /* multicast remove request failed */ + break; + } + + /* multicast add request failed */ + for (count = 0; count < msgp->count; count++) { + addrp = &(msgp->mca[count]); + + /* delete address from the table */ + for (i = 0; i < vgenp->mccount; i++) { + if (ether_cmp(addrp, + &(vgenp->mctab[i])) == 0) { + if (vgenp->mccount > 1) { + vgenp->mctab[i] = + vgenp->mctab[vgenp->mccount-1]; + } + vgenp->mccount--; + break; + } + } + } + break; + + } + DBG1((vnetp, "vgen_handle_mcast_info: exit\n")); +} + +/* handler for control messages received from the peer ldc end-point */ +static void +vgen_handle_ctrlmsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp) +{ + void *vnetp = LDC_TO_VNET(ldcp); + + DBG1((vnetp, "vgen_handle_ctrlmsg: enter\n")); + switch (tagp->vio_subtype_env) { + + case VIO_VER_INFO: + vgen_handle_version_negotiate(ldcp, tagp); + break; + + case VIO_ATTR_INFO: + vgen_handle_attr_info(ldcp, tagp); + break; + + case VIO_DRING_REG: + vgen_handle_dring_reg(ldcp, tagp); + break; + + case VIO_RDX: + vgen_handle_rdx_info(ldcp, tagp); + break; + + case VNET_MCAST_INFO: + vgen_handle_mcast_info(ldcp, tagp); + break; + + } + DBG1((vnetp, "vgen_handle_ctrlmsg: exit\n")); +} + +/* handler for data messages received from the peer ldc end-point */ +static void +vgen_handle_datamsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, + mblk_t **headp, mblk_t **tailp) +{ + void *vnetp = LDC_TO_VNET(ldcp); + + DBG1((vnetp, "vgen_handle_datamsg: enter\n")); + + if (ldcp->hphase != VH_DONE) + return; + switch (tagp->vio_subtype_env) { + case VIO_DRING_DATA: + vgen_handle_dring_data(ldcp, tagp, headp, tailp); + break; + default: + break; + } + + DBG1((vnetp, "vgen_handle_datamsg: exit\n")); +} + +static void +vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, + mblk_t **headp, mblk_t **tailp) +{ + vio_dring_msg_t *dringmsg; + vnet_public_desc_t *rxdp; + vnet_public_desc_t *txdp; + vio_dring_entry_hdr_t *hdrp; + vgen_stats_t *statsp; + struct ether_header *ehp; + mblk_t *mp = NULL; + mblk_t *bp = NULL; + mblk_t *bpt = NULL; + size_t nbytes; + size_t nread; + uint64_t off = 0; + uint32_t start; + uint32_t end; + uint32_t datalen; + uint32_t ncookies; + uint32_t sync_start; + uint32_t sync_end; + uint32_t rxi; + uint32_t txi; + int rv; + boolean_t rxd_err = B_FALSE; + boolean_t sync_done = B_FALSE; +#ifdef VGEN_HANDLE_LOST_PKTS + int n; +#endif +#ifdef VGEN_REXMIT + uint64_t seqnum; + vgen_private_desc_t *tbufp; +#endif + void *vnetp = LDC_TO_VNET(ldcp); + + dringmsg = (vio_dring_msg_t *)tagp; + start = dringmsg->start_idx; + end = dringmsg->end_idx; + statsp = ldcp->statsp; + + DBG1((vnetp, "vgen_handle_dring_data: enter\n")); + switch (tagp->vio_subtype) { + + case VIO_SUBTYPE_INFO: + /* + * received a data msg, which contains the start and end + * indeces of the descriptors within the rx ring holding data, + * the seq_num of data packet corresponding to the start index, + * and the dring_ident. + * We can now read the contents of each of these descriptors + * and gather data from it. + */ + DBG2((vnetp, + "vgen_handle_dring_data: INFO: start(%d), end(%d)\n", + start, end)); + + /* validate rx start and end indeces */ + if (!(CHECK_RXI(start, ldcp)) || !(CHECK_RXI(end, ldcp))) { + /* drop the message if invalid index */ + break; + } + + /* validate dring_ident */ + if (dringmsg->dring_ident != ldcp->peer_hparams.dring_ident) { + /* invalid dring_ident, drop the msg */ + break; + } +#ifdef DEBUG + if (vgen_trigger_rxlost) { + /* drop this msg to simulate lost pkts for debugging */ + vgen_trigger_rxlost = 0; + break; + } +#endif + +#ifdef VGEN_HANDLE_LOST_PKTS + + /* receive start index doesn't match expected index */ + if (ldcp->next_rxi != start) { + + DWARN((vnetp, "vgen_handle_dring_data: id(%lx) " + "next_rxi(%d) != start(%d)\n", + ldcp->ldc_id, ldcp->next_rxi, start)); + + /* calculate the number of pkts lost */ + if (start >= ldcp->next_rxi) { + n = start - ldcp->next_rxi; + } else { + n = ldcp->num_rxds - (ldcp->next_rxi - start); + } + + /* + * Starting sequence number of the received packets + * is less than the next sequence number that + * is expected: + * + * drop the message and the corresponding packets. + */ + if (ldcp->next_rxseq > dringmsg->seq_num) { + DWARN((vnetp, "vgen_handle_dring_data: id(%lx) " + "dropping pkts, expected rxseq(0x%lx) " + "> recvd(0x%lx)\n", + ldcp->ldc_id, ldcp->next_rxseq, + dringmsg->seq_num)); + /* + * duplicate/multiple retransmissions from + * sender?? drop this msg. + */ + break; + } + + /* + * Starting sequence number of the received packets + * is greater than the next expected sequence number + * + * send a NACK back to the peer to indicate lost + * packets. + */ + if (dringmsg->seq_num > ldcp->next_rxseq) { + statsp->rx_lost_pkts += n; + tagp->vio_subtype = VIO_SUBTYPE_NACK; + tagp->vio_sid = ldcp->local_sid; + /* indicate the range of lost descriptors */ + dringmsg->start_idx = ldcp->next_rxi; + rxi = start; + DECR_RXI(rxi, ldcp); + dringmsg->end_idx = rxi; + /* dring ident is left unchanged */ + if (vgen_sendmsg(ldcp, (caddr_t)tagp, + sizeof (*dringmsg), B_FALSE)) { + DWARN((vnetp, + "vgen_handle_dring_data: id(%lx) " + "vgen_sendmsg failed, " + "stype: NACK\n", ldcp->ldc_id)); + } +#ifdef VGEN_REXMIT + /* + * stop further processing until peer + * retransmits with the right index and seqnum. + */ + break; +#else /* VGEN_REXMIT */ + /* + * treat this range of descrs/pkts as dropped + * and set the new expected values for next_rxi + * and next_rxseq. continue(below) to process + * from the new start index. + */ + ldcp->next_rxi = start; + ldcp->next_rxseq += n; +#endif /* VGEN_REXMIT */ + + } else if (dringmsg->seq_num == ldcp->next_rxseq) { + /* + * expected and starting seqnums match, but + * the descriptor indeces don't? + * + * restart handshake with peer. + */ + DWARN((vnetp, + "vgen_handle_dring_data: id(%lx) " + "next_rxseq(0x%lx) == seq_num(0x%lx)\n", + ldcp->ldc_id, ldcp->next_rxseq, + dringmsg->seq_num)); + +#if 0 + vgen_handshake_retry(ldcp); + break; +#endif + + } + + } else { + /* expected and start dring indeces match */ + + if (dringmsg->seq_num != ldcp->next_rxseq) { + + /* seqnums don't match */ + + DWARN((vnetp, + "vgen_handle_dring_data: id(%lx) " + "next_rxseq(0x%lx) != seq_num(0x%lx)\n", + ldcp->ldc_id, ldcp->next_rxseq, + dringmsg->seq_num)); + +#if 0 + vgen_handshake_retry(ldcp); + break; +#endif + } + } + +#endif /* VGEN_HANDLE_LOST_PKTS */ + + /* + * Start processing the descriptor range, specified + * in the dring data msg. + */ + if (ldc_mem_dring_acquire(ldcp->rx_dhandle, start, end)) { + DWARN((vnetp, "vgen_handle_dring_data: " + "id(%lx), ldc_mem_dring_acquire() failed\n", + ldcp->ldc_id)); + statsp->ierrors++; + } + rxi = start; + sync_start = start; + do { + /* recv packets from 'start' to 'end' */ + + rxdp = &(ldcp->rxdp[rxi]); + hdrp = &rxdp->hdr; + + datalen = rxdp->nbytes; + ncookies = rxdp->ncookies; + if ((datalen < ETHERMIN) || + (ncookies == 0) || + (ncookies > (uint64_t)MAX_COOKIES) || + (hdrp->dstate != VIO_DESC_READY)) { + rxd_err = B_TRUE; + } else { + /* + * The data buffer returned by allocb(9F) is + * 8byte aligned. We allocate extra 8 bytes to + * ensure size is multiple of 8 bytes for + * ldc_mem_copy(). + */ + mp = allocb(datalen + 8, BPRI_MED); + nbytes = (datalen + 7) & ~7; + } + if ((rxd_err) || (mp == NULL)) { + /* + * rxd_err or allocb() failure, + * drop this packet, get next. + */ + if (rxd_err) { + statsp->ierrors++; + rxd_err = B_FALSE; + } else { + statsp->rx_allocb_fail++; + } + + /* set descriptor done bit */ + hdrp->dstate = VIO_DESC_DONE; + + if (hdrp->ack) { + /* + * sender needs ack for this packet. + * sync pkts upto this index and + * send the ack to the peer. + */ + sync_end = rxi; + (void) ldc_mem_dring_release( + ldcp->rx_dhandle, sync_start, + sync_end); + tagp->vio_subtype = VIO_SUBTYPE_ACK; + tagp->vio_sid = ldcp->local_sid; + dringmsg = (vio_dring_msg_t *)tagp; + dringmsg->start_idx = sync_start; + dringmsg->end_idx = sync_end; + if (vgen_sendmsg(ldcp, (caddr_t)tagp, + sizeof (*dringmsg), B_FALSE)) { + DWARN((vnetp, + "vgen_handle_dring_data: " + "id(%lx) vgen_sendmsg " + "failed, stype: ACK\n", + ldcp->ldc_id)); + } + /* save new sync index start */ + if (sync_end != end) { + INCR_RXI(sync_end, ldcp); + sync_start = sync_end; + } else + sync_done = B_TRUE; + } + goto vgen_next_rxi; + } + + nread = nbytes; + rv = ldc_mem_copy(ldcp->ldc_handle, + (caddr_t)mp->b_rptr, off, &nread, + rxdp->memcookie, ncookies, LDC_COPY_IN); + + /* set done bit irrespective of rv of ldc_mem_copy() */ + hdrp->dstate = VIO_DESC_DONE; + + if (hdrp->ack) { + /* + * sender needs ack for this packet. + * sync pkts upto this index and + * send the ack to the peer. + */ + sync_end = rxi; + (void) ldc_mem_dring_release(ldcp->rx_dhandle, + sync_start, sync_end); + tagp->vio_subtype = VIO_SUBTYPE_ACK; + tagp->vio_sid = ldcp->local_sid; + dringmsg = (vio_dring_msg_t *)tagp; + dringmsg->start_idx = sync_start; + dringmsg->end_idx = sync_end; + if (vgen_sendmsg(ldcp, (caddr_t)tagp, + sizeof (*dringmsg), B_FALSE)) { + DWARN((vnetp, + "vgen_handle_dring_data: id(%lx) " + "vgen_sendmsg failed stype: ACK\n", + ldcp->ldc_id)); + } + /* save new sync index start */ + if (sync_end != end) { + INCR_RXI(sync_end, ldcp); + sync_start = sync_end; + } else + sync_done = B_TRUE; + } + /* if ldc_mem_copy() failed */ + if (rv) { + DWARN((vnetp, + "vgen_handle_dring_data: id(%lx) " + "ldc_mem_copy failed\n", ldcp->ldc_id)); + statsp->ierrors++; + freemsg(mp); + goto vgen_next_rxi; + } + if (nread != nbytes) { + DWARN((vnetp, + "vgen_handle_dring_data: id(%lx) " + "ldc_mem_copy nread(%lx), nbytes(%lx)\n", + ldcp->ldc_id, nread, nbytes)); + statsp->ierrors++; + freemsg(mp); + goto vgen_next_rxi; + } + + /* point to the actual end of data */ + mp->b_wptr = mp->b_rptr + datalen; + + /* update stats */ + statsp->ipackets++; + statsp->rbytes += datalen; + ehp = (struct ether_header *)mp->b_rptr; + if (IS_BROADCAST(ehp)) + statsp->brdcstrcv++; + else if (IS_MULTICAST(ehp)) + statsp->multircv++; + + /* build a chain of received packets */ + if (bp == NULL) { + /* first pkt */ + bp = mp; + bpt = bp; + bpt->b_next = NULL; + } else { + mp->b_next = NULL; + bpt->b_next = mp; + bpt = mp; + } + +vgen_next_rxi: if (rxi == end) { + break; + } + /* increment recv index */ + INCR_RXI(rxi, ldcp); + + _NOTE(CONSTCOND) + } while (1); + + if (!sync_done) { + /* sync remote descriptor range */ + sync_end = rxi; + (void) ldc_mem_dring_release(ldcp->rx_dhandle, + sync_start, sync_end); + DBG2((vnetp, + "vgen_handle_dring_data: not sending ACK\n")); + } + + /* save new recv index */ + INCR_RXI(rxi, ldcp); + ldcp->next_rxi = rxi; + ldcp->next_rxseq += ((end >= start) ? + ((end - start) + 1) : (start - end)); + + /* try to reclaim transmit descrs also */ + vgen_reclaim(ldcp); + break; + + case VIO_SUBTYPE_ACK: + /* + * received an ack corresponding to a specific descriptor for + * which we had set the ACK bit in the descriptor (during + * transmit). This enables us to reclaim descriptors. + */ + DBG2((vnetp, + "vgen_handle_dring_data: ACK: start(%d), end(%d)\n", + start, end)); + + /* validate start and end indeces in the tx ack msg */ + if (!(CHECK_TXI(start, ldcp)) || !(CHECK_TXI(end, ldcp))) { + /* drop the message if invalid index */ + break; + } + /* validate dring_ident */ + if (dringmsg->dring_ident != ldcp->local_hparams.dring_ident) { + /* invalid dring_ident, drop the msg */ + break; + } + statsp->dring_data_acks++; + vgen_reclaim(ldcp); + break; + + case VIO_SUBTYPE_NACK: + /* + * peer sent a NACK msg to indicate lost packets. + * The start and end correspond to the range of descriptors + * for which the peer didn't receive a dring data msg and so + * didn't receive the corresponding data. + */ + DWARN((vnetp, + "vgen_handle_dring_data: NACK: start(%d), end(%d)\n", + start, end)); + + /* validate start and end indeces in the tx nack msg */ + if (!(CHECK_TXI(start, ldcp)) || !(CHECK_TXI(end, ldcp))) { + /* drop the message if invalid index */ + break; + } + /* validate dring_ident */ + if (dringmsg->dring_ident != ldcp->local_hparams.dring_ident) { + /* invalid dring_ident, drop the msg */ + break; + } + mutex_enter(&ldcp->txlock); + mutex_enter(&ldcp->tclock); + + if (ldcp->next_tbufp == ldcp->cur_tbufp) { + /* no busy descriptors, bogus nack ? */ + mutex_exit(&ldcp->tclock); + mutex_exit(&ldcp->txlock); + break; + } + +#ifdef VGEN_REXMIT + /* send a new dring data msg including the lost descrs */ + end = ldcp->next_tbufp - ldcp->tbufp; + DECR_TXI(end, ldcp); + seqnum = ldcp->tbufp[start].seqnum; + /* no need to increment ldcp->next_txseq as this is rexmit */ + rv = vgen_send_dring_data(ldcp, start, end, seqnum); + if (rv != 0) { + /* + * vgen_send_dring_data() error: drop all packets + * in this descr range + */ + DWARN((vnetp, + "vgen_handle_dring_data: " + "vgen_send_dring_data failed :" + "id(%lx) rv(%d)\n", ldcp->ldc_id, rv)); + for (txi = start; txi <= end; ) { + tbufp = &(ldcp->tbufp[txi]); + txdp = tbufp->descp; + hdrp = &txdp->hdr; + (void) ldc_mem_unbind_handle(tbufp->memhandle); + freemsg(tbufp->mp); + tbufp->flags = VGEN_PRIV_DESC_FREE; + hdrp->dstate = VIO_DESC_FREE; + hdrp->ack = B_FALSE; + statsp->oerrors++; + } + + /* update next pointer */ + ldcp->next_tbufp = &(ldcp->tbufp[start]); + ldcp->next_txseq = seqnum; + ldcp->next_txi = start; + } + DBG2((vnetp, + "vgen_handle_dring_data: rexmit: start(%d) end(%d)\n", + start, end)); +#else /* VGEN_REXMIT */ + /* we just mark the descrs as done so they can be reclaimed */ + for (txi = start; txi <= end; ) { + txdp = &(ldcp->txdp[txi]); + hdrp = &txdp->hdr; + if (hdrp->dstate == VIO_DESC_READY) + hdrp->dstate = VIO_DESC_DONE; + INCR_TXI(txi, ldcp); + } +#endif /* VGEN_REXMIT */ + mutex_exit(&ldcp->tclock); + mutex_exit(&ldcp->txlock); + + vgen_reclaim(ldcp); + + break; + } + + DBG1((vnetp, "vgen_handle_dring_data: exit\n")); + *headp = bp; + *tailp = bpt; +} + +static void +vgen_reclaim(vgen_ldc_t *ldcp) +{ + if (mutex_tryenter(&ldcp->tclock) == 0) + return; /* already in progress */ + vgen_reclaim_dring(ldcp); + ldcp->reclaim_lbolt = ddi_get_lbolt(); + mutex_exit(&ldcp->tclock); +} + +/* + * transmit reclaim function. starting from the current reclaim index + * look for descriptors marked DONE and reclaim the descriptor and the + * corresponding buffers (tbuf). + */ +static void +vgen_reclaim_dring(vgen_ldc_t *ldcp) +{ + vnet_public_desc_t *txdp; + vgen_private_desc_t *tbufp; + vio_dring_entry_hdr_t *hdrp; +#ifdef VGEN_USE_MAC_TX_UPDATE + vgen_t *vgenp = (vgen_t *)ldcp->vgenp; +#endif + +#ifdef DEBUG + if (vgen_trigger_txtimeout) + return; +#endif + + tbufp = ldcp->cur_tbufp; + txdp = tbufp->descp; + hdrp = &txdp->hdr; + + while ((hdrp->dstate == VIO_DESC_DONE) && + (tbufp != ldcp->next_tbufp)) { + (void) ldc_mem_unbind_handle(tbufp->memhandle); + freemsg(tbufp->mp); + tbufp->mp = NULL; + tbufp->flags = VGEN_PRIV_DESC_FREE; + hdrp->dstate = VIO_DESC_FREE; + hdrp->ack = B_FALSE; + + tbufp = NEXTTBUF(ldcp, tbufp); + txdp = tbufp->descp; + hdrp = &txdp->hdr; + } + + ldcp->cur_tbufp = tbufp; + + /* + * Check if mac layer should be notified to restart transmissions + */ + if (ldcp->need_resched) { + ldcp->need_resched = B_FALSE; +#ifdef VGEN_USE_MAC_TX_UPDATE + mac_tx_update(vgenp->vnetmacp); +#endif + } +} + +/* return the number of pending transmits for the channel */ +static int +vgen_num_txpending(vgen_ldc_t *ldcp) +{ + int n; + + if (ldcp->next_tbufp >= ldcp->cur_tbufp) { + n = ldcp->next_tbufp - ldcp->cur_tbufp; + } else { + /* cur_tbufp > next_tbufp */ + n = ldcp->num_txds - (ldcp->cur_tbufp - ldcp->next_tbufp); + } + + return (n); +} + +/* determine if the transmit descriptor ring is full */ +static int +vgen_tx_dring_full(vgen_ldc_t *ldcp) +{ + vgen_private_desc_t *tbufp; + vgen_private_desc_t *ntbufp; + + tbufp = ldcp->next_tbufp; + ntbufp = NEXTTBUF(ldcp, tbufp); + if (ntbufp == ldcp->cur_tbufp) { /* out of tbufs/txds */ +#if 0 + void *vnetp = LDC_TO_VNET(ldcp); + DWARN((vnetp, "vgen_tx_dring_full: id(%lx)\n", + ldcp->ldc_id)); +#endif + return (VGEN_SUCCESS); + } + return (VGEN_FAILURE); +} + +/* determine if timeout condition has occured */ +static int +vgen_ldc_txtimeout(vgen_ldc_t *ldcp) +{ + if (((ddi_get_lbolt() - ldcp->reclaim_lbolt) > + drv_usectohz(vnet_ldcwd_txtimeout * 1000)) && + (vnet_ldcwd_txtimeout) && + (vgen_tx_dring_full(ldcp) == VGEN_SUCCESS)) { +#if 0 + void *vnetp = LDC_TO_VNET(ldcp); + DWARN((vnetp, "vgen_ldc_txtimeout: id(%lx)\n", + ldcp->ldc_id)); +#endif + return (VGEN_SUCCESS); + } else { + return (VGEN_FAILURE); + } +} + +/* transmit watchdog timeout handler */ +static void +vgen_ldc_watchdog(void *arg) +{ + vgen_ldc_t *ldcp; + void *vnetp; + int rv; + + ldcp = (vgen_ldc_t *)arg; + vnetp = LDC_TO_VNET(ldcp); + + rv = vgen_ldc_txtimeout(ldcp); + if (rv == VGEN_SUCCESS) { + DWARN((vnetp, + "vgen_ldc_watchdog: transmit timeout ldcid(%lx)\n", + ldcp->ldc_id)); +#ifdef DEBUG + if (vgen_trigger_txtimeout) { + /* tx timeout triggered for debugging */ + vgen_trigger_txtimeout = 0; + } +#endif + mutex_enter(&ldcp->cblock); + vgen_handshake_retry(ldcp); + mutex_exit(&ldcp->cblock); + if (ldcp->need_resched) { + ldcp->need_resched = B_FALSE; +#ifdef VGEN_USE_MAC_TX_UPDATE + mac_tx_update(ldcp->vgenp->vnetmacp); +#endif + } + } + + ldcp->wd_tid = timeout(vgen_ldc_watchdog, (caddr_t)ldcp, + drv_usectohz(vnet_ldcwd_interval * 1000)); +} + +/* based on mcopymsg() */ +static void +vgen_copymsg(mblk_t *mp, void *bufp) +{ + caddr_t dest = bufp; + mblk_t *bp; + size_t n; + + for (bp = mp; bp != NULL; bp = bp->b_cont) { + n = MBLKL(bp); + bcopy(bp->b_rptr, dest, n); + dest += n; + } +} + +static int +vgen_setup_kstats(vgen_ldc_t *ldcp) +{ + vgen_t *vgenp; + struct kstat *ksp; + vgen_stats_t *statsp; + vgen_kstats_t *ldckp; + int instance; + size_t size; + char name[MAXNAMELEN]; + + vgenp = LDC_TO_VGEN(ldcp); + instance = ddi_get_instance(vgenp->vnetdip); + (void) sprintf(name, "vnetldc0x%lx", ldcp->ldc_id); + statsp = kmem_zalloc(sizeof (vgen_stats_t), KM_SLEEP); + if (statsp == NULL) { + return (VGEN_FAILURE); + } + size = sizeof (vgen_kstats_t) / sizeof (kstat_named_t); + ksp = kstat_create("vnet", instance, name, "net", KSTAT_TYPE_NAMED, + size, 0); + if (ksp == NULL) { + KMEM_FREE(statsp); + return (VGEN_FAILURE); + } + + ldckp = (vgen_kstats_t *)ksp->ks_data; + kstat_named_init(&ldckp->ipackets, "ipackets", + KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->ipackets64, "ipackets64", + KSTAT_DATA_ULONGLONG); + kstat_named_init(&ldckp->ierrors, "ierrors", + KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->opackets, "opackets", + KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->opackets64, "opackets64", + KSTAT_DATA_ULONGLONG); + kstat_named_init(&ldckp->oerrors, "oerrors", + KSTAT_DATA_ULONG); + + + /* MIB II kstat variables */ + kstat_named_init(&ldckp->rbytes, "rbytes", + KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->rbytes64, "rbytes64", + KSTAT_DATA_ULONGLONG); + kstat_named_init(&ldckp->obytes, "obytes", + KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->obytes64, "obytes64", + KSTAT_DATA_ULONGLONG); + kstat_named_init(&ldckp->multircv, "multircv", + KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->multixmt, "multixmt", + KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->brdcstrcv, "brdcstrcv", + KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->brdcstxmt, "brdcstxmt", + KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->norcvbuf, "norcvbuf", + KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->noxmtbuf, "noxmtbuf", + KSTAT_DATA_ULONG); + + /* Tx stats */ + kstat_named_init(&ldckp->tx_no_desc, "tx_no_desc", + KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->tx_allocb_fail, "tx_allocb_fail", + KSTAT_DATA_ULONG); + + /* Rx stats */ + kstat_named_init(&ldckp->rx_no_desc, "rx_no_desc", + KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->rx_allocb_fail, "rx_allocb_fail", + KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->rx_lost_pkts, "rx_lost_pkts", + KSTAT_DATA_ULONG); + + /* Interrupt stats */ + kstat_named_init(&ldckp->callbacks, "callbacks", + KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->dring_data_acks, "dring_data_acks", + KSTAT_DATA_ULONG); + + ksp->ks_update = vgen_kstat_update; + ksp->ks_private = (void *)ldcp; + kstat_install(ksp); + + ldcp->ksp = ksp; + ldcp->statsp = statsp; + return (VGEN_SUCCESS); +} + +static void +vgen_destroy_kstats(vgen_ldc_t *ldcp) +{ + if (ldcp->ksp) + kstat_delete(ldcp->ksp); + KMEM_FREE(ldcp->statsp); +} + +static int +vgen_kstat_update(kstat_t *ksp, int rw) +{ + vgen_ldc_t *ldcp; + vgen_stats_t *statsp; + vgen_kstats_t *ldckp; + + ldcp = (vgen_ldc_t *)ksp->ks_private; + statsp = ldcp->statsp; + ldckp = (vgen_kstats_t *)ksp->ks_data; + + if (rw == KSTAT_READ) { + ldckp->ipackets.value.ul = (uint32_t)statsp->ipackets; + ldckp->ipackets64.value.ull = statsp->ipackets; + ldckp->ierrors.value.ul = statsp->ierrors; + ldckp->opackets.value.ul = (uint32_t)statsp->opackets; + ldckp->opackets64.value.ull = statsp->opackets; + ldckp->oerrors.value.ul = statsp->oerrors; + + /* + * MIB II kstat variables + */ + ldckp->rbytes.value.ul = (uint32_t)statsp->rbytes; + ldckp->rbytes64.value.ull = statsp->rbytes; + ldckp->obytes.value.ul = (uint32_t)statsp->obytes; + ldckp->obytes64.value.ull = statsp->obytes; + ldckp->multircv.value.ul = statsp->multircv; + ldckp->multixmt.value.ul = statsp->multixmt; + ldckp->brdcstrcv.value.ul = statsp->brdcstrcv; + ldckp->brdcstxmt.value.ul = statsp->brdcstxmt; + ldckp->norcvbuf.value.ul = statsp->norcvbuf; + ldckp->noxmtbuf.value.ul = statsp->noxmtbuf; + + ldckp->tx_no_desc.value.ul = statsp->tx_no_desc; + ldckp->tx_allocb_fail.value.ul = statsp->tx_allocb_fail; + + ldckp->rx_no_desc.value.ul = statsp->rx_no_desc; + ldckp->rx_allocb_fail.value.ul = statsp->rx_allocb_fail; + ldckp->rx_lost_pkts.value.ul = statsp->rx_lost_pkts; + + ldckp->callbacks.value.ul = statsp->callbacks; + ldckp->dring_data_acks.value.ul = statsp->dring_data_acks; + } else { + statsp->ipackets = ldckp->ipackets64.value.ull; + statsp->ierrors = ldckp->ierrors.value.ul; + statsp->opackets = ldckp->opackets64.value.ull; + statsp->oerrors = ldckp->oerrors.value.ul; + + /* + * MIB II kstat variables + */ + statsp->rbytes = ldckp->rbytes64.value.ull; + statsp->obytes = ldckp->obytes64.value.ull; + statsp->multircv = ldckp->multircv.value.ul; + statsp->multixmt = ldckp->multixmt.value.ul; + statsp->brdcstrcv = ldckp->brdcstrcv.value.ul; + statsp->brdcstxmt = ldckp->brdcstxmt.value.ul; + statsp->norcvbuf = ldckp->norcvbuf.value.ul; + statsp->noxmtbuf = ldckp->noxmtbuf.value.ul; + + statsp->tx_no_desc = ldckp->tx_no_desc.value.ul; + statsp->tx_allocb_fail = ldckp->tx_allocb_fail.value.ul; + + statsp->rx_no_desc = ldckp->rx_no_desc.value.ul; + statsp->rx_allocb_fail = ldckp->rx_allocb_fail.value.ul; + statsp->rx_lost_pkts = ldckp->rx_lost_pkts.value.ul; + + statsp->callbacks = ldckp->callbacks.value.ul; + statsp->dring_data_acks = ldckp->dring_data_acks.value.ul; + } + + return (VGEN_SUCCESS); +} + +/* handler for error messages received from the peer ldc end-point */ +static void +vgen_handle_errmsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp) +{ + _NOTE(ARGUNUSED(ldcp, tagp)) +} + +/* Check if the session id in the received message is valid */ +static int +vgen_check_sid(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp) +{ + if (tagp->vio_sid != ldcp->peer_sid) { + void *vnetp = LDC_TO_VNET(ldcp); + DWARN((vnetp, + "sid mismatch: expected(%x), rcvd(%x)\n", + ldcp->peer_sid, tagp->vio_sid)); + return (VGEN_FAILURE); + } + else + return (VGEN_SUCCESS); +} + +/* convert mac address from string to uint64_t */ +static uint64_t +vgen_macaddr_strtoul(const uint8_t *macaddr) +{ + uint64_t val = 0; + int i; + +#if 0 + for (i = ETHERADDRL - 1; i >= 0; i--) { +#endif + for (i = 0; i < ETHERADDRL; i++) { + val <<= 8; + val |= macaddr[i]; + } + +#if 0 + cmn_err(CE_CONT, "vgen_macaddr_strtoul: str(%x:%x:%x:%x:%x:%x)\n", + macaddr[0], macaddr[1], macaddr[2], + macaddr[3], macaddr[4], macaddr[5]); + cmn_err(CE_CONT, "vgen_macaddr_strtoul: val(0x%lx)\n", val); +#endif + return (val); +} + +/* convert mac address from uint64_t to string */ +static int +vgen_macaddr_ultostr(uint64_t val, uint8_t *macaddr) +{ + int i; + uint64_t value; + + value = val; +#if 0 + for (i = 0; i < ETHERADDRL; i++) { +#endif + for (i = ETHERADDRL - 1; i >= 0; i--) { + macaddr[i] = value & 0xFF; + value >>= 8; + } +#if 0 + cmn_err(CE_CONT, "vgen_macaddr_ultostr: val(0x%lx)\n", val); + cmn_err(CE_CONT, "vgen_macaddr_ultostr: str(%x:%x:%x:%x:%x:%x)\n", + macaddr[0], macaddr[1], macaddr[2], + macaddr[3], macaddr[4], macaddr[5]); +#endif + return (VGEN_SUCCESS); +} + +static caddr_t +vgen_print_ethaddr(uint8_t *a, char *ebuf) +{ + (void) sprintf(ebuf, + "%x:%x:%x:%x:%x:%x", a[0], a[1], a[2], a[3], a[4], a[5]); + return (ebuf); +} + +/* Handshake watchdog timeout handler */ +static void +vgen_hwatchdog(void *arg) +{ + vgen_ldc_t *ldcp = (vgen_ldc_t *)arg; + void *vnetp = LDC_TO_VNET(ldcp); + + DWARN((vnetp, + "vgen_hwatchdog: handshake timeout ldc(%lx) phase(%x) state(%x)\n", + ldcp->ldc_id, ldcp->hphase, ldcp->hstate)); + + mutex_enter(&ldcp->cblock); + ldcp->htid = 0; + vgen_handshake_retry(ldcp); + mutex_exit(&ldcp->cblock); +} + +static void +vgen_print_attr_info(vgen_ldc_t *ldcp, int endpoint) +{ + vgen_hparams_t *hp; + char ep[8]; + uint8_t addr[6]; + char ea[6]; + + if (endpoint == VGEN_LOCAL) { + hp = &ldcp->local_hparams; + (void) sprintf(ep, "Local"); + } else { + hp = &ldcp->peer_hparams; + (void) sprintf(ep, "Peer"); + } + (void) vgen_macaddr_ultostr(hp->addr, addr); + cmn_err(CE_CONT, "attr_info: %s: \n", ep); + cmn_err(CE_CONT, "\tMTU: %lx, addr: %s\n", hp->mtu, + vgen_print_ethaddr(addr, ea)); + cmn_err(CE_CONT, "\taddr_type: %x, xfer_mode: %x, ack_freq: %x\n", + hp->addr_type, hp->xfer_mode, hp->ack_freq); +} + +static void +vgen_print_hparams(vgen_hparams_t *hp) +{ + uint8_t addr[6]; + char ea[6]; + ldc_mem_cookie_t *dc; + + cmn_err(CE_CONT, "version_info:\n"); + cmn_err(CE_CONT, + "\tver_major: %d, ver_minor: %d, dev_class: %d\n", + hp->ver_major, hp->ver_minor, hp->dev_class); + + (void) vgen_macaddr_ultostr(hp->addr, addr); + cmn_err(CE_CONT, "attr_info:\n"); + cmn_err(CE_CONT, "\tMTU: %lx, addr: %s\n", hp->mtu, + vgen_print_ethaddr(addr, ea)); + cmn_err(CE_CONT, + "\taddr_type: %x, xfer_mode: %x, ack_freq: %x\n", + hp->addr_type, hp->xfer_mode, hp->ack_freq); + + dc = &hp->dring_cookie; + cmn_err(CE_CONT, "dring_info:\n"); + cmn_err(CE_CONT, + "\tlength: %d, dsize: %d\n", hp->num_desc, hp->desc_size); + cmn_err(CE_CONT, + "\tldc_addr: 0x%lx, ldc_size: %ld\n", + dc->addr, dc->size); + cmn_err(CE_CONT, "\tdring_ident: 0x%lx\n", hp->dring_ident); +} + +static void +vgen_print_ldcinfo(vgen_ldc_t *ldcp) +{ + vgen_hparams_t *hp; + + cmn_err(CE_CONT, "Channel Information:\n"); + cmn_err(CE_CONT, + "\tldc_id: 0x%lx, ldc_status: 0x%x\n", + ldcp->ldc_id, ldcp->ldc_status); + cmn_err(CE_CONT, + "\tlocal_sid: 0x%x, peer_sid: 0x%x\n", + ldcp->local_sid, ldcp->peer_sid); + cmn_err(CE_CONT, + "\thphase: 0x%x, hstate: 0x%x\n", + ldcp->hphase, ldcp->hstate); + + cmn_err(CE_CONT, "Local handshake params:\n"); + hp = &ldcp->local_hparams; + vgen_print_hparams(hp); + + cmn_err(CE_CONT, "Peer handshake params:\n"); + hp = &ldcp->peer_hparams; + vgen_print_hparams(hp); +} diff --git a/usr/src/uts/sun4v/io/vnex.c b/usr/src/uts/sun4v/io/vnex.c index 1eb0856a8e..e30506c23a 100644 --- a/usr/src/uts/sun4v/io/vnex.c +++ b/usr/src/uts/sun4v/io/vnex.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -96,7 +95,8 @@ static struct vnex_pil_map vnex_name_to_pil[] = { {"loop", PIL_3}, {"sunmc", PIL_3}, {"sunvts", PIL_3}, - {"explorer", PIL_3} + {"explorer", PIL_3}, + {"ncp", PIL_8} }; #define VNEX_MAX_DEVS (sizeof (vnex_name_to_pil) / \ @@ -423,6 +423,20 @@ vnex_disable_intr(dev_info_t *rdip, ddi_intr_handle_impl_t *hdlp) return (DDI_SUCCESS); } +int +vnex_ino_to_inum(dev_info_t *dip, uint32_t ino) +{ + vnex_id_t *vid_p; + ddi_intr_handle_impl_t *hdlp; + + if ((vid_p = vnex_locate_id(dip, ino)) == NULL) + return (-1); + else if ((hdlp = vid_p->vid_ddi_hdlp) == NULL) + return (-1); + else + return (hdlp->ih_inum); +} + static int vnex_add_intr(dev_info_t *dip, dev_info_t *rdip, ddi_intr_handle_impl_t *hdlp) diff --git a/usr/src/uts/sun4v/io/vsw.c b/usr/src/uts/sun4v/io/vsw.c new file mode 100644 index 0000000000..6038ea2874 --- /dev/null +++ b/usr/src/uts/sun4v/io/vsw.c @@ -0,0 +1,6959 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/debug.h> +#include <sys/time.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/user.h> +#include <sys/stropts.h> +#include <sys/stream.h> +#include <sys/strlog.h> +#include <sys/strsubr.h> +#include <sys/cmn_err.h> +#include <sys/cpu.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> +#include <sys/stat.h> +#include <sys/kstat.h> +#include <sys/vtrace.h> +#include <sys/strsun.h> +#include <sys/dlpi.h> +#include <sys/ethernet.h> +#include <net/if.h> +#include <sys/varargs.h> +#include <sys/machsystm.h> +#include <sys/modctl.h> +#include <sys/modhash.h> +#include <sys/mac.h> +#include <sys/taskq.h> +#include <sys/note.h> +#include <sys/mach_descrip.h> +#include <sys/mac.h> +#include <sys/mdeg.h> +#include <sys/ldc.h> +#include <sys/vsw_fdb.h> +#include <sys/vsw.h> +#include <sys/vio_mailbox.h> +#include <sys/vnet_mailbox.h> +#include <sys/vnet_common.h> + +/* + * Function prototypes. + */ +static int vsw_attach(dev_info_t *, ddi_attach_cmd_t); +static int vsw_detach(dev_info_t *, ddi_detach_cmd_t); +static int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); +static void vsw_get_md_properties(vsw_t *vswp); +static int vsw_setup_layer2(vsw_t *); +static int vsw_setup_layer3(vsw_t *); + +/* MAC layer routines */ +static int vsw_mac_attach(vsw_t *vswp); +static void vsw_mac_detach(vsw_t *vswp); +static void vsw_notify_cb(void *, mac_notify_type_t); +static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *); +static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *); +static int vsw_mac_register(vsw_t *); +static int vsw_mac_unregister(vsw_t *); +static uint64_t vsw_m_stat(void *arg, enum mac_stat); +static void vsw_m_stop(void *arg); +static int vsw_m_start(void *arg); +static int vsw_m_unicst(void *arg, const uint8_t *); +static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *); +static int vsw_m_promisc(void *arg, boolean_t); +static mblk_t *vsw_m_tx(void *arg, mblk_t *); +static void vsw_m_resources(void *arg); +static void vsw_m_ioctl(void *arg, queue_t *q, mblk_t *mp); + +/* MDEG routines */ +static void vsw_mdeg_register(vsw_t *vswp); +static void vsw_mdeg_unregister(vsw_t *vswp); +static int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *); + +/* Port add/deletion routines */ +static int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node); +static int vsw_port_attach(vsw_t *vswp, int p_instance, + uint64_t *ldcids, int nids, struct ether_addr *macaddr); +static int vsw_detach_ports(vsw_t *vswp); +static int vsw_port_detach(vsw_t *vswp, int p_instance); +static int vsw_port_delete(vsw_port_t *port); +static int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id); +static int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id); +static int vsw_init_ldcs(vsw_port_t *port); +static int vsw_uninit_ldcs(vsw_port_t *port); +static int vsw_ldc_init(vsw_ldc_t *ldcp); +static int vsw_ldc_uninit(vsw_ldc_t *ldcp); +static int vsw_drain_ldcs(vsw_port_t *port); +static int vsw_drain_port_taskq(vsw_port_t *port); +static void vsw_marker_task(void *); +static vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance); +static int vsw_plist_del_node(vsw_t *, vsw_port_t *port); + +/* Interrupt routines */ +static uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg); + +/* Handshake routines */ +static void vsw_restart_handshake(vsw_ldc_t *); +static int vsw_check_flag(vsw_ldc_t *, int, uint64_t); +static void vsw_next_milestone(vsw_ldc_t *); +static int vsw_supported_version(vio_ver_msg_t *); + +/* Data processing routines */ +static void vsw_process_pkt(void *); +static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t); +static void vsw_process_ctrl_pkt(void *); +static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *); +static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *); +static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *); +static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *); +static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *); +static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *); +static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); +static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *); +static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *); +static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *); +static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); + +/* Switching/data transmit routines */ +static void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, + vsw_port_t *port, mac_resource_handle_t); +static void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, + vsw_port_t *port, mac_resource_handle_t); +static int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, + vsw_port_t *port); +static int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, + vsw_port_t *port); +static int vsw_portsend(vsw_port_t *, mblk_t *); +static int vsw_dringsend(vsw_ldc_t *, mblk_t *); +static int vsw_descrsend(vsw_ldc_t *, mblk_t *); + +/* Packet creation routines */ +static void vsw_send_ver(vsw_ldc_t *); +static void vsw_send_attr(vsw_ldc_t *); +static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *); +static void vsw_send_dring_info(vsw_ldc_t *); +static void vsw_send_rdx(vsw_ldc_t *); + +static void vsw_send_msg(vsw_ldc_t *, void *, int); + +/* Forwarding database (FDB) routines */ +static int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port); +static int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port); +static vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *); +static int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *); +static int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *); +static int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *); +static void vsw_del_addr(uint8_t, void *, uint64_t); +static void vsw_del_mcst_port(vsw_port_t *); +static void vsw_del_mcst_vsw(vsw_t *); + +/* Dring routines */ +static dring_info_t *vsw_create_dring(vsw_ldc_t *); +static void vsw_create_privring(vsw_ldc_t *); +static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp); +static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **, + int *); +static void vsw_dring_priv2pub(vsw_private_desc_t *); +static dring_info_t *vsw_ident2dring(lane_t *, uint64_t); + +static void vsw_set_lane_attr(vsw_t *, lane_t *); +static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *); +static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg); +static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *); +static int vsw_check_dring_info(vio_dring_reg_msg_t *); + +/* Misc support routines */ +static caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf); + +static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t); +static int vsw_free_ring(dring_info_t *); + +/* Debugging routines */ +static void dump_flags(uint64_t); +static void display_state(void); +static void display_lane(lane_t *); +static void display_ring(dring_info_t *); + +int vsw_num_handshakes = 3; /* # of handshake attempts */ +int vsw_wretries = 100; /* # of write attempts */ + +/* + * mode specific frame switching function + */ +void (*vsw_switch_frame)(vsw_t *, mblk_t *, int, vsw_port_t *, + mac_resource_handle_t); + +static struct cb_ops vsw_cb_ops = { + nulldev, /* cb_open */ + nulldev, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + nodev, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* cb_stream */ + D_MP, /* cb_flag */ + CB_REV, /* rev */ + nodev, /* int (*cb_aread)() */ + nodev /* int (*cb_awrite)() */ +}; + +static struct dev_ops vsw_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + vsw_getinfo, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + vsw_attach, /* devo_attach */ + vsw_detach, /* devo_detach */ + nodev, /* devo_reset */ + &vsw_cb_ops, /* devo_cb_ops */ + (struct bus_ops *)NULL, /* devo_bus_ops */ + ddi_power /* devo_power */ +}; + +extern struct mod_ops mod_driverops; +static struct modldrv vswmodldrv = { + &mod_driverops, + "sun4v Virtual Switch Driver %I%", + &vsw_ops, +}; + +#define LDC_ENTER_LOCK(ldcp) \ + mutex_enter(&((ldcp)->ldc_cblock));\ + mutex_enter(&((ldcp)->ldc_txlock)); +#define LDC_EXIT_LOCK(ldcp) \ + mutex_exit(&((ldcp)->ldc_txlock));\ + mutex_exit(&((ldcp)->ldc_cblock)); + +/* Driver soft state ptr */ +static void *vsw_state; + +/* + * Linked list of "vsw_t" structures - one per instance. + */ +vsw_t *vsw_head = NULL; +krwlock_t vsw_rw; + +/* + * Property names + */ +static char vdev_propname[] = "virtual-device"; +static char vsw_propname[] = "virtual-network-switch"; +static char physdev_propname[] = "vsw-phys-dev"; +static char smode_propname[] = "vsw-switch-mode"; +static char macaddr_propname[] = "local-mac-address"; +static char remaddr_propname[] = "remote-mac-address"; +static char ldcids_propname[] = "ldc-ids"; +static char chan_propname[] = "channel-endpoint"; +static char id_propname[] = "id"; +static char reg_propname[] = "reg"; + +/* supported versions */ +static ver_sup_t vsw_versions[] = { {1, 0} }; + +/* + * Matching criteria passed to the MDEG to register interest + * in changes to 'virtual-device-port' nodes identified by their + * 'id' property. + */ +static md_prop_match_t vport_prop_match[] = { + { MDET_PROP_VAL, "id" }, + { MDET_LIST_END, NULL } +}; + +static mdeg_node_match_t vport_match = { "virtual-device-port", + vport_prop_match }; + +/* + * Specification of an MD node passed to the MDEG to filter any + * 'vport' nodes that do not belong to the specified node. This + * template is copied for each vsw instance and filled in with + * the appropriate 'cfg-handle' value before being passed to the MDEG. + */ +static mdeg_prop_spec_t vsw_prop_template[] = { + { MDET_PROP_STR, "name", vsw_propname }, + { MDET_PROP_VAL, "cfg-handle", NULL }, + { MDET_LIST_END, NULL, NULL } +}; + +#define VSW_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); + +/* + * Print debug messages - set to 0x1f to enable all msgs + * or 0x0 to turn all off. + */ +int vswdbg = 0x0; + +/* + * debug levels: + * 0x01: Function entry/exit tracing + * 0x02: Internal function messages + * 0x04: Verbose internal messages + * 0x08: Warning messages + * 0x10: Error messages + */ + +static void +vswdebug(vsw_t *vswp, const char *fmt, ...) +{ + char buf[512]; + va_list ap; + + va_start(ap, fmt); + (void) vsprintf(buf, fmt, ap); + va_end(ap); + + if (vswp == NULL) + cmn_err(CE_CONT, "%s\n", buf); + else + cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf); +} + +/* + * For the moment the state dump routines have their own + * private flag. + */ +#define DUMP_STATE 0 + +#if DUMP_STATE + +#define DUMP_TAG(tag) \ +{ \ + D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \ + D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype); \ + D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env); \ +} + +#define DUMP_TAG_PTR(tag) \ +{ \ + D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \ + D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype); \ + D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env); \ +} + +#define DUMP_FLAGS(flags) dump_flags(flags); +#define DISPLAY_STATE() display_state() + +#else + +#define DUMP_TAG(tag) +#define DUMP_TAG_PTR(tag) +#define DUMP_FLAGS(state) +#define DISPLAY_STATE() + +#endif /* DUMP_STATE */ + +#ifdef DEBUG + +#define D1 \ +if (vswdbg & 0x01) \ + vswdebug + +#define D2 \ +if (vswdbg & 0x02) \ + vswdebug + +#define D3 \ +if (vswdbg & 0x04) \ + vswdebug + +#define DWARN \ +if (vswdbg & 0x08) \ + vswdebug + +#define DERR \ +if (vswdbg & 0x10) \ + vswdebug + +#else + +#define DERR if (0) vswdebug +#define DWARN if (0) vswdebug +#define D1 if (0) vswdebug +#define D2 if (0) vswdebug +#define D3 if (0) vswdebug + +#endif /* DEBUG */ + +static struct modlinkage modlinkage = { + MODREV_1, + &vswmodldrv, + NULL +}; + +int +_init(void) +{ + int status; + + rw_init(&vsw_rw, NULL, RW_DRIVER, NULL); + + status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1); + if (status != 0) { + return (status); + } + + mac_init_ops(&vsw_ops, "vsw"); + status = mod_install(&modlinkage); + if (status != 0) { + ddi_soft_state_fini(&vsw_state); + } + return (status); +} + +int +_fini(void) +{ + int status; + + status = mod_remove(&modlinkage); + if (status != 0) + return (status); + mac_fini_ops(&vsw_ops); + ddi_soft_state_fini(&vsw_state); + + rw_destroy(&vsw_rw); + + return (status); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +static int +vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + vsw_t *vswp; + int smode, instance, i; + char hashname[MAXNAMELEN]; + char qname[TASKQ_NAMELEN]; + int rv = 1; + enum { PROG_init = 0x0, PROG_if_lock = 0x1, + PROG_fdb = 0x2, PROG_mfdb = 0x4, + PROG_report_dev = 0x8, PROG_plist = 0x10, + PROG_taskq = 0x20} + progress; + + progress = PROG_init; + + switch (cmd) { + case DDI_ATTACH: + break; + case DDI_RESUME: + /* nothing to do for this non-device */ + return (DDI_SUCCESS); + case DDI_PM_RESUME: + default: + return (DDI_FAILURE); + } + + instance = ddi_get_instance(dip); + if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) { + DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance); + return (DDI_FAILURE); + } + vswp = ddi_get_soft_state(vsw_state, instance); + + if (vswp == NULL) { + DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance); + goto vsw_attach_fail; + } + + vswp->dip = dip; + vswp->instance = instance; + ddi_set_driver_private(dip, (caddr_t)vswp); + + rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL); + + progress |= PROG_if_lock; + + /* + * User specifies (via MD) an array of switching modes in + * decreasing order of preference. Default mode is always + * layer 2 (mac switching), so init array with that value. + */ + vswp->smode_idx = 0; + for (i = 0; i < NUM_SMODES; i++) + vswp->smode[i] = VSW_LAYER2; + + /* + * Get the various properties such as physical device name + * (vsw-phys-dev), switch mode etc from the MD. + */ + vsw_get_md_properties(vswp); + + /* setup the unicast forwarding database */ + (void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d", + vswp->instance); + D2(vswp, "creating unicast hash table (%s)...", hashname); + vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, + mod_hash_null_valdtor, sizeof (void *)); + + progress |= PROG_fdb; + + /* setup the multicast fowarding database */ + (void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d", + vswp->instance); + D2(vswp, "creating multicast hash table %s)...", hashname); + rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL); + vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, + mod_hash_null_valdtor, sizeof (void *)); + + progress |= PROG_mfdb; + + /* + * create lock protecting list of multicast addresses + * which could come via m_multicst() entry point when plumbed. + */ + mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL); + vswp->mcap = NULL; + + ddi_report_dev(vswp->dip); + + progress |= PROG_report_dev; + + WRITE_ENTER(&vsw_rw); + vswp->next = vsw_head; + vsw_head = vswp; + RW_EXIT(&vsw_rw); + + /* setup the port list */ + rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL); + vswp->plist.head = NULL; + + progress |= PROG_plist; + + /* + * Create the taskq which will process all the VIO + * control messages. + */ + (void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance); + if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1, + TASKQ_DEFAULTPRI, 0)) == NULL) { + cmn_err(CE_WARN, "Unable to create task queue"); + goto vsw_attach_fail; + } + + progress |= PROG_taskq; + + /* select best switching mode */ + for (i = 0; i < NUM_SMODES; i++) { + smode = vswp->smode[i]; + switch (smode) { + case VSW_LAYER2: + rv = vsw_setup_layer2(vswp); + break; + + case VSW_LAYER2_PROMISC: + rv = vsw_setup_layer2(vswp); + break; + + case VSW_LAYER3: + rv = vsw_setup_layer3(vswp); + break; + + default: + DERR(vswp, "unknown switch mode"); + break; + } + + if (rv == 0) { + vswp->smode_idx = i; + break; + } + } + + if (rv == 1) { + cmn_err(CE_WARN, "Unable to setup switching mode"); + goto vsw_attach_fail; + } + + D2(vswp, "Operating in mode %d", vswp->smode[vswp->smode_idx]); + + /* + * Register with the MAC layer as a network device so + * we can be plumbed if desired. + * + * Do this in both layer 2 and layer 3 mode. + */ + vswp->if_state &= ~VSW_IF_UP; + vswp->if_macp = NULL; + vswp->if_mrh = NULL; + if (vswp->mdprops & VSW_MD_MACADDR) { + if (vsw_mac_register(vswp) != 0) { + cmn_err(CE_WARN, "Unable to register as provider " + " with MAC layer, continuing with attach"); + } + } + + /* + * Now we have everything setup, register for MD change + * events. + */ + vsw_mdeg_register(vswp); + + return (DDI_SUCCESS); + +vsw_attach_fail: + DERR(NULL, "vsw_attach: failed"); + + if (progress & PROG_taskq) + ddi_taskq_destroy(vswp->taskq_p); + + if (progress & PROG_plist) + rw_destroy(&vswp->plist.lockrw); + + if (progress & PROG_report_dev) { + ddi_remove_minor_node(dip, NULL); + mutex_destroy(&vswp->mca_lock); + } + + if (progress & PROG_mfdb) { + mod_hash_destroy_hash(vswp->mfdb); + vswp->mfdb = NULL; + rw_destroy(&vswp->mfdbrw); + } + + if (progress & PROG_fdb) { + mod_hash_destroy_hash(vswp->fdb); + vswp->fdb = NULL; + } + + if (progress & PROG_if_lock) + rw_destroy(&vswp->if_lockrw); + + ddi_soft_state_free(vsw_state, instance); + return (DDI_FAILURE); +} + +static int +vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + vsw_t **vswpp, *vswp; + int instance; + + instance = ddi_get_instance(dip); + vswp = ddi_get_soft_state(vsw_state, instance); + + if (vswp == NULL) { + return (DDI_FAILURE); + } + + switch (cmd) { + case DDI_DETACH: + break; + case DDI_SUSPEND: + case DDI_PM_SUSPEND: + default: + return (DDI_FAILURE); + } + + D2(vswp, "detaching instance %d", instance); + + if (vswp->mdprops & VSW_MD_MACADDR) { + if (vsw_mac_unregister(vswp) != 0) { + cmn_err(CE_WARN, "Unable to detach from MAC layer"); + return (DDI_FAILURE); + } + } + rw_destroy(&vswp->if_lockrw); + + vsw_mdeg_unregister(vswp); + + if ((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || + (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) { + vsw_mac_detach(vswp); + } + + if (vsw_detach_ports(vswp) != 0) { + cmn_err(CE_WARN, "Unable to detach ports"); + return (DDI_FAILURE); + } + + /* + * Remove this instance from any entries it may be on in + * the hash table by using the list of addresses maintained + * in the vsw_t structure. + */ + vsw_del_mcst_vsw(vswp); + + vswp->mcap = NULL; + mutex_destroy(&vswp->mca_lock); + + /* + * By now any pending tasks have finished and the underlying + * ldc's have been destroyed, so its safe to delete the control + * message taskq. + */ + if (vswp->taskq_p != NULL) + ddi_taskq_destroy(vswp->taskq_p); + + /* + * At this stage all the data pointers in the hash table + * should be NULL, as all the ports have been removed and will + * have deleted themselves from the port lists which the data + * pointers point to. Hence we can destroy the table using the + * default destructors. + */ + D2(vswp, "vsw_detach: destroying hash tables.."); + mod_hash_destroy_hash(vswp->fdb); + vswp->fdb = NULL; + + WRITE_ENTER(&vswp->mfdbrw); + mod_hash_destroy_hash(vswp->mfdb); + vswp->mfdb = NULL; + RW_EXIT(&vswp->mfdbrw); + rw_destroy(&vswp->mfdbrw); + + ddi_remove_minor_node(dip, NULL); + + rw_destroy(&vswp->plist.lockrw); + WRITE_ENTER(&vsw_rw); + for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) { + if (*vswpp == vswp) { + *vswpp = vswp->next; + break; + } + } + RW_EXIT(&vsw_rw); + ddi_soft_state_free(vsw_state, instance); + + return (DDI_SUCCESS); +} + +static int +vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + _NOTE(ARGUNUSED(dip)) + + vsw_t *vswp = NULL; + dev_t dev = (dev_t)arg; + int instance; + + instance = getminor(dev); + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) { + *result = NULL; + return (DDI_FAILURE); + } + *result = vswp->dip; + return (DDI_SUCCESS); + + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)(uintptr_t)instance; + return (DDI_SUCCESS); + + default: + *result = NULL; + return (DDI_FAILURE); + } +} + +/* + * Get the properties from our MD node. + */ +static void +vsw_get_md_properties(vsw_t *vswp) +{ + md_t *mdp = NULL; + int num_nodes = 0; + int len = 0, listsz = 0; + int num_vdev = 0; + int i, idx; + boolean_t found_node = B_FALSE; + char *smode = NULL; + char *curr_mode = NULL; + char *physname = NULL; + char *node_name = NULL; + char *dev; + uint64_t macaddr = 0; + uint64_t md_inst, obp_inst; + mde_cookie_t *listp = NULL; + mde_cookie_t rootnode; + + D1(vswp, "%s: enter", __func__); + + /* + * Further down we compare the obp 'reg' property to the + * 'cfg-handle' property in the vsw MD node to determine + * if the node refers to this particular instance. So if + * we can't read the obp value then there is no point + * in proceeding further. + */ + if (ddi_prop_exists(DDI_DEV_T_ANY, vswp->dip, + DDI_PROP_DONTPASS, reg_propname) != 1) { + cmn_err(CE_WARN, "Unable to read %s property " + "from OBP device node", reg_propname); + return; + } + + obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip, + DDI_PROP_DONTPASS, reg_propname, 0); + + D2(vswp, "%s: obp_inst 0x%llx", __func__, obp_inst); + + if ((mdp = md_get_handle()) == NULL) { + DERR(vswp, "%s: unable to init MD", __func__); + return; + } + + if ((num_nodes = md_node_count(mdp)) <= 0) { + DERR(vswp, "%s: invalid number of nodes found %d", + __func__, num_nodes); + (void) md_fini_handle(mdp); + return; + } + + D2(vswp, "%s: %d nodes in total in MD", __func__, num_nodes); + + /* allocate enough space for node list */ + listsz = num_nodes * sizeof (mde_cookie_t); + listp = kmem_zalloc(listsz, KM_SLEEP); + + rootnode = md_root_node(mdp); + + /* Get the list of virtual devices */ + num_vdev = md_scan_dag(mdp, rootnode, + md_find_name(mdp, vdev_propname), + md_find_name(mdp, "fwd"), listp); + + if (num_vdev <= 0) { + DERR(vswp, "%s: didn't find any virtual-device nodes in MD", + __func__); + goto md_prop_exit; + } + + D2(vswp, "%s: %d virtual-device nodes found", __func__, num_vdev); + + /* Look for the virtual switch nodes in the list */ + for (idx = 0; idx < num_vdev; idx++) { + if (md_get_prop_str(mdp, listp[idx], + "name", &node_name) != 0) { + DERR(vswp, "%s: unable to get node name", __func__); + continue; + + } + + if (strcmp(node_name, vsw_propname) == 0) { + /* Virtual switch node */ + if (md_get_prop_val(mdp, listp[idx], + "cfg-handle", &md_inst) != 0) { + DERR(vswp, "%s: unable to get cfg-handle from" + " node %d", __func__, idx); + goto md_prop_exit; + } else if (md_inst == obp_inst) { + D2(vswp, "%s: found matching node (%d)" + " 0x%llx == 0x%llx", __func__, idx, + md_inst, obp_inst); + found_node = B_TRUE; + break; + } + } + } + + if (!found_node) { + DWARN(vswp, "%s: couldn't find correct vsw node", __func__); + goto md_prop_exit; + } + + /* + * Now, having found the correct node, get the various properties. + */ + + if (md_get_prop_data(mdp, listp[idx], physdev_propname, + (uint8_t **)(&physname), &len) != 0) { + cmn_err(CE_WARN, "%s: unable to get name(s) of physical " + "device(s) from MD", __func__); + } else if ((strlen(physname) + 1) > LIFNAMSIZ) { + cmn_err(CE_WARN, "%s is too long a device name", physname); + } else { + (void) strncpy(vswp->physname, physname, strlen(physname) + 1); + vswp->mdprops |= VSW_MD_PHYSNAME; + D2(vswp, "%s: using first device specified (%s)", + __func__, vswp->physname); + } + + +#ifdef DEBUG + /* + * As a temporary measure to aid testing we check to see if there + * is a vsw.conf file present. If there is we use the value of the + * vsw_physname property in the file as the name of the physical + * device, overriding the value from the MD. + * + * There may be multiple devices listed, but for the moment + * we just use the first one. + */ + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0, + "vsw_physname", &dev) == DDI_PROP_SUCCESS) { + if ((strlen(dev) + 1) > LIFNAMSIZ) { + cmn_err(CE_WARN, "%s is too long a device name", dev); + } else { + cmn_err(CE_NOTE, "%s: using device name (%s) from " + "config file", __func__, dev); + + (void) strncpy(vswp->physname, dev, strlen(dev) + 1); + vswp->mdprops |= VSW_MD_PHYSNAME; + } + + ddi_prop_free(dev); + + } +#endif + + /* local mac address */ + if (md_get_prop_val(mdp, listp[idx], + macaddr_propname, &macaddr) != 0) { + cmn_err(CE_WARN, "%s: unable to get local MAC address", + __func__); + } else { + READ_ENTER(&vswp->if_lockrw); + for (i = ETHERADDRL - 1; i >= 0; i--) { + vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF; + macaddr >>= 8; + } + RW_EXIT(&vswp->if_lockrw); + vswp->mdprops |= VSW_MD_MACADDR; + } + + /* + * Get the switch-mode property. The modes are listed in + * decreasing order of preference, i.e. prefered mode is + * first item in list. + */ + len = 0; + if (md_get_prop_data(mdp, listp[idx], smode_propname, + (uint8_t **)(&smode), &len) != 0) { + /* + * Unable to get switch-mode property, so just use + * default values which vswp->smode[] array has already + * been pre-populated with, namely layer2. + */ + cmn_err(CE_WARN, "%s: unable to get switch mode property, " + "defaulting to layer 2 mode", __func__); + } else { + i = 0; + curr_mode = smode; + /* + * Modes of operation: + * 'switched' - layer 2 switching, underlying HW in + * non-promiscuous mode. + * 'promiscuous' - layer 2 switching, underlying HW in + * promiscuous mode. + * 'routed' - layer 3 (i.e. IP) routing, underlying HW + * in non-promiscuous mode. + */ + while ((curr_mode < (smode + len)) && (i < NUM_SMODES)) { + D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode); + if (strcmp(curr_mode, "switched") == 0) + vswp->smode[i] = VSW_LAYER2; + else if (strcmp(curr_mode, "promiscuous") == 0) + vswp->smode[i] = VSW_LAYER2_PROMISC; + else if (strcmp(curr_mode, "routed") == 0) + vswp->smode[i] = VSW_LAYER3; + else { + DERR(vswp, "%s: unknown mode %s", + __func__, curr_mode); + /* default to layer 2 */ + vswp->smode[i] = VSW_LAYER2; + } + curr_mode += strlen(curr_mode) + 1; + i++; + } + + vswp->mdprops |= VSW_MD_SMODE; + } + +md_prop_exit: + (void) md_fini_handle(mdp); + + kmem_free(listp, listsz); + + D1(vswp, "%s: exit", __func__); +} + +static int +vsw_setup_layer2(vsw_t *vswp) +{ + int rv = 0; + + D1(vswp, "%s: enter", __func__); + + vsw_switch_frame = vsw_switch_l2_frame; + + /* + * Attempt to link into the MAC layer so we can get + * and send packets out over the physical adapter. + */ + if (vswp->mdprops & VSW_MD_PHYSNAME) { + if (vsw_mac_attach(vswp) != 0) { + /* + * Registration with the MAC layer has failed, + * so return 1 so that can fall back to next + * prefered switching method. + */ + cmn_err(CE_WARN, "!unable to join as MAC layer " + "client, continuing with attach"); + rv = 1; + } + } else { + /* No physical device name found in MD */ + DERR(vswp, "%s: no physical device name specified", __func__); + rv = 1; + } + + D1(vswp, "%s: exit", __func__); + + return (rv); +} + +static int +vsw_setup_layer3(vsw_t *vswp) +{ + D1(vswp, "%s: enter", __func__); + + D2(vswp, "%s: operating in layer 3 mode", __func__); + vsw_switch_frame = vsw_switch_l3_frame; + + D1(vswp, "%s: exit", __func__); + + return (0); +} + +/* + * Link into the MAC layer to gain access to the services provided by + * the underlying physical device driver (which should also have + * registered with the MAC layer). + * + * Only when in layer 2 mode. + */ +static int +vsw_mac_attach(vsw_t *vswp) +{ + D1(vswp, "vsw_mac_attach: enter"); + + vswp->mh = NULL; + vswp->mrh = NULL; + vswp->mnh = NULL; + + ASSERT(vswp->mdprops & VSW_MD_PHYSNAME); + + if ((mac_open(vswp->physname, 0, &vswp->mh)) != 0) { + cmn_err(CE_WARN, "mac_open %s failed", vswp->physname); + goto mac_fail_exit; + } + + D2(vswp, "vsw_mac_attach: using device %s", vswp->physname); + + /* register for changes in the interface */ + vswp->mnh = mac_notify_add(vswp->mh, vsw_notify_cb, (void *)vswp); + + /* register our rx callback function */ + vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp); + + /* get the MAC tx fn */ + vswp->txinfo = mac_tx_get(vswp->mh); + + /* start the interface */ + if (mac_start(vswp->mh) != 0) { + cmn_err(CE_WARN, "could not start mac interface"); + goto mac_fail_exit; + } + + /* get and store original promisc setting */ + vswp->init_promisc = mac_promisc_get(vswp->mh, MAC_DEVPROMISC); + + /* + * FUTURE: When we have the ability to set multiple unicast + * mac address then we won't have to set the device into + * promisc mode, but for the moment its the only way we. + * can see pkts that logical domains we are serving are + * interested in. + */ + if ((vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) && + (vswp->init_promisc == B_FALSE)) { + DERR(vswp, "vsw_mac_attach: enabling promisc mode.."); + + if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) { + DERR(vswp, "vsw_mac_attach: unable to set device" + " into promiscuous mode"); + goto mac_fail_exit; + } + } + + D1(vswp, "vsw_mac_attach: exit"); + return (0); + +mac_fail_exit: + if (vswp->mh != NULL) { + mac_promisc_set(vswp->mh, vswp->init_promisc, MAC_DEVPROMISC); + if (vswp->mrh != NULL) + mac_rx_remove(vswp->mh, vswp->mrh); + + if (vswp->mnh != NULL) + mac_notify_remove(vswp->mh, vswp->mnh); + + mac_close(vswp->mh); + } + + vswp->mrh = NULL; + vswp->mnh = NULL; + vswp->mh = NULL; + vswp->txinfo = NULL; + + D1(vswp, "vsw_mac_attach: fail exit"); + return (1); +} + +static void +vsw_mac_detach(vsw_t *vswp) +{ + D1(vswp, "vsw_mac_detach: enter"); + + if (vswp->mh != NULL) { + /* restore promisc to original setting */ + mac_promisc_set(vswp->mh, vswp->init_promisc, MAC_DEVPROMISC); + if (vswp->mrh != NULL) + mac_rx_remove(vswp->mh, vswp->mrh); + + if (vswp->mnh != NULL) + mac_notify_remove(vswp->mh, vswp->mnh); + + mac_close(vswp->mh); + } + + vswp->mrh = NULL; + vswp->mnh = NULL; + vswp->mh = NULL; + vswp->txinfo = NULL; + + D1(vswp, "vsw_mac_detach: exit"); +} + +/* + * Get notified of changes to the interface. + * + * For the moment we brute force the interface back + * into promisc mode if it is unset (e.g. by snoop). + * When we have the ability to set multiple mac addresses, + * we will need to see if this is necessary. + */ +static void +vsw_notify_cb(void *arg, mac_notify_type_t type) +{ + vsw_t *vswp = (vsw_t *)arg; + + switch (type) { + case MAC_NOTE_PROMISC: + vswp->txinfo = mac_tx_get(vswp->mh); + if (mac_promisc_get(vswp->mh, MAC_DEVPROMISC) == B_TRUE) { + D2(vswp, "%s: still in PROMISC mode", __func__); + } else { + D2(vswp, "%s: now in NON-PROMISC mode", __func__); + D2(vswp, "...re-enabling"); + mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC); + } + break; + default: + break; + } +} + +/* + * receive callback routine. Invoked by MAC layer when there + * are pkts being passed up from physical device. + * + * PERF: It may be more efficient when the card is in promisc + * mode to check the dest address of the pkts here (against + * the FDB) rather than checking later. Needs to be investigated. + */ +static void +vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) +{ + _NOTE(ARGUNUSED(mrh)) + + vsw_t *vswp = (vsw_t *)arg; + + ASSERT(vswp != NULL); + + D1(vswp, "vsw_rx_cb: enter"); + + /* switch the chain of packets received */ + vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL); + + D1(vswp, "vsw_rx_cb: exit"); +} + +/* + * Send a message out over the physical device via the MAC layer. + * + * Returns any mblks that it was unable to transmit. + */ +static mblk_t * +vsw_tx_msg(vsw_t *vswp, mblk_t *mp) +{ + const mac_txinfo_t *mtp; + mblk_t *nextp; + + if (vswp->mh == NULL) { + DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail"); + return (mp); + } else { + for (;;) { + nextp = mp->b_next; + mp->b_next = NULL; + + mtp = vswp->txinfo; + if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) { + mp->b_next = nextp; + break; + } + + if ((mp = nextp) == NULL) + break; + + } + + } + + return (mp); +} + +/* + * Register with the MAC layer as a network device, so we + * can be plumbed if necessary. + */ +static int +vsw_mac_register(vsw_t *vswp) +{ + mac_t *macp = NULL; + mac_info_t *mip = NULL; + int rv = 0; + + D1(vswp, "%s: enter", __func__); + + macp = kmem_zalloc(sizeof (mac_t), KM_SLEEP); + + /* + * Setup the m_info fields. + */ + mip = &(macp->m_info); + mip->mi_media = DL_ETHER; + mip->mi_sdu_min = 0; + mip->mi_sdu_max = ETHERMTU; + mip->mi_cksum = 0; + mip->mi_poll = DL_CAPAB_POLL; + + mip->mi_addr_length = ETHERADDRL; + bcopy(ðerbroadcastaddr, mip->mi_brdcst_addr, ETHERADDRL); + + READ_ENTER(&vswp->if_lockrw); + bcopy(&vswp->if_addr, mip->mi_unicst_addr, ETHERADDRL); + RW_EXIT(&vswp->if_lockrw); + + MAC_STAT_MIB(mip->mi_stat); + MAC_STAT_ETHER(mip->mi_stat); + + /* entry points */ + macp->m_stat = vsw_m_stat; + macp->m_stop = vsw_m_stop; + macp->m_start = vsw_m_start; + macp->m_unicst = vsw_m_unicst; + macp->m_multicst = vsw_m_multicst; + macp->m_promisc = vsw_m_promisc; + macp->m_tx = vsw_m_tx; + macp->m_resources = vsw_m_resources; + macp->m_ioctl = vsw_m_ioctl; + + macp->m_port = 0; + macp->m_dip = vswp->dip; + macp->m_ident = MAC_IDENT; + macp->m_driver = vswp; + + vswp->if_macp = macp; + + /* register */ + rv = mac_register(macp); + + D1(vswp, "%s: exit", __func__); + + return (rv); +} + +static int +vsw_mac_unregister(vsw_t *vswp) +{ + int rv = 0; + + D1(vswp, "%s: enter", __func__); + + WRITE_ENTER(&vswp->if_lockrw); + + if (vswp->if_macp != NULL) { + rv = mac_unregister(vswp->if_macp); + if (rv != 0) { + DWARN(vswp, "%s: unable to unregister from MAC " + "framework", __func__); + + RW_EXIT(&vswp->if_lockrw); + D1(vswp, "%s: fail exit", __func__); + return (rv); + } + + /* mark i/f as down and promisc off */ + vswp->if_state &= ~VSW_IF_UP; + + kmem_free(vswp->if_macp, sizeof (mac_t)); + vswp->if_macp = NULL; + } + RW_EXIT(&vswp->if_lockrw); + + D1(vswp, "%s: exit", __func__); + + return (rv); +} + +static uint64_t +vsw_m_stat(void *arg, enum mac_stat stat) +{ + vsw_t *vswp = (vsw_t *)arg; + const mac_info_t *mip; + + D1(vswp, "%s: enter", __func__); + + if (vswp->mh != NULL) + mip = mac_info(vswp->mh); + else + return (0); + + if (!mip->mi_stat[stat]) + return (0); + + /* return stats from underlying device */ + return (mac_stat_get(vswp->mh, stat)); + +} + +static void +vsw_m_stop(void *arg) +{ + vsw_t *vswp = (vsw_t *)arg; + + D1(vswp, "%s: enter", __func__); + + WRITE_ENTER(&vswp->if_lockrw); + vswp->if_state &= ~VSW_IF_UP; + RW_EXIT(&vswp->if_lockrw); + + D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); +} + +static int +vsw_m_start(void *arg) +{ + vsw_t *vswp = (vsw_t *)arg; + + D1(vswp, "%s: enter", __func__); + + WRITE_ENTER(&vswp->if_lockrw); + vswp->if_state |= VSW_IF_UP; + RW_EXIT(&vswp->if_lockrw); + + D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); + return (0); +} + +/* + * Change the local interface address. + */ +static int +vsw_m_unicst(void *arg, const uint8_t *macaddr) +{ + vsw_t *vswp = (vsw_t *)arg; + + D1(vswp, "%s: enter", __func__); + + WRITE_ENTER(&vswp->if_lockrw); + ether_copy(macaddr, &vswp->if_addr); + RW_EXIT(&vswp->if_lockrw); + + D1(vswp, "%s: exit", __func__); + + return (0); +} + +static int +vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca) +{ + vsw_t *vswp = (vsw_t *)arg; + mcst_addr_t *mcst_p = NULL; + uint64_t addr = 0x0; + int i; + + D1(vswp, "%s: enter", __func__); + + /* + * Convert address into form that can be used + * as hash table key. + */ + for (i = 0; i < ETHERADDRL; i++) { + addr = (addr << 8) | mca[i]; + } + + D2(vswp, "%s: addr = 0x%llx", __func__, addr); + + if (add) { + D2(vswp, "%s: adding multicast", __func__); + if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { + /* + * Update the list of multicast addresses + * contained within the vsw_t structure to + * include this new one. + */ + mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP); + if (mcst_p == NULL) { + DERR(vswp, "%s unable to alloc mem", __func__); + return (1); + } + mcst_p->addr = addr; + + mutex_enter(&vswp->mca_lock); + mcst_p->nextp = vswp->mcap; + vswp->mcap = mcst_p; + mutex_exit(&vswp->mca_lock); + + /* + * Call into the underlying driver to program the + * address into HW. + * + * Note: + * Can safely ignore the return value as the card + * will for the moment always be in promisc mode. + * When we can program multiple MAC addresses into the + * HW then we will need to care about the return + * value here. + */ + if (vswp->mh != NULL) + (void) mac_multicst_add(vswp->mh, mca); + } + } else { + D2(vswp, "%s: removing multicast", __func__); + /* + * Remove the address from the hash table.. + */ + if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { + + /* + * ..and then from the list maintained in the + * vsw_t structure. + */ + vsw_del_addr(VSW_LOCALDEV, vswp, addr); + + if (vswp->mh != NULL) + (void) mac_multicst_remove(vswp->mh, mca); + } + } + + D1(vswp, "%s: exit", __func__); + + return (0); +} + +static int +vsw_m_promisc(void *arg, boolean_t on) +{ + vsw_t *vswp = (vsw_t *)arg; + + D1(vswp, "%s: enter", __func__); + + WRITE_ENTER(&vswp->if_lockrw); + if (on) + vswp->if_state |= VSW_IF_PROMISC; + else + vswp->if_state &= ~VSW_IF_PROMISC; + RW_EXIT(&vswp->if_lockrw); + + D1(vswp, "%s: exit", __func__); + + return (0); +} + +static mblk_t * +vsw_m_tx(void *arg, mblk_t *mp) +{ + vsw_t *vswp = (vsw_t *)arg; + + D1(vswp, "%s: enter", __func__); + + vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL); + + D1(vswp, "%s: exit", __func__); + + return (NULL); +} + +static void +vsw_m_resources(void *arg) +{ + vsw_t *vswp = (vsw_t *)arg; + mac_rx_fifo_t mrf; + + D1(vswp, "%s: enter", __func__); + + mrf.mrf_type = MAC_RX_FIFO; + mrf.mrf_blank = NULL; + mrf.mrf_arg = (void *)vswp; + mrf.mrf_normal_blank_time = 0; + mrf.mrf_normal_pkt_count = 0; + + WRITE_ENTER(&vswp->if_lockrw); + vswp->if_mrh = mac_resource_add(vswp->if_macp, (mac_resource_t *)&mrf); + RW_EXIT(&vswp->if_lockrw); + + D1(vswp, "%s: exit", __func__); +} + +static void +vsw_m_ioctl(void *arg, queue_t *q, mblk_t *mp) +{ + vsw_t *vswp = (vsw_t *)arg; + + D1(vswp, "%s: enter", __func__); + + miocnak(q, mp, 0, ENOTSUP); + + D1(vswp, "%s: exit", __func__); +} + +/* + * Register for machine description (MD) updates. + */ +static void +vsw_mdeg_register(vsw_t *vswp) +{ + mdeg_prop_spec_t *pspecp; + mdeg_node_spec_t *inst_specp; + mdeg_handle_t mdeg_hdl; + size_t templatesz; + int inst, rv; + + D1(vswp, "%s: enter", __func__); + + inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip, + DDI_PROP_DONTPASS, reg_propname, -1); + if (inst == -1) { + DERR(vswp, "%s: unable to get %s property", + __func__, reg_propname); + return; + } + + D2(vswp, "%s: instance %d registering with mdeg", __func__, inst); + + /* + * Allocate and initialize a per-instance copy + * of the global property spec array that will + * uniquely identify this vsw instance. + */ + templatesz = sizeof (vsw_prop_template); + pspecp = kmem_zalloc(templatesz, KM_SLEEP); + + bcopy(vsw_prop_template, pspecp, templatesz); + + VSW_SET_MDEG_PROP_INST(pspecp, inst); + + /* initialize the complete prop spec structure */ + inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); + inst_specp->namep = "virtual-device"; + inst_specp->specp = pspecp; + + /* perform the registration */ + rv = mdeg_register(inst_specp, &vport_match, vsw_mdeg_cb, + (void *)vswp, &mdeg_hdl); + + if (rv != MDEG_SUCCESS) { + DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv); + kmem_free(inst_specp, sizeof (mdeg_node_spec_t)); + kmem_free(pspecp, templatesz); + return; + } + + /* save off data that will be needed later */ + vswp->inst_spec = inst_specp; + vswp->mdeg_hdl = mdeg_hdl; + + D1(vswp, "%s: exit", __func__); +} + +static void +vsw_mdeg_unregister(vsw_t *vswp) +{ + D1(vswp, "vsw_mdeg_unregister: enter"); + + (void) mdeg_unregister(vswp->mdeg_hdl); + + if (vswp->inst_spec->specp != NULL) { + (void) kmem_free(vswp->inst_spec->specp, + sizeof (vsw_prop_template)); + vswp->inst_spec->specp = NULL; + } + + if (vswp->inst_spec != NULL) { + (void) kmem_free(vswp->inst_spec, + sizeof (mdeg_node_spec_t)); + vswp->inst_spec = NULL; + } + + D1(vswp, "vsw_mdeg_unregister: exit"); +} + +static int +vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp) +{ + vsw_t *vswp; + int idx; + md_t *mdp; + mde_cookie_t node; + uint64_t inst; + + if (resp == NULL) + return (MDEG_FAILURE); + + vswp = (vsw_t *)cb_argp; + + D1(vswp, "%s: added %d : removed %d : matched %d", + __func__, resp->added.nelem, resp->removed.nelem, + resp->match_prev.nelem); + + /* process added ports */ + for (idx = 0; idx < resp->added.nelem; idx++) { + mdp = resp->added.mdp; + node = resp->added.mdep[idx]; + + D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node); + + if (vsw_port_add(vswp, mdp, &node) != 0) { + cmn_err(CE_WARN, "Unable to add new port (0x%lx)", + node); + } + } + + /* process removed ports */ + for (idx = 0; idx < resp->removed.nelem; idx++) { + mdp = resp->removed.mdp; + node = resp->removed.mdep[idx]; + + if (md_get_prop_val(mdp, node, id_propname, &inst)) { + DERR(vswp, "%s: prop(%s) not found port(%d)", + __func__, id_propname, idx); + continue; + } + + D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node); + + if (vsw_port_detach(vswp, inst) != 0) { + cmn_err(CE_WARN, "Unable to remove port %ld", inst); + } + } + + /* + * Currently no support for updating already active ports. + * So, ignore the match_curr and match_priv arrays for now. + */ + + D1(vswp, "%s: exit", __func__); + + return (MDEG_SUCCESS); +} + +/* + * Add a new port to the system. + * + * Returns 0 on success, 1 on failure. + */ +int +vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node) +{ + uint64_t ldc_id; + uint8_t *addrp; + int i, addrsz; + int num_nodes = 0, nchan = 0; + int listsz = 0; + mde_cookie_t *listp = NULL; + struct ether_addr ea; + uint64_t macaddr; + uint64_t inst = 0; + vsw_port_t *port; + + if (md_get_prop_val(mdp, *node, id_propname, &inst)) { + DWARN(vswp, "%s: prop(%s) not found", __func__, + id_propname); + return (1); + } + + /* + * Find the channel endpoint node(s) (which should be under this + * port node) which contain the channel id(s). + */ + if ((num_nodes = md_node_count(mdp)) <= 0) { + DERR(vswp, "%s: invalid number of nodes found (%d)", + __func__, num_nodes); + return (1); + } + + /* allocate enough space for node list */ + listsz = num_nodes * sizeof (mde_cookie_t); + listp = kmem_zalloc(listsz, KM_SLEEP); + + nchan = md_scan_dag(mdp, *node, + md_find_name(mdp, chan_propname), + md_find_name(mdp, "fwd"), listp); + + if (nchan <= 0) { + DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname); + kmem_free(listp, listsz); + return (1); + } + + D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname); + + /* use property from first node found */ + if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) { + DWARN(vswp, "%s: prop(%s) not found\n", __func__, + id_propname); + kmem_free(listp, listsz); + return (1); + } + + /* don't need list any more */ + kmem_free(listp, listsz); + + D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id); + + /* read mac-address property */ + if (md_get_prop_data(mdp, *node, remaddr_propname, + &addrp, &addrsz)) { + DWARN(vswp, "%s: prop(%s) not found", + __func__, remaddr_propname); + return (1); + } + + if (addrsz < ETHERADDRL) { + DWARN(vswp, "%s: invalid address size", __func__); + return (1); + } + + macaddr = *((uint64_t *)addrp); + D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr); + + for (i = ETHERADDRL - 1; i >= 0; i--) { + ea.ether_addr_octet[i] = macaddr & 0xFF; + macaddr >>= 8; + } + + if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) { + DERR(vswp, "%s: failed to attach port", __func__); + return (1); + } + + port = vsw_lookup_port(vswp, (int)inst); + + /* just successfuly created the port, so it should exist */ + ASSERT(port != NULL); + + return (0); +} + +/* + * Attach the specified port. + * + * Returns 0 on success, 1 on failure. + */ +static int +vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids, +struct ether_addr *macaddr) +{ + vsw_port_list_t *plist = &vswp->plist; + vsw_port_t *port, **prev_port; + int i; + + D1(vswp, "%s: enter : port %d", __func__, p_instance); + + /* port already exists? */ + READ_ENTER(&plist->lockrw); + for (port = plist->head; port != NULL; port = port->p_next) { + if (port->p_instance == p_instance) { + DWARN(vswp, "%s: port instance %d already attached", + __func__, p_instance); + RW_EXIT(&plist->lockrw); + return (1); + } + } + RW_EXIT(&plist->lockrw); + + port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP); + port->p_vswp = vswp; + port->p_instance = p_instance; + port->p_ldclist.num_ldcs = 0; + port->p_ldclist.head = NULL; + + rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL); + + mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL); + + mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL); + + mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&port->state_cv, NULL, CV_DRIVER, NULL); + port->state = VSW_PORT_INIT; + + if (nids > VSW_PORT_MAX_LDCS) { + D2(vswp, "%s: using first of %d ldc ids", + __func__, nids); + nids = VSW_PORT_MAX_LDCS; + } + + D2(vswp, "%s: %d nids", __func__, nids); + for (i = 0; i < nids; i++) { + D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]); + if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) { + DERR(vswp, "%s: ldc_attach failed", __func__); + + rw_destroy(&port->p_ldclist.lockrw); + + cv_destroy(&port->ref_cv); + mutex_destroy(&port->ref_lock); + + cv_destroy(&port->state_cv); + mutex_destroy(&port->state_lock); + + mutex_destroy(&port->tx_lock); + mutex_destroy(&port->mca_lock); + kmem_free(port, sizeof (vsw_port_t)); + return (1); + } + } + + ether_copy(macaddr, &port->p_macaddr); + + WRITE_ENTER(&plist->lockrw); + + /* create the fdb entry for this port/mac address */ + (void) vsw_add_fdb(vswp, port); + + /* link it into the list of ports for this vsw instance */ + prev_port = (vsw_port_t **)(&plist->head); + port->p_next = *prev_port; + *prev_port = port; + plist->num_ports++; + RW_EXIT(&plist->lockrw); + + /* + * Initialise the port and any ldc's under it. + */ + (void) vsw_init_ldcs(port); + + D1(vswp, "%s: exit", __func__); + return (0); +} + +/* + * Detach the specified port. + * + * Returns 0 on success, 1 on failure. + */ +static int +vsw_port_detach(vsw_t *vswp, int p_instance) +{ + vsw_port_t *port = NULL; + vsw_port_list_t *plist = &vswp->plist; + + D1(vswp, "%s: enter: port id %d", __func__, p_instance); + + WRITE_ENTER(&plist->lockrw); + + if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) { + RW_EXIT(&plist->lockrw); + return (1); + } + + if (vsw_plist_del_node(vswp, port)) { + RW_EXIT(&plist->lockrw); + return (1); + } + + /* Remove the fdb entry for this port/mac address */ + (void) vsw_del_fdb(vswp, port); + + /* Remove any multicast addresses.. */ + vsw_del_mcst_port(port); + + /* + * No longer need to hold lock on port list now that we + * have unlinked the target port from the list. + */ + RW_EXIT(&plist->lockrw); + + if (vsw_port_delete(port)) { + return (1); + } + + D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance); + return (0); +} + +/* + * Detach all active ports. + * + * Returns 0 on success, 1 on failure. + */ +static int +vsw_detach_ports(vsw_t *vswp) +{ + vsw_port_list_t *plist = &vswp->plist; + vsw_port_t *port = NULL; + + D1(vswp, "%s: enter", __func__); + + WRITE_ENTER(&plist->lockrw); + + while ((port = plist->head) != NULL) { + if (vsw_plist_del_node(vswp, port)) { + DERR(vswp, "%s: Error deleting port %d" + " from port list", __func__, + port->p_instance); + RW_EXIT(&plist->lockrw); + return (1); + } + + /* Remove the fdb entry for this port/mac address */ + (void) vsw_del_fdb(vswp, port); + + /* Remove any multicast addresses.. */ + vsw_del_mcst_port(port); + + /* + * No longer need to hold the lock on the port list + * now that we have unlinked the target port from the + * list. + */ + RW_EXIT(&plist->lockrw); + if (vsw_port_delete(port)) { + DERR(vswp, "%s: Error deleting port %d", + __func__, port->p_instance); + return (1); + } + WRITE_ENTER(&plist->lockrw); + } + RW_EXIT(&plist->lockrw); + + D1(vswp, "%s: exit", __func__); + + return (0); +} + +/* + * Delete the specified port. + * + * Returns 0 on success, 1 on failure. + */ +static int +vsw_port_delete(vsw_port_t *port) +{ + vsw_ldc_list_t *ldcl; + vsw_t *vswp = port->p_vswp; + + D1(vswp, "%s: enter : port id %d", __func__, port->p_instance); + + (void) vsw_uninit_ldcs(port); + + /* + * Wait for any pending ctrl msg tasks which reference this + * port to finish. + */ + if (vsw_drain_port_taskq(port)) + return (1); + + /* + * Wait for port reference count to hit zero. + */ + mutex_enter(&port->ref_lock); + while (port->ref_cnt != 0) + cv_wait(&port->ref_cv, &port->ref_lock); + mutex_exit(&port->ref_lock); + + /* + * Wait for any active callbacks to finish + */ + if (vsw_drain_ldcs(port)) + return (1); + + ldcl = &port->p_ldclist; + WRITE_ENTER(&ldcl->lockrw); + while (ldcl->num_ldcs > 0) { + if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {; + cmn_err(CE_WARN, "unable to detach ldc %ld", + ldcl->head->ldc_id); + RW_EXIT(&ldcl->lockrw); + return (1); + } + } + RW_EXIT(&ldcl->lockrw); + + rw_destroy(&port->p_ldclist.lockrw); + + mutex_destroy(&port->mca_lock); + mutex_destroy(&port->tx_lock); + cv_destroy(&port->ref_cv); + mutex_destroy(&port->ref_lock); + + cv_destroy(&port->state_cv); + mutex_destroy(&port->state_lock); + + kmem_free(port, sizeof (vsw_port_t)); + + D1(vswp, "%s: exit", __func__); + + return (0); +} + +/* + * Attach a logical domain channel (ldc) under a specified port. + * + * Returns 0 on success, 1 on failure. + */ +static int +vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id) +{ + vsw_t *vswp = port->p_vswp; + vsw_ldc_list_t *ldcl = &port->p_ldclist; + vsw_ldc_t *ldcp = NULL; + ldc_attr_t attr; + ldc_status_t istatus; + int status = DDI_FAILURE; + + D1(vswp, "%s: enter", __func__); + + ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP); + if (ldcp == NULL) { + DERR(vswp, "%s: kmem_zalloc failed", __func__); + return (1); + } + ldcp->ldc_id = ldc_id; + + mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL); + + /* required for handshake with peer */ + ldcp->local_session = (uint64_t)ddi_get_lbolt(); + ldcp->peer_session = 0; + ldcp->session_status = 0; + + mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL); + ldcp->hss_id = 1; /* Initial handshake session id */ + + /* only set for outbound lane, inbound set by peer */ + vsw_set_lane_attr(vswp, &ldcp->lane_out); + + attr.devclass = LDC_DEV_NT_SVC; + attr.instance = ddi_get_instance(vswp->dip); + attr.mode = LDC_MODE_UNRELIABLE; + attr.qlen = VSW_LDC_QLEN; + status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle); + if (status != 0) { + DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)", + __func__, ldc_id, status); + mutex_destroy(&ldcp->ldc_txlock); + mutex_destroy(&ldcp->ldc_cblock); + cv_destroy(&ldcp->drain_cv); + mutex_destroy(&ldcp->drain_cv_lock); + mutex_destroy(&ldcp->hss_lock); + kmem_free(ldcp, sizeof (vsw_ldc_t)); + return (1); + } + + status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp); + if (status != 0) { + DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)", + __func__, ldc_id, status); + mutex_destroy(&ldcp->ldc_txlock); + mutex_destroy(&ldcp->ldc_cblock); + cv_destroy(&ldcp->drain_cv); + mutex_destroy(&ldcp->drain_cv_lock); + mutex_destroy(&ldcp->hss_lock); + (void) ldc_fini(ldcp->ldc_handle); + kmem_free(ldcp, sizeof (vsw_ldc_t)); + return (1); + } + + + if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { + DERR(vswp, "%s: ldc_status failed", __func__); + return (1); + } + + ldcp->ldc_status = istatus; + ldcp->ldc_port = port; + ldcp->ldc_vswp = vswp; + + /* link it into the list of channels for this port */ + WRITE_ENTER(&ldcl->lockrw); + ldcp->ldc_next = ldcl->head; + ldcl->head = ldcp; + ldcl->num_ldcs++; + RW_EXIT(&ldcl->lockrw); + + D1(vswp, "%s: exit", __func__); + return (0); +} + +/* + * Detach a logical domain channel (ldc) belonging to a + * particular port. + * + * Returns 0 on success, 1 on failure. + */ +static int +vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id) +{ + vsw_t *vswp = port->p_vswp; + vsw_ldc_t *ldcp, *prev_ldcp; + vsw_ldc_list_t *ldcl = &port->p_ldclist; + int rv; + + prev_ldcp = ldcl->head; + for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) { + if (ldcp->ldc_id == ldc_id) { + break; + } + } + + /* specified ldc id not found */ + if (ldcp == NULL) { + DERR(vswp, "%s: ldcp = NULL", __func__); + return (1); + } + + D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id); + + /* + * Before we can close the channel we must release any mapped + * resources (e.g. drings). + */ + vsw_free_lane_resources(ldcp, INBOUND); + vsw_free_lane_resources(ldcp, OUTBOUND); + + /* + * If the close fails we are in serious trouble, as won't + * be able to delete the parent port. + */ + if ((rv = ldc_close(ldcp->ldc_handle)) != 0) { + DERR(vswp, "%s: error %d closing channel %lld", + __func__, rv, ldcp->ldc_id); + return (1); + } + + (void) ldc_fini(ldcp->ldc_handle); + + ldcp->ldc_status = LDC_INIT; + ldcp->ldc_handle = NULL; + ldcp->ldc_vswp = NULL; + mutex_destroy(&ldcp->ldc_txlock); + mutex_destroy(&ldcp->ldc_cblock); + cv_destroy(&ldcp->drain_cv); + mutex_destroy(&ldcp->drain_cv_lock); + mutex_destroy(&ldcp->hss_lock); + + /* unlink it from the list */ + prev_ldcp = ldcp->ldc_next; + ldcl->num_ldcs--; + kmem_free(ldcp, sizeof (vsw_ldc_t)); + + return (0); +} + +/* + * Open and attempt to bring up the channel. Note that channel + * can only be brought up if peer has also opened channel. + * + * Returns 0 if can open and bring up channel, otherwise + * returns 1. + */ +static int +vsw_ldc_init(vsw_ldc_t *ldcp) +{ + vsw_t *vswp = ldcp->ldc_vswp; + ldc_status_t istatus = 0; + int rv; + + D1(vswp, "%s: enter", __func__); + + LDC_ENTER_LOCK(ldcp); + + /* don't start at 0 in case clients don't like that */ + ldcp->next_ident = 1; + + rv = ldc_open(ldcp->ldc_handle); + if (rv != 0) { + DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)", + __func__, ldcp->ldc_id, rv); + LDC_EXIT_LOCK(ldcp); + return (1); + } + + if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { + DERR(vswp, "%s: unable to get status", __func__); + LDC_EXIT_LOCK(ldcp); + return (1); + + } else if (istatus != LDC_OPEN && istatus != LDC_READY) { + DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY", + __func__, ldcp->ldc_id, istatus); + LDC_EXIT_LOCK(ldcp); + return (1); + } + + ldcp->ldc_status = istatus; + rv = ldc_up(ldcp->ldc_handle); + if (rv != 0) { + /* + * Not a fatal error for ldc_up() to fail, as peer + * end point may simply not be ready yet. + */ + D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__, + ldcp->ldc_id, rv); + LDC_EXIT_LOCK(ldcp); + return (1); + } + + /* + * ldc_up() call is non-blocking so need to explicitly + * check channel status to see if in fact the channel + * is UP. + */ + if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { + DERR(vswp, "%s: unable to get status", __func__); + LDC_EXIT_LOCK(ldcp); + return (1); + + } else if (istatus != LDC_UP) { + DERR(vswp, "%s: id(%lld) status(%d) is not UP", + __func__, ldcp->ldc_id, istatus); + } else { + ldcp->ldc_status = istatus; + } + + LDC_EXIT_LOCK(ldcp); + + D1(vswp, "%s: exit", __func__); + return (0); +} + +/* disable callbacks on the channel */ +static int +vsw_ldc_uninit(vsw_ldc_t *ldcp) +{ + vsw_t *vswp = ldcp->ldc_vswp; + int rv; + + D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id); + + LDC_ENTER_LOCK(ldcp); + + rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE); + if (rv != 0) { + DERR(vswp, "vsw_ldc_uninit(%lld): error disabling " + "interrupts (rv = %d)\n", ldcp->ldc_id, rv); + LDC_EXIT_LOCK(ldcp); + return (1); + } + + ldcp->ldc_status = LDC_INIT; + + LDC_EXIT_LOCK(ldcp); + + D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id); + + return (0); +} + +static int +vsw_init_ldcs(vsw_port_t *port) +{ + vsw_ldc_list_t *ldcl = &port->p_ldclist; + vsw_ldc_t *ldcp; + + READ_ENTER(&ldcl->lockrw); + ldcp = ldcl->head; + for (; ldcp != NULL; ldcp = ldcp->ldc_next) { + (void) vsw_ldc_init(ldcp); + } + RW_EXIT(&ldcl->lockrw); + + return (0); +} + +static int +vsw_uninit_ldcs(vsw_port_t *port) +{ + vsw_ldc_list_t *ldcl = &port->p_ldclist; + vsw_ldc_t *ldcp; + + D1(NULL, "vsw_uninit_ldcs: enter\n"); + + READ_ENTER(&ldcl->lockrw); + ldcp = ldcl->head; + for (; ldcp != NULL; ldcp = ldcp->ldc_next) { + (void) vsw_ldc_uninit(ldcp); + } + RW_EXIT(&ldcl->lockrw); + + D1(NULL, "vsw_uninit_ldcs: exit\n"); + + return (0); +} + +/* + * Wait until the callback(s) associated with the ldcs under the specified + * port have completed. + * + * Prior to this function being invoked each channel under this port + * should have been quiesced via ldc_set_cb_mode(DISABLE). + * + * A short explaination of what we are doing below.. + * + * The simplest approach would be to have a reference counter in + * the ldc structure which is increment/decremented by the callbacks as + * they use the channel. The drain function could then simply disable any + * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately + * there is a tiny window here - before the callback is able to get the lock + * on the channel it is interrupted and this function gets to execute. It + * sees that the ref count is zero and believes its free to delete the + * associated data structures. + * + * We get around this by taking advantage of the fact that before the ldc + * framework invokes a callback it sets a flag to indicate that there is a + * callback active (or about to become active). If when we attempt to + * unregister a callback when this active flag is set then the unregister + * will fail with EWOULDBLOCK. + * + * If the unregister fails we do a cv_timedwait. We will either be signaled + * by the callback as it is exiting (note we have to wait a short period to + * allow the callback to return fully to the ldc framework and it to clear + * the active flag), or by the timer expiring. In either case we again attempt + * the unregister. We repeat this until we can succesfully unregister the + * callback. + * + * The reason we use a cv_timedwait rather than a simple cv_wait is to catch + * the case where the callback has finished but the ldc framework has not yet + * cleared the active flag. In this case we would never get a cv_signal. + */ +static int +vsw_drain_ldcs(vsw_port_t *port) +{ + vsw_ldc_list_t *ldcl = &port->p_ldclist; + vsw_ldc_t *ldcp; + vsw_t *vswp = port->p_vswp; + + D1(vswp, "%s: enter", __func__); + + READ_ENTER(&ldcl->lockrw); + + ldcp = ldcl->head; + + for (; ldcp != NULL; ldcp = ldcp->ldc_next) { + /* + * If we can unregister the channel callback then we + * know that there is no callback either running or + * scheduled to run for this channel so move on to next + * channel in the list. + */ + mutex_enter(&ldcp->drain_cv_lock); + + /* prompt active callbacks to quit */ + ldcp->drain_state = VSW_LDC_DRAINING; + + if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) { + D2(vswp, "%s: unreg callback for chan %ld", __func__, + ldcp->ldc_id); + mutex_exit(&ldcp->drain_cv_lock); + continue; + } else { + /* + * If we end up here we know that either 1) a callback + * is currently executing, 2) is about to start (i.e. + * the ldc framework has set the active flag but + * has not actually invoked the callback yet, or 3) + * has finished and has returned to the ldc framework + * but the ldc framework has not yet cleared the + * active bit. + * + * Wait for it to finish. + */ + while (ldc_unreg_callback(ldcp->ldc_handle) + == EWOULDBLOCK) + (void) cv_timedwait(&ldcp->drain_cv, + &ldcp->drain_cv_lock, lbolt + hz); + + mutex_exit(&ldcp->drain_cv_lock); + D2(vswp, "%s: unreg callback for chan %ld after " + "timeout", __func__, ldcp->ldc_id); + } + } + RW_EXIT(&ldcl->lockrw); + + D1(vswp, "%s: exit", __func__); + return (0); +} + +/* + * Wait until all tasks which reference this port have completed. + * + * Prior to this function being invoked each channel under this port + * should have been quiesced via ldc_set_cb_mode(DISABLE). + */ +static int +vsw_drain_port_taskq(vsw_port_t *port) +{ + vsw_t *vswp = port->p_vswp; + + D1(vswp, "%s: enter", __func__); + + /* + * Mark the port as in the process of being detached, and + * dispatch a marker task to the queue so we know when all + * relevant tasks have completed. + */ + mutex_enter(&port->state_lock); + port->state = VSW_PORT_DETACHING; + + if ((vswp->taskq_p == NULL) || + (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task, + port, DDI_NOSLEEP) != DDI_SUCCESS)) { + DERR(vswp, "%s: unable to dispatch marker task", + __func__); + mutex_exit(&port->state_lock); + return (1); + } + + /* + * Wait for the marker task to finish. + */ + while (port->state != VSW_PORT_DETACHABLE) + cv_wait(&port->state_cv, &port->state_lock); + + mutex_exit(&port->state_lock); + + D1(vswp, "%s: exit", __func__); + + return (0); +} + +static void +vsw_marker_task(void *arg) +{ + vsw_port_t *port = arg; + vsw_t *vswp = port->p_vswp; + + D1(vswp, "%s: enter", __func__); + + mutex_enter(&port->state_lock); + + /* + * No further tasks should be dispatched which reference + * this port so ok to mark it as safe to detach. + */ + port->state = VSW_PORT_DETACHABLE; + + cv_signal(&port->state_cv); + + mutex_exit(&port->state_lock); + + D1(vswp, "%s: exit", __func__); +} + +static vsw_port_t * +vsw_lookup_port(vsw_t *vswp, int p_instance) +{ + vsw_port_list_t *plist = &vswp->plist; + vsw_port_t *port; + + for (port = plist->head; port != NULL; port = port->p_next) { + if (port->p_instance == p_instance) { + D2(vswp, "vsw_lookup_port: found p_instance\n"); + return (port); + } + } + + return (NULL); +} + +/* + * Search for and remove the specified port from the port + * list. Returns 0 if able to locate and remove port, otherwise + * returns 1. + */ +static int +vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port) +{ + vsw_port_list_t *plist = &vswp->plist; + vsw_port_t *curr_p, *prev_p; + + if (plist->head == NULL) + return (1); + + curr_p = prev_p = plist->head; + + while (curr_p != NULL) { + if (curr_p == port) { + if (prev_p == curr_p) { + plist->head = curr_p->p_next; + } else { + prev_p->p_next = curr_p->p_next; + } + plist->num_ports--; + break; + } else { + prev_p = curr_p; + curr_p = curr_p->p_next; + } + } + return (0); +} + +/* + * Interrupt handler for ldc messages. + */ +static uint_t +vsw_ldc_cb(uint64_t event, caddr_t arg) +{ + vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; + vsw_t *vswp = ldcp->ldc_vswp; + ldc_status_t lstatus; + int rv; + + D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); + + mutex_enter(&ldcp->ldc_cblock); + + if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) { + mutex_exit(&ldcp->ldc_cblock); + return (LDC_SUCCESS); + } + + if (event & LDC_EVT_UP) { + /* + * Channel has come up, get the state and then start + * the handshake. + */ + rv = ldc_status(ldcp->ldc_handle, &lstatus); + if (rv != 0) { + cmn_err(CE_WARN, "Unable to read channel state"); + } + ldcp->ldc_status = lstatus; + + D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)", + __func__, ldcp->ldc_id, event, ldcp->ldc_status); + + vsw_restart_handshake(ldcp); + + ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); + } + + if (event & LDC_EVT_READ) { + /* + * Data available for reading. + */ + D2(vswp, "%s: id(ld) event(%llx) data READ", + __func__, ldcp->ldc_id, event); + + vsw_process_pkt(ldcp); + + ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); + + goto vsw_cb_exit; + } + + if (event & LDC_EVT_RESET) { + rv = ldc_status(ldcp->ldc_handle, &lstatus); + if (rv != 0) { + cmn_err(CE_WARN, "Unable to read channel state"); + } else { + ldcp->ldc_status = lstatus; + } + D2(vswp, "%s: id(%ld) event(%llx) RESET: status (%ld)", + __func__, ldcp->ldc_id, event, ldcp->ldc_status); + } + + if (event & LDC_EVT_DOWN) { + rv = ldc_status(ldcp->ldc_handle, &lstatus); + if (rv != 0) { + cmn_err(CE_WARN, "Unable to read channel state"); + } else { + ldcp->ldc_status = lstatus; + } + + D2(vswp, "%s: id(%ld) event(%llx) DOWN: status (%ld)", + __func__, ldcp->ldc_id, event, ldcp->ldc_status); + + } + + /* + * Catch either LDC_EVT_WRITE which we don't support or any + * unknown event. + */ + if (event & ~(LDC_EVT_UP | LDC_EVT_RESET + | LDC_EVT_DOWN | LDC_EVT_READ)) { + + DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)", + __func__, ldcp->ldc_id, event, ldcp->ldc_status); + } + +vsw_cb_exit: + mutex_exit(&ldcp->ldc_cblock); + + /* + * Let the drain function know we are finishing if it + * is waiting. + */ + mutex_enter(&ldcp->drain_cv_lock); + if (ldcp->drain_state == VSW_LDC_DRAINING) + cv_signal(&ldcp->drain_cv); + mutex_exit(&ldcp->drain_cv_lock); + + return (LDC_SUCCESS); +} + +/* + * (Re)start a handshake with our peer by sending them + * our version info. + */ +static void +vsw_restart_handshake(vsw_ldc_t *ldcp) +{ + vsw_t *vswp = ldcp->ldc_vswp; + vsw_port_t *port; + vsw_ldc_list_t *ldcl; + + D1(vswp, "vsw_restart_handshake: enter"); + + port = ldcp->ldc_port; + ldcl = &port->p_ldclist; + + WRITE_ENTER(&ldcl->lockrw); + + D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__, + ldcp->lane_in.lstate, ldcp->lane_out.lstate); + + vsw_free_lane_resources(ldcp, INBOUND); + vsw_free_lane_resources(ldcp, OUTBOUND); + RW_EXIT(&ldcl->lockrw); + + ldcp->lane_in.lstate = 0; + ldcp->lane_out.lstate = 0; + + /* + * Remove parent port from any multicast groups + * it may have registered with. Client must resend + * multicast add command after handshake completes. + */ + (void) vsw_del_fdb(vswp, port); + + vsw_del_mcst_port(port); + + ldcp->hphase = VSW_MILESTONE0; + + ldcp->peer_session = 0; + ldcp->session_status = 0; + + /* + * We now increment the transaction group id. This allows + * us to identify and disard any tasks which are still pending + * on the taskq and refer to the handshake session we are about + * to restart. These stale messages no longer have any real + * meaning. + */ + mutex_enter(&ldcp->hss_lock); + ldcp->hss_id++; + mutex_exit(&ldcp->hss_lock); + + if (ldcp->hcnt++ > vsw_num_handshakes) { + cmn_err(CE_WARN, "exceeded number of permitted " + "handshake attempts (%d) on channel %ld", + ldcp->hcnt, ldcp->ldc_id); + return; + } + + vsw_send_ver(ldcp); + + D1(vswp, "vsw_restart_handshake: exit"); +} + +/* + * returns 0 if legal for event signified by flag to have + * occured at the time it did. Otherwise returns 1. + */ +int +vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag) +{ + vsw_t *vswp = ldcp->ldc_vswp; + uint64_t state; + uint64_t phase; + + if (dir == INBOUND) + state = ldcp->lane_in.lstate; + else + state = ldcp->lane_out.lstate; + + phase = ldcp->hphase; + + switch (flag) { + case VSW_VER_INFO_RECV: + if (phase > VSW_MILESTONE0) { + DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV" + " when in state %d\n", ldcp->ldc_id, phase); + vsw_restart_handshake(ldcp); + return (1); + } + break; + + case VSW_VER_ACK_RECV: + case VSW_VER_NACK_RECV: + if (!(state & VSW_VER_INFO_SENT)) { + DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK" + " or VER_NACK when in state %d\n", + ldcp->ldc_id, phase); + vsw_restart_handshake(ldcp); + return (1); + } else + state &= ~VSW_VER_INFO_SENT; + break; + + case VSW_ATTR_INFO_RECV: + if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) { + DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV" + " when in state %d\n", ldcp->ldc_id, phase); + vsw_restart_handshake(ldcp); + return (1); + } + break; + + case VSW_ATTR_ACK_RECV: + case VSW_ATTR_NACK_RECV: + if (!(state & VSW_ATTR_INFO_SENT)) { + DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK" + " or ATTR_NACK when in state %d\n", + ldcp->ldc_id, phase); + vsw_restart_handshake(ldcp); + return (1); + } else + state &= ~VSW_ATTR_INFO_SENT; + break; + + case VSW_DRING_INFO_RECV: + if (phase < VSW_MILESTONE1) { + DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV" + " when in state %d\n", ldcp->ldc_id, phase); + vsw_restart_handshake(ldcp); + return (1); + } + break; + + case VSW_DRING_ACK_RECV: + case VSW_DRING_NACK_RECV: + if (!(state & VSW_DRING_INFO_SENT)) { + DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK" + " or DRING_NACK when in state %d\n", + ldcp->ldc_id, phase); + vsw_restart_handshake(ldcp); + return (1); + } else + state &= ~VSW_DRING_INFO_SENT; + break; + + case VSW_RDX_INFO_RECV: + if (phase < VSW_MILESTONE3) { + DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV" + " when in state %d\n", ldcp->ldc_id, phase); + vsw_restart_handshake(ldcp); + return (1); + } + break; + + case VSW_RDX_ACK_RECV: + case VSW_RDX_NACK_RECV: + if (!(state & VSW_RDX_INFO_SENT)) { + DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK" + " or RDX_NACK when in state %d\n", + ldcp->ldc_id, phase); + vsw_restart_handshake(ldcp); + return (1); + } else + state &= ~VSW_RDX_INFO_SENT; + break; + + case VSW_MCST_INFO_RECV: + if (phase < VSW_MILESTONE3) { + DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV" + " when in state %d\n", ldcp->ldc_id, phase); + vsw_restart_handshake(ldcp); + return (1); + } + break; + + default: + DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)", + ldcp->ldc_id, flag); + return (1); + } + + if (dir == INBOUND) + ldcp->lane_in.lstate = state; + else + ldcp->lane_out.lstate = state; + + D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id); + + return (0); +} + +void +vsw_next_milestone(vsw_ldc_t *ldcp) +{ + vsw_t *vswp = ldcp->ldc_vswp; + + D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__, + ldcp->ldc_id, ldcp->hphase); + + DUMP_FLAGS(ldcp->lane_in.lstate); + DUMP_FLAGS(ldcp->lane_out.lstate); + + switch (ldcp->hphase) { + + case VSW_MILESTONE0: + /* + * If we haven't started to handshake with our peer, + * start to do so now. + */ + if (ldcp->lane_out.lstate == 0) { + D2(vswp, "%s: (chan %lld) starting handshake " + "with peer", __func__, ldcp->ldc_id); + vsw_restart_handshake(ldcp); + } + + /* + * Only way to pass this milestone is to have successfully + * negotiated version info. + */ + if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) && + (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) { + + D2(vswp, "%s: (chan %lld) leaving milestone 0", + __func__, ldcp->ldc_id); + + /* + * Next milestone is passed when attribute + * information has been successfully exchanged. + */ + ldcp->hphase = VSW_MILESTONE1; + vsw_send_attr(ldcp); + + } + break; + + case VSW_MILESTONE1: + /* + * Only way to pass this milestone is to have successfully + * negotiated attribute information. + */ + if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) { + + ldcp->hphase = VSW_MILESTONE2; + + /* + * If the peer device has said it wishes to + * use descriptor rings then we send it our ring + * info, otherwise we just set up a private ring + * which we use an internal buffer + */ + if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) + vsw_send_dring_info(ldcp); + } + break; + + + case VSW_MILESTONE2: + /* + * If peer has indicated in its attribute message that + * it wishes to use descriptor rings then the only way + * to pass this milestone is for us to have received + * valid dring info. + * + * If peer is not using descriptor rings then just fall + * through. + */ + if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) && + (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))) + break; + + D2(vswp, "%s: (chan %lld) leaving milestone 2", + __func__, ldcp->ldc_id); + + ldcp->hphase = VSW_MILESTONE3; + vsw_send_rdx(ldcp); + break; + + case VSW_MILESTONE3: + /* + * Pass this milestone when all paramaters have been + * successfully exchanged and RDX sent in both directions. + * + * Mark outbound lane as available to transmit data. + */ + if ((ldcp->lane_in.lstate & VSW_RDX_ACK_SENT) && + (ldcp->lane_out.lstate & VSW_RDX_ACK_RECV)) { + + D2(vswp, "%s: (chan %lld) leaving milestone 3", + __func__, ldcp->ldc_id); + D2(vswp, "%s: ** handshake complete **", __func__); + ldcp->lane_out.lstate |= VSW_LANE_ACTIVE; + ldcp->hphase = VSW_MILESTONE4; + ldcp->hcnt = 0; + DISPLAY_STATE(); + } + break; + + case VSW_MILESTONE4: + D2(vswp, "%s: (chan %lld) in milestone 4", __func__, + ldcp->ldc_id); + break; + + default: + DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__, + ldcp->ldc_id, ldcp->hphase); + } + + D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id, + ldcp->hphase); +} + +/* + * Check if major version is supported. + * + * Returns 0 if finds supported major number, and if necessary + * adjusts the minor field. + * + * Returns 1 if can't match major number exactly. Sets mjor/minor + * to next lowest support values, or to zero if no other values possible. + */ +static int +vsw_supported_version(vio_ver_msg_t *vp) +{ + int i; + + D1(NULL, "vsw_supported_version: enter"); + + for (i = 0; i < VSW_NUM_VER; i++) { + if (vsw_versions[i].ver_major == vp->ver_major) { + /* + * Matching or lower major version found. Update + * minor number if necessary. + */ + if (vp->ver_minor > vsw_versions[i].ver_minor) { + D2(NULL, "%s: adjusting minor value" + " from %d to %d", __func__, + vp->ver_minor, + vsw_versions[i].ver_minor); + vp->ver_minor = vsw_versions[i].ver_minor; + } + + return (0); + } + + if (vsw_versions[i].ver_major < vp->ver_major) { + if (vp->ver_minor > vsw_versions[i].ver_minor) { + D2(NULL, "%s: adjusting minor value" + " from %d to %d", __func__, + vp->ver_minor, + vsw_versions[i].ver_minor); + vp->ver_minor = vsw_versions[i].ver_minor; + } + return (1); + } + } + + /* No match was possible, zero out fields */ + vp->ver_major = 0; + vp->ver_minor = 0; + + D1(NULL, "vsw_supported_version: exit"); + + return (1); +} + +/* + * Main routine for processing messages received over LDC. + */ +static void +vsw_process_pkt(void *arg) +{ + vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; + vsw_t *vswp = ldcp->ldc_vswp; + size_t msglen; + vio_msg_tag_t tag; + def_msg_t dmsg; + int rv = 0; + + D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); + + /* + * If channel is up read messages until channel is empty. + */ + do { + msglen = sizeof (dmsg); + rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen); + + if (rv != 0) { + DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) " + "len(%d)\n", __func__, ldcp->ldc_id, + rv, msglen); + break; + } + + if (msglen == 0) { + D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__, + ldcp->ldc_id); + break; + } + + D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__, + ldcp->ldc_id, msglen); + + /* + * Figure out what sort of packet we have gotten by + * examining the msg tag, and then switch it appropriately. + */ + bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t)); + + switch (tag.vio_msgtype) { + case VIO_TYPE_CTRL: + vsw_dispatch_ctrl_task(ldcp, &dmsg, tag); + break; + case VIO_TYPE_DATA: + vsw_process_data_pkt(ldcp, &dmsg, tag); + break; + case VIO_TYPE_ERR: + vsw_process_err_pkt(ldcp, &dmsg, tag); + break; + default: + DERR(vswp, "%s: Unknown tag(%lx) ", __func__, + "id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id); + break; + } + } while (msglen); + + D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id); +} + +/* + * Dispatch a task to process a VIO control message. + */ +static void +vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag) +{ + vsw_ctrl_task_t *ctaskp = NULL; + vsw_port_t *port = ldcp->ldc_port; + vsw_t *vswp = port->p_vswp; + + D1(vswp, "%s: enter", __func__); + + /* + * We need to handle RDX ACK messages in-band as once they + * are exchanged it is possible that we will get an + * immediate (legitimate) data packet. + */ + if ((tag.vio_subtype_env == VIO_RDX) && + (tag.vio_subtype == VIO_SUBTYPE_ACK)) { + if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_ACK_RECV)) + return; + + ldcp->lane_out.lstate |= VSW_RDX_ACK_RECV; + vsw_next_milestone(ldcp); + D2(vswp, "%s (%ld) handling RDX_ACK in place", __func__, + ldcp->ldc_id); + return; + } + + ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP); + + if (ctaskp == NULL) { + DERR(vswp, "%s: unable to alloc space for ctrl" + " msg", __func__); + vsw_restart_handshake(ldcp); + return; + } + + ctaskp->ldcp = ldcp; + bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t)); + mutex_enter(&ldcp->hss_lock); + ctaskp->hss_id = ldcp->hss_id; + mutex_exit(&ldcp->hss_lock); + + /* + * Dispatch task to processing taskq if port is not in + * the process of being detached. + */ + mutex_enter(&port->state_lock); + if (port->state == VSW_PORT_INIT) { + if ((vswp->taskq_p == NULL) || + (ddi_taskq_dispatch(vswp->taskq_p, + vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP) + != DDI_SUCCESS)) { + DERR(vswp, "%s: unable to dispatch task to taskq", + __func__); + kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); + mutex_exit(&port->state_lock); + vsw_restart_handshake(ldcp); + return; + } + } else { + DWARN(vswp, "%s: port %d detaching, not dispatching " + "task", __func__, port->p_instance); + } + + mutex_exit(&port->state_lock); + + D2(vswp, "%s: dispatched task to taskq for chan %d", __func__, + ldcp->ldc_id); + D1(vswp, "%s: exit", __func__); +} + +/* + * Process a VIO ctrl message. Invoked from taskq. + */ +static void +vsw_process_ctrl_pkt(void *arg) +{ + vsw_ctrl_task_t *ctaskp = (vsw_ctrl_task_t *)arg; + vsw_ldc_t *ldcp = ctaskp->ldcp; + vsw_t *vswp = ldcp->ldc_vswp; + vio_msg_tag_t tag; + uint16_t env; + + D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); + + bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t)); + env = tag.vio_subtype_env; + + /* stale pkt check */ + mutex_enter(&ldcp->hss_lock); + if (ctaskp->hss_id < ldcp->hss_id) { + DWARN(vswp, "%s: discarding stale packet belonging to" + " earlier (%ld) handshake session", __func__, + ctaskp->hss_id); + mutex_exit(&ldcp->hss_lock); + return; + } + mutex_exit(&ldcp->hss_lock); + + /* session id check */ + if (ldcp->session_status & VSW_PEER_SESSION) { + if (ldcp->peer_session != tag.vio_sid) { + DERR(vswp, "%s (chan %d): invalid session id (%llx)", + __func__, ldcp->ldc_id, tag.vio_sid); + kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); + vsw_restart_handshake(ldcp); + return; + } + } + + /* + * Switch on vio_subtype envelope, then let lower routines + * decide if its an INFO, ACK or NACK packet. + */ + switch (env) { + case VIO_VER_INFO: + vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp); + break; + case VIO_DRING_REG: + vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp); + break; + case VIO_DRING_UNREG: + vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp); + break; + case VIO_ATTR_INFO: + vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp); + break; + case VNET_MCAST_INFO: + vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp); + break; + case VIO_RDX: + vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp); + break; + default: + DERR(vswp, "%s : unknown vio_subtype_env (%x)\n", + __func__, env); + } + + kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); + D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); +} + +/* + * Version negotiation. We can end up here either because our peer + * has responded to a handshake message we have sent it, or our peer + * has initiated a handshake with us. If its the former then can only + * be ACK or NACK, if its the later can only be INFO. + * + * If its an ACK we move to the next stage of the handshake, namely + * attribute exchange. If its a NACK we see if we can specify another + * version, if we can't we stop. + * + * If it is an INFO we reset all params associated with communication + * in that direction over this channel (remember connection is + * essentially 2 independent simplex channels). + */ +void +vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt) +{ + vio_ver_msg_t *ver_pkt; + vsw_t *vswp = ldcp->ldc_vswp; + + D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); + + /* + * We know this is a ctrl/version packet so + * cast it into the correct structure. + */ + ver_pkt = (vio_ver_msg_t *)pkt; + + switch (ver_pkt->tag.vio_subtype) { + case VIO_SUBTYPE_INFO: + D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n"); + + /* + * Record the session id, which we will use from now + * until we see another VER_INFO msg. Even then the + * session id in most cases will be unchanged, execpt + * if channel was reset. + */ + if ((ldcp->session_status & VSW_PEER_SESSION) && + (ldcp->peer_session != ver_pkt->tag.vio_sid)) { + DERR(vswp, "%s: updating session id for chan %lld " + "from %llx to %llx", __func__, ldcp->ldc_id, + ldcp->peer_session, ver_pkt->tag.vio_sid); + } + + ldcp->peer_session = ver_pkt->tag.vio_sid; + ldcp->session_status |= VSW_PEER_SESSION; + + /* Legal message at this time ? */ + if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV)) + return; + + /* + * First check the device class. Currently only expect + * to be talking to a network device. In the future may + * also talk to another switch. + */ + if (ver_pkt->dev_class != VDEV_NETWORK) { + DERR(vswp, "%s: illegal device class %d", __func__, + ver_pkt->dev_class); + + ver_pkt->tag.vio_sid = ldcp->local_session; + ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; + + DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); + + vsw_send_msg(ldcp, (void *)ver_pkt, + sizeof (vio_ver_msg_t)); + + ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; + vsw_next_milestone(ldcp); + return; + } else { + ldcp->dev_class = ver_pkt->dev_class; + } + + /* + * Now check the version. + */ + if (vsw_supported_version(ver_pkt) == 0) { + /* + * Support this major version and possibly + * adjusted minor version. + */ + + D2(vswp, "%s: accepted ver %d:%d", __func__, + ver_pkt->ver_major, ver_pkt->ver_minor); + + /* Store accepted values */ + ldcp->lane_in.ver_major = ver_pkt->ver_major; + ldcp->lane_in.ver_minor = ver_pkt->ver_minor; + + ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; + + ldcp->lane_in.lstate |= VSW_VER_ACK_SENT; + } else { + /* + * NACK back with the next lower major/minor + * pairing we support (if don't suuport any more + * versions then they will be set to zero. + */ + + D2(vswp, "%s: replying with ver %d:%d", __func__, + ver_pkt->ver_major, ver_pkt->ver_minor); + + /* Store updated values */ + ldcp->lane_in.ver_major = ver_pkt->ver_major; + ldcp->lane_in.ver_minor = ver_pkt->ver_minor; + + ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; + + ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; + } + + DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); + ver_pkt->tag.vio_sid = ldcp->local_session; + vsw_send_msg(ldcp, (void *)ver_pkt, sizeof (vio_ver_msg_t)); + + vsw_next_milestone(ldcp); + break; + + case VIO_SUBTYPE_ACK: + D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__); + + if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV)) + return; + + /* Store updated values */ + ldcp->lane_in.ver_major = ver_pkt->ver_major; + ldcp->lane_in.ver_minor = ver_pkt->ver_minor; + + + ldcp->lane_out.lstate |= VSW_VER_ACK_RECV; + vsw_next_milestone(ldcp); + + break; + + case VIO_SUBTYPE_NACK: + D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__); + + if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV)) + return; + + /* + * If our peer sent us a NACK with the ver fields set to + * zero then there is nothing more we can do. Otherwise see + * if we support either the version suggested, or a lesser + * one. + */ + if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { + DERR(vswp, "%s: peer unable to negotiate any " + "further.", __func__); + ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; + vsw_next_milestone(ldcp); + return; + } + + /* + * Check to see if we support this major version or + * a lower one. If we don't then maj/min will be set + * to zero. + */ + (void) vsw_supported_version(ver_pkt); + if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { + /* Nothing more we can do */ + DERR(vswp, "%s: version negotiation failed.\n", + __func__); + ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; + vsw_next_milestone(ldcp); + } else { + /* found a supported major version */ + ldcp->lane_out.ver_major = ver_pkt->ver_major; + ldcp->lane_out.ver_minor = ver_pkt->ver_minor; + + D2(vswp, "%s: resending with updated values (%x, %x)", + __func__, ver_pkt->ver_major, + ver_pkt->ver_minor); + + ldcp->lane_out.lstate |= VSW_VER_INFO_SENT; + ver_pkt->tag.vio_sid = ldcp->local_session; + ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; + + DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); + + vsw_send_msg(ldcp, (void *)ver_pkt, + sizeof (vio_ver_msg_t)); + + vsw_next_milestone(ldcp); + + } + break; + + default: + DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, + ver_pkt->tag.vio_subtype); + } + + D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); +} + +/* + * Process an attribute packet. We can end up here either because our peer + * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our + * peer has sent us an attribute INFO message + * + * If its an ACK we then move to the next stage of the handshake which + * is to send our descriptor ring info to our peer. If its a NACK then + * there is nothing more we can (currently) do. + * + * If we get a valid/acceptable INFO packet (and we have already negotiated + * a version) we ACK back and set channel state to ATTR_RECV, otherwise we + * NACK back and reset channel state to INACTIV. + * + * FUTURE: in time we will probably negotiate over attributes, but for + * the moment unacceptable attributes are regarded as a fatal error. + * + */ +void +vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt) +{ + vnet_attr_msg_t *attr_pkt; + vsw_t *vswp = ldcp->ldc_vswp; + vsw_port_t *port = ldcp->ldc_port; + uint64_t macaddr = 0; + int i; + + D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); + + /* + * We know this is a ctrl/attr packet so + * cast it into the correct structure. + */ + attr_pkt = (vnet_attr_msg_t *)pkt; + + switch (attr_pkt->tag.vio_subtype) { + case VIO_SUBTYPE_INFO: + D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); + + if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) + return; + + /* + * If the attributes are unacceptable then we NACK back. + */ + if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) { + + DERR(vswp, "%s (chan %d): invalid attributes", + __func__, ldcp->ldc_id); + + vsw_free_lane_resources(ldcp, INBOUND); + + attr_pkt->tag.vio_sid = ldcp->local_session; + attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; + + DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); + ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT; + vsw_send_msg(ldcp, (void *)attr_pkt, + sizeof (vnet_attr_msg_t)); + + vsw_next_milestone(ldcp); + return; + } + + /* + * Otherwise store attributes for this lane and update + * lane state. + */ + ldcp->lane_in.mtu = attr_pkt->mtu; + ldcp->lane_in.addr = attr_pkt->addr; + ldcp->lane_in.addr_type = attr_pkt->addr_type; + ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode; + ldcp->lane_in.ack_freq = attr_pkt->ack_freq; + + macaddr = ldcp->lane_in.addr; + for (i = ETHERADDRL - 1; i >= 0; i--) { + port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF; + macaddr >>= 8; + } + + /* create the fdb entry for this port/mac address */ + (void) vsw_add_fdb(vswp, port); + + /* setup device specifc xmit routines */ + mutex_enter(&port->tx_lock); + if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) { + D2(vswp, "%s: mode = VIO_DRING_MODE", __func__); + port->transmit = vsw_dringsend; + } else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) { + D2(vswp, "%s: mode = VIO_DESC_MODE", __func__); + vsw_create_privring(ldcp); + port->transmit = vsw_descrsend; + } + mutex_exit(&port->tx_lock); + + attr_pkt->tag.vio_sid = ldcp->local_session; + attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; + + DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); + + ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT; + + vsw_send_msg(ldcp, (void *)attr_pkt, + sizeof (vnet_attr_msg_t)); + + vsw_next_milestone(ldcp); + break; + + case VIO_SUBTYPE_ACK: + D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); + + if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) + return; + + ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV; + vsw_next_milestone(ldcp); + break; + + case VIO_SUBTYPE_NACK: + D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); + + if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV)) + return; + + ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV; + vsw_next_milestone(ldcp); + break; + + default: + DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, + attr_pkt->tag.vio_subtype); + } + + D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); +} + +/* + * Process a dring info packet. We can end up here either because our peer + * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our + * peer has sent us a dring INFO message. + * + * If we get a valid/acceptable INFO packet (and we have already negotiated + * a version) we ACK back and update the lane state, otherwise we NACK back. + * + * FUTURE: nothing to stop client from sending us info on multiple dring's + * but for the moment we will just use the first one we are given. + * + */ +void +vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt) +{ + vio_dring_reg_msg_t *dring_pkt; + vsw_t *vswp = ldcp->ldc_vswp; + ldc_mem_info_t minfo; + dring_info_t *dp, *dbp; + int dring_found = 0; + + /* + * We know this is a ctrl/dring packet so + * cast it into the correct structure. + */ + dring_pkt = (vio_dring_reg_msg_t *)pkt; + + D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); + + switch (dring_pkt->tag.vio_subtype) { + case VIO_SUBTYPE_INFO: + D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); + + if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) + return; + + /* + * If the dring params are unacceptable then we NACK back. + */ + if (vsw_check_dring_info(dring_pkt)) { + + DERR(vswp, "%s (%lld): invalid dring info", + __func__, ldcp->ldc_id); + + vsw_free_lane_resources(ldcp, INBOUND); + + dring_pkt->tag.vio_sid = ldcp->local_session; + dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; + + DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); + + ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; + + vsw_send_msg(ldcp, (void *)dring_pkt, + sizeof (vio_dring_reg_msg_t)); + + vsw_next_milestone(ldcp); + return; + } + + /* + * Otherwise, attempt to map in the dring using the + * cookie. If that succeeds we send back a unique dring + * identifier that the sending side will use in future + * to refer to this descriptor ring. + */ + dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); + + dp->num_descriptors = dring_pkt->num_descriptors; + dp->descriptor_size = dring_pkt->descriptor_size; + dp->options = dring_pkt->options; + dp->ncookies = dring_pkt->ncookies; + + /* + * Note: should only get one cookie. Enforced in + * the ldc layer. + */ + bcopy(&dring_pkt->cookie[0], &dp->cookie[0], + sizeof (ldc_mem_cookie_t)); + + D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__, + dp->num_descriptors, dp->descriptor_size); + D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__, + dp->options, dp->ncookies); + + if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0], + dp->ncookies, dp->num_descriptors, + dp->descriptor_size, LDC_SHADOW_MAP, + &(dp->handle))) != 0) { + + DERR(vswp, "%s: dring_map failed\n", __func__); + + kmem_free(dp, sizeof (dring_info_t)); + vsw_free_lane_resources(ldcp, INBOUND); + + dring_pkt->tag.vio_sid = ldcp->local_session; + dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; + + DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); + + ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; + vsw_send_msg(ldcp, (void *)dring_pkt, + sizeof (vio_dring_reg_msg_t)); + + vsw_next_milestone(ldcp); + return; + } + + if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { + + DERR(vswp, "%s: dring_addr failed\n", __func__); + + kmem_free(dp, sizeof (dring_info_t)); + vsw_free_lane_resources(ldcp, INBOUND); + + dring_pkt->tag.vio_sid = ldcp->local_session; + dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; + + DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); + + ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; + vsw_send_msg(ldcp, (void *)dring_pkt, + sizeof (vio_dring_reg_msg_t)); + + vsw_next_milestone(ldcp); + return; + } else { + /* store the address of the pub part of ring */ + dp->pub_addr = minfo.vaddr; + } + + /* no private section as we are importing */ + dp->priv_addr = NULL; + + /* + * Using simple mono increasing int for ident at + * the moment. + */ + dp->ident = ldcp->next_ident; + ldcp->next_ident++; + + dp->end_idx = 0; + dp->next = NULL; + + /* + * Link it onto the end of the list of drings + * for this lane. + */ + if (ldcp->lane_in.dringp == NULL) { + D2(vswp, "%s: adding first INBOUND dring", __func__); + ldcp->lane_in.dringp = dp; + } else { + dbp = ldcp->lane_in.dringp; + + while (dbp->next != NULL) + dbp = dbp->next; + + dbp->next = dp; + } + + /* acknowledge it */ + dring_pkt->tag.vio_sid = ldcp->local_session; + dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; + dring_pkt->dring_ident = dp->ident; + + vsw_send_msg(ldcp, (void *)dring_pkt, + sizeof (vio_dring_reg_msg_t)); + + ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT; + vsw_next_milestone(ldcp); + break; + + case VIO_SUBTYPE_ACK: + D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); + + if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) + return; + + /* + * Peer is acknowledging our dring info and will have + * sent us a dring identifier which we will use to + * refer to this ring w.r.t. our peer. + */ + dp = ldcp->lane_out.dringp; + if (dp != NULL) { + /* + * Find the ring this ident should be associated + * with. + */ + if (vsw_dring_match(dp, dring_pkt)) { + dring_found = 1; + + } else while (dp != NULL) { + if (vsw_dring_match(dp, dring_pkt)) { + dring_found = 1; + break; + } + dp = dp->next; + } + + if (dring_found == 0) { + DERR(NULL, "%s: unrecognised ring cookie", + __func__); + vsw_restart_handshake(ldcp); + return; + } + + } else { + DERR(vswp, "%s: DRING ACK received but no drings " + "allocated", __func__); + vsw_restart_handshake(ldcp); + return; + } + + /* store ident */ + dp->ident = dring_pkt->dring_ident; + ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV; + vsw_next_milestone(ldcp); + break; + + case VIO_SUBTYPE_NACK: + D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); + + if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV)) + return; + + ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV; + vsw_next_milestone(ldcp); + break; + + default: + DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, + dring_pkt->tag.vio_subtype); + } + + D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); +} + +/* + * Process a request from peer to unregister a dring. + * + * For the moment we just restart the handshake if our + * peer endpoint attempts to unregister a dring. + */ +void +vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt) +{ + vsw_t *vswp = ldcp->ldc_vswp; + vio_dring_unreg_msg_t *dring_pkt; + + /* + * We know this is a ctrl/dring packet so + * cast it into the correct structure. + */ + dring_pkt = (vio_dring_unreg_msg_t *)pkt; + + D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); + + switch (dring_pkt->tag.vio_subtype) { + case VIO_SUBTYPE_INFO: + D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); + + DWARN(vswp, "%s: restarting handshake..", __func__); + vsw_restart_handshake(ldcp); + break; + + case VIO_SUBTYPE_ACK: + D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); + + DWARN(vswp, "%s: restarting handshake..", __func__); + vsw_restart_handshake(ldcp); + break; + + case VIO_SUBTYPE_NACK: + D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); + + DWARN(vswp, "%s: restarting handshake..", __func__); + vsw_restart_handshake(ldcp); + break; + + default: + DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, + dring_pkt->tag.vio_subtype); + vsw_restart_handshake(ldcp); + } + + D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); +} + +#define SND_MCST_NACK(ldcp, pkt) \ + pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ + pkt->tag.vio_sid = ldcp->local_session; \ + vsw_send_msg(ldcp, (void *)pkt, sizeof (vnet_mcast_msg_t)); + +/* + * Process a multicast request from a vnet. + * + * Vnet's specify a multicast address that they are interested in. This + * address is used as a key into the hash table which forms the multicast + * forwarding database (mFDB). + * + * The table keys are the multicast addresses, while the table entries + * are pointers to lists of ports which wish to receive packets for the + * specified multicast address. + * + * When a multicast packet is being switched we use the address as a key + * into the hash table, and then walk the appropriate port list forwarding + * the pkt to each port in turn. + * + * If a vnet is no longer interested in a particular multicast grouping + * we simply find the correct location in the hash table and then delete + * the relevant port from the port list. + * + * To deal with the case whereby a port is being deleted without first + * removing itself from the lists in the hash table, we maintain a list + * of multicast addresses the port has registered an interest in, within + * the port structure itself. We then simply walk that list of addresses + * using them as keys into the hash table and remove the port from the + * appropriate lists. + */ +static void +vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt) +{ + vnet_mcast_msg_t *mcst_pkt; + vsw_port_t *port = ldcp->ldc_port; + vsw_t *vswp = ldcp->ldc_vswp; + int i; + + D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); + + /* + * We know this is a ctrl/mcast packet so + * cast it into the correct structure. + */ + mcst_pkt = (vnet_mcast_msg_t *)pkt; + + switch (mcst_pkt->tag.vio_subtype) { + case VIO_SUBTYPE_INFO: + D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); + + /* + * Check if in correct state to receive a multicast + * message (i.e. handshake complete). If not reset + * the handshake. + */ + if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV)) + return; + + /* + * Before attempting to add or remove address check + * that they are valid multicast addresses. + * If not, then NACK back. + */ + for (i = 0; i < mcst_pkt->count; i++) { + if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) { + DERR(vswp, "%s: invalid multicast address", + __func__); + SND_MCST_NACK(ldcp, mcst_pkt); + return; + } + } + + /* + * Now add/remove the addresses. If this fails we + * NACK back. + */ + if (vsw_add_rem_mcst(mcst_pkt, port) != 0) { + SND_MCST_NACK(ldcp, mcst_pkt); + return; + } + + mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; + mcst_pkt->tag.vio_sid = ldcp->local_session; + + DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt); + + vsw_send_msg(ldcp, (void *)mcst_pkt, + sizeof (vnet_mcast_msg_t)); + break; + + case VIO_SUBTYPE_ACK: + DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__); + + /* + * We shouldn't ever get a multicast ACK message as + * at the moment we never request multicast addresses + * to be set on some other device. This may change in + * the future if we have cascading switches. + */ + if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV)) + return; + + /* Do nothing */ + break; + + case VIO_SUBTYPE_NACK: + DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__); + + /* + * We shouldn't get a multicast NACK packet for the + * same reasons as we shouldn't get a ACK packet. + */ + if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV)) + return; + + /* Do nothing */ + break; + + default: + DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, + mcst_pkt->tag.vio_subtype); + } + + D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); +} + +static void +vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt) +{ + vio_rdx_msg_t *rdx_pkt; + vsw_t *vswp = ldcp->ldc_vswp; + + /* + * We know this is a ctrl/rdx packet so + * cast it into the correct structure. + */ + rdx_pkt = (vio_rdx_msg_t *)pkt; + + D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); + + switch (rdx_pkt->tag.vio_subtype) { + case VIO_SUBTYPE_INFO: + D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); + + if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_INFO_RECV)) + return; + + rdx_pkt->tag.vio_sid = ldcp->local_session; + rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; + + DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt); + + ldcp->lane_in.lstate |= VSW_RDX_ACK_SENT; + + vsw_send_msg(ldcp, (void *)rdx_pkt, + sizeof (vio_rdx_msg_t)); + + vsw_next_milestone(ldcp); + break; + + case VIO_SUBTYPE_ACK: + /* + * Should be handled in-band by callback handler. + */ + DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__); + vsw_restart_handshake(ldcp); + break; + + case VIO_SUBTYPE_NACK: + D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); + + if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_NACK_RECV)) + return; + + ldcp->lane_out.lstate |= VSW_RDX_NACK_RECV; + vsw_next_milestone(ldcp); + break; + + default: + DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, + rdx_pkt->tag.vio_subtype); + } + + D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); +} + +static void +vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag) +{ + uint16_t env = tag.vio_subtype_env; + vsw_t *vswp = ldcp->ldc_vswp; + + D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); + + /* session id check */ + if (ldcp->session_status & VSW_PEER_SESSION) { + if (ldcp->peer_session != tag.vio_sid) { + DERR(vswp, "%s (chan %d): invalid session id (%llx)", + __func__, ldcp->ldc_id, tag.vio_sid); + vsw_restart_handshake(ldcp); + return; + } + } + + /* + * It is an error for us to be getting data packets + * before the handshake has completed. + */ + if (ldcp->hphase != VSW_MILESTONE4) { + DERR(vswp, "%s: got data packet before handshake complete " + "hphase %d (%x: %x)", __func__, ldcp->hphase, + ldcp->lane_in.lstate, ldcp->lane_out.lstate); + DUMP_FLAGS(ldcp->lane_in.lstate); + DUMP_FLAGS(ldcp->lane_out.lstate); + vsw_restart_handshake(ldcp); + return; + } + + /* + * Switch on vio_subtype envelope, then let lower routines + * decide if its an INFO, ACK or NACK packet. + */ + if (env == VIO_DRING_DATA) { + vsw_process_data_dring_pkt(ldcp, dpkt); + } else if (env == VIO_PKT_DATA) { + vsw_process_data_raw_pkt(ldcp, dpkt); + } else if (env == VIO_DESC_DATA) { + vsw_process_data_ibnd_pkt(ldcp, dpkt); + } else { + DERR(vswp, "%s : unknown vio_subtype_env (%x)\n", + __func__, env); + } + + D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); +} + +#define SND_DRING_NACK(ldcp, pkt) \ + pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ + pkt->tag.vio_sid = ldcp->local_session; \ + vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_dring_msg_t)); + +static void +vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) +{ + vio_dring_msg_t *dring_pkt; + vnet_public_desc_t *pub_addr = NULL; + vsw_private_desc_t *priv_addr = NULL; + dring_info_t *dp = NULL; + vsw_t *vswp = ldcp->ldc_vswp; + mblk_t *mp = NULL; + mblk_t *bp = NULL; + mblk_t *bpt = NULL; + size_t nbytes = 0; + size_t off = 0; + uint64_t ncookies = 0; + uint64_t chain = 0; + uint64_t j, len, num; + uint32_t start, end, datalen; + int i, last_sync, rv; + boolean_t ack_needed = B_FALSE; + boolean_t sync_needed = B_TRUE; + + D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); + + /* + * We know this is a data/dring packet so + * cast it into the correct structure. + */ + dring_pkt = (vio_dring_msg_t *)dpkt; + + /* + * Switch on the vio_subtype. If its INFO then we need to + * process the data. If its an ACK we need to make sure + * it makes sense (i.e did we send an earlier data/info), + * and if its a NACK then we maybe attempt a retry. + */ + switch (dring_pkt->tag.vio_subtype) { + case VIO_SUBTYPE_INFO: + D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id); + + if ((dp = vsw_ident2dring(&ldcp->lane_in, + dring_pkt->dring_ident)) == NULL) { + + DERR(vswp, "%s(%lld): unable to find dring from " + "ident 0x%llx", __func__, ldcp->ldc_id, + dring_pkt->dring_ident); + + SND_DRING_NACK(ldcp, dring_pkt); + return; + } + + start = end = 0; + start = dring_pkt->start_idx; + end = dring_pkt->end_idx; + + D3(vswp, "%s(%lld): start index %ld : end %ld\n", + __func__, ldcp->ldc_id, start, end); + + /* basic sanity check */ + len = dp->num_descriptors; + if (end > len) { + DERR(vswp, "%s(%lld): endpoint %lld outside ring" + " length %lld", __func__, ldcp->ldc_id, + end, len); + + SND_DRING_NACK(ldcp, dring_pkt); + return; + } + + /* sync data */ + if ((rv = ldc_mem_dring_acquire(dp->handle, + start, end)) != 0) { + DERR(vswp, "%s(%lld): unable to acquire dring : err %d", + __func__, ldcp->ldc_id, rv); + return; + } + + pub_addr = (vnet_public_desc_t *)dp->pub_addr; + + j = num = 0; + + /* calculate # descriptors taking into a/c wrap around */ + num = end >= start ? end - start + 1: (len - start + 1) + end; + + last_sync = start; + + for (i = start; j < num; i = (i + 1) % len, j++) { + pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; + + /* + * Data is padded to align on 8 byte boundary, + * datalen is actual data length, i.e. minus that + * padding. + */ + datalen = pub_addr->nbytes; + + /* + * Does peer wish us to ACK when we have finished + * with this descriptor ? + */ + if (pub_addr->hdr.ack) + ack_needed = B_TRUE; + + D2(vswp, "%s(%lld): processing desc %lld at pos" + " 0x%llx : dstate 0x%lx : datalen 0x%lx", + __func__, ldcp->ldc_id, i, pub_addr, + pub_addr->hdr.dstate, datalen); + + /* + * XXXX : Is it a fatal error to be told to + * process a packet when the READY bit is not + * set ? + */ + if (pub_addr->hdr.dstate != VIO_DESC_READY) { + DERR(vswp, "%s(%d): descriptor %lld at pos " + " 0x%llx not READY (0x%lx)", __func__, + ldcp->ldc_id, i, pub_addr, + pub_addr->hdr.dstate); + + SND_DRING_NACK(ldcp, dring_pkt); + (void) ldc_mem_dring_release(dp->handle, + start, end); + return; + } + + /* + * Mark that we are starting to process descriptor. + */ + pub_addr->hdr.dstate = VIO_DESC_ACCEPTED; + + /* + * allocb(9F) returns an aligned data block. We + * need to ensure that we ask ldc for an aligned + * number of bytes also. + */ + nbytes = datalen; + if (nbytes & 0x7) { + off = 8 - (nbytes & 0x7); + nbytes += off; + } + mp = allocb(datalen, BPRI_MED); + if (mp == NULL) { + DERR(vswp, "%s(%lld): allocb failed", + __func__, ldcp->ldc_id); + (void) ldc_mem_dring_release(dp->handle, + start, end); + return; + } + + ncookies = pub_addr->ncookies; + rv = ldc_mem_copy(ldcp->ldc_handle, + (caddr_t)mp->b_rptr, 0, &nbytes, + pub_addr->memcookie, ncookies, + LDC_COPY_IN); + + if (rv != 0) { + DERR(vswp, "%s(%d): unable to copy in " + "data from %d cookies", __func__, + ldcp->ldc_id, ncookies); + freemsg(mp); + (void) ldc_mem_dring_release(dp->handle, + start, end); + return; + } else { + D2(vswp, "%s(%d): copied in %ld bytes" + " using %d cookies", __func__, + ldcp->ldc_id, nbytes, ncookies); + } + + /* point to the actual end of data */ + mp->b_wptr = mp->b_rptr + datalen; + + /* build a chain of received packets */ + if (bp == NULL) { + /* first pkt */ + bp = mp; + bp->b_next = bp->b_prev = NULL; + bpt = bp; + chain = 1; + } else { + mp->b_next = NULL; + mp->b_prev = bpt; + bpt->b_next = mp; + bpt = mp; + chain++; + } + + /* mark we are finished with this descriptor */ + pub_addr->hdr.dstate = VIO_DESC_DONE; + + /* + * Send an ACK back to peer if requested, and sync + * the rings up to this point so the remote side sees + * the descriptor flag in a consistent state. + */ + if (ack_needed) { + if ((rv = ldc_mem_dring_release( + dp->handle, last_sync, i)) != 0) { + DERR(vswp, "%s(%lld): unable to sync" + " from %d to %d", __func__, + ldcp->ldc_id, last_sync, i); + } + + ack_needed = B_FALSE; + + if (i == end) + sync_needed = B_FALSE; + else + sync_needed = B_TRUE; + + last_sync = (i + 1) % len; + + dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; + dring_pkt->tag.vio_sid = ldcp->local_session; + vsw_send_msg(ldcp, (void *)dring_pkt, + sizeof (vio_dring_msg_t)); + } + } + + if (sync_needed) { + if ((rv = ldc_mem_dring_release(dp->handle, + last_sync, end)) != 0) { + DERR(vswp, "%s(%lld): unable to sync" + " from %d to %d", __func__, + ldcp->ldc_id, last_sync, end); + } + } + + /* send the chain of packets to be switched */ + D3(vswp, "%s(%lld): switching chain of %d msgs", __func__, + ldcp->ldc_id, chain); + vsw_switch_frame(vswp, bp, VSW_VNETPORT, + ldcp->ldc_port, NULL); + + break; + + case VIO_SUBTYPE_ACK: + D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id); + /* + * Verify that the relevant descriptors are all + * marked as DONE + */ + if ((dp = vsw_ident2dring(&ldcp->lane_out, + dring_pkt->dring_ident)) == NULL) { + DERR(vswp, "%s: unknown ident in ACK", __func__); + return; + } + + pub_addr = (vnet_public_desc_t *)dp->pub_addr; + priv_addr = (vsw_private_desc_t *)dp->priv_addr; + + start = end = 0; + start = dring_pkt->start_idx; + end = dring_pkt->end_idx; + len = dp->num_descriptors; + + + j = num = 0; + /* calculate # descriptors taking into a/c wrap around */ + num = end >= start ? end - start + 1: (len - start + 1) + end; + + D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n", + __func__, ldcp->ldc_id, start, end, num); + + for (i = start; j < num; i = (i + 1) % len, j++) { + pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; + priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; + + if (pub_addr->hdr.dstate != VIO_DESC_DONE) { + DERR(vswp, "%s: descriptor %lld at pos " + " 0x%llx not DONE (0x%lx)\n", __func__, + i, pub_addr, pub_addr->hdr.dstate); + return; + } else { + /* clear all the fields */ + bzero(priv_addr->datap, priv_addr->datalen); + priv_addr->datalen = 0; + + pub_addr->hdr.dstate = VIO_DESC_FREE; + pub_addr->hdr.ack = 0; + priv_addr->dstate = VIO_DESC_FREE; + + D3(vswp, "clearing descp %d : pub state " + "0x%llx : priv state 0x%llx", i, + pub_addr->hdr.dstate, + priv_addr->dstate); + } + } + + break; + + case VIO_SUBTYPE_NACK: + DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK", + __func__, ldcp->ldc_id); + /* + * Something is badly wrong if we are getting NACK's + * for our data pkts. So reset the channel. + */ + vsw_restart_handshake(ldcp); + + break; + + default: + DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, + ldcp->ldc_id, dring_pkt->tag.vio_subtype); + } + + D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); +} + +/* + * VIO_PKT_DATA (a.k.a raw data mode ) + * + * Note - currently not supported. Do nothing. + */ +static void +vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt) +{ + _NOTE(ARGUNUSED(dpkt)) + + D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id); + + DERR(NULL, "%s (%lld): currently not supported", + __func__, ldcp->ldc_id); + + D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id); +} + +#define SND_IBND_DESC_NACK(ldcp, pkt) \ + pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ + pkt->tag.vio_sid = ldcp->local_session; \ + vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_ibnd_desc_t)); + +/* + * Process an in-band descriptor message (most likely from + * OBP). + */ +static void +vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt) +{ + vio_ibnd_desc_t *ibnd_desc; + dring_info_t *dp = NULL; + vsw_private_desc_t *priv_addr = NULL; + vsw_t *vswp = ldcp->ldc_vswp; + mblk_t *mp = NULL; + size_t nbytes = 0; + size_t off = 0; + uint64_t idx = 0; + uint32_t datalen = 0; + uint64_t ncookies = 0; + int rv; + + D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); + + ibnd_desc = (vio_ibnd_desc_t *)pkt; + + switch (ibnd_desc->hdr.tag.vio_subtype) { + case VIO_SUBTYPE_INFO: + D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__); + + if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) + return; + + /* + * Data is padded to align on a 8 byte boundary, + * nbytes is actual data length, i.e. minus that + * padding. + */ + datalen = ibnd_desc->nbytes; + + D2(vswp, "%s(%lld): processing inband desc : " + ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen); + + ncookies = ibnd_desc->ncookies; + + /* + * allocb(9F) returns an aligned data block. We + * need to ensure that we ask ldc for an aligned + * number of bytes also. + */ + nbytes = datalen; + if (nbytes & 0x7) { + off = 8 - (nbytes & 0x7); + nbytes += off; + } + + mp = allocb(datalen, BPRI_MED); + if (mp == NULL) { + DERR(vswp, "%s(%lld): allocb failed", + __func__, ldcp->ldc_id); + return; + } + + rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr, + 0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies, + LDC_COPY_IN); + + if (rv != 0) { + DERR(vswp, "%s(%d): unable to copy in data from " + "%d cookie(s)", __func__, + ldcp->ldc_id, ncookies); + freemsg(mp); + return; + } else { + D2(vswp, "%s(%d): copied in %ld bytes using %d " + "cookies", __func__, ldcp->ldc_id, nbytes, + ncookies); + } + + /* point to the actual end of data */ + mp->b_wptr = mp->b_rptr + datalen; + + /* + * We ACK back every in-band descriptor message we process + */ + ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK; + ibnd_desc->hdr.tag.vio_sid = ldcp->local_session; + vsw_send_msg(ldcp, (void *)ibnd_desc, + sizeof (vio_ibnd_desc_t)); + + /* send the packet to be switched */ + vsw_switch_frame(vswp, mp, VSW_VNETPORT, + ldcp->ldc_port, NULL); + + break; + + case VIO_SUBTYPE_ACK: + D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__); + + /* Verify the ACK is valid */ + idx = ibnd_desc->hdr.desc_handle; + + if (idx >= VSW_RING_NUM_EL) { + cmn_err(CE_WARN, "%s: corrupted ACK received " + "(idx %ld)", __func__, idx); + return; + } + + if ((dp = ldcp->lane_out.dringp) == NULL) { + DERR(vswp, "%s: no dring found", __func__); + return; + } + + priv_addr = (vsw_private_desc_t *)dp->priv_addr; + + /* move to correct location in ring */ + priv_addr += idx; + + /* + * When we sent the in-band message to our peer we + * marked the copy in our private ring as READY. We now + * check that the descriptor we are being ACK'ed for is in + * fact READY, i.e. it is one we have shared with our peer. + */ + if (priv_addr->dstate != VIO_DESC_READY) { + cmn_err(CE_WARN, "%s: (%ld) desc at index %ld not " + "READY (0x%lx)", __func__, ldcp->ldc_id, idx, + priv_addr->dstate); + cmn_err(CE_CONT, "%s: bound %d: ncookies %ld\n", + __func__, priv_addr->bound, + priv_addr->ncookies); + cmn_err(CE_CONT, "datalen %ld\n", priv_addr->datalen); + return; + } else { + D2(vswp, "%s: (%lld) freeing descp at %lld", __func__, + ldcp->ldc_id, idx); + + /* release resources associated with sent msg */ + bzero(priv_addr->datap, priv_addr->datalen); + priv_addr->datalen = 0; + priv_addr->dstate = VIO_DESC_FREE; + } + break; + + case VIO_SUBTYPE_NACK: + DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__); + + /* + * We should only get a NACK if our peer doesn't like + * something about a message we have sent it. If this + * happens we just release the resources associated with + * the message. (We are relying on higher layers to decide + * whether or not to resend. + */ + + /* limit check */ + idx = ibnd_desc->hdr.desc_handle; + + if (idx >= VSW_RING_NUM_EL) { + DERR(vswp, "%s: corrupted NACK received (idx %lld)", + __func__, idx); + return; + } + + if ((dp = ldcp->lane_out.dringp) == NULL) { + DERR(vswp, "%s: no dring found", __func__); + return; + } + + priv_addr = (vsw_private_desc_t *)dp->priv_addr; + + /* move to correct location in ring */ + priv_addr += idx; + + /* release resources associated with sent msg */ + bzero(priv_addr->datap, priv_addr->datalen); + priv_addr->datalen = 0; + priv_addr->dstate = VIO_DESC_FREE; + + break; + + default: + DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, + ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype); + } + + D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); +} + +static void +vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag) +{ + _NOTE(ARGUNUSED(epkt)) + + vsw_t *vswp = ldcp->ldc_vswp; + uint16_t env = tag.vio_subtype_env; + + D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id); + + /* + * Error vio_subtypes have yet to be defined. So for + * the moment we can't do anything. + */ + D2(vswp, "%s: (%x) vio_subtype env", __func__, env); + + D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id); +} + +/* + * Switch the given ethernet frame when operating in layer 2 mode. + * + * vswp: pointer to the vsw instance + * mp: pointer to chain of ethernet frame(s) to be switched + * caller: identifies the source of this frame as: + * 1. VSW_VNETPORT - a vsw port (connected to a vnet). + * 2. VSW_PHYSDEV - the physical ethernet device + * 3. VSW_LOCALDEV - vsw configured as a virtual interface + * arg: argument provided by the caller. + * 1. for VNETPORT - pointer to the corresponding vsw_port_t. + * 2. for PHYSDEV - NULL + * 3. for LOCALDEV - pointer to to this vsw_t(self) + */ +void +vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, + vsw_port_t *arg, mac_resource_handle_t mrh) +{ + struct ether_header *ehp; + vsw_port_t *port = NULL; + mblk_t *bp, *ret_m; + mblk_t *nmp = NULL; + vsw_port_list_t *plist = &vswp->plist; + + D1(vswp, "%s: enter (caller %d)", __func__, caller); + + /* + * PERF: rather than breaking up the chain here, scan it + * to find all mblks heading to same destination and then + * pass that sub-chain to the lower transmit functions. + */ + + /* process the chain of packets */ + bp = mp; + while (bp) { + mp = bp; + bp = bp->b_next; + mp->b_next = mp->b_prev = NULL; + ehp = (struct ether_header *)mp->b_rptr; + + D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", + __func__, MBLKSIZE(mp), MBLKL(mp)); + + READ_ENTER(&vswp->if_lockrw); + if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) { + /* + * If destination is VSW_LOCALDEV (vsw as an eth + * interface) and if the device is up & running, + * send the packet up the stack on this host. + * If the virtual interface is down, drop the packet. + */ + if (caller != VSW_LOCALDEV) { + if (vswp->if_state & VSW_IF_UP) { + RW_EXIT(&vswp->if_lockrw); + mac_rx(vswp->if_macp, mrh, mp); + } else { + RW_EXIT(&vswp->if_lockrw); + /* Interface down, drop pkt */ + freemsg(mp); + } + } else { + RW_EXIT(&vswp->if_lockrw); + freemsg(mp); + } + continue; + } + RW_EXIT(&vswp->if_lockrw); + + READ_ENTER(&plist->lockrw); + port = vsw_lookup_fdb(vswp, ehp); + if (port) { + /* + * Mark the port as in-use. + */ + mutex_enter(&port->ref_lock); + port->ref_cnt++; + mutex_exit(&port->ref_lock); + RW_EXIT(&plist->lockrw); + + /* + * If plumbed and in promisc mode then copy msg + * and send up the stack. + */ + READ_ENTER(&vswp->if_lockrw); + if (VSW_U_P(vswp->if_state)) { + RW_EXIT(&vswp->if_lockrw); + nmp = copymsg(mp); + if (nmp) + mac_rx(vswp->if_macp, mrh, nmp); + } else { + RW_EXIT(&vswp->if_lockrw); + } + + /* + * If the destination is in FDB, the packet + * should be forwarded to the correponding + * vsw_port (connected to a vnet device - + * VSW_VNETPORT) + */ + (void) vsw_portsend(port, mp); + + /* + * Decrement use count in port and check if + * should wake delete thread. + */ + mutex_enter(&port->ref_lock); + port->ref_cnt--; + if (port->ref_cnt == 0) + cv_signal(&port->ref_cv); + mutex_exit(&port->ref_lock); + } else { + RW_EXIT(&plist->lockrw); + /* + * Destination not in FDB. + * + * If the destination is broadcast or + * multicast forward the packet to all + * (VNETPORTs, PHYSDEV, LOCALDEV), + * except the caller. + */ + if (IS_BROADCAST(ehp)) { + D3(vswp, "%s: BROADCAST pkt", __func__); + (void) vsw_forward_all(vswp, mp, + caller, arg); + } else if (IS_MULTICAST(ehp)) { + D3(vswp, "%s: MULTICAST pkt", __func__); + (void) vsw_forward_grp(vswp, mp, + caller, arg); + } else { + /* + * If the destination is unicast, and came + * from either a logical network device or + * the switch itself when it is plumbed, then + * send it out on the physical device and also + * up the stack if the logical interface is + * in promiscious mode. + * + * NOTE: The assumption here is that if we + * cannot find the destination in our fdb, its + * a unicast address, and came from either a + * vnet or down the stack (when plumbed) it + * must be destinded for an ethernet device + * outside our ldoms. + */ + if (caller == VSW_VNETPORT) { + READ_ENTER(&vswp->if_lockrw); + if (VSW_U_P(vswp->if_state)) { + RW_EXIT(&vswp->if_lockrw); + nmp = copymsg(mp); + if (nmp) + mac_rx(vswp->if_macp, + mrh, nmp); + } else { + RW_EXIT(&vswp->if_lockrw); + } + if ((ret_m = vsw_tx_msg(vswp, mp)) + != NULL) { + DERR(vswp, "%s: drop mblks to " + "phys dev", __func__); + freemsg(ret_m); + } + + } else if (caller == VSW_PHYSDEV) { + /* + * Pkt seen because card in promisc + * mode. Send up stack if plumbed in + * promisc mode, else drop it. + */ + READ_ENTER(&vswp->if_lockrw); + if (VSW_U_P(vswp->if_state)) { + RW_EXIT(&vswp->if_lockrw); + mac_rx(vswp->if_macp, mrh, mp); + } else { + RW_EXIT(&vswp->if_lockrw); + freemsg(mp); + } + + } else if (caller == VSW_LOCALDEV) { + /* + * Pkt came down the stack, send out + * over physical device. + */ + if ((ret_m = vsw_tx_msg(vswp, mp)) + != NULL) { + DERR(vswp, "%s: drop mblks to " + "phys dev", __func__); + freemsg(ret_m); + } + } + } + } + } + D1(vswp, "%s: exit\n", __func__); +} + +/* + * Switch ethernet frame when in layer 3 mode (i.e. using IP + * layer to do the routing). + * + * There is a large amount of overlap between this function and + * vsw_switch_l2_frame. At some stage we need to revisit and refactor + * both these functions. + */ +void +vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, + vsw_port_t *arg, mac_resource_handle_t mrh) +{ + struct ether_header *ehp; + vsw_port_t *port = NULL; + mblk_t *bp = NULL; + vsw_port_list_t *plist = &vswp->plist; + + D1(vswp, "%s: enter (caller %d)", __func__, caller); + + /* + * In layer 3 mode should only ever be switching packets + * between IP layer and vnet devices. So make sure thats + * who is invoking us. + */ + if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) { + DERR(vswp, "%s: unexpected caller (%d)", __func__, caller); + freemsgchain(mp); + return; + } + + /* process the chain of packets */ + bp = mp; + while (bp) { + mp = bp; + bp = bp->b_next; + mp->b_next = mp->b_prev = NULL; + ehp = (struct ether_header *)mp->b_rptr; + + D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", + __func__, MBLKSIZE(mp), MBLKL(mp)); + + READ_ENTER(&plist->lockrw); + port = vsw_lookup_fdb(vswp, ehp); + if (port) { + /* + * Mark port as in-use. + */ + mutex_enter(&port->ref_lock); + port->ref_cnt++; + mutex_exit(&port->ref_lock); + RW_EXIT(&plist->lockrw); + + D2(vswp, "%s: sending to target port", __func__); + (void) vsw_portsend(port, mp); + + /* + * Finished with port so decrement ref count and + * check if should wake delete thread. + */ + mutex_enter(&port->ref_lock); + port->ref_cnt--; + if (port->ref_cnt == 0) + cv_signal(&port->ref_cv); + mutex_exit(&port->ref_lock); + } else { + RW_EXIT(&plist->lockrw); + /* + * Destination not in FDB + * + * If the destination is broadcast or + * multicast forward the packet to all + * (VNETPORTs, PHYSDEV, LOCALDEV), + * except the caller. + */ + if (IS_BROADCAST(ehp)) { + D2(vswp, "%s: BROADCAST pkt", __func__); + (void) vsw_forward_all(vswp, mp, + caller, arg); + } else if (IS_MULTICAST(ehp)) { + D2(vswp, "%s: MULTICAST pkt", __func__); + (void) vsw_forward_grp(vswp, mp, + caller, arg); + } else { + /* + * Unicast pkt from vnet that we don't have + * an FDB entry for, so must be destinded for + * the outside world. Attempt to send up to the + * IP layer to allow it to deal with it. + */ + if (caller == VSW_VNETPORT) { + READ_ENTER(&vswp->if_lockrw); + if (vswp->if_state & VSW_IF_UP) { + RW_EXIT(&vswp->if_lockrw); + D2(vswp, "%s: sending up", + __func__); + mac_rx(vswp->if_macp, mrh, mp); + } else { + RW_EXIT(&vswp->if_lockrw); + /* Interface down, drop pkt */ + D2(vswp, "%s I/F down", + __func__); + freemsg(mp); + } + } + } + } + } + + D1(vswp, "%s: exit", __func__); +} + +/* + * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV), + * except the caller (port on which frame arrived). + */ +static int +vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) +{ + vsw_port_list_t *plist = &vswp->plist; + vsw_port_t *portp; + mblk_t *nmp = NULL; + mblk_t *ret_m = NULL; + int skip_port = 0; + + D1(vswp, "vsw_forward_all: enter\n"); + + /* + * Broadcast message from inside ldoms so send to outside + * world if in either of layer 2 modes. + */ + if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || + (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && + ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) { + + nmp = dupmsg(mp); + if (nmp) { + if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { + DERR(vswp, "%s: dropping pkt(s) " + "consisting of %ld bytes of data for" + " physical device", __func__, MBLKL(ret_m)); + freemsg(ret_m); + } + } + } + + if (caller == VSW_VNETPORT) + skip_port = 1; + + /* + * Broadcast message from other vnet (layer 2 or 3) or outside + * world (layer 2 only), send up stack if plumbed. + */ + if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) { + READ_ENTER(&vswp->if_lockrw); + if (vswp->if_state & VSW_IF_UP) { + RW_EXIT(&vswp->if_lockrw); + nmp = copymsg(mp); + if (nmp) + mac_rx(vswp->if_macp, vswp->if_mrh, nmp); + } else { + RW_EXIT(&vswp->if_lockrw); + } + } + + /* send it to all VNETPORTs */ + READ_ENTER(&plist->lockrw); + for (portp = plist->head; portp != NULL; portp = portp->p_next) { + D2(vswp, "vsw_forward_all: port %d", portp->p_instance); + /* + * Caution ! - don't reorder these two checks as arg + * will be NULL if the caller is PHYSDEV. skip_port is + * only set if caller is VNETPORT. + */ + if ((skip_port) && (portp == arg)) + continue; + else { + nmp = dupmsg(mp); + if (nmp) { + (void) vsw_portsend(portp, nmp); + } else { + DERR(vswp, "vsw_forward_all: nmp NULL"); + } + } + } + RW_EXIT(&plist->lockrw); + + freemsg(mp); + + D1(vswp, "vsw_forward_all: exit\n"); + return (0); +} + +/* + * Forward pkts to any devices or interfaces which have registered + * an interest in them (i.e. multicast groups). + */ +static int +vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) +{ + struct ether_header *ehp = (struct ether_header *)mp->b_rptr; + mfdb_ent_t *entp = NULL; + mfdb_ent_t *tpp = NULL; + vsw_port_t *port; + uint64_t key = 0; + mblk_t *nmp = NULL; + mblk_t *ret_m = NULL; + boolean_t check_if = B_TRUE; + + /* + * Convert address to hash table key + */ + KEY_HASH(key, ehp->ether_dhost); + + D1(vswp, "%s: key 0x%llx", __func__, key); + + /* + * If pkt came from either a vnet or down the stack (if we are + * plumbed) and we are in layer 2 mode, then we send the pkt out + * over the physical adapter, and then check to see if any other + * vnets are interested in it. + */ + if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || + (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && + ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) { + nmp = dupmsg(mp); + if (nmp) { + if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { + DERR(vswp, "%s: dropping pkt(s) " + "consisting of %ld bytes of " + "data for physical device", + __func__, MBLKL(ret_m)); + freemsg(ret_m); + } + } + } + + READ_ENTER(&vswp->mfdbrw); + if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key, + (mod_hash_val_t *)&entp) != 0) { + D3(vswp, "%s: no table entry found for addr 0x%llx", + __func__, key); + } else { + /* + * Send to list of devices associated with this address... + */ + for (tpp = entp; tpp != NULL; tpp = tpp->nextp) { + + /* dont send to ourselves */ + if ((caller == VSW_VNETPORT) && + (tpp->d_addr == (void *)arg)) { + port = (vsw_port_t *)tpp->d_addr; + D3(vswp, "%s: not sending to ourselves" + " : port %d", __func__, + port->p_instance); + continue; + + } else if ((caller == VSW_LOCALDEV) && + (tpp->d_type == VSW_LOCALDEV)) { + D3(vswp, "%s: not sending back up stack", + __func__); + continue; + } + + if (tpp->d_type == VSW_VNETPORT) { + port = (vsw_port_t *)tpp->d_addr; + D3(vswp, "%s: sending to port %ld for " + " addr 0x%llx", __func__, + port->p_instance, key); + + nmp = dupmsg(mp); + if (nmp) + (void) vsw_portsend(port, nmp); + } else { + if (vswp->if_state & VSW_IF_UP) { + nmp = copymsg(mp); + if (nmp) + mac_rx(vswp->if_macp, + vswp->if_mrh, nmp); + check_if = B_FALSE; + D3(vswp, "%s: sending up stack" + " for addr 0x%llx", __func__, + key); + } + } + } + } + + RW_EXIT(&vswp->mfdbrw); + + /* + * If the pkt came from either a vnet or from physical device, + * and if we havent already sent the pkt up the stack then we + * check now if we can/should (i.e. the interface is plumbed + * and in promisc mode). + */ + if ((check_if) && + ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) { + READ_ENTER(&vswp->if_lockrw); + if (VSW_U_P(vswp->if_state)) { + RW_EXIT(&vswp->if_lockrw); + D3(vswp, "%s: (caller %d) finally sending up stack" + " for addr 0x%llx", __func__, caller, key); + nmp = copymsg(mp); + if (nmp) + mac_rx(vswp->if_macp, vswp->if_mrh, nmp); + } else { + RW_EXIT(&vswp->if_lockrw); + } + } + + freemsg(mp); + + D1(vswp, "%s: exit", __func__); + + return (0); +} + +/* transmit the packet over the given port */ +static int +vsw_portsend(vsw_port_t *port, mblk_t *mp) +{ + vsw_ldc_list_t *ldcl = &port->p_ldclist; + vsw_ldc_t *ldcp; + int status = 0; + + + READ_ENTER(&ldcl->lockrw); + /* + * Note for now, we have a single channel. + */ + ldcp = ldcl->head; + if (ldcp == NULL) { + DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n"); + freemsg(mp); + RW_EXIT(&ldcl->lockrw); + return (1); + } + + /* + * Send the message out using the appropriate + * transmit function which will free mblock when it + * is finished with it. + */ + mutex_enter(&port->tx_lock); + if (port->transmit != NULL) + status = (*port->transmit)(ldcp, mp); + else { + freemsg(mp); + } + mutex_exit(&port->tx_lock); + + RW_EXIT(&ldcl->lockrw); + + return (status); +} + +/* + * Send packet out via descriptor ring to a logical device. + */ +static int +vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp) +{ + vio_dring_msg_t dring_pkt; + dring_info_t *dp = NULL; + vsw_private_desc_t *priv_desc = NULL; + vsw_t *vswp = ldcp->ldc_vswp; + mblk_t *bp; + size_t n, size; + caddr_t bufp; + int idx; + int status = LDC_TX_SUCCESS; + + D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id); + + /* TODO: make test a macro */ + if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || + (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { + DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping " + "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status, + ldcp->lane_out.lstate); + freemsg(mp); + return (LDC_TX_FAILURE); + } + + /* + * Note - using first ring only, this may change + * in the future. + */ + if ((dp = ldcp->lane_out.dringp) == NULL) { + DERR(vswp, "%s(%lld): no dring for outbound lane on" + " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id); + freemsg(mp); + return (LDC_TX_FAILURE); + } + + mutex_enter(&dp->dlock); + + size = msgsize(mp); + if (size > (size_t)ETHERMAX) { + DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, + ldcp->ldc_id, size); + status = LDC_TX_FAILURE; + goto vsw_dringsend_free_exit; + } + + /* + * Find a free descriptor + * + * Note: for the moment we are assuming that we will only + * have one dring going from the switch to each of its + * peers. This may change in the future. + */ + if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { + DERR(vswp, "%s(%lld): no descriptor available for ring " + "at 0x%llx", __func__, ldcp->ldc_id, dp); + + /* nothing more we can do */ + status = LDC_TX_NORESOURCES; + goto vsw_dringsend_free_exit; + } else { + D2(vswp, "%s(%lld): free private descriptor found at pos " + "%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx, + priv_desc); + } + + /* copy data into the descriptor */ + bufp = priv_desc->datap; + for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { + n = MBLKL(bp); + bcopy(bp->b_rptr, bufp, n); + bufp += n; + } + + priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; + priv_desc->dstate = VIO_DESC_READY; + + /* + * Copy relevant sections of private descriptor + * to public section + */ + vsw_dring_priv2pub(priv_desc); + + /* + * Send a vio_dring_msg to peer to prompt them to read + * the updated descriptor ring. + */ + dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA; + dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; + dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA; + dring_pkt.tag.vio_sid = ldcp->local_session; + + /* Note - for now using first ring */ + dring_pkt.dring_ident = dp->ident; + + /* + * Access to the seq_num is implicitly protected by the + * fact that we have only one dring associated with the + * lane currently and we hold the associated dring lock. + */ + dring_pkt.seq_num = ldcp->lane_out.seq_num++; + + /* Note - only updating single descrip at time at the moment */ + dring_pkt.start_idx = idx; + dring_pkt.end_idx = idx; + + D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__, + ldcp->ldc_id, dp, dring_pkt.dring_ident); + D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n", __func__, + ldcp->ldc_id, dring_pkt.start_idx, dring_pkt.end_idx, + dring_pkt.seq_num); + + vsw_send_msg(ldcp, (void *)&dring_pkt, sizeof (vio_dring_msg_t)); + +vsw_dringsend_free_exit: + + mutex_exit(&dp->dlock); + + /* free the message block */ + freemsg(mp); + + D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); + return (status); +} + +/* + * Send an in-band descriptor message over ldc. + */ +static int +vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp) +{ + vsw_t *vswp = ldcp->ldc_vswp; + vio_ibnd_desc_t ibnd_msg; + vsw_private_desc_t *priv_desc = NULL; + dring_info_t *dp = NULL; + size_t n, size = 0; + caddr_t bufp; + mblk_t *bp; + int idx, i; + int status = LDC_TX_SUCCESS; + static int warn_msg = 1; + + D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); + + ASSERT(mp != NULL); + + if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || + (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { + DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt", + __func__, ldcp->ldc_id, ldcp->ldc_status, + ldcp->lane_out.lstate); + freemsg(mp); + return (LDC_TX_FAILURE); + } + + /* + * only expect single dring to exist, which we use + * as an internal buffer, rather than a transfer channel. + */ + if ((dp = ldcp->lane_out.dringp) == NULL) { + DERR(vswp, "%s(%lld): no dring for outbound lane", + __func__, ldcp->ldc_id); + DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", + __func__, ldcp->ldc_id, ldcp->ldc_status, + ldcp->lane_out.lstate); + freemsg(mp); + return (LDC_TX_FAILURE); + } + + mutex_enter(&dp->dlock); + + size = msgsize(mp); + if (size > (size_t)ETHERMAX) { + DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, + ldcp->ldc_id, size); + status = LDC_TX_FAILURE; + goto vsw_descrsend_free_exit; + } + + /* + * Find a free descriptor in our buffer ring + */ + if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { + if (warn_msg) { + DERR(vswp, "%s(%lld): no descriptor available for ring " + "at 0x%llx", __func__, ldcp->ldc_id, dp); + warn_msg = 0; + } + + /* nothing more we can do */ + status = LDC_TX_NORESOURCES; + goto vsw_descrsend_free_exit; + } else { + D2(vswp, "%s(%lld): free private descriptor found at pos " + "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, + priv_desc); + warn_msg = 1; + } + + /* copy data into the descriptor */ + bufp = priv_desc->datap; + for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { + n = MBLKL(bp); + bcopy(bp->b_rptr, bufp, n); + bufp += n; + } + + priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; + priv_desc->dstate = VIO_DESC_READY; + + /* create and send the in-band descp msg */ + ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA; + ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO; + ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA; + ibnd_msg.hdr.tag.vio_sid = ldcp->local_session; + + /* + * Access to the seq_num is implicitly protected by the + * fact that we have only one dring associated with the + * lane currently and we hold the associated dring lock. + */ + ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++; + + /* + * Copy the mem cookies describing the data from the + * private region of the descriptor ring into the inband + * descriptor. + */ + for (i = 0; i < priv_desc->ncookies; i++) { + bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i], + sizeof (ldc_mem_cookie_t)); + } + + ibnd_msg.hdr.desc_handle = idx; + ibnd_msg.ncookies = priv_desc->ncookies; + ibnd_msg.nbytes = size; + + vsw_send_msg(ldcp, (void *)&ibnd_msg, sizeof (vio_ibnd_desc_t)); + +vsw_descrsend_free_exit: + + mutex_exit(&dp->dlock); + + /* free the allocated message blocks */ + freemsg(mp); + + D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); + return (status); +} + +static void +vsw_send_ver(vsw_ldc_t *ldcp) +{ + vsw_t *vswp = ldcp->ldc_vswp; + lane_t *lp = &ldcp->lane_out; + vio_ver_msg_t ver_msg; + + D1(vswp, "%s enter", __func__); + + ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL; + ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; + ver_msg.tag.vio_subtype_env = VIO_VER_INFO; + ver_msg.tag.vio_sid = ldcp->local_session; + + ver_msg.ver_major = vsw_versions[0].ver_major; + ver_msg.ver_minor = vsw_versions[0].ver_minor; + ver_msg.dev_class = VDEV_NETWORK_SWITCH; + + lp->lstate |= VSW_VER_INFO_SENT; + lp->ver_major = ver_msg.ver_major; + lp->ver_minor = ver_msg.ver_minor; + + DUMP_TAG(ver_msg.tag); + + vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t)); + + D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id); +} + +static void +vsw_send_attr(vsw_ldc_t *ldcp) +{ + vsw_t *vswp = ldcp->ldc_vswp; + lane_t *lp = &ldcp->lane_out; + vnet_attr_msg_t attr_msg; + + D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); + + /* + * Subtype is set to INFO by default + */ + attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL; + attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; + attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO; + attr_msg.tag.vio_sid = ldcp->local_session; + + /* payload copied from default settings for lane */ + attr_msg.mtu = lp->mtu; + attr_msg.addr_type = lp->addr_type; + attr_msg.xfer_mode = lp->xfer_mode; + attr_msg.ack_freq = lp->xfer_mode; + + READ_ENTER(&vswp->if_lockrw); + bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL); + RW_EXIT(&vswp->if_lockrw); + + ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT; + + DUMP_TAG(attr_msg.tag); + + vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t)); + + D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); +} + +/* + * Create dring info msg (which also results in the creation of + * a dring). + */ +static vio_dring_reg_msg_t * +vsw_create_dring_info_pkt(vsw_ldc_t *ldcp) +{ + vio_dring_reg_msg_t *mp; + dring_info_t *dp; + vsw_t *vswp = ldcp->ldc_vswp; + + D1(vswp, "vsw_create_dring_info_pkt enter\n"); + + /* + * If we can't create a dring, obviously no point sending + * a message. + */ + if ((dp = vsw_create_dring(ldcp)) == NULL) + return (NULL); + + mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP); + + mp->tag.vio_msgtype = VIO_TYPE_CTRL; + mp->tag.vio_subtype = VIO_SUBTYPE_INFO; + mp->tag.vio_subtype_env = VIO_DRING_REG; + mp->tag.vio_sid = ldcp->local_session; + + /* payload */ + mp->num_descriptors = dp->num_descriptors; + mp->descriptor_size = dp->descriptor_size; + mp->options = dp->options; + mp->ncookies = dp->ncookies; + bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t)); + + mp->dring_ident = 0; + + D1(vswp, "vsw_create_dring_info_pkt exit\n"); + + return (mp); +} + +static void +vsw_send_dring_info(vsw_ldc_t *ldcp) +{ + vio_dring_reg_msg_t *dring_msg; + vsw_t *vswp = ldcp->ldc_vswp; + + D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id); + + dring_msg = vsw_create_dring_info_pkt(ldcp); + if (dring_msg == NULL) { + cmn_err(CE_WARN, "vsw_send_dring_info: error creating msg"); + return; + } + + ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT; + + DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg); + + vsw_send_msg(ldcp, dring_msg, + sizeof (vio_dring_reg_msg_t)); + + kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t)); + + D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id); +} + +static void +vsw_send_rdx(vsw_ldc_t *ldcp) +{ + vsw_t *vswp = ldcp->ldc_vswp; + vio_rdx_msg_t rdx_msg; + + D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); + + rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL; + rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; + rdx_msg.tag.vio_subtype_env = VIO_RDX; + rdx_msg.tag.vio_sid = ldcp->local_session; + + ldcp->lane_out.lstate |= VSW_RDX_INFO_SENT; + + DUMP_TAG(rdx_msg.tag); + + vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t)); + + D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); +} + +/* + * Generic routine to send message out over ldc channel. + */ +static void +vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size) +{ + int rv; + size_t msglen = size; + vio_msg_tag_t *tag = (vio_msg_tag_t *)msgp; + vsw_t *vswp = ldcp->ldc_vswp; + + D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes", + ldcp->ldc_id, size); + + D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype); + D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype); + D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env); + + mutex_enter(&ldcp->ldc_txlock); + do { + msglen = size; + rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen); + } while (rv == EWOULDBLOCK && --vsw_wretries > 0); + + mutex_exit(&ldcp->ldc_txlock); + + if ((rv != 0) || (msglen != size)) { + DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) " + "rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id, + rv, size, msglen); + } + + D1(vswp, "vsw_send_msg (%lld) exit : sent %d bytes", + ldcp->ldc_id, msglen); +} + +/* + * Add an entry into FDB, for the given mac address and port_id. + * Returns 0 on success, 1 on failure. + * + * Lock protecting FDB must be held by calling process. + */ +static int +vsw_add_fdb(vsw_t *vswp, vsw_port_t *port) +{ + uint64_t addr = 0; + + D1(vswp, "%s: enter", __func__); + + KEY_HASH(addr, port->p_macaddr); + + D2(vswp, "%s: key = 0x%llx", __func__, addr); + + /* + * Note: duplicate keys will be rejected by mod_hash. + */ + if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr, + (mod_hash_val_t)port) != 0) { + DERR(vswp, "%s: unable to add entry into fdb.", __func__); + return (1); + } + + D1(vswp, "%s: exit", __func__); + return (0); +} + +/* + * Remove an entry from FDB. + * Returns 0 on success, 1 on failure. + */ +static int +vsw_del_fdb(vsw_t *vswp, vsw_port_t *port) +{ + uint64_t addr = 0; + + D1(vswp, "%s: enter", __func__); + + KEY_HASH(addr, port->p_macaddr); + + D2(vswp, "%s: key = 0x%llx", __func__, addr); + + (void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr); + + D1(vswp, "%s: enter", __func__); + + return (0); +} + +/* + * Search fdb for a given mac address. + * Returns pointer to the entry if found, else returns NULL. + */ +static vsw_port_t * +vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp) +{ + uint64_t key = 0; + vsw_port_t *port = NULL; + + D1(vswp, "%s: enter", __func__); + + KEY_HASH(key, ehp->ether_dhost); + + D2(vswp, "%s: key = 0x%llx", __func__, key); + + if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key, + (mod_hash_val_t *)&port) != 0) { + return (NULL); + } + + D1(vswp, "%s: exit", __func__); + + return (port); +} + +/* + * Add or remove multicast address(es). + * + * Returns 0 on success, 1 on failure. + */ +static int +vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port) +{ + mcst_addr_t *mcst_p = NULL; + vsw_t *vswp = port->p_vswp; + uint64_t addr = 0x0; + int i; + + D1(vswp, "%s: enter", __func__); + + D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count); + + for (i = 0; i < mcst_pkt->count; i++) { + /* + * Convert address into form that can be used + * as hash table key. + */ + KEY_HASH(addr, mcst_pkt->mca[i]); + + /* + * Add or delete the specified address/port combination. + */ + if (mcst_pkt->set == 0x1) { + D3(vswp, "%s: adding multicast address 0x%llx for " + "port %ld", __func__, addr, port->p_instance); + if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { + /* + * Update the list of multicast + * addresses contained within the + * port structure to include this new + * one. + */ + mcst_p = kmem_alloc(sizeof (mcst_addr_t), + KM_NOSLEEP); + if (mcst_p == NULL) { + DERR(vswp, "%s: unable to alloc mem", + __func__); + return (1); + } + + mcst_p->nextp = NULL; + mcst_p->addr = addr; + + mutex_enter(&port->mca_lock); + mcst_p->nextp = port->mcap; + port->mcap = mcst_p; + mutex_exit(&port->mca_lock); + + /* + * Program the address into HW. If the addr + * has already been programmed then the MAC + * just increments a ref counter (which is + * used when the address is being deleted) + * + * Note: + * For the moment we dont care if this + * succeeds because the card must be in + * promics mode. When we have the ability + * to program multiple unicst address into + * the card then we will need to check this + * return value. + */ + if (vswp->mh != NULL) + (void) mac_multicst_add(vswp->mh, + (uchar_t *)&mcst_pkt->mca[i]); + + } else { + DERR(vswp, "%s: error adding multicast " + "address 0x%llx for port %ld", + __func__, addr, port->p_instance); + return (1); + } + } else { + /* + * Delete an entry from the multicast hash + * table and update the address list + * appropriately. + */ + if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { + D3(vswp, "%s: deleting multicast address " + "0x%llx for port %ld", __func__, addr, + port->p_instance); + + vsw_del_addr(VSW_VNETPORT, port, addr); + + /* + * Remove the address from HW. The address + * will actually only be removed once the ref + * count within the MAC layer has dropped to + * zero. I.e. we can safely call this fn even + * if other ports are interested in this + * address. + */ + if (vswp->mh != NULL) + (void) mac_multicst_remove(vswp->mh, + (uchar_t *)&mcst_pkt->mca[i]); + + } else { + DERR(vswp, "%s: error deleting multicast " + "addr 0x%llx for port %ld", + __func__, addr, port->p_instance); + return (1); + } + } + } + D1(vswp, "%s: exit", __func__); + return (0); +} + +/* + * Add a new multicast entry. + * + * Search hash table based on address. If match found then + * update associated val (which is chain of ports), otherwise + * create new key/val (addr/port) pair and insert into table. + */ +static int +vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) +{ + int dup = 0; + int rv = 0; + mfdb_ent_t *ment = NULL; + mfdb_ent_t *tmp_ent = NULL; + mfdb_ent_t *new_ent = NULL; + void *tgt = NULL; + + if (devtype == VSW_VNETPORT) { + /* + * Being invoked from a vnet. + */ + ASSERT(arg != NULL); + tgt = arg; + D2(NULL, "%s: port %d : address 0x%llx", __func__, + ((vsw_port_t *)arg)->p_instance, addr); + } else { + /* + * We are being invoked via the m_multicst mac entry + * point. + */ + D2(NULL, "%s: address 0x%llx", __func__, addr); + tgt = (void *)vswp; + } + + WRITE_ENTER(&vswp->mfdbrw); + if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, + (mod_hash_val_t *)&ment) != 0) { + + /* address not currently in table */ + ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); + ment->d_addr = (void *)tgt; + ment->d_type = devtype; + ment->nextp = NULL; + + if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr, + (mod_hash_val_t)ment) != 0) { + DERR(vswp, "%s: hash table insertion failed", __func__); + kmem_free(ment, sizeof (mfdb_ent_t)); + rv = 1; + } else { + D2(vswp, "%s: added initial entry for 0x%llx to " + "table", __func__, addr); + } + } else { + /* + * Address in table. Check to see if specified port + * is already associated with the address. If not add + * it now. + */ + tmp_ent = ment; + while (tmp_ent != NULL) { + if (tmp_ent->d_addr == (void *)tgt) { + if (devtype == VSW_VNETPORT) { + DERR(vswp, "%s: duplicate port entry " + "found for portid %ld and key " + "0x%llx", __func__, + ((vsw_port_t *)arg)->p_instance, + addr); + } else { + DERR(vswp, "%s: duplicate entry found" + "for key 0x%llx", + __func__, addr); + } + rv = 1; + dup = 1; + break; + } + tmp_ent = tmp_ent->nextp; + } + + /* + * Port not on list so add it to end now. + */ + if (0 == dup) { + D2(vswp, "%s: added entry for 0x%llx to table", + __func__, addr); + new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); + new_ent->d_addr = (void *)tgt; + new_ent->d_type = devtype; + new_ent->nextp = NULL; + + tmp_ent = ment; + while (tmp_ent->nextp != NULL) + tmp_ent = tmp_ent->nextp; + + tmp_ent->nextp = new_ent; + } + } + + RW_EXIT(&vswp->mfdbrw); + return (rv); +} + +/* + * Remove a multicast entry from the hashtable. + * + * Search hash table based on address. If match found, scan + * list of ports associated with address. If specified port + * found remove it from list. + */ +static int +vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) +{ + mfdb_ent_t *ment = NULL; + mfdb_ent_t *curr_p, *prev_p; + void *tgt = NULL; + + D1(vswp, "%s: enter", __func__); + + if (devtype == VSW_VNETPORT) { + tgt = (vsw_port_t *)arg; + D2(vswp, "%s: removing port %d from mFDB for address" + " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, + addr); + } else { + D2(vswp, "%s: removing entry", __func__); + tgt = (void *)vswp; + } + + WRITE_ENTER(&vswp->mfdbrw); + if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, + (mod_hash_val_t *)&ment) != 0) { + D2(vswp, "%s: address 0x%llx not in table", __func__, addr); + RW_EXIT(&vswp->mfdbrw); + return (1); + } + + prev_p = curr_p = ment; + + while (curr_p != NULL) { + if (curr_p->d_addr == (void *)tgt) { + if (devtype == VSW_VNETPORT) { + D2(vswp, "%s: port %d found", __func__, + ((vsw_port_t *)tgt)->p_instance); + } else { + D2(vswp, "%s: instance found", __func__); + } + + if (prev_p == curr_p) { + /* + * head of list, if no other element is in + * list then destroy this entry, otherwise + * just replace it with updated value. + */ + ment = curr_p->nextp; + kmem_free(curr_p, sizeof (mfdb_ent_t)); + if (ment == NULL) { + (void) mod_hash_destroy(vswp->mfdb, + (mod_hash_val_t)addr); + } else { + (void) mod_hash_replace(vswp->mfdb, + (mod_hash_key_t)addr, + (mod_hash_val_t)ment); + } + } else { + /* + * Not head of list, no need to do + * replacement, just adjust list pointers. + */ + prev_p->nextp = curr_p->nextp; + kmem_free(curr_p, sizeof (mfdb_ent_t)); + } + break; + } + + prev_p = curr_p; + curr_p = curr_p->nextp; + } + + RW_EXIT(&vswp->mfdbrw); + + D1(vswp, "%s: exit", __func__); + + return (0); +} + +/* + * Port is being deleted, but has registered an interest in one + * or more multicast groups. Using the list of addresses maintained + * within the port structure find the appropriate entry in the hash + * table and remove this port from the list of interested ports. + */ +static void +vsw_del_mcst_port(vsw_port_t *port) +{ + mcst_addr_t *mcst_p = NULL; + vsw_t *vswp = port->p_vswp; + + D1(vswp, "%s: enter", __func__); + + mutex_enter(&port->mca_lock); + while (port->mcap != NULL) { + (void) vsw_del_mcst(vswp, VSW_VNETPORT, + port->mcap->addr, port); + + mcst_p = port->mcap->nextp; + kmem_free(port->mcap, sizeof (mcst_addr_t)); + port->mcap = mcst_p; + } + mutex_exit(&port->mca_lock); + + D1(vswp, "%s: exit", __func__); +} + +/* + * This vsw instance is detaching, but has registered an interest in one + * or more multicast groups. Using the list of addresses maintained + * within the vsw structure find the appropriate entry in the hash + * table and remove this instance from the list of interested ports. + */ +static void +vsw_del_mcst_vsw(vsw_t *vswp) +{ + mcst_addr_t *next_p = NULL; + + D1(vswp, "%s: enter", __func__); + + mutex_enter(&vswp->mca_lock); + + while (vswp->mcap != NULL) { + DERR(vswp, "%s: deleting addr 0x%llx", + __func__, vswp->mcap->addr); + (void) vsw_del_mcst(vswp, VSW_LOCALDEV, + vswp->mcap->addr, NULL); + + next_p = vswp->mcap->nextp; + kmem_free(vswp->mcap, sizeof (mcst_addr_t)); + vswp->mcap = next_p; + } + + vswp->mcap = NULL; + mutex_exit(&vswp->mca_lock); + + D1(vswp, "%s: exit", __func__); +} + + +/* + * Remove the specified address from the list of address maintained + * in this port node. + */ +static void +vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr) +{ + vsw_t *vswp = NULL; + vsw_port_t *port = NULL; + mcst_addr_t *prev_p = NULL; + mcst_addr_t *curr_p = NULL; + + D1(NULL, "%s: enter : devtype %d : addr 0x%llx", + __func__, devtype, addr); + + if (devtype == VSW_VNETPORT) { + port = (vsw_port_t *)arg; + mutex_enter(&port->mca_lock); + prev_p = curr_p = port->mcap; + } else { + vswp = (vsw_t *)arg; + mutex_enter(&vswp->mca_lock); + prev_p = curr_p = vswp->mcap; + } + + while (curr_p != NULL) { + if (curr_p->addr == addr) { + D2(NULL, "%s: address found", __func__); + /* match found */ + if (prev_p == curr_p) { + /* list head */ + if (devtype == VSW_VNETPORT) + port->mcap = curr_p->nextp; + else + vswp->mcap = curr_p->nextp; + } else { + prev_p->nextp = curr_p->nextp; + } + kmem_free(curr_p, sizeof (mcst_addr_t)); + break; + } else { + prev_p = curr_p; + curr_p = curr_p->nextp; + } + } + + if (devtype == VSW_VNETPORT) + mutex_exit(&port->mca_lock); + else + mutex_exit(&vswp->mca_lock); + + D1(NULL, "%s: exit", __func__); +} + +/* + * Creates a descriptor ring (dring) and links it into the + * link of outbound drings for this channel. + * + * Returns NULL if creation failed. + */ +static dring_info_t * +vsw_create_dring(vsw_ldc_t *ldcp) +{ + vsw_private_desc_t *priv_addr = NULL; + vsw_t *vswp = ldcp->ldc_vswp; + ldc_mem_info_t minfo; + dring_info_t *dp, *tp; + int i; + + dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); + + mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); + + /* create public section of ring */ + if ((ldc_mem_dring_create(VSW_RING_NUM_EL, + VSW_PUB_SIZE, &dp->handle)) != 0) { + + DERR(vswp, "vsw_create_dring(%lld): ldc dring create " + "failed", ldcp->ldc_id); + goto create_fail_exit; + } + + ASSERT(dp->handle != NULL); + + /* + * Get the base address of the public section of the ring. + */ + if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { + DERR(vswp, "vsw_create_dring(%lld): dring info failed\n", + ldcp->ldc_id); + goto dring_fail_exit; + } else { + ASSERT(minfo.vaddr != 0); + dp->pub_addr = minfo.vaddr; + } + + dp->num_descriptors = VSW_RING_NUM_EL; + dp->descriptor_size = VSW_PUB_SIZE; + dp->options = VIO_TX_DRING; + dp->ncookies = 1; /* guaranteed by ldc */ + + /* + * create private portion of ring + */ + dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc( + (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP); + + if (vsw_setup_ring(ldcp, dp)) { + DERR(vswp, "%s: unable to setup ring", __func__); + goto dring_fail_exit; + } + + /* haven't used any descriptors yet */ + dp->end_idx = 0; + + /* bind dring to the channel */ + if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle, + LDC_SHADOW_MAP, LDC_MEM_RW, + &dp->cookie[0], &dp->ncookies)) != 0) { + DERR(vswp, "vsw_create_dring: unable to bind to channel " + "%lld", ldcp->ldc_id); + goto dring_fail_exit; + } + + /* + * Only ever create rings for outgoing lane. Link it onto + * end of list. + */ + if (ldcp->lane_out.dringp == NULL) { + D2(vswp, "vsw_create_dring: adding first outbound ring"); + ldcp->lane_out.dringp = dp; + } else { + tp = ldcp->lane_out.dringp; + while (tp->next != NULL) + tp = tp->next; + + tp->next = dp; + } + + return (dp); + +dring_fail_exit: + (void) ldc_mem_dring_destroy(dp->handle); + +create_fail_exit: + if (dp->priv_addr != NULL) { + priv_addr = dp->priv_addr; + for (i = 0; i < VSW_RING_NUM_EL; i++) { + if (priv_addr->memhandle != NULL) + (void) ldc_mem_free_handle( + priv_addr->memhandle); + priv_addr++; + } + kmem_free(dp->priv_addr, + (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); + } + mutex_destroy(&dp->dlock); + + kmem_free(dp, sizeof (dring_info_t)); + return (NULL); +} + +/* + * Create a ring consisting of just a private portion and link + * it into the list of rings for the outbound lane. + * + * These type of rings are used primarily for temporary data + * storage (i.e. as data buffers). + */ +void +vsw_create_privring(vsw_ldc_t *ldcp) +{ + dring_info_t *dp, *tp; + vsw_t *vswp = ldcp->ldc_vswp; + + D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); + + dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); + + mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); + + /* no public section */ + dp->pub_addr = NULL; + + dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) * + VSW_RING_NUM_EL), KM_SLEEP); + + if (vsw_setup_ring(ldcp, dp)) { + DERR(vswp, "%s: setup of ring failed", __func__); + kmem_free(dp->priv_addr, + (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); + mutex_destroy(&dp->dlock); + kmem_free(dp, sizeof (dring_info_t)); + return; + } + + /* haven't used any descriptors yet */ + dp->end_idx = 0; + + /* + * Only ever create rings for outgoing lane. Link it onto + * end of list. + */ + if (ldcp->lane_out.dringp == NULL) { + D2(vswp, "%s: adding first outbound privring", __func__); + ldcp->lane_out.dringp = dp; + } else { + tp = ldcp->lane_out.dringp; + while (tp->next != NULL) + tp = tp->next; + + tp->next = dp; + } + + D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); +} + +/* + * Setup the descriptors in the dring. Returns 0 on success, 1 on + * failure. + */ +int +vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp) +{ + vnet_public_desc_t *pub_addr = NULL; + vsw_private_desc_t *priv_addr = NULL; + vsw_t *vswp = ldcp->ldc_vswp; + uint64_t *tmpp; + uint64_t offset = 0; + uint32_t ncookies = 0; + static char *name = "vsw_setup_ring"; + int i, j, rv; + + /* note - public section may be null */ + priv_addr = dp->priv_addr; + pub_addr = dp->pub_addr; + + /* + * Allocate the region of memory which will be used to hold + * the data the descriptors will refer to. + */ + dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ); + dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP); + + D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name, + dp->data_sz, dp->data_addr); + + tmpp = (uint64_t *)dp->data_addr; + offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp); + + /* + * Initialise some of the private and public (if they exist) + * descriptor fields. + */ + for (i = 0; i < VSW_RING_NUM_EL; i++) { + if ((ldc_mem_alloc_handle(ldcp->ldc_handle, + &priv_addr->memhandle)) != 0) { + DERR(vswp, "%s: alloc mem handle failed", name); + goto setup_ring_cleanup; + } + + priv_addr->datap = (void *)tmpp; + + rv = ldc_mem_bind_handle(priv_addr->memhandle, + (caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ, + LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W, + &(priv_addr->memcookie[0]), &ncookies); + if (rv != 0) { + DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed " + "(rv %d)", name, ldcp->ldc_id, rv); + goto setup_ring_cleanup; + } + priv_addr->bound = 1; + + D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx", + name, i, priv_addr->memcookie[0].addr, + priv_addr->memcookie[0].size); + + if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) { + DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned " + "invalid num of cookies (%d) for size 0x%llx", + name, ldcp->ldc_id, ncookies, + VSW_RING_EL_DATA_SZ); + + goto setup_ring_cleanup; + } else { + for (j = 1; j < ncookies; j++) { + rv = ldc_mem_nextcookie(priv_addr->memhandle, + &(priv_addr->memcookie[j])); + if (rv != 0) { + DERR(vswp, "%s: ldc_mem_nextcookie " + "failed rv (%d)", name, rv); + goto setup_ring_cleanup; + } + D3(vswp, "%s: memcookie %d : addr 0x%llx : " + "size 0x%llx", name, j, + priv_addr->memcookie[j].addr, + priv_addr->memcookie[j].size); + } + + } + priv_addr->ncookies = ncookies; + priv_addr->dstate = VIO_DESC_FREE; + + if (pub_addr != NULL) { + + /* link pub and private sides */ + priv_addr->descp = pub_addr; + + pub_addr->hdr.dstate = VIO_DESC_FREE; + pub_addr++; + } + + /* + * move to next element in the dring and the next + * position in the data buffer. + */ + priv_addr++; + tmpp += offset; + } + + return (0); + +setup_ring_cleanup: + priv_addr = dp->priv_addr; + + for (i = 0; i < VSW_RING_NUM_EL; i++) { + (void) ldc_mem_unbind_handle(priv_addr->memhandle); + (void) ldc_mem_free_handle(priv_addr->memhandle); + + priv_addr++; + } + kmem_free(dp->data_addr, dp->data_sz); + + return (1); +} + +/* + * Searches the private section of a ring for a free descriptor, + * starting at the location of the last free descriptor found + * previously. + * + * Returns 0 if free descriptor is available, 1 otherwise. + * + * FUTURE: might need to return contiguous range of descriptors + * as dring info msg assumes all will be contiguous. + */ +static int +vsw_dring_find_free_desc(dring_info_t *dringp, + vsw_private_desc_t **priv_p, int *idx) +{ + vsw_private_desc_t *addr; + uint64_t i; + uint64_t j = 0; + uint64_t start = dringp->end_idx; + int num = VSW_RING_NUM_EL; + int ret = 1; + + D1(NULL, "%s enter\n", __func__); + + addr = dringp->priv_addr; + + D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld", + __func__, dringp, start); + + for (i = start; j < num; i = (i + 1) % num, j++) { + addr = (vsw_private_desc_t *)dringp->priv_addr + i; + D2(NULL, "%s: descriptor %lld : dstate 0x%llx\n", + __func__, i, addr->dstate); + if (addr->dstate == VIO_DESC_FREE) { + D2(NULL, "%s: descriptor %lld is available", + __func__, i); + *priv_p = addr; + *idx = i; + dringp->end_idx = (i + 1) % num; + ret = 0; + break; + } + } + + /* ring full */ + if (ret == 1) { + D2(NULL, "%s: no desp free: started at %d", __func__, start); + } + + D1(NULL, "%s: exit\n", __func__); + + return (ret); +} + +/* + * Copy relevant fields from the private descriptor into the + * associated public side. + */ +static void +vsw_dring_priv2pub(vsw_private_desc_t *priv) +{ + vnet_public_desc_t *pub; + int i; + + D1(NULL, "vsw_dring_priv2pub enter\n"); + + pub = priv->descp; + + pub->ncookies = priv->ncookies; + pub->nbytes = priv->datalen; + + for (i = 0; i < pub->ncookies; i++) { + bcopy(&priv->memcookie[i], &pub->memcookie[i], + sizeof (ldc_mem_cookie_t)); + } + + pub->hdr.ack = 1; + pub->hdr.dstate = VIO_DESC_READY; + + D1(NULL, "vsw_dring_priv2pub exit"); +} + +/* + * Map from a dring identifier to the ring itself. Returns + * pointer to ring or NULL if no match found. + */ +static dring_info_t * +vsw_ident2dring(lane_t *lane, uint64_t ident) +{ + dring_info_t *dp = NULL; + + if ((dp = lane->dringp) == NULL) { + return (NULL); + } else { + if (dp->ident == ident) + return (dp); + + while (dp != NULL) { + if (dp->ident == ident) + break; + dp = dp->next; + } + } + + return (dp); +} + +/* + * Set the default lane attributes. These are copied into + * the attr msg we send to our peer. If they are not acceptable + * then (currently) the handshake ends. + */ +static void +vsw_set_lane_attr(vsw_t *vswp, lane_t *lp) +{ + bzero(lp, sizeof (lane_t)); + + READ_ENTER(&vswp->if_lockrw); + ether_copy(&(vswp->if_addr), &(lp->addr)); + RW_EXIT(&vswp->if_lockrw); + + lp->mtu = VSW_MTU; + lp->addr_type = ADDR_TYPE_MAC; + lp->xfer_mode = VIO_DRING_MODE; + lp->ack_freq = 0; /* for shared mode */ + lp->seq_num = VNET_ISS; +} + +/* + * Verify that the attributes are acceptable. + * + * FUTURE: If some attributes are not acceptable, change them + * our desired values. + */ +static int +vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port) +{ + int ret = 0; + + D1(NULL, "vsw_check_attr enter\n"); + + /* + * Note we currently only support in-band descriptors + * and descriptor rings, not packet based transfer (VIO_PKT_MODE) + */ + if ((pkt->xfer_mode != VIO_DESC_MODE) && + (pkt->xfer_mode != VIO_DRING_MODE)) { + D2(NULL, "vsw_check_attr: unknown mode %x\n", + pkt->xfer_mode); + ret = 1; + } + + /* Only support MAC addresses at moment. */ + if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) { + D2(NULL, "vsw_check_attr: invalid addr_type %x, " + "or address 0x%llx\n", pkt->addr_type, + pkt->addr); + ret = 1; + } + + /* + * MAC address supplied by device should match that stored + * in the vsw-port OBP node. Need to decide what to do if they + * don't match, for the moment just warn but don't fail. + */ + if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) { + DERR(NULL, "vsw_check_attr: device supplied address " + "0x%llx doesn't match node address 0x%llx\n", + pkt->addr, port->p_macaddr); + } + + /* + * Ack freq only makes sense in pkt mode, in shared + * mode the ring descriptors say whether or not to + * send back an ACK. + */ + if ((pkt->xfer_mode == VIO_DRING_MODE) && + (pkt->ack_freq > 0)) { + D2(NULL, "vsw_check_attr: non zero ack freq " + " in SHM mode\n"); + ret = 1; + } + + /* + * Note: for the moment we only support ETHER + * frames. This may change in the future. + */ + if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) { + D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n", + pkt->mtu); + ret = 1; + } + + D1(NULL, "vsw_check_attr exit\n"); + + return (ret); +} + +/* + * Returns 1 if there is a problem, 0 otherwise. + */ +static int +vsw_check_dring_info(vio_dring_reg_msg_t *pkt) +{ + _NOTE(ARGUNUSED(pkt)) + + int ret = 0; + + D1(NULL, "vsw_check_dring_info enter\n"); + + if ((pkt->num_descriptors == 0) || + (pkt->descriptor_size == 0) || + (pkt->ncookies != 1)) { + DERR(NULL, "vsw_check_dring_info: invalid dring msg"); + ret = 1; + } + + D1(NULL, "vsw_check_dring_info exit\n"); + + return (ret); +} + +/* + * Returns 1 if two memory cookies match. Otherwise returns 0. + */ +static int +vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2) +{ + if ((m1->addr != m2->addr) || + (m2->size != m2->size)) { + return (0); + } else { + return (1); + } +} + +/* + * Returns 1 if ring described in reg message matches that + * described by dring_info structure. Otherwise returns 0. + */ +static int +vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg) +{ + if ((msg->descriptor_size != dp->descriptor_size) || + (msg->num_descriptors != dp->num_descriptors) || + (msg->ncookies != dp->ncookies) || + !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) { + return (0); + } else { + return (1); + } + +} + +static caddr_t +vsw_print_ethaddr(uint8_t *a, char *ebuf) +{ + (void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x", + a[0], a[1], a[2], a[3], a[4], a[5]); + return (ebuf); +} + +/* + * Reset and free all the resources associated with + * the channel. + */ +static void +vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir) +{ + dring_info_t *dp, *dpp; + lane_t *lp = NULL; + int rv = 0; + + ASSERT(ldcp != NULL); + + D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id); + + if (dir == INBOUND) { + D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane" + " of channel %lld", __func__, ldcp->ldc_id); + lp = &ldcp->lane_in; + } else { + D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane" + " of channel %lld", __func__, ldcp->ldc_id); + lp = &ldcp->lane_out; + } + + lp->lstate = VSW_LANE_INACTIV; + lp->seq_num = VNET_ISS; + if (lp->dringp) { + if (dir == INBOUND) { + dp = lp->dringp; + while (dp != NULL) { + dpp = dp->next; + if (dp->handle != NULL) + (void) ldc_mem_dring_unmap(dp->handle); + kmem_free(dp, sizeof (dring_info_t)); + dp = dpp; + } + } else { + /* + * unbind, destroy exported dring, free dring struct + */ + dp = lp->dringp; + rv = vsw_free_ring(dp); + } + if (rv == 0) { + lp->dringp = NULL; + } + } + + D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id); +} + +/* + * Free ring and all associated resources. + */ +static int +vsw_free_ring(dring_info_t *dp) +{ + vsw_private_desc_t *paddr = NULL; + dring_info_t *dpp; + int i, rv = 1; + + while (dp != NULL) { + mutex_enter(&dp->dlock); + dpp = dp->next; + if (dp->priv_addr != NULL) { + /* + * First unbind and free the memory handles + * stored in each descriptor within the ring. + */ + for (i = 0; i < VSW_RING_NUM_EL; i++) { + paddr = (vsw_private_desc_t *) + dp->priv_addr + i; + if (paddr->memhandle != NULL) { + if (paddr->bound == 1) { + rv = ldc_mem_unbind_handle( + paddr->memhandle); + + if (rv != 0) { + DERR(NULL, "error " + "unbinding handle for " + "ring 0x%llx at pos %d", + dp, i); + mutex_exit(&dp->dlock); + return (rv); + } + paddr->bound = 0; + } + + rv = ldc_mem_free_handle( + paddr->memhandle); + if (rv != 0) { + DERR(NULL, "error freeing " + "handle for ring " + "0x%llx at pos %d", + dp, i); + mutex_exit(&dp->dlock); + return (rv); + } + paddr->memhandle = NULL; + } + } + kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t) + * VSW_RING_NUM_EL)); + } + + /* + * Now unbind and destroy the ring itself. + */ + if (dp->handle != NULL) { + (void) ldc_mem_dring_unbind(dp->handle); + (void) ldc_mem_dring_destroy(dp->handle); + } + + if (dp->data_addr != NULL) { + kmem_free(dp->data_addr, dp->data_sz); + } + + mutex_exit(&dp->dlock); + mutex_destroy(&dp->dlock); + kmem_free(dp, sizeof (dring_info_t)); + + dp = dpp; + } + return (0); +} + +/* + * Debugging routines + */ +static void +display_state(void) +{ + vsw_t *vswp; + vsw_port_list_t *plist; + vsw_port_t *port; + vsw_ldc_list_t *ldcl; + vsw_ldc_t *ldcp; + + cmn_err(CE_NOTE, "***** system state *****"); + + for (vswp = vsw_head; vswp; vswp = vswp->next) { + plist = &vswp->plist; + READ_ENTER(&plist->lockrw); + cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n", + vswp->instance, plist->num_ports); + + for (port = plist->head; port != NULL; port = port->p_next) { + ldcl = &port->p_ldclist; + cmn_err(CE_CONT, "port %d : %d ldcs attached\n", + port->p_instance, ldcl->num_ldcs); + READ_ENTER(&ldcl->lockrw); + ldcp = ldcl->head; + for (; ldcp != NULL; ldcp = ldcp->ldc_next) { + cmn_err(CE_CONT, "chan %lu : dev %d : " + "status %d : phase %u\n", + ldcp->ldc_id, ldcp->dev_class, + ldcp->ldc_status, ldcp->hphase); + cmn_err(CE_CONT, "chan %lu : lsession %lu : " + "psession %lu\n", + ldcp->ldc_id, + ldcp->local_session, + ldcp->peer_session); + + cmn_err(CE_CONT, "Inbound lane:\n"); + display_lane(&ldcp->lane_in); + cmn_err(CE_CONT, "Outbound lane:\n"); + display_lane(&ldcp->lane_out); + } + RW_EXIT(&ldcl->lockrw); + } + RW_EXIT(&plist->lockrw); + } + cmn_err(CE_NOTE, "***** system state *****"); +} + +static void +display_lane(lane_t *lp) +{ + dring_info_t *drp; + + cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n", + lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu); + cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n", + lp->addr_type, lp->addr, lp->xfer_mode); + cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp); + + cmn_err(CE_CONT, "Dring info:\n"); + for (drp = lp->dringp; drp != NULL; drp = drp->next) { + cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n", + drp->num_descriptors, drp->descriptor_size); + cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle); + cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n", + (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr); + cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n", + drp->ident, drp->end_idx); + display_ring(drp); + } +} + +static void +display_ring(dring_info_t *dringp) +{ + uint64_t i; + uint64_t priv_count = 0; + uint64_t pub_count = 0; + vnet_public_desc_t *pub_addr = NULL; + vsw_private_desc_t *priv_addr = NULL; + + for (i = 0; i < VSW_RING_NUM_EL; i++) { + if (dringp->pub_addr != NULL) { + pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i; + + if (pub_addr->hdr.dstate == VIO_DESC_FREE) + pub_count++; + } + + if (dringp->priv_addr != NULL) { + priv_addr = + (vsw_private_desc_t *)dringp->priv_addr + i; + + if (priv_addr->dstate == VIO_DESC_FREE) + priv_count++; + } + } + cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n", + i, priv_count, pub_count); +} + +static void +dump_flags(uint64_t state) +{ + int i; + + typedef struct flag_name { + int flag_val; + char *flag_name; + } flag_name_t; + + flag_name_t flags[] = { + VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT", + VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV", + VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV", + VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT", + VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV", + VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT", + VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT", + VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV", + VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT", + VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV", + VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT", + VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV", + VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT", + VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV", + VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT", + VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV", + VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT", + VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV", + VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT", + VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV", + VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT", + VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV", + VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT", + VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV", + VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT", + VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV", + VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT", + VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV", + VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT", + VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV", + VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"}; + + DERR(NULL, "DUMP_FLAGS: %llx\n", state); + for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) { + if (state & flags[i].flag_val) + DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name); + } +} diff --git a/usr/src/uts/sun4v/ldc/Makefile b/usr/src/uts/sun4v/ldc/Makefile new file mode 100644 index 0000000000..ef3961d65c --- /dev/null +++ b/usr/src/uts/sun4v/ldc/Makefile @@ -0,0 +1,96 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" +# +# uts/sun4v/ldc/Makefile +# +# This makefile drives the production of the LDC transport kernel module. +# +# sun4v implementation architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = ldc +OBJECTS = $(LDC_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(LDC_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_MISC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sun4v/Makefile.sun4v + +# +# Override defaults to build a unique, local modstubs.o. +# +MODSTUBS_DIR = $(OBJS_DIR) + +CLEANFILES += $(MODSTUBS_O) + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement +# +CFLAGS += -v + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/$(PLATFORM)/Makefile.targ diff --git a/usr/src/uts/sun4v/ml/hcall.s b/usr/src/uts/sun4v/ml/hcall.s index d445127644..360626f2eb 100644 --- a/usr/src/uts/sun4v/ml/hcall.s +++ b/usr/src/uts/sun4v/ml/hcall.s @@ -18,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -37,6 +38,30 @@ #if defined(lint) || defined(__lint) /*ARGSUSED*/ +uint64_t +hv_mach_exit(uint64_t exit_code) +{ return (0); } + +uint64_t +hv_mach_sir(void) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hv_cpu_start(uint64_t cpuid, uint64_t pc, uint64_t rtba, uint64_t arg) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hv_cpu_stop(uint64_t cpuid) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hv_cpu_set_rtba(uint64_t *rtba) +{ return (0); } + +/*ARGSUSED*/ int64_t hv_cnputchar(uint8_t ch) { return (0); } @@ -61,6 +86,11 @@ uint64_t hv_mmu_map_perm_addr(void *vaddr, int ctx, uint64_t tte, int flags) { return (0); } +/*ARGSUSED */ +uint64_t +hv_mmu_fault_area_conf(void *raddr) +{ return (0); } + /*ARGSUSED*/ uint64_t hv_mmu_unmap_perm_addr(void *vaddr, int ctx, int flags) @@ -171,7 +201,7 @@ hv_ttrace_freeze(uint64_t freeze, uint64_t *prev_freeze) uint64_t hv_mach_desc(uint64_t buffer_ra, uint64_t *buffer_sizep) { return (0); } - + /*ARGSUSED*/ uint64_t hv_ra2pa(uint64_t ra) @@ -182,31 +212,190 @@ uint64_t hv_hpriv(void *func, uint64_t arg1, uint64_t arg2, uint64_t arg3) { return (0); } +/*ARGSUSED*/ +uint64_t +hv_ldc_tx_qconf(uint64_t channel, uint64_t ra_base, uint64_t nentries) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hv_ldc_tx_qinfo(uint64_t channel, uint64_t *ra_base, uint64_t *nentries) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hv_ldc_tx_get_state(uint64_t channel, + uint64_t *headp, uint64_t *tailp, uint64_t *state) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hv_ldc_tx_set_qtail(uint64_t channel, uint64_t tail) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hv_ldc_rx_qconf(uint64_t channel, uint64_t ra_base, uint64_t nentries) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hv_ldc_rx_qinfo(uint64_t channel, uint64_t *ra_base, uint64_t *nentries) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hv_ldc_rx_get_state(uint64_t channel, + uint64_t *headp, uint64_t *tailp, uint64_t *state) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hv_ldc_rx_set_qhead(uint64_t channel, uint64_t head) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hv_ldc_send_msg(uint64_t channel, uint64_t msg_ra) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hv_ldc_set_map_table(uint64_t channel, uint64_t tbl_ra, uint64_t tbl_entries) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hv_ldc_copy(uint64_t channel, uint64_t request, uint64_t cookie, + uint64_t raddr, uint64_t length, uint64_t *lengthp) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hvldc_intr_getcookie(uint64_t dev_hdl, uint32_t devino, uint64_t *cookie) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hvldc_intr_setcookie(uint64_t dev_hdl, uint32_t devino, uint64_t cookie) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hvldc_intr_getvalid(uint64_t dev_hdl, uint32_t devino, int *intr_valid_state) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hvldc_intr_setvalid(uint64_t dev_hdl, uint32_t devino, int intr_valid_state) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hvldc_intr_getstate(uint64_t dev_hdl, uint32_t devino, int *intr_state) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hvldc_intr_setstate(uint64_t dev_hdl, uint32_t devino, int intr_state) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hvldc_intr_gettarget(uint64_t dev_hdl, uint32_t devino, uint32_t *cpuid) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hvldc_intr_settarget(uint64_t dev_hdl, uint32_t devino, uint32_t cpuid) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hv_api_get_version(uint64_t api_group, uint64_t *majorp, uint64_t *minorp) +{ return (0); } + +/*ARGSUSED*/ +uint64_t +hv_api_set_version(uint64_t api_group, uint64_t major, uint64_t minor, + uint64_t *supported_minor) +{ return (0); } + #else /* lint || __lint */ /* - * %o0 - character + * int hv_mach_exit(uint64_t exit_code) + */ + ENTRY(hv_mach_exit) + mov HV_MACH_EXIT, %o5 + ta FAST_TRAP + retl + nop + SET_SIZE(hv_mach_exit) + + /* + * uint64_t hv_mach_sir(void) + */ + ENTRY(hv_mach_sir) + mov HV_MACH_SIR, %o5 + ta FAST_TRAP + retl + nop + SET_SIZE(hv_mach_sir) + + /* + * hv_cpu_start(uint64_t cpuid, uint64_t pc, ui64_t rtba, + * uint64_t arg) + */ + ENTRY(hv_cpu_start) + mov HV_CPU_START, %o5 + ta FAST_TRAP + retl + nop + SET_SIZE(hv_cpu_start) + + /* + * hv_cpu_stop(uint64_t cpuid) + */ + ENTRY(hv_cpu_stop) + mov HV_CPU_STOP, %o5 + ta FAST_TRAP + retl + nop + SET_SIZE(hv_cpu_stop) + + /* + * hv_cpu_set_rtba(uint64_t *rtba) + */ + ENTRY(hv_cpu_set_rtba) + mov %o0, %o2 + ldx [%o2], %o0 + mov HV_CPU_SET_RTBA, %o5 + ta FAST_TRAP + stx %o1, [%o2] + retl + nop + SET_SIZE(hv_cpu_set_rtba) + + /* + * int64_t hv_cnputchar(uint8_t ch) */ ENTRY(hv_cnputchar) - mov CONS_WRITE, %o5 + mov CONS_PUTCHAR, %o5 ta FAST_TRAP - tst %o0 retl - movnz %xcc, -1, %o0 + nop SET_SIZE(hv_cnputchar) /* - * %o0 pointer to character buffer - * return values: - * 0 success - * hv_errno failure + * int64_t hv_cngetchar(uint8_t *ch) */ ENTRY(hv_cngetchar) mov %o0, %o2 - mov CONS_READ, %o5 + mov CONS_GETCHAR, %o5 ta FAST_TRAP brnz,a %o0, 1f ! failure, just return error - mov 1, %o0 + nop cmp %o1, H_BREAK be 1f @@ -220,7 +409,7 @@ hv_hpriv(void *func, uint64_t arg1, uint64_t arg2, uint64_t arg3) mov 0, %o0 1: retl - nop + nop SET_SIZE(hv_cngetchar) ENTRY(hv_tod_get) @@ -253,6 +442,19 @@ hv_hpriv(void *func, uint64_t arg1, uint64_t arg2, uint64_t arg3) SET_SIZE(hv_mmu_map_perm_addr) /* + * hv_mmu_fault_area_conf(void *raddr) + */ + ENTRY(hv_mmu_fault_area_conf) + mov %o0, %o2 + ldx [%o2], %o0 + mov MMU_SET_INFOPTR, %o5 + ta FAST_TRAP + stx %o1, [%o2] + retl + nop + SET_SIZE(hv_mmu_fault_area_conf) + + /* * Unmap permanent address * arg0 vaddr (%o0) * arg1 context (%o1) @@ -308,7 +510,7 @@ hv_hpriv(void *func, uint64_t arg1, uint64_t arg2, uint64_t arg3) * arg2 Size (%o2) */ ENTRY(hv_cpu_qconf) - mov CPU_QCONF, %o5 + mov HV_CPU_QCONF, %o5 ta FAST_TRAP retl nop @@ -537,7 +739,7 @@ hv_hpriv(void *func, uint64_t arg1, uint64_t arg2, uint64_t arg3) * arg0 enable/ freeze (%o0) * ret0 status (%o0) * ret1 previous freeze state (%o1) - */ + */ ENTRY(hv_ttrace_freeze) mov %o1, %o2 mov TTRACE_FREEZE, %o5 @@ -597,4 +799,320 @@ hv_hpriv(void *func, uint64_t arg1, uint64_t arg2, uint64_t arg3) nop SET_SIZE(hv_hpriv) + /* + * hv_ldc_tx_qconf(uint64_t channel, uint64_t ra_base, + * uint64_t nentries); + */ + ENTRY(hv_ldc_tx_qconf) + mov LDC_TX_QCONF, %o5 + ta FAST_TRAP + retl + nop + SET_SIZE(hv_ldc_tx_qconf) + + + /* + * hv_ldc_tx_qinfo(uint64_t channel, uint64_t *ra_base, + * uint64_t *nentries); + */ + ENTRY(hv_ldc_tx_qinfo) + mov %o1, %g1 + mov %o2, %g2 + mov LDC_TX_QINFO, %o5 + ta FAST_TRAP + stx %o1, [%g1] + retl + stx %o2, [%g2] + SET_SIZE(hv_ldc_tx_qinfo) + + + /* + * hv_ldc_tx_get_state(uint64_t channel, + * uint64_t *headp, uint64_t *tailp, uint64_t *state); + */ + ENTRY(hv_ldc_tx_get_state) + mov LDC_TX_GET_STATE, %o5 + mov %o1, %g1 + mov %o2, %g2 + mov %o3, %g3 + ta FAST_TRAP + stx %o1, [%g1] + stx %o2, [%g2] + retl + stx %o3, [%g3] + SET_SIZE(hv_ldc_tx_get_state) + + + /* + * hv_ldc_tx_set_qtail(uint64_t channel, uint64_t tail) + */ + ENTRY(hv_ldc_tx_set_qtail) + mov LDC_TX_SET_QTAIL, %o5 + ta FAST_TRAP + retl + SET_SIZE(hv_ldc_tx_set_qtail) + + + /* + * hv_ldc_rx_qconf(uint64_t channel, uint64_t ra_base, + * uint64_t nentries); + */ + ENTRY(hv_ldc_rx_qconf) + mov LDC_RX_QCONF, %o5 + ta FAST_TRAP + retl + nop + SET_SIZE(hv_ldc_rx_qconf) + + + /* + * hv_ldc_rx_qinfo(uint64_t channel, uint64_t *ra_base, + * uint64_t *nentries); + */ + ENTRY(hv_ldc_rx_qinfo) + mov %o1, %g1 + mov %o2, %g2 + mov LDC_RX_QINFO, %o5 + ta FAST_TRAP + stx %o1, [%g1] + retl + stx %o2, [%g2] + SET_SIZE(hv_ldc_rx_qinfo) + + + /* + * hv_ldc_rx_get_state(uint64_t channel, + * uint64_t *headp, uint64_t *tailp, uint64_t *state); + */ + ENTRY(hv_ldc_rx_get_state) + mov LDC_RX_GET_STATE, %o5 + mov %o1, %g1 + mov %o2, %g2 + mov %o3, %g3 + ta FAST_TRAP + stx %o1, [%g1] + stx %o2, [%g2] + retl + stx %o3, [%g3] + SET_SIZE(hv_ldc_rx_get_state) + + + /* + * hv_ldc_rx_set_qhead(uint64_t channel, uint64_t head) + */ + ENTRY(hv_ldc_rx_set_qhead) + mov LDC_RX_SET_QHEAD, %o5 + ta FAST_TRAP + retl + SET_SIZE(hv_ldc_rx_set_qhead) + + /* + * hv_ldc_set_map_table(uint64_t channel, uint64_t tbl_ra, + * uint64_t tbl_entries) + */ + ENTRY(hv_ldc_set_map_table) + mov LDC_SET_MAP_TABLE, %o5 + ta FAST_TRAP + retl + nop + SET_SIZE(hv_ldc_set_map_table) + + + /* + * hv_ldc_get_map_table(uint64_t channel, uint64_t *tbl_ra, + * uint64_t *tbl_entries) + */ + ENTRY(hv_ldc_get_map_table) + mov %o1, %g1 + mov %o2, %g2 + mov LDC_GET_MAP_TABLE, %o5 + ta FAST_TRAP + stx %o1, [%g1] + retl + stx %o2, [%g2] + SET_SIZE(hv_ldc_get_map_table) + + + /* + * hv_ldc_copy(uint64_t channel, uint64_t request, uint64_t cookie, + * uint64_t raddr, uint64_t length, uint64_t *lengthp); + */ + ENTRY(hv_ldc_copy) + mov %o5, %g1 + mov LDC_COPY, %o5 + ta FAST_TRAP + retl + stx %o1, [%g1] + SET_SIZE(hv_ldc_copy) + + + /* + * hv_ldc_mapin(uint64_t channel, uint64_t cookie, uint64_t *raddr, + * uint64_t *perm) + */ + ENTRY(hv_ldc_mapin) + mov %o2, %g1 + mov %o3, %g2 + mov LDC_MAPIN, %o5 + ta FAST_TRAP + stx %o1, [%g1] + retl + stx %o2, [%g2] + SET_SIZE(hv_ldc_mapin) + + + /* + * hv_ldc_unmap(uint64_t raddr) + */ + ENTRY(hv_ldc_unmap) + mov LDC_UNMAP, %o5 + ta FAST_TRAP + retl + nop + SET_SIZE(hv_ldc_unmap) + + + /* + * hv_ldc_revoke(uint64_t raddr) + */ + ENTRY(hv_ldc_revoke) + mov LDC_REVOKE, %o5 + ta FAST_TRAP + retl + nop + SET_SIZE(hv_ldc_revoke) + + + /* + * hvldc_intr_getcookie(uint64_t dev_hdl, uint32_t devino, + * uint64_t *cookie); + */ + ENTRY(hvldc_intr_getcookie) + mov %o2, %g1 + mov VINTR_GET_COOKIE, %o5 + ta FAST_TRAP + retl + stx %o1, [%g1] + SET_SIZE(hvldc_intr_getcookie) + + /* + * hvldc_intr_setcookie(uint64_t dev_hdl, uint32_t devino, + * uint64_t cookie); + */ + ENTRY(hvldc_intr_setcookie) + mov VINTR_SET_COOKIE, %o5 + ta FAST_TRAP + retl + nop + SET_SIZE(hvldc_intr_setcookie) + + + /* + * hvldc_intr_getvalid(uint64_t dev_hdl, uint32_t devino, + * int *intr_valid_state); + */ + ENTRY(hvldc_intr_getvalid) + mov %o2, %g1 + mov VINTR_GET_VALID, %o5 + ta FAST_TRAP + retl + stuw %o1, [%g1] + SET_SIZE(hvldc_intr_getvalid) + + /* + * hvldc_intr_setvalid(uint64_t dev_hdl, uint32_t devino, + * int intr_valid_state); + */ + ENTRY(hvldc_intr_setvalid) + mov VINTR_SET_VALID, %o5 + ta FAST_TRAP + retl + nop + SET_SIZE(hvldc_intr_setvalid) + + /* + * hvldc_intr_getstate(uint64_t dev_hdl, uint32_t devino, + * int *intr_state); + */ + ENTRY(hvldc_intr_getstate) + mov %o2, %g1 + mov VINTR_GET_STATE, %o5 + ta FAST_TRAP + retl + stuw %o1, [%g1] + SET_SIZE(hvldc_intr_getstate) + + /* + * hvldc_intr_setstate(uint64_t dev_hdl, uint32_t devino, + * int intr_state); + */ + ENTRY(hvldc_intr_setstate) + mov VINTR_SET_STATE, %o5 + ta FAST_TRAP + retl + nop + SET_SIZE(hvldc_intr_setstate) + + /* + * hvldc_intr_gettarget(uint64_t dev_hdl, uint32_t devino, + * uint32_t *cpuid); + */ + ENTRY(hvldc_intr_gettarget) + mov %o2, %g1 + mov VINTR_GET_TARGET, %o5 + ta FAST_TRAP + retl + stuw %o1, [%g1] + SET_SIZE(hvldc_intr_gettarget) + + /* + * hvldc_intr_settarget(uint64_t dev_hdl, uint32_t devino, + * uint32_t cpuid); + */ + ENTRY(hvldc_intr_settarget) + mov VINTR_SET_TARGET, %o5 + ta FAST_TRAP + retl + nop + SET_SIZE(hvldc_intr_settarget) + + /* + * hv_api_get_version(uint64_t api_group, uint64_t *majorp, + * uint64_t *minorp) + * + * API_GET_VERSION + * arg0 API group + * ret0 status + * ret1 major number + * ret2 minor number + */ + ENTRY(hv_api_get_version) + mov %o1, %o3 + mov %o2, %o4 + mov API_GET_VERSION, %o5 + ta CORE_TRAP + stx %o1, [%o3] + retl + stx %o2, [%o4] + SET_SIZE(hv_api_get_version) + + /* + * hv_api_set_version(uint64_t api_group, uint64_t major, + * uint64_t minor, uint64_t *supported_minor) + * + * API_SET_VERSION + * arg0 API group + * arg1 major number + * arg2 requested minor number + * ret0 status + * ret1 actual minor number + */ + ENTRY(hv_api_set_version) + mov %o3, %o4 + mov API_SET_VERSION, %o5 + ta CORE_TRAP + retl + stx %o1, [%o4] + SET_SIZE(hv_api_set_version) + #endif /* lint || __lint */ diff --git a/usr/src/uts/sun4v/ml/mach_offsets.in b/usr/src/uts/sun4v/ml/mach_offsets.in index 8f032ed908..1770b73bb2 100644 --- a/usr/src/uts/sun4v/ml/mach_offsets.in +++ b/usr/src/uts/sun4v/ml/mach_offsets.in @@ -2,9 +2,8 @@ \ CDDL HEADER START \ \ The contents of this file are subject to the terms of the -\ Common Development and Distribution License, Version 1.0 only -\ (the "License"). You may not use this file except in compliance -\ with the License. +\ Common Development and Distribution License (the "License"). +\ You may not use this file except in compliance with the License. \ \ You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE \ or http://www.opensolaris.org/os/licensing. @@ -19,7 +18,7 @@ \ \ CDDL HEADER END \ -\ Copyright 2004 Sun Microsystems, Inc. All rights reserved. +\ Copyright 2006 Sun Microsystems, Inc. All rights reserved. \ Use is subject to license terms. \ \ offsets.in: input file to produce assym.h using the stabs program @@ -79,6 +78,7 @@ #include <vm/hat_sfmmu.h> #include <sys/traptrace.h> +#include <sys/lpad.h> machcpu intrstat MCPU_INTRSTAT @@ -280,3 +280,16 @@ ptl1_gregs ptl1_g6 ptl1_g7 +lpad_data + magic LPAD_MAGIC + inuse LPAD_INUSE + mmfsa_ra LPAD_MMFSA_RA + pc LPAD_PC + arg LPAD_ARG + nmap LPAD_NMAP + map LPAD_MAP + +lpad_map LPAD_MAP_SIZE + flags LPAD_MAP_FLAGS + va LPAD_MAP_VA + tte LPAD_MAP_TTE diff --git a/usr/src/uts/sun4v/ml/mach_proc_init.s b/usr/src/uts/sun4v/ml/mach_proc_init.s new file mode 100644 index 0000000000..20d4d3c3cc --- /dev/null +++ b/usr/src/uts/sun4v/ml/mach_proc_init.s @@ -0,0 +1,211 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * sun4v processor initialization + * + * This is the kernel entry point for CPUs that enter Solaris + * directly from the hypervisor. i.e. without going through OBP. + */ + +#if !defined(lint) +#include "assym.h" +#endif /* !lint */ + +#include <sys/asm_linkage.h> +#include <sys/hypervisor_api.h> +#include <sys/machasi.h> +#include <sys/machpcb.h> +#include <sys/machlock.h> +#include <sys/mmu.h> +#include <sys/lpad.h> + +#if defined(lint) + +/* ARGSUSED */ +void +mach_cpu_startup(uint64_t rabase, uint64_t memsz) +{} + +#else /* lint */ + + /* + * %o0 - hcall specified arg (cpuid) + * %i0 - real memory base + * %i1 - memory size + */ + ENTRY_NP(mach_cpu_startup) + /* + * Calculate the data pointer. The landing pad + * data immediately follows the landing pad text. + */ + rd %pc, %l0 + add %l0, LPAD_TEXT_SIZE, %l1 ! %l1 has start of data + + /* + * Setup the initial state of the CPU. + */ + wrpr %g0, 0, %tl + wrpr %g0, 0, %gl + wrpr %g0, MAXWIN - 2, %cansave + wrpr %g0, MAXWIN - 2, %cleanwin + wrpr %g0, 0, %canrestore + wrpr %g0, 0, %otherwin + wrpr %g0, 0, %cwp + wrpr %g0, 0, %wstate + wr %g0, %y + wrpr %g0, PIL_MAX, %pil + + set trap_table, %g1 + wrpr %g1, %tba + + ! initialize cpuid into scratchpad register + mov SCRATCHPAD_CPUID, %g1 + stxa %o0, [%g1]ASI_SCRATCHPAD + + ! sanity check the data section + setx LPAD_MAGIC_VAL, %g2, %g1 + ldx [%l1 + LPAD_MAGIC], %g2 + cmp %g1, %g2 + bne startup_error + nop + + /* + * Loop through the array of TTE's, installing the + * VA to RA mapping for each one. + */ + ldx [%l1 + LPAD_NMAP], %l2 ! %l2 = number of mappings + add %l1, LPAD_MAP, %l3 ! %l3 = the current mapping + + /* + * Sanity check the number of mappings. + */ + mulx %l2, LPAD_MAP_SIZE, %g1 + add %l3, %g1, %g1 ! %g1 = end of the array + add %l1, LPAD_DATA_SIZE, %g2 ! %g2 = end of data section + sub %g2, %g1, %g2 + brlz %g2, startup_error + nop + +0: + cmp %l2, %g0 + be 3f + nop + + ldx [%l3 + LPAD_MAP_FLAGS], %l4 ! %l4 = flags + + /* + * Generate args for the HV call + */ + ldx [%l3 + LPAD_MAP_VA], %o0 ! %o0 = virtual address + mov KCONTEXT, %o1 ! %o1 = context + ldx [%l3 + LPAD_MAP_TTE], %o2 ! %o2 = TTE + and %l4, FLAG_MMUFLAGS_MASK, %o3 ! %o3 = MMU flags + + ! check if this is a locked TTE + and %l4, FLAG_LOCK_MASK, %l4 + cmp %l4, %g0 + bne 1f + nop + + ! install an unlocked entry + ta MMU_MAP_ADDR + ba 2f + nop +1: + ! install a locked entry + mov MAP_PERM_ADDR, %o5 + ta FAST_TRAP + +2: + ! check for errors from the hcall + cmp %o0, %g0 + bne startup_error + nop + + sub %l2, 1, %l2 ! decrement counter + add %l3, LPAD_MAP_SIZE, %l3 ! increment pointer + + ba 0b + nop + +3: + /* + * Set the MMU fault status area + */ + ldx [%l1 + LPAD_MMFSA_RA], %o0 + + mov MMU_SET_INFOPTR, %o5 + ta FAST_TRAP + + ! check for errors from the hcall + cmp %o0, %g0 + bne startup_error + nop + + /* + * Load remaining arguments before enabling the + * MMU so that the loads can be done using real + * addresses. + */ + ldx [%l1 + LPAD_PC], %l3 ! %l3 = specified entry point + ldx [%l1 + LPAD_ARG], %l4 ! %l4 = specified argument + ldx [%l1 + LPAD_INUSE], %l5 ! %l5 = va of inuse mailbox + + /* + * Enable the MMU. On success, it returns to the + * global version of the landing pad text, rather + * than the text copied into the lpad buffer. + */ + mov 1, %o0 ! %o0 = enable flag (1 = enable) + set startup_complete, %o1 ! VA of return address + mov MMU_ENABLE, %o5 + ta FAST_TRAP + + /* + * On errors, just enter a spin loop until the + * CPU that initiated the start recovers the CPU. + */ +startup_error: + ba startup_error + nop + + /* + * Jump to the generic CPU initialization code. + */ +startup_complete: + mov %l4, %o0 + jmpl %l3, %g0 + stx %g0, [%l5] ! clear the inuse mailbox + + SET_SIZE(mach_cpu_startup) + + .global mach_cpu_startup_end +mach_cpu_startup_end: + +#endif /* lint */ diff --git a/usr/src/uts/sun4v/ml/mach_subr_asm.s b/usr/src/uts/sun4v/ml/mach_subr_asm.s index f0a9255abf..28d3e2a0d8 100644 --- a/usr/src/uts/sun4v/ml/mach_subr_asm.s +++ b/usr/src/uts/sun4v/ml/mach_subr_asm.s @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -74,6 +73,80 @@ get_mmfsa_scratchpad() SET_SIZE(get_mmfsa_scratchpad) #endif /* lint */ + + +#if defined(lint) +/* ARGSUSED */ +void +cpu_intrq_unregister_powerdown(uint64_t doneflag_va) +{} + +#else /* lint */ + +/* + * Called from a x-trap at tl1 must use %g1 as arg + * and save/restore %o0-%o5 after hypervisor calls + */ + + ENTRY(cpu_intrq_unregister_powerdown) + + CPU_ADDR(%g2, %g3) + add %g2, CPU_MCPU, %g2 + /* + * Save %o regs + */ + mov %o0, %g3 + mov %o1, %g4 + mov %o2, %g5 + mov %o5, %g6 + + ldx [%g2 + MCPU_CPU_Q_BASE], %o1 + mov INTR_CPU_Q, %o0 + call hv_cpu_qconf + mov %g0, %o2 + + ldx [%g2 + MCPU_DEV_Q_BASE], %o1 + mov INTR_DEV_Q, %o0 + call hv_cpu_qconf + mov %g0, %o2 + + ldx [%g2 + MCPU_RQ_BASE], %o1 + mov CPU_RQ, %o0 + call hv_cpu_qconf + mov %g0, %o2 + + ldx [%g2 + MCPU_NRQ_BASE], %o1 + mov CPU_NRQ, %o0 + call hv_cpu_qconf + mov %g0, %o2 + + /* + * set done flag to 0 + */ + stub %g0, [%g1] + + /* + * Restore %o regs + */ + mov %g3, %o0 + mov %g4, %o1 + mov %g5, %o2 + mov %g6, %o5 + + /* + * This CPU is on its way out. Spin here + * until the DR unconfigure code stops it. + * Returning would put it back in the OS + * where it might grab resources like locks, + * causing some nastiness to occur. + */ +0: + ba,a 0b + + SET_SIZE(cpu_intrq_unregister_powerdown) +#endif /* lint */ + + #if defined(lint) /* ARGSUSED */ int diff --git a/usr/src/uts/sun4v/ml/trap_table.s b/usr/src/uts/sun4v/ml/trap_table.s index 391bb34f2e..24fb20058c 100644 --- a/usr/src/uts/sun4v/ml/trap_table.s +++ b/usr/src/uts/sun4v/ml/trap_table.s @@ -151,11 +151,7 @@ .align 32 #define NOTP4 NOTP; NOTP; NOTP; NOTP -/* - * RED is for traps that use the red mode handler. - * We should never see these either. - */ -#define RED NOT + /* * BAD is used for trap vectors we don't have a kernel * handler for. @@ -824,6 +820,25 @@ tt_pil/**/level: ;\ .align 32 /* + * We take over the rtba after we set our trap table and + * fault status area. The watchdog reset trap is now handled by the OS. + */ +#define WATCHDOG_RESET \ + mov PTL1_BAD_WATCHDOG, %g1 ;\ + ba,a,pt %xcc, .watchdog_trap ;\ + .align 32 + +/* + * RED is for traps that use the red mode handler. + * We should never see these either. + */ +#define RED \ + mov PTL1_BAD_RED, %g1 ;\ + ba,a,pt %xcc, .watchdog_trap ;\ + .align 32 + + +/* * MMU Trap Handlers. */ @@ -1124,7 +1139,7 @@ trap_table0: /* hardware traps */ NOT; /* 000 reserved */ RED; /* 001 power on reset */ - RED; /* 002 watchdog reset */ + WATCHDOG_RESET; /* 002 watchdog reset */ RED; /* 003 externally initiated reset */ RED; /* 004 software initiated reset */ RED; /* 005 red mode exception */ @@ -2683,6 +2698,20 @@ trace_dataprot: #endif /* TRAPTRACE */ /* + * Handle watchdog reset trap. Enable the MMU using the MMU_ENABLE + * HV service, which requires the return target to be specified as a VA + * since we are enabling the MMU. We set the target to ptl1_panic. + */ + + .type .watchdog_trap, #function +.watchdog_trap: + mov 1, %o0 + setx ptl1_panic, %g2, %o1 + mov MMU_ENABLE, %o5 + ta FAST_TRAP + done + SET_SIZE(.watchdog_trap) +/* * synthesize for trap(): SFAR in %g2, SFSR in %g3 */ .type .dmmu_exc_lddf_not_aligned, #function diff --git a/usr/src/uts/sun4v/os/fillsysinfo.c b/usr/src/uts/sun4v/os/fillsysinfo.c index 7cfb68fe7f..173019f902 100644 --- a/usr/src/uts/sun4v/os/fillsysinfo.c +++ b/usr/src/uts/sun4v/os/fillsysinfo.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -44,373 +43,750 @@ #include <sys/cmp.h> #include <sys/async.h> #include <vm/page.h> - -/* - * The OpenBoot Standalone Interface supplies the kernel with - * implementation dependent parameters through the devinfo/property mechanism - */ -typedef enum { XDRBOOL, XDRINT, XDRSTRING } xdrs; - -/* - * structure describing properties that we are interested in querying the - * OBP for. - */ -struct getprop_info { - char *name; - xdrs type; - uint_t *var; -}; - -/* - * structure used to convert between a string returned by the OBP & a type - * used within the kernel. We prefer to paramaterize rather than type. - */ -struct convert_info { - char *name; - uint_t var; - char *realname; -}; - -/* - * structure describing nodes that we are interested in querying the OBP for - * properties. - */ -struct node_info { - char *name; - int size; - struct getprop_info *prop; - struct getprop_info *prop_end; - unsigned int *value; -}; - -/* - * macro definitions for routines that form the OBP interface - */ -#define NEXT prom_nextnode -#define CHILD prom_childnode -#define GETPROP prom_getprop -#define GETPROPLEN prom_getproplen - -/* 0=quiet; 1=verbose; 2=debug */ -int debug_fillsysinfo = 0; -#define VPRINTF if (debug_fillsysinfo) prom_printf +#include <vm/hat_sfmmu.h> +#include <sys/sysmacros.h> +#include <sys/mach_descrip.h> +#include <sys/mdesc.h> +#include <sys/archsystm.h> +#include <sys/error.h> +#include <sys/mmu.h> +#include <sys/bitmap.h> int ncpunode; struct cpu_node cpunodes[NCPU]; -void fill_cpu(pnode_t); -void plat_fill_mc(pnode_t); -#pragma weak plat_fill_mc +uint64_t cpu_q_entries; +uint64_t dev_q_entries; +uint64_t cpu_rq_entries; +uint64_t cpu_nrq_entries; + +void fill_cpu(md_t *, mde_cookie_t); + +static uint64_t get_mmu_ctx_bits(md_t *, mde_cookie_t); +static uint64_t get_cpu_pagesizes(md_t *, mde_cookie_t); +static char *construct_isalist(md_t *, mde_cookie_t, char **); +static void set_at_flags(char *, int, char **); +static void init_md_broken(md_t *); +static int get_l2_cache_info(md_t *, mde_cookie_t, uint64_t *, uint64_t *, + uint64_t *); +static id_t get_exec_unit_mapping(md_t *, mde_cookie_t, mde_cookie_t *); +static int find_exec_unit_id(mde_cookie_t, mde_cookie_t *); +static void get_q_sizes(md_t *, mde_cookie_t); +static void get_va_bits(md_t *, mde_cookie_t); +static size_t get_ra_limit(md_t *); uint64_t system_clock_freq; int niobus = 0; uint_t niommu_tsbs = 0; -/* - * Hardware watchdog support. - */ -#define CHOSEN_EEPROM "eeprom" -static pnode_t chosen_eeprom; - -/* - * If this variable is non-zero, cpr should return "not supported" when - * it is queried even though it would normally be supported on this platform. - */ -int cpr_supported_override; +void +map_wellknown_devices() +{ +} -/* - * Some platforms may need to support CPR even in the absence of the - * energystar-v* property (Enchilada server, for example). If this - * variable is non-zero, cpr should proceed even in the absence - * of the energystar-v* property. - */ -int cpr_platform_enable = 0; +#define S_VAC_SIZE MMU_PAGESIZE +#define S_VAC_SHIFT MMU_PAGESHIFT /* - * Some nodes have functions that need to be called when they're seen. + * For backward compatibility we need to verify that we can handle + * running on platforms which shipped with missing MD properties. */ -static void have_pci(pnode_t); - -static struct wkdevice { - char *wk_namep; - void (*wk_func)(pnode_t); - caddr_t *wk_vaddrp; - ushort_t wk_flags; -#define V_OPTIONAL 0x0000 -#define V_MUSTHAVE 0x0001 -#define V_MAPPED 0x0002 -#define V_MULTI 0x0003 /* optional, may be more than one */ -} wkdevice[] = { - { "pci", have_pci, NULL, V_MULTI }, - { 0, }, -}; - -static void map_wellknown(pnode_t); +#define ONTARIO_PLATNAME1 "SUNW,Sun-Fire-T200" +#define ONTARIO_PLATNAME2 "SUNW,Sun-Fire-T2000" +#define ERIE_PLATNAME1 "SUNW,Sun-Fire-T100" +#define ERIE_PLATNAME2 "SUNW,Sun-Fire-T1000" void -map_wellknown_devices() +fill_cpu(md_t *mdp, mde_cookie_t cpuc) { - struct wkdevice *wkp; - phandle_t ieeprom; - pnode_t root; - uint_t stick_freq; + struct cpu_node *cpunode; + uint64_t cpuid; + uint64_t clk_freq; + char *namebuf; + char *namebufp; + int namelen; + uint64_t associativity = 0, linesize = 0, size = 0; + int status; + + if (md_get_prop_val(mdp, cpuc, "id", &cpuid)) { + return; + } + if (cpuid >= NCPU) { + cmn_err(CE_CONT, "fill_cpu: out of range cpuid %ld - " + "cpu excluded from configuration", cpuid); + + mutex_enter(&cpu_lock); + + /* + * Since the CPU cannot be used, make sure it + * is in a safe place. If the firmware does not + * support CPU stop, this is known to be true. + * If it fails to stop for any other reason, the + * system is in an inconsistent state and cannot + * be allowed to continue. + */ + status = stopcpu_bycpuid(cpuid); + + if ((status != 0) && (status != ENOTSUP)) { + cmn_err(CE_PANIC, "failed to stop cpu %lu (%d)", + cpuid, status); + } + + mutex_exit(&cpu_lock); + return; + } + + cpunode = &cpunodes[cpuid]; + cpunode->cpuid = (int)cpuid; + cpunode->device_id = cpuid; + + if (sizeof (cpunode->fru_fmri) > strlen(CPU_FRU_FMRI)) + (void) strcpy(cpunode->fru_fmri, CPU_FRU_FMRI); + + if (md_get_prop_data(mdp, cpuc, + "compatible", (uint8_t **)&namebuf, &namelen)) { + cmn_err(CE_PANIC, "fill_cpu: Cannot read compatible " + "property"); + } + namebufp = namebuf; + if (strncmp(namebufp, "SUNW,", 5) == 0) + namebufp += 5; + if (strlen(namebufp) > sizeof (cpunode->name)) + cmn_err(CE_PANIC, "Compatible property too big to " + "fit into the cpunode name buffer"); + (void) strcpy(cpunode->name, namebufp); + + if (md_get_prop_val(mdp, cpuc, + "clock-frequency", &clk_freq)) { + clk_freq = 0; + } + cpunode->clock_freq = clk_freq; + + ASSERT(cpunode->clock_freq != 0); /* - * if there is a chosen eeprom, note it (for have_eeprom()) + * Compute scaling factor based on rate of %tick. This is used + * to convert from ticks derived from %tick to nanoseconds. See + * comment in sun4u/sys/clock.h for details. */ - if (GETPROPLEN(prom_chosennode(), CHOSEN_EEPROM) == - sizeof (phandle_t) && - GETPROP(prom_chosennode(), CHOSEN_EEPROM, (caddr_t)&ieeprom) != -1) - chosen_eeprom = (pnode_t)prom_decode_int(ieeprom); + cpunode->tick_nsec_scale = (uint_t)(((uint64_t)NANOSEC << + (32 - TICK_NSEC_SHIFT)) / cpunode->clock_freq); - root = prom_nextnode((pnode_t)0); /* - * Get System clock frequency from root node if it exists. + * The nodeid is not used in sun4v at all. Setting it + * to positive value to make starting of slave CPUs + * code happy. */ - if (GETPROP(root, "stick-frequency", (caddr_t)&stick_freq) != -1) - system_clock_freq = stick_freq; - - map_wellknown(NEXT((pnode_t)0)); + cpunode->nodeid = cpuid + 1; /* - * See if it worked + * Obtain the L2 cache information from MD. + * If "Cache" node exists, then set L2 cache properties + * as read from MD. + * If node does not exists, then set the L2 cache properties + * in individual CPU module. */ - for (wkp = wkdevice; wkp->wk_namep; ++wkp) { - if (wkp->wk_flags == V_MUSTHAVE) { - cmn_err(CE_PANIC, "map_wellknown_devices: required " - "device %s not mapped", wkp->wk_namep); - } + if ((!get_l2_cache_info(mdp, cpuc, + &associativity, &size, &linesize)) || + associativity == 0 || size == 0 || linesize == 0) { + cpu_fiximp(cpunode); + } else { + /* + * Do not expect L2 cache properties to be bigger + * than 32-bit quantity. + */ + cpunode->ecache_associativity = (int)associativity; + cpunode->ecache_size = (int)size; + cpunode->ecache_linesize = (int)linesize; } + + cpunode->ecache_setsize = + cpunode->ecache_size / cpunode->ecache_associativity; + + /* + * Start off by assigning the cpu id as the default + * mapping index. + */ + + cpunode->exec_unit_mapping = NO_EU_MAPPING_FOUND; + + if (ecache_setsize == 0) + ecache_setsize = cpunode->ecache_setsize; + if (ecache_alignsize == 0) + ecache_alignsize = cpunode->ecache_linesize; + + ncpunode++; } -/* - * map_wellknown - map known devices & registers - */ -static void -map_wellknown(pnode_t curnode) +void +empty_cpu(int cpuid) { - extern int status_okay(int, char *, int); - char tmp_name[MAXSYSNAME]; - static void fill_address(pnode_t, char *); - int sok; + bzero(&cpunodes[cpuid], sizeof (struct cpu_node)); + ncpunode--; +} -#ifdef VPRINTF - VPRINTF("map_wellknown(%x)\n", curnode); -#endif /* VPRINTF */ +void +setup_exec_unit_mappings(md_t *mdp) +{ + uint64_t num, num_eunits; + mde_cookie_t cpus_node; + mde_cookie_t *node, *eunit; + int idx, i, j; + processorid_t cpuid; + char *eunit_name = broken_md_flag ? "exec_unit" : "exec-unit"; - for (curnode = CHILD(curnode); curnode; curnode = NEXT(curnode)) { - /* - * prune subtree if status property indicating not okay - */ - sok = status_okay((int)curnode, (char *)NULL, 0); - if (!sok) { - char devtype_buf[OBP_MAXPROPNAME]; - int size; - -#ifdef VPRINTF - VPRINTF("map_wellknown: !okay status property\n"); -#endif /* VPRINTF */ - /* - * a status property indicating bad memory will be - * associated with a node which has a "device_type" - * property with a value of "memory-controller" - */ - if ((size = GETPROPLEN(curnode, - OBP_DEVICETYPE)) == -1) - continue; - if (size > OBP_MAXPROPNAME) { - cmn_err(CE_CONT, "node %x '%s' prop too " - "big\n", curnode, OBP_DEVICETYPE); - continue; - } - if (GETPROP(curnode, OBP_DEVICETYPE, - devtype_buf) == -1) { - cmn_err(CE_CONT, "node %x '%s' get failed\n", - curnode, OBP_DEVICETYPE); - continue; + /* + * Find the cpu integer exec units - and + * setup the mappings appropriately. + */ + num = md_alloc_scan_dag(mdp, md_root_node(mdp), "cpus", "fwd", &node); + if (num < 1) + cmn_err(CE_PANIC, "No cpus node in machine desccription"); + if (num > 1) + cmn_err(CE_PANIC, "More than 1 cpus node in machine" + " description"); + + cpus_node = node[0]; + md_free_scan_dag(mdp, &node); + + num_eunits = md_alloc_scan_dag(mdp, cpus_node, eunit_name, + "fwd", &eunit); + if (num_eunits > 0) { + char *match_type = broken_md_flag ? "int" : "integer"; + + /* Spin through and find all the integer exec units */ + for (i = 0; i < num_eunits; i++) { + char *p; + char *val; + int vallen; + uint64_t lcpuid; + + /* ignore nodes with no type */ + if (md_get_prop_data(mdp, eunit[i], "type", + (uint8_t **)&val, &vallen)) continue; + + for (p = val; *p != '\0'; p += strlen(p) + 1) { + if (strcmp(p, match_type) == 0) + goto found; } - if (strcmp(devtype_buf, "memory-controller") != 0) - continue; + + continue; +found: + idx = NCPU + i; /* - * ...else fall thru and process the node... + * find the cpus attached to this EU and + * update their mapping indices */ + num = md_alloc_scan_dag(mdp, eunit[i], "cpu", + "back", &node); + + if (num < 1) + cmn_err(CE_PANIC, "exec-unit node in MD" + " not attached to a cpu node"); + + for (j = 0; j < num; j++) { + if (md_get_prop_val(mdp, node[j], "id", + &lcpuid)) + continue; + if (lcpuid >= NCPU) + continue; + cpuid = (processorid_t)lcpuid; + cpunodes[cpuid].exec_unit_mapping = idx; + } + md_free_scan_dag(mdp, &node); } - bzero(tmp_name, MAXSYSNAME); - if (GETPROP(curnode, OBP_NAME, (caddr_t)tmp_name) != -1) - fill_address(curnode, tmp_name); - if (GETPROP(curnode, OBP_DEVICETYPE, tmp_name) != -1 && - strcmp(tmp_name, "cpu") == 0) { - fill_cpu(curnode); - } - if (sok && (strcmp(tmp_name, "memory-controller") == 0) && - (&plat_fill_mc != NULL)) - plat_fill_mc(curnode); - map_wellknown(curnode); + + md_free_scan_dag(mdp, &eunit); } } -static void -fill_address(pnode_t curnode, char *namep) +/* + * All the common setup of sun4v CPU modules is done by this routine. + */ +void +cpu_setup_common(char **cpu_module_isa_set) { - struct wkdevice *wkp; - int size; - uint32_t vaddr; - - for (wkp = wkdevice; wkp->wk_namep; ++wkp) { - if (strcmp(wkp->wk_namep, namep) != 0) - continue; - if (wkp->wk_flags == V_MAPPED) - return; - if (wkp->wk_vaddrp != NULL) { - if ((size = GETPROPLEN(curnode, OBP_ADDRESS)) == -1) { - cmn_err(CE_CONT, "device %s size %d\n", - namep, size); - continue; - } - if (size != sizeof (vaddr)) { - cmn_err(CE_CONT, "device %s address prop too " - "big\n", namep); - continue; - } - if (GETPROP(curnode, OBP_ADDRESS, - (caddr_t)&vaddr) == -1) { - cmn_err(CE_CONT, "device %s not mapped\n", - namep); - continue; - } + extern int disable_delay_tlb_flush, delay_tlb_flush; + extern int mmu_exported_pagesize_mask; + extern int vac_size, vac_shift; + extern uint_t vac_mask; + int nocpus, i; + size_t ra_limit; + mde_cookie_t *cpulist; + md_t *mdp; + + if ((mdp = md_get_handle()) == NULL) + cmn_err(CE_PANIC, "Unable to initialize machine description"); + + init_md_broken(mdp); + + nocpus = md_alloc_scan_dag(mdp, + md_root_node(mdp), "cpu", "fwd", &cpulist); + if (nocpus < 1) { + cmn_err(CE_PANIC, "cpu_common_setup: cpulist allocation " + "failed or incorrect number of CPUs in MD"); + } - /* make into a native pointer */ - *wkp->wk_vaddrp = (caddr_t)(uintptr_t)vaddr; -#ifdef VPRINTF - VPRINTF("fill_address: %s mapped to %p\n", namep, - *wkp->wk_vaddrp); -#endif /* VPRINTF */ + if (use_page_coloring) { + do_pg_coloring = 1; + if (use_virtual_coloring) { + /* + * XXX Sun4v cpus don't have virtual caches + */ + do_virtual_coloring = 1; } - if (wkp->wk_func != NULL) - (*wkp->wk_func)(curnode); + } + + /* + * Get the valid contexts, mmu page sizes mask, Q sizes and isalist/r + * from the MD for the first available CPU in cpulist. + */ + + if (nctxs == 0) + nctxs = (uint_t)(1 << get_mmu_ctx_bits(mdp, cpulist[0])); + + if (nctxs > MAX_NCTXS) + nctxs = MAX_NCTXS; + + /* Do not expect the MMU page sizes mask to be more than 32-bit. */ + mmu_exported_pagesize_mask = (int)get_cpu_pagesizes(mdp, cpulist[0]); + + for (i = 0; i < nocpus; i++) + fill_cpu(mdp, cpulist[i]); + + setup_exec_unit_mappings(mdp); + + vac_size = S_VAC_SIZE; + vac_mask = MMU_PAGEMASK & (vac_size - 1); + vac_shift = S_VAC_SHIFT; + shm_alignment = vac_size; + vac = 0; + + /* + * If MD is broken then append the passed ISA set, + * otherwise trust the MD. + */ + + if (broken_md_flag) + isa_list = construct_isalist(mdp, cpulist[0], + cpu_module_isa_set); + else + isa_list = construct_isalist(mdp, cpulist[0], NULL); + + get_q_sizes(mdp, cpulist[0]); + + get_va_bits(mdp, cpulist[0]); + + /* + * ra_limit is the highest real address in the machine. + */ + ra_limit = get_ra_limit(mdp); + + md_free_scan_dag(mdp, &cpulist); + + (void) md_fini_handle(mdp); + + /* + * Block stores invalidate all pages of the d$ so pagecopy + * et. al. do not need virtual translations with virtual + * coloring taken into consideration. + */ + pp_consistent_coloring = 0; + + /* + * The kpm mapping window. + * kpm_size: + * The size of a single kpm range. + * The overall size will be: kpm_size * vac_colors. + * kpm_vbase: + * The virtual start address of the kpm range within the kernel + * virtual address space. kpm_vbase has to be kpm_size aligned. + */ + + /* + * Make kpm_vbase, kpm_size aligned to kpm_size_shift. + * To do this find the nearest power of 2 size that the + * actual ra_limit fits within. + * If it is an even power of two use that, otherwise use the + * next power of two larger than ra_limit. + */ + + ASSERT(ra_limit != 0); + + kpm_size_shift = (ra_limit & (ra_limit - 1)) != 0 ? + highbit(ra_limit) : highbit(ra_limit) - 1; + + /* + * No virtual caches on sun4v so size matches size shift + */ + kpm_size = 1ul << kpm_size_shift; + + if (va_bits < VA_ADDRESS_SPACE_BITS) { /* - * If this one is optional and there may be more than - * one, don't set V_MAPPED, which would cause us to skip it - * next time around + * In case of VA hole + * kpm_base = hole_end + 1TB + * Starting 1TB beyond where VA hole ends because on Niagara + * processor software must not use pages within 4GB of the + * VA hole as instruction pages to avoid problems with + * prefetching into the VA hole. */ - if (wkp->wk_flags != V_MULTI) - wkp->wk_flags = V_MAPPED; + kpm_vbase = (caddr_t)((0ull - (1ull << (va_bits - 1))) + + (1ull << 40)); + } else { /* Number of VA bits 64 ... no VA hole */ + kpm_vbase = (caddr_t)0x8000000000000000ull; /* 8 EB */ } + + /* + * The traptrace code uses either %tick or %stick for + * timestamping. The sun4v require use of %stick. + */ + traptrace_use_stick = 1; + + /* + * sun4v provides demap_all + */ + if (!disable_delay_tlb_flush) + delay_tlb_flush = 1; } -void -fill_cpu(pnode_t node) +/* + * Get the nctxs from MD. If absent panic. + */ +static uint64_t +get_mmu_ctx_bits(md_t *mdp, mde_cookie_t cpu_node_cookie) { - struct cpu_node *cpunode; - processorid_t cpuid; - uint_t clk_freq; - char namebuf[OBP_MAXPROPNAME], unum[UNUM_NAMLEN]; - char *namebufp; + uint64_t ctx_bits; - if (GETPROP(node, "cpuid", (caddr_t)&cpuid) == -1) { - if (GETPROP(node, "reg", (caddr_t)&cpuid) == -1) - cmn_err(CE_PANIC, "reg prop not found in cpu node"); - cpuid = PROM_CFGHDL_TO_CPUID(cpuid); - } + if (md_get_prop_val(mdp, cpu_node_cookie, "mmu-#context-bits", + &ctx_bits)) + ctx_bits = 0; - if (cpuid < 0 || cpuid >= NCPU) { - cmn_err(CE_CONT, "cpu (dnode %x): out of range cpuid %d - " - "cpu excluded from configuration\n", node, cpuid); - return; - } + if (ctx_bits < MIN_NCTXS_BITS || ctx_bits > MAX_NCTXS_BITS) + cmn_err(CE_PANIC, "Incorrect %ld number of contexts bits " + "returned by MD", ctx_bits); - cpunode = &cpunodes[cpuid]; - cpunode->cpuid = cpuid; - cpunode->device_id = cpuid; + return (ctx_bits); +} - unum[0] = '\0'; - (void) snprintf(cpunode->fru_fmri, sizeof (cpunode->fru_fmri), - "%s%s", CPU_FRU_FMRI, unum); - (void) GETPROP(node, "compatible", namebuf); - namebufp = namebuf; - if (strncmp(namebufp, "SUNW,", 5) == 0) - namebufp += 5; - (void) strcpy(cpunode->name, namebufp); +/* + * Initalize supported page sizes information. + * Set to 0, if the page sizes mask information is absent in MD. + */ +static uint64_t +get_cpu_pagesizes(md_t *mdp, mde_cookie_t cpu_node_cookie) +{ + uint64_t mmu_page_size_list; - if (GETPROP(node, "clock-frequency", (caddr_t)&clk_freq) == -1) { - /* - * If we didn't find it in the CPU node, look in the root node. - */ - pnode_t root = prom_nextnode((pnode_t)0); - if (GETPROP(root, "clock-frequency", (caddr_t)&clk_freq) == -1) - clk_freq = 0; - } - cpunode->clock_freq = clk_freq; + if (md_get_prop_val(mdp, cpu_node_cookie, "mmu-page-size-list", + &mmu_page_size_list)) + mmu_page_size_list = 0; + + if (mmu_page_size_list == 0 || mmu_page_size_list > MAX_PAGESIZE_MASK) + cmn_err(CE_PANIC, "Incorrect 0x%lx pagesize mask returned" + "by MD", mmu_page_size_list); + + return (mmu_page_size_list); +} + +/* + * This routine gets the isalist information from MD and appends + * the CPU module ISA set if required. + */ +static char * +construct_isalist(md_t *mdp, mde_cookie_t cpu_node_cookie, + char **cpu_module_isa_set) +{ + extern int at_flags; + char *md_isalist; + int md_isalen; + char *isabuf; + int isalen; + char **isa_set; + char *p, *q; + int cpu_module_isalen = 0, found = 0; + + (void) md_get_prop_data(mdp, cpu_node_cookie, + "isalist", (uint8_t **)&isabuf, &isalen); - ASSERT(cpunode->clock_freq != 0); /* - * Compute scaling factor based on rate of %tick. This is used - * to convert from ticks derived from %tick to nanoseconds. See - * comment in sun4u/sys/clock.h for details. + * We support binaries for all the cpus that have shipped so far. + * The kernel emulates instructions that are not supported by hardware. */ - cpunode->tick_nsec_scale = (uint_t)(((uint64_t)NANOSEC << - (32 - TICK_NSEC_SHIFT)) / cpunode->clock_freq); + at_flags = EF_SPARC_SUN_US3 | EF_SPARC_32PLUS | EF_SPARC_SUN_US1; + /* + * Construct the space separated isa_list. + */ + if (cpu_module_isa_set != NULL) { + for (isa_set = cpu_module_isa_set; *isa_set != NULL; + isa_set++) { + cpu_module_isalen += strlen(*isa_set); + cpu_module_isalen++; /* for space character */ + } + } - cpunode->nodeid = node; + /* + * Allocate the buffer of MD isa buffer length + CPU module + * isa buffer length. + */ + md_isalen = isalen + cpu_module_isalen + 2; + md_isalist = (char *)prom_alloc((caddr_t)0, md_isalen, 0); + if (md_isalist == NULL) + cmn_err(CE_PANIC, "construct_isalist: Allocation failed for " + "md_isalist"); + + md_isalist[0] = '\0'; /* create an empty string to start */ + for (p = isabuf, q = p + isalen; p < q; p += strlen(p) + 1) { + (void) strlcat(md_isalist, p, md_isalen); + (void) strcat(md_isalist, " "); + } /* - * Call cpu module specific code to fill in the cpu properities + * Check if the isa_set is present in isalist returned by MD. + * If yes, then no need to append it, if no then append it to + * isalist returned by MD. */ - cpu_fiximp(cpunode); + if (cpu_module_isa_set != NULL) { + for (isa_set = cpu_module_isa_set; *isa_set != NULL; + isa_set++) { + found = 0; + for (p = isabuf, q = p + isalen; p < q; + p += strlen(p) + 1) { + if (strcmp(p, *isa_set) == 0) { + found = 1; + break; + } + } + if (!found) { + (void) strlcat(md_isalist, *isa_set, md_isalen); + (void) strcat(md_isalist, " "); + } + } + } + + /* Get rid of any trailing white spaces */ + md_isalist[strlen(md_isalist) - 1] = '\0'; + + return (md_isalist); } -#define IOMMU_PER_SCHIZO 2 +uint64_t +get_ra_limit(md_t *mdp) +{ + mde_cookie_t *mem_list; + mde_cookie_t *mblock_list; + int i; + int memnodes; + int nmblock; + uint64_t base; + uint64_t size; + uint64_t ra_limit = 0, new_limit = 0; + + memnodes = md_alloc_scan_dag(mdp, + md_root_node(mdp), "memory", "fwd", &mem_list); + + ASSERT(memnodes == 1); + + nmblock = md_alloc_scan_dag(mdp, + mem_list[0], "mblock", "fwd", &mblock_list); + if (nmblock < 1) + cmn_err(CE_PANIC, "cannot find mblock nodes in MD"); + + for (i = 0; i < nmblock; i++) { + if (md_get_prop_val(mdp, mblock_list[i], "base", &base)) + cmn_err(CE_PANIC, "base property missing from MD" + " mblock node"); + if (md_get_prop_val(mdp, mblock_list[i], "size", &size)) + cmn_err(CE_PANIC, "size property missing from MD" + " mblock node"); + + ASSERT(size != 0); + + new_limit = base + size; + + if (base > new_limit) + cmn_err(CE_PANIC, "mblock in MD wrapped around"); + + if (new_limit > ra_limit) + ra_limit = new_limit; + } + + ASSERT(ra_limit != 0); + + if (ra_limit > MAX_REAL_ADDRESS) { + cmn_err(CE_WARN, "Highest real address in MD too large" + " clipping to %llx\n", MAX_REAL_ADDRESS); + ra_limit = MAX_REAL_ADDRESS; + } + + md_free_scan_dag(mdp, &mblock_list); + + md_free_scan_dag(mdp, &mem_list); + + return (ra_limit); +} /* - * The first psycho must always programmed up for the system clock and error - * handling purposes. + * This routine sets the globals for CPU and DEV mondo queue entries and + * resumable and non-resumable error queue entries. */ +static uint64_t +get_single_q_size(md_t *mdp, mde_cookie_t cpu_node_cookie, + char *qnamep, uint64_t default_entries) +{ + uint64_t entries; + + if (md_get_prop_val(mdp, cpu_node_cookie, qnamep, &entries)) { + if (!broken_md_flag) + cmn_err(CE_PANIC, "Missing %s property in MD cpu node", + qnamep); + entries = default_entries; + } else { + entries = 1 << entries; + } + return (entries); +} + + static void -have_pci(pnode_t node) +get_q_sizes(md_t *mdp, mde_cookie_t cpu_node_cookie) { - int size; - uint_t portid; - char compatible[OBP_MAXDRVNAME]; + cpu_q_entries = get_single_q_size(mdp, cpu_node_cookie, + "q-cpu-mondo-#bits", DEFAULT_CPU_Q_ENTRIES); + + dev_q_entries = get_single_q_size(mdp, cpu_node_cookie, + "q-dev-mondo-#bits", DEFAULT_DEV_Q_ENTRIES); + + cpu_rq_entries = get_single_q_size(mdp, cpu_node_cookie, + "q-resumable-#bits", CPU_RQ_ENTRIES); + + cpu_nrq_entries = get_single_q_size(mdp, cpu_node_cookie, + "q-nonresumable-#bits", CPU_NRQ_ENTRIES); +} + + +static void +get_va_bits(md_t *mdp, mde_cookie_t cpu_node_cookie) +{ + uint64_t value = VA_ADDRESS_SPACE_BITS; + + if (md_get_prop_val(mdp, cpu_node_cookie, "mmu-#va-bits", &value)) + cmn_err(CE_PANIC, "mmu-#va-bits property not found in MD"); - size = GETPROPLEN(node, "portid"); - if (size == -1) size = GETPROPLEN(node, "upa-portid"); - if (size == -1) - return; - if (size > sizeof (portid)) - cmn_err(CE_PANIC, "portid size wrong"); - if (GETPROP(node, "portid", (caddr_t)&portid) == -1) - if (GETPROP(node, "upa-portid", (caddr_t)&portid) == -1) - cmn_err(CE_PANIC, "portid not found"); + if (value == 0 || value > VA_ADDRESS_SPACE_BITS) + cmn_err(CE_PANIC, "Incorrect number of va bits in MD"); - niobus++; + /* Do not expect number of VA bits to be more than 32-bit quantity */ + va_bits = (int)value; /* - * Need two physical TSBs for Schizo-compatible nodes, - * one otherwise. + * Correct the value for VA bits on UltraSPARC-T1 based systems + * in case of broken MD. */ - compatible[0] = '\0'; - (void) prom_getprop(node, OBP_COMPATIBLE, compatible); - if (strcmp(compatible, "pci108e,8001") == 0) - niommu_tsbs += IOMMU_PER_SCHIZO; - else - niommu_tsbs++; + if (broken_md_flag) + va_bits = DEFAULT_VA_ADDRESS_SPACE_BITS; } +/* + * This routine returns the L2 cache information such as -- associativity, + * size and linesize. + */ +static int +get_l2_cache_info(md_t *mdp, mde_cookie_t cpu_node_cookie, + uint64_t *associativity, uint64_t *size, uint64_t *linesize) +{ + mde_cookie_t *cachelist; + int ncaches, i; + uint64_t max_level; + + ncaches = md_alloc_scan_dag(mdp, cpu_node_cookie, "cache", + "fwd", &cachelist); + /* + * The "cache" node is optional in MD, therefore ncaches can be 0. + */ + if (ncaches < 1) { + return (0); + } + + max_level = 0; + for (i = 0; i < ncaches; i++) { + uint64_t cache_level; + uint64_t local_assoc; + uint64_t local_size; + uint64_t local_lsize; + + if (md_get_prop_val(mdp, cachelist[i], "level", &cache_level)) + continue; + + if (cache_level <= max_level) continue; + + /* If properties are missing from this cache ignore it */ + + if ((md_get_prop_val(mdp, cachelist[i], + "associativity", &local_assoc))) { + continue; + } + + if ((md_get_prop_val(mdp, cachelist[i], + "size", &local_size))) { + continue; + } + + if ((md_get_prop_val(mdp, cachelist[i], + "line-size", &local_lsize))) { + continue; + } + + max_level = cache_level; + *associativity = local_assoc; + *size = local_size; + *linesize = local_lsize; + } -int -get_cpu_pagesizes(void) + md_free_scan_dag(mdp, &cachelist); + + return ((max_level > 0) ? 1 : 0); +} + +/* + * The broken_md_flag is set to 1, if the MD doesn't have + * the domaining-enabled property in the platform node and the platforms + * are Ontario and Erie. This flag is used to workaround some of the + * incorrect MD properties. + */ +static void +init_md_broken(md_t *mdp) { + int nrnode; + mde_cookie_t *platlist, rootnode; + char *vbuf; + uint64_t val = 0; + + rootnode = md_root_node(mdp); + ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); + + nrnode = md_alloc_scan_dag(mdp, md_root_node(mdp), "platform", "fwd", + &platlist); + + ASSERT(nrnode == 1); + + if (md_get_prop_str(mdp, platlist[0], "name", &vbuf) != 0) + panic("platform name not found in machine description"); + /* - * XXXQ Get supported page sizes information from the PD - * and return a bit mask indicating which page sizes are - * supported. - * - * Return 0 when no information is available. + * If domaining-enable prop doesn't exist and the platform name is + * Ontario or Erie the md is broken. */ - return (0); /* XXXQ for now return 0 as no PD */ + if (md_get_prop_val(mdp, platlist[0], "domaining-enabled", &val) != 0 && + ((strcmp(vbuf, ONTARIO_PLATNAME1) == 0) || + (strcmp(vbuf, ONTARIO_PLATNAME2) == 0) || + (strcmp(vbuf, ERIE_PLATNAME1) == 0) || + (strcmp(vbuf, ERIE_PLATNAME2) == 0))) + broken_md_flag = 1; + + md_free_scan_dag(mdp, &platlist); } diff --git a/usr/src/uts/sun4v/os/hsvc.c b/usr/src/uts/sun4v/os/hsvc.c index 4b88b60222..e06012e920 100644 --- a/usr/src/uts/sun4v/os/hsvc.c +++ b/usr/src/uts/sun4v/os/hsvc.c @@ -18,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -653,12 +654,15 @@ hsvc_init(void) * uses hypervisor services belonging to the HSVC_GROUP_CORE API * group only for itself. * - * Note that the HSVC_GROUP_DIAG is negotiated on behalf of - * any driver/module using DIAG services. + * Rest of the API groups are currently negotiated on behalf + * of the pcitool, glvc support. In future, when these drivers + * are modified to do the negotiation themselves, corresponding + * entry should be removed from the table below. */ static hsvc_info_t hsvcinfo_unix[] = { {HSVC_REV_1, NULL, HSVC_GROUP_SUN4V, 1, 0, NULL}, - {HSVC_REV_1, NULL, HSVC_GROUP_CORE, 1, 0, NULL}, + {HSVC_REV_1, NULL, HSVC_GROUP_CORE, 1, 1, NULL}, + {HSVC_REV_1, NULL, HSVC_GROUP_VSC, 1, 0, NULL}, {HSVC_REV_1, NULL, HSVC_GROUP_DIAG, 1, 0, NULL} }; diff --git a/usr/src/uts/sun4v/os/intrq.c b/usr/src/uts/sun4v/os/intrq.c index 0ddf35c033..ae905ed312 100644 --- a/usr/src/uts/sun4v/os/intrq.c +++ b/usr/src/uts/sun4v/os/intrq.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -34,28 +34,6 @@ #include <sys/error.h> #include <sys/hypervisor_api.h> -/* - * XXX needs to be set by some algorithm that derives this - * from the partition description - */ -int cpu_q_entries = 128; -int dev_q_entries = 128; - -/* - * Once the partition description if finallized - * cpu_q_entries and dev_q_entries will be set - * and be garaunteed to be two's power multiples. - */ -#define INTR_CPU_Q 0x3c -#define INTR_DEV_Q 0x3d -#define INTR_REPORT_SIZE 64 -#define INTR_CPU_Q_SIZE (cpu_q_entries * INTR_REPORT_SIZE) -#define INTR_DEV_Q_SIZE (dev_q_entries * INTR_REPORT_SIZE) - -/* - * XXX - This needs to be rewritten with prom calls to - * let OBP know the queues are allocated - */ void cpu_intrq_register(struct cpu *cpu) { @@ -72,13 +50,12 @@ cpu_intrq_register(struct cpu *cpu) cmn_err(CE_PANIC, "cpu%d: dev_mondo queue configuration " "failed, error %lu", cpu->cpu_id, ret); - ret = hv_cpu_qconf(CPU_RQ, mcpup->cpu_rq_base_pa, CPU_RQ_ENTRIES); + ret = hv_cpu_qconf(CPU_RQ, mcpup->cpu_rq_base_pa, cpu_rq_entries); if (ret != H_EOK) cmn_err(CE_PANIC, "cpu%d: resumable error queue configuration " "failed, error %lu", cpu->cpu_id, ret); - ret = hv_cpu_qconf(CPU_NRQ, mcpup->cpu_nrq_base_pa, - CPU_NRQ_ENTRIES); + ret = hv_cpu_qconf(CPU_NRQ, mcpup->cpu_nrq_base_pa, cpu_nrq_entries); if (ret != H_EOK) cmn_err(CE_PANIC, "cpu%d: non-resumable error queue " "configuration failed, error %lu", cpu->cpu_id, ret); @@ -89,6 +66,10 @@ cpu_intrq_setup(struct cpu *cpu) { struct machcpu *mcpup = &cpu->cpu_m; int cpu_list_size; + uint64_t cpu_q_size; + uint64_t dev_q_size; + uint64_t cpu_rq_size; + uint64_t cpu_nrq_size; /* * Allocate mondo data for xcalls. @@ -120,38 +101,109 @@ cpu_intrq_setup(struct cpu *cpu) /* * Allocate sun4v interrupt and error queues. */ - mcpup->cpu_q_va = contig_mem_alloc(INTR_CPU_Q_SIZE); + cpu_q_size = cpu_q_entries * INTR_REPORT_SIZE; + mcpup->cpu_q_va = contig_mem_alloc(cpu_q_size); if (mcpup->cpu_q_va == NULL) cmn_err(CE_PANIC, "cpu%d: cpu intrq allocation failed", cpu->cpu_id); mcpup->cpu_q_base_pa = va_to_pa(mcpup->cpu_q_va); - mcpup->cpu_q_size = INTR_CPU_Q_SIZE; + mcpup->cpu_q_size = cpu_q_size; - mcpup->dev_q_va = contig_mem_alloc(INTR_DEV_Q_SIZE); + dev_q_size = dev_q_entries * INTR_REPORT_SIZE; + mcpup->dev_q_va = contig_mem_alloc(dev_q_size); if (mcpup->dev_q_va == NULL) cmn_err(CE_PANIC, "cpu%d: dev intrq allocation failed", cpu->cpu_id); mcpup->dev_q_base_pa = va_to_pa(mcpup->dev_q_va); - mcpup->dev_q_size = INTR_DEV_Q_SIZE; + mcpup->dev_q_size = dev_q_size; /* Allocate resumable queue and its kernel buffer */ - mcpup->cpu_rq_va = contig_mem_alloc(2 * CPU_RQ_SIZE); + cpu_rq_size = cpu_rq_entries * Q_ENTRY_SIZE; + mcpup->cpu_rq_va = contig_mem_alloc(2 * cpu_rq_size); if (mcpup->cpu_rq_va == NULL) cmn_err(CE_PANIC, "cpu%d: resumable queue allocation failed", cpu->cpu_id); mcpup->cpu_rq_base_pa = va_to_pa(mcpup->cpu_rq_va); - mcpup->cpu_rq_size = CPU_RQ_SIZE; + mcpup->cpu_rq_size = cpu_rq_size; /* zero out the memory */ - bzero(mcpup->cpu_rq_va, 2 * CPU_RQ_SIZE); + bzero(mcpup->cpu_rq_va, 2 * cpu_rq_size); /* Allocate nonresumable queue here */ - mcpup->cpu_nrq_va = contig_mem_alloc(2 * CPU_NRQ_SIZE); + cpu_nrq_size = cpu_nrq_entries * Q_ENTRY_SIZE; + mcpup->cpu_nrq_va = contig_mem_alloc(2 * cpu_nrq_size); if (mcpup->cpu_nrq_va == NULL) cmn_err(CE_PANIC, "cpu%d: nonresumable queue " "allocation failed", cpu->cpu_id); mcpup->cpu_nrq_base_pa = va_to_pa(mcpup->cpu_nrq_va); - mcpup->cpu_nrq_size = CPU_NRQ_SIZE; + mcpup->cpu_nrq_size = cpu_nrq_size; /* zero out the memory */ - bzero(mcpup->cpu_nrq_va, 2 * CPU_NRQ_SIZE); + bzero(mcpup->cpu_nrq_va, 2 * cpu_nrq_size); +} +void +cpu_intrq_cleanup(struct cpu *cpu) +{ + struct machcpu *mcpup = &cpu->cpu_m; + int cpu_list_size; + uint64_t cpu_q_size; + uint64_t dev_q_size; + uint64_t cpu_rq_size; + uint64_t cpu_nrq_size; + + /* + * Free mondo data for xcalls. + */ + if (mcpup->mondo_data) { + contig_mem_free(mcpup->mondo_data, INTR_REPORT_SIZE); + mcpup->mondo_data = NULL; + mcpup->mondo_data_ra = NULL; + } + + /* + * Free percpu list of NCPU for xcalls + */ + cpu_list_size = NCPU * sizeof (uint16_t); + if (cpu_list_size < INTR_REPORT_SIZE) + cpu_list_size = INTR_REPORT_SIZE; + + if (mcpup->cpu_list) { + contig_mem_free(mcpup->cpu_list, cpu_list_size); + mcpup->cpu_list = NULL; + mcpup->cpu_list_ra = NULL; + } + + /* + * Free sun4v interrupt and error queues. + */ + if (mcpup->cpu_q_va) { + cpu_q_size = cpu_q_entries * INTR_REPORT_SIZE; + contig_mem_free(mcpup->cpu_q_va, cpu_q_size); + mcpup->cpu_q_va = NULL; + mcpup->cpu_q_base_pa = NULL; + mcpup->cpu_q_size = 0; + } + + if (mcpup->dev_q_va) { + dev_q_size = dev_q_entries * INTR_REPORT_SIZE; + contig_mem_free(mcpup->dev_q_va, dev_q_size); + mcpup->dev_q_va = NULL; + mcpup->dev_q_base_pa = NULL; + mcpup->dev_q_size = 0; + } + + if (mcpup->cpu_rq_va) { + cpu_rq_size = cpu_rq_entries * Q_ENTRY_SIZE; + contig_mem_free(mcpup->cpu_rq_va, 2 * cpu_rq_size); + mcpup->cpu_rq_va = NULL; + mcpup->cpu_rq_base_pa = NULL; + mcpup->cpu_rq_size = 0; + } + + if (mcpup->cpu_nrq_va) { + cpu_nrq_size = cpu_nrq_entries * Q_ENTRY_SIZE; + contig_mem_free(mcpup->cpu_nrq_va, 2 * cpu_nrq_size); + mcpup->cpu_nrq_va = NULL; + mcpup->cpu_nrq_base_pa = NULL; + mcpup->cpu_nrq_size = 0; + } } diff --git a/usr/src/uts/sun4v/os/lpad.c b/usr/src/uts/sun4v/os/lpad.c new file mode 100644 index 0000000000..a2c22badde --- /dev/null +++ b/usr/src/uts/sun4v/os/lpad.c @@ -0,0 +1,231 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/machsystm.h> +#include <sys/machparam.h> +#include <sys/cmn_err.h> +#include <sys/cpuvar.h> +#include <sys/note.h> +#include <sys/hypervisor_api.h> +#include <sys/lpad.h> + +typedef struct { + uint64_t inuse; + uint64_t buf[LPAD_SIZE / sizeof (uint64_t)]; +} lpad_t; + +/* + * A global pool of landing pad memory. Currently, CPUs are only + * brought into the system one at a time, so the pool is only a + * single landing pad. In the future, it may be desirable to bring + * CPUs into the systems in parallel. At that time, the size of + * the pool can be increased by changing the pool size constant. + */ +#define LPAD_POOL_SIZE 1 + +static lpad_t lpad_pool[LPAD_POOL_SIZE]; + +#ifdef DEBUG +static int lpad_dbg = 0; + +#define LPAD_DBG if (lpad_dbg) printf +#define LPAD_DUMP_DATA lpad_dump_data + +static void lpad_dump_data(uint64_t *lpd_start, uint64_t *lpd_end); + +#else /* DEBUG */ + +#define LPAD_DBG _NOTE(CONSTCOND) if (0) printf +#define LPAD_DUMP_DATA +#endif /* DEBUG */ + +extern void mach_cpu_startup(uint64_t rabase, uint64_t memsize); +extern void mach_cpu_startup_end(void); +extern int promif_in_cif(void); + +static lpad_t *lpad_alloc(void); + +uint64_t * +lpad_setup(int cpuid, uint64_t pc, uint64_t arg) +{ + lpad_t *lpp; + uint64_t textsz; + uint64_t datasz; + lpad_data_t *lpd; + lpad_map_t *lpm; + + /* external parameters */ + extern caddr_t textva; + extern caddr_t datava; + extern tte_t ktext_tte; + extern tte_t kdata_tte; + extern caddr_t mmu_fault_status_area; + + LPAD_DBG("lpad_setup...\n"); + + if ((cpuid < 0) || (cpuid > NCPU)) { + cmn_err(CE_PANIC, "lpad_setup: invalid cpuid"); + } + + /* allocate our landing pad */ + if ((lpp = lpad_alloc()) == NULL) { + cmn_err(CE_PANIC, "lpad_setup: unable to allocate lpad"); + } + + /* calculate the size of our text */ + textsz = (uint64_t)mach_cpu_startup_end - (uint64_t)mach_cpu_startup; + + LPAD_DBG("lpad textsz=%ld\n", textsz); + + ASSERT(textsz <= LPAD_TEXT_SIZE); + + /* copy over text section */ + bcopy((void *)mach_cpu_startup, lpp->buf, textsz); + + lpd = (lpad_data_t *)(((caddr_t)lpp->buf) + LPAD_TEXT_SIZE); + lpm = (lpad_map_t *)lpd->map; + + ASSERT(mmu_fault_status_area); + + bzero(lpd, LPAD_TEXT_SIZE); + lpd->magic = LPAD_MAGIC_VAL; + lpd->inuse = &(lpp->inuse); + lpd->mmfsa_ra = va_to_pa(mmu_fault_status_area) + (MMFSA_SIZE * cpuid); + lpd->pc = pc; + lpd->arg = arg; + + /* + * List of mappings: + * + * - permanent inst/data mapping for kernel text + * - permanent data mapping for kernel data + * - non-permanent inst mapping for kernel data, + * required for landing pad text + */ + lpd->nmap = 3; + + /* verify the lpad has enough room for the data */ + datasz = sizeof (lpad_data_t); + datasz += (lpd->nmap - 1) * sizeof (lpad_map_t); + + ASSERT(datasz <= LPAD_DATA_SIZE); + + /* + * Kernel Text Mapping + */ + lpm->va = (uint64_t)textva; + lpm->tte = ktext_tte; + lpm->flag_mmuflags = (MAP_ITLB | MAP_DTLB); + lpm->flag_perm = 1; + lpm++; + + /* + * Kernel Data Mapping + */ + lpm->va = (uint64_t)datava; + lpm->tte = kdata_tte; + lpm->flag_mmuflags = MAP_DTLB; + lpm->flag_perm = 1; + lpm++; + + /* + * Landing Pad Text Mapping + * + * Because this mapping should not be permanent, + * the permanent mapping above cannot be used. + */ + lpm->va = (uint64_t)datava; + lpm->tte = kdata_tte; + lpm->flag_mmuflags = MAP_ITLB; + lpm->flag_perm = 0; + lpm++; + + ASSERT(((uint64_t)lpm - (uint64_t)lpd) == datasz); + + LPAD_DBG("copied %ld bytes of data into lpad\n", datasz); + + LPAD_DUMP_DATA((uint64_t *)lpd, (uint64_t *)lpm); + + return (lpp->buf); +} + +static lpad_t * +lpad_alloc(void) +{ + int idx; + + /* + * No locking is required for the global lpad pool since + * it should only be accessed while in the CIF which is + * single threaded. If this assumption changes, locking + * would be required. + */ + ASSERT(promif_in_cif()); + + /* + * Wait until an lpad buffer becomes available. + */ + for (;;) { + LPAD_DBG("checking lpad pool:\n"); + + /* walk the lpad buffer array */ + for (idx = 0; idx < LPAD_POOL_SIZE; idx++) { + + LPAD_DBG("\tchecking lpad_pool[%d]\n", idx); + + if (lpad_pool[idx].inuse == 0) { + LPAD_DBG("found empty lpad (%d)\n", idx); + + /* mark the buffer as busy */ + lpad_pool[idx].inuse = 1; + + return (&lpad_pool[idx]); + } + } + } +} + +#ifdef DEBUG +static void +lpad_dump_data(uint64_t *lpd_start, uint64_t *lpd_end) +{ + uint64_t *lp; + uint_t offset = 0; + + if (lpad_dbg == 0) + return; + + printf("lpad data:\n"); + + for (lp = lpd_start; lp < lpd_end; lp++) { + printf("\t0x%02x 0x%016lx\n", offset, *lp); + offset += sizeof (uint64_t); + } +} +#endif /* DEBUG */ diff --git a/usr/src/uts/sun4v/os/mach_cpu_states.c b/usr/src/uts/sun4v/os/mach_cpu_states.c index f43356ac1b..a045ea00e2 100644 --- a/usr/src/uts/sun4v/os/mach_cpu_states.c +++ b/usr/src/uts/sun4v/os/mach_cpu_states.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -49,6 +48,8 @@ #include <sys/dtrace.h> #include <sys/xc_impl.h> #include <sys/callb.h> +#include <sys/mdesc.h> +#include <sys/mach_descrip.h> /* * hvdump_buf_va is a pointer to the currently-configured hvdump_buf. @@ -438,6 +439,12 @@ ptl1_panic_handler(ptl1_state_t *pstate) "CPU ECC error loop", /* PTL1_BAD_ECC */ "unexpected error from hypervisor call", /* PTL1_BAD_HCALL */ "unexpected global level(%gl)", /* PTL1_BAD_GL */ + "Watchdog Reset", /* PTL1_BAD_WATCHDOG */ + "unexpected RED mode trap", /* PTL1_BAD_RED */ + "return value EINVAL from hcall: "\ + "UNMAP_PERM_ADDR", /* PTL1_BAD_HCALL_UNMAP_PERM_EINVAL */ + "return value ENOMAP from hcall: "\ + "UNMAP_PERM_ADDR", /* PTL1_BAD_HCALL_UNMAP_PERM_ENOMAP */ }; uint_t reason = pstate->ptl1_regs.ptl1_gregs[0].ptl1_g1; @@ -559,7 +566,45 @@ getintprop(pnode_t node, char *name, int deflt) void cpu_init_tick_freq(void) { - sys_tick_freq = cpunodes[CPU->cpu_id].clock_freq; + md_t *mdp; + mde_cookie_t rootnode; + int listsz; + mde_cookie_t *listp = NULL; + int num_nodes; + uint64_t stick_prop; + + if (broken_md_flag) { + sys_tick_freq = cpunodes[CPU->cpu_id].clock_freq; + return; + } + + if ((mdp = md_get_handle()) == NULL) + panic("stick_frequency property not found in MD"); + + rootnode = md_root_node(mdp); + ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); + + num_nodes = md_node_count(mdp); + + ASSERT(num_nodes > 0); + listsz = num_nodes * sizeof (mde_cookie_t); + listp = (mde_cookie_t *)prom_alloc((caddr_t)0, listsz, 0); + + if (listp == NULL) + panic("cannot allocate list for MD properties"); + + num_nodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "platform"), + md_find_name(mdp, "fwd"), listp); + + ASSERT(num_nodes == 1); + + if (md_get_prop_val(mdp, *listp, "stick-frequency", &stick_prop) != 0) + panic("stick_frequency property not found in MD"); + + sys_tick_freq = stick_prop; + + prom_free((caddr_t)listp, listsz); + (void) md_fini_handle(mdp); } int shipit(int n, uint64_t cpu_list_ra); diff --git a/usr/src/uts/sun4v/os/mach_descrip.c b/usr/src/uts/sun4v/os/mach_descrip.c index d603a1c06e..fe4b9f3724 100644 --- a/usr/src/uts/sun4v/os/mach_descrip.c +++ b/usr/src/uts/sun4v/os/mach_descrip.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,13 +18,33 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" +/* + * Kernel Machine Description (MD) + * + * The Kernel maintains a global copy of the machine description for + * the system. This is for use by all kernel subsystems and is exported + * to user applications through the the 'mdesc' device driver. It is + * initially copied in from the Hypervisor at boot time, but can be + * updated dynamically on demand. The Kernel provides an interface + * for consumers to obtain a handle to the global MD. Consumers of the + * MD must use the specified interfaces. An update interface is provided + * for platform services to intiate an MD update on notification by a + * service entity. + * + * Locks + * The current global MD is protected by the curr_mach_descrip_lock. + * Each Machine description has a lock to synchornize its ref count. + * The Obsolete MD list is protected by the obs_list_lock. + */ + #include <sys/machsystm.h> #include <sys/vm.h> #include <sys/cpu.h> @@ -37,32 +56,93 @@ #include <sys/error.h> #include <sys/hypervisor_api.h> #include <sys/types.h> -#include <sys/kstat.h> -#ifdef MACH_DESC_DEBUG -#include <sys/promif.h> /* for prom_printf */ -#endif #include <sys/sysmacros.h> +#include <sys/mdesc.h> +#include <sys/mdesc_impl.h> #include <sys/mach_descrip.h> +#include <sys/prom_plat.h> +#include <sys/bootconf.h> +#include <sys/promif.h> + + +static void *mach_descrip_strt_meta_alloc(size_t size); +static void mach_descrip_strt_meta_free(void *buf, size_t size); +static void *mach_descrip_strt_buf_alloc(size_t size, size_t align); +static void mach_descrip_strt_buf_free(void *buf, size_t size); +static void *mach_descrip_buf_alloc(size_t size, size_t align); +static void *mach_descrip_meta_alloc(size_t size); +static uint64_t mach_descrip_find_md_gen(caddr_t ptr); +static void init_md_params(void); +static void init_domaining_enabled(md_t *mdp, mde_cookie_t *listp); + +extern struct bootops *bootops; + +/* + * Global ptr of the current generation Machine Description + */ +static machine_descrip_t *curr_mach_descrip; /* - * Basic code to pull in the machine description from the Hypervisor - * An equivalent to this should really be available from mlsetup - * for really early info, but for the time being we are content to - * invoke this from startup_end once the VM system has been initialised. - * To do this we use the intrq allocator which means that - * this function should be called after intrq_init(); - * We try and do this early enough however that it is useful to other - * components within the kernel. - * Also, user-level entities can grab the machine description via - * kstat and/or the mdesc device driver. + * Initialized by machine_descrip_startup_init in startup. + * machine_descript_init will reintialize the structure with + * the vmem allocators once the vmem is available in the boot up + * process. */ +static machine_descrip_memops_t *curr_mach_descrip_memops = NULL; +static machine_descrip_memops_t startup_memops = { + mach_descrip_strt_buf_alloc, + mach_descrip_strt_buf_free, + mach_descrip_strt_meta_alloc, + mach_descrip_strt_meta_free, +}; -machine_descrip_t machine_descrip; +static machine_descrip_memops_t mach_descrip_memops = { + mach_descrip_buf_alloc, + contig_mem_free, + mach_descrip_meta_alloc, + kmem_free, +}; +static kmutex_t curr_mach_descrip_lock; +/* + * List of obsolete Machine Descriptions + * Machine descriptions that have users are put on this list + * and freed after the last user has called md_fini_handle. + */ +static machine_descrip_t *obs_machine_descrip_list; -#ifdef MACH_DESC_DEBUG -#define MDP(ARGS) prom_printf ARGS +static kmutex_t obs_list_lock; + +static const char alloc_fail_msg[] = + "MD: cannot allocate MD buffer of size %ld bytes\n"; + +/* + * Global flag that indicates whether domaining features are + * available. The value is set at boot time based on the value + * of the 'domaining-enabled' property in the MD and the global + * override flag below. Updates to this variable after boot are + * not supported. + */ +uint_t domaining_enabled; + +/* + * Global override for the 'domaining_enabled' flag. If this + * flag is set in /etc/system, domaining features are disabled, + * ignoring the value of the 'domaining-enabled' property in + * the MD. + */ +uint_t force_domaining_disabled; + +#define HAS_GEN(x) (x != MDESC_INVAL_GEN) + +#ifdef DEBUG +static int mach_descrip_debug = 0; + +#define MDP(ARGS) if (mach_descrip_debug) prom_printf ARGS +#define PRINT_LIST() if (mach_descrip_debug) print_obs_list() + +#ifdef MACH_DESC_DEBUG static void dump_buf(uint8_t *bufp, int size) { @@ -75,74 +155,711 @@ dump_buf(uint8_t *bufp, int size) prom_printf("\n"); } } +#endif /* MACH_DESC_DEBUG */ + +static void +print_obs_list(void) +{ + machine_descrip_t *lmdescp; + mutex_enter(&obs_list_lock); + + lmdescp = obs_machine_descrip_list; + prom_printf("MD_obs_list->"); + while (lmdescp != NULL) { + prom_printf("g:%ld,r:%d", lmdescp->gen, lmdescp->refcnt); + + lmdescp = lmdescp->next; + prom_printf("->"); + } + prom_printf("NULL\n"); + mutex_exit(&obs_list_lock); +} + #else -#define MDP(x) -#endif +#define MDP(ARGS) +#define PRINT_LIST() +#endif /* DEBUG */ +/* + * MD obsolete list managment functions + */ +static machine_descrip_t * +md_obs_list_look_up_by_gen(uint64_t gen) +{ + machine_descrip_t *mdescp; + mutex_enter(&obs_list_lock); + mdescp = obs_machine_descrip_list; + while (mdescp != NULL) { + if (mdescp->gen == gen) { + mutex_exit(&obs_list_lock); + return (mdescp); + } + mdescp = mdescp->next; + } + mutex_exit(&obs_list_lock); + return (mdescp); +} -void -mach_descrip_init(void) +static void +md_obs_list_remove(machine_descrip_t *mdescp) +{ + machine_descrip_t *lmdescp; + + mutex_enter(&obs_list_lock); + + lmdescp = obs_machine_descrip_list; + + if (obs_machine_descrip_list == mdescp) { + obs_machine_descrip_list = mdescp->next; + } else { + while (lmdescp != NULL) { + if (lmdescp->next == mdescp) { + lmdescp->next = mdescp->next; + mdescp->next = NULL; + break; + } + lmdescp = lmdescp->next; + } + } + mutex_exit(&obs_list_lock); + PRINT_LIST(); +} + +static void +md_obs_list_add(machine_descrip_t *mdescp) +{ + mutex_enter(&obs_list_lock); + + mdescp->next = obs_machine_descrip_list; + obs_machine_descrip_list = mdescp; + + mutex_exit(&obs_list_lock); + PRINT_LIST(); +} + +/* + * Allocate a machine_descrip meta structure and intitialize it. + */ +static machine_descrip_t * +new_mach_descrip(void) +{ + machine_descrip_t *mdescp; + + mdescp = (machine_descrip_t *)(*curr_mach_descrip_memops->meta_allocp) + (sizeof (machine_descrip_t)); + if (mdescp != NULL) { + bzero(mdescp, sizeof (*mdescp)); + mdescp->memops = curr_mach_descrip_memops; + mutex_init(&mdescp->lock, NULL, MUTEX_DRIVER, NULL); + } + + return (mdescp); +} + +/* + * Free a machine_descrip meta structure and intitialize it. + * Also free the MD buffer. + */ +static void +destroy_machine_descrip(machine_descrip_t *mdescp) { - uint64_t md_size, ret; + machine_descrip_memops_t *mdesc_memopsp; + + ASSERT((mdescp != NULL)); + + mdesc_memopsp = mdescp->memops; + if (mdescp->memops == NULL) + panic("destroy_machine_descrip: memops NULL\n"); + + (*mdesc_memopsp->buf_freep)(mdescp->va, mdescp->space); + mutex_destroy(&mdescp->lock); + (*mdesc_memopsp->meta_freep)(mdescp, sizeof (*mdescp)); +} + +/* + * Call into the Hypervisor to retrieve the most recent copy of the + * machine description. If references to the current MD are active + * stow it in the obsolete MD list and update the current MD reference + * with the new one. + * The obsolete list contains one MD per generation. If the firmware + * doesn't support MD generation fail the call. + */ +int +mach_descrip_update(void) +{ + uint64_t md_size0, md_size; + uint64_t md_space = 0; + uint64_t hvret; + caddr_t tbuf = NULL; + uint64_t tbuf_pa; + uint64_t tgen; + int ret = 0; MDP(("MD: Requesting buffer size\n")); - md_size = 0LL; - (void) hv_mach_desc((uint64_t)0, &md_size); - MDP(("MD: buffer size is %d\n", md_size)); + ASSERT((curr_mach_descrip != NULL)); + + mutex_enter(&curr_mach_descrip_lock); /* - * Align allocated space to nearest page contig_mem_alloc_align - * requires a Power of 2 alignment + * If the required MD size changes between our first call + * to hv_mach_desc (to find the required buf size) and the + * second call (to get the actual MD), the MD was in the + * process of being updated. Loop until the two sizes are + * identical. */ - machine_descrip.space = P2ROUNDUP(md_size, PAGESIZE); - MDP(("MD: allocated space is %d\n", machine_descrip.space)); - machine_descrip.va = contig_mem_alloc_align(machine_descrip.space, - PAGESIZE); - if (machine_descrip.va == NULL) - cmn_err(CE_PANIC, "Allocation for machine description failed"); + do { + if (tbuf != NULL) + (*curr_mach_descrip_memops->buf_freep)(tbuf, md_space); - MDP(("MD: allocated va = 0x%p (size 0x%llx)\n", - machine_descrip.va, machine_descrip.space)); + md_size0 = 0LL; + (void) hv_mach_desc((uint64_t)0, &md_size0); + MDP(("MD: buffer size is %ld\n", md_size0)); - machine_descrip.pa = va_to_pa(machine_descrip.va); + /* + * Align allocated space to nearest page. + * contig_mem_alloc_align() requires a power of 2 alignment. + */ + md_space = P2ROUNDUP(md_size0, PAGESIZE); + MDP(("MD: allocated space is %ld\n", md_space)); - MDP(("MD: allocated pa = 0x%llx\n", machine_descrip.pa)); + tbuf = (caddr_t)(*curr_mach_descrip_memops->buf_allocp) + (md_space, PAGESIZE); + if (tbuf == NULL) { + ret = -1; + goto done; + } - ret = hv_mach_desc(machine_descrip.pa, &md_size); - MDP(("MD: HV return code = %ld\n", ret)); + tbuf_pa = va_to_pa(tbuf); + hvret = hv_mach_desc(tbuf_pa, &md_size); + MDP(("MD: HV return code = %ld\n", hvret)); - if (ret != H_EOK) { - MDP(("MD: Failed with code %ld from HV\n", ret)); + /* + * We get H_EINVAL if our buffer size is too small. In + * that case stay in the loop, reallocate the buffer + * and try again. + */ + if (hvret != H_EOK && hvret != H_EINVAL) { + MDP(("MD: Failed with code %ld from HV\n", hvret)); + ret = -1; + goto done; + } - machine_descrip.size = 0; + } while (md_size0 != md_size || hvret == H_EINVAL); - } else { - MDP(("MD: Grabbed %d bytes from HV\n", md_size)); -#ifdef MACH_DESC_DEBUG - dump_buf((uint8_t *)machine_descrip.va, md_size); -#endif /* MACH_DESC_DEBUG */ + tgen = mach_descrip_find_md_gen(tbuf); + +#ifdef DEBUG + if (!HAS_GEN(tgen)) { + MDP(("MD: generation number not found\n")); + } else + MDP(("MD: generation number %ld\n", tgen)); +#endif /* DEBUG */ - machine_descrip.size = md_size; + if (curr_mach_descrip->va != NULL) { + /* check for the same generation number */ + if (HAS_GEN(tgen) && ((curr_mach_descrip->gen == tgen) && + (curr_mach_descrip->size == md_size))) { +#ifdef DEBUG /* - * Allocate the kstat to get at the data + * Pedantic Check for generation number. If the + * generation number is the same, make sure the + * MDs are really identical. */ - machine_descrip.ksp = kstat_create("unix", 0, "machdesc", - "misc", - KSTAT_TYPE_RAW, - (uint_t)machine_descrip.size, - KSTAT_FLAG_VIRTUAL); - - if (machine_descrip.ksp == NULL) { - cmn_err(CE_PANIC, - "Failed to create kstat for machine description"); + if (bcmp(curr_mach_descrip->va, tbuf, md_size) != 0) { + cmn_err(CE_WARN, "machine_descrip_update: MDs " + "with the same generation (%ld) are not " + "identical", tgen); + ret = -1; + goto done; + } +#endif + cmn_err(CE_WARN, "machine_descrip_update: new MD has " + "the same generation (%ld) as the old MD", tgen); + ret = 0; + goto done; + } + + /* check for generations moving backwards */ + if (HAS_GEN(tgen) && HAS_GEN(curr_mach_descrip->gen) && + (curr_mach_descrip->gen > tgen)) { + cmn_err(CE_WARN, "machine_descrip_update: new MD" + " older generation (%ld) than current MD (%ld)", + tgen, curr_mach_descrip->gen); + ret = -1; + goto done; + } + + if (curr_mach_descrip->refcnt == 0) { + + MDP(("MD: freeing old md buffer gen %ld\n", + curr_mach_descrip->gen)); + + /* Free old space */ + ASSERT(curr_mach_descrip->space > 0); + + (*curr_mach_descrip_memops->buf_freep) + (curr_mach_descrip->va, curr_mach_descrip->space); } else { - machine_descrip.ksp->ks_data = machine_descrip.va; - kstat_install(machine_descrip.ksp); + if (!HAS_GEN(tgen)) { + /* + * No update support if FW + * doesn't have MD generation id + * feature. + */ + prom_printf("WARNING: F/W does not support MD " + "generation count, MD update failed\n"); + ret = -1; + goto done; + } + + MDP(("MD: adding to obs list %ld\n", + curr_mach_descrip->gen)); + + md_obs_list_add(curr_mach_descrip); + + curr_mach_descrip = new_mach_descrip(); + + if (curr_mach_descrip == NULL) { + panic("Allocation for machine description" + " failed\n"); + } + } + } + + curr_mach_descrip->va = tbuf; + curr_mach_descrip->gen = tgen; + curr_mach_descrip->size = md_size; + curr_mach_descrip->space = md_space; + +#ifdef MACH_DESC_DEBUG + dump_buf((uint8_t *)curr_mach_descrip->va, md_size); +#endif /* MACH_DESC_DEBUG */ + + mutex_exit(&curr_mach_descrip_lock); + return (ret); + +done: + if (tbuf != NULL) + (*curr_mach_descrip_memops->buf_freep)(tbuf, md_space); + mutex_exit(&curr_mach_descrip_lock); + return (ret); +} + +static void * +mach_descrip_buf_alloc(size_t size, size_t align) +{ + void *p; + + if ((p = contig_mem_alloc_align(size, align)) == NULL) + cmn_err(CE_WARN, alloc_fail_msg, size); + + return (p); +} + +static void * +mach_descrip_strt_meta_alloc(size_t size) +{ + return (BOP_ALLOC(bootops, (caddr_t)0, size, PAGESIZE)); +} + +static void +mach_descrip_strt_meta_free(void *buf, size_t size) +{ + BOP_FREE(bootops, buf, size); +} + +static void * +mach_descrip_strt_buf_alloc(size_t size, size_t align) +{ + void *p = prom_alloc((caddr_t)0, size, align); + + if (p == NULL) + prom_printf(alloc_fail_msg, size); + + return (p); +} + +static void +mach_descrip_strt_buf_free(void *buf, size_t size) +{ + prom_free((caddr_t)buf, size); +} + +static void * +mach_descrip_meta_alloc(size_t size) +{ + return (kmem_alloc(size, KM_SLEEP)); +} + +/* + * Initialize the kernel's Machine Description(MD) framework + * early on in startup during mlsetup() so consumers + * can get to the MD before the VM system has been initialized. + * + * Also get the most recent version of the MD. + */ +void +mach_descrip_startup_init(void) +{ + + mutex_init(&curr_mach_descrip_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&obs_list_lock, NULL, MUTEX_DRIVER, NULL); + + obs_machine_descrip_list = NULL; + + curr_mach_descrip_memops = &startup_memops; + + curr_mach_descrip = new_mach_descrip(); + if (curr_mach_descrip == NULL) + panic("Allocation for machine description failed\n"); + + if (mach_descrip_update()) + panic("Machine description initialization failed\n"); + +} + +/* + * Counterpart to the above init function. Free up resources + * allocated at startup by mach_descrip_startup_setup(). + * And reset machine description framework state. + * + * All consumers must have fini'ed their handles at this point. + */ +void +mach_descrip_startup_fini(void) +{ + + ASSERT((curr_mach_descrip != NULL)); + ASSERT((curr_mach_descrip->refcnt == 0)); + ASSERT((obs_machine_descrip_list == NULL)); + + destroy_machine_descrip(curr_mach_descrip); + curr_mach_descrip = NULL; + curr_mach_descrip_memops = NULL; +} + +/* + * Initialize the kernel's Machine Description(MD) framework + * after the the VM system has been initialized. + * + * Also get the most recent version of the MD. + * Assumes that the machine description frame work is in a clean + * state and the machine description intialized during startup + * has been cleaned up and resources deallocated. + */ +void +mach_descrip_init(void) +{ + ASSERT((curr_mach_descrip == NULL && + curr_mach_descrip_memops == NULL)); + + curr_mach_descrip_memops = &mach_descrip_memops; + + curr_mach_descrip = new_mach_descrip(); + if (curr_mach_descrip == NULL) + panic("Allocation for machine description failed\n"); + + if (mach_descrip_update()) + panic("Machine description intialization failed\n"); + + /* read in global params */ + init_md_params(); +} + +/* + * Client interface to get a handle to the current MD. + * The md_fini_handle() interface should be used to + * clean up the refernce to the MD returned by this function. + */ +md_t * +md_get_handle(void) +{ + md_t *mdp; + + mutex_enter(&curr_mach_descrip_lock); + + if (curr_mach_descrip == NULL) { + return (NULL); + } + + curr_mach_descrip->refcnt++; + mdp = md_init_intern(curr_mach_descrip->va, + curr_mach_descrip->memops->meta_allocp, + curr_mach_descrip->memops->meta_freep); + + mutex_exit(&curr_mach_descrip_lock); + + return (mdp); +} + +/* + * Client interface to clean up the refernce to the MD returned + * by md_get_handle(). + */ +int +md_fini_handle(md_t *ptr) +{ + machine_descrip_t *mdescp; + md_impl_t *mdp; + + + mdp = (md_impl_t *)ptr; + + if (mdp == NULL) + return (-1); + /* + * Check if mdp is current MD gen + */ + mutex_enter(&curr_mach_descrip_lock); + + if (curr_mach_descrip->gen == mdp->gen) { + curr_mach_descrip->refcnt--; + mutex_exit(&curr_mach_descrip_lock); + goto fini; + } + mutex_exit(&curr_mach_descrip_lock); + + /* + * MD is in the obsolete list + */ + mdescp = md_obs_list_look_up_by_gen(mdp->gen); + if (mdescp == NULL) + return (-1); + + mutex_enter(&mdescp->lock); + mdescp->refcnt--; + if (mdescp->refcnt == 0) { + md_obs_list_remove(mdescp); + mutex_exit(&mdescp->lock); + destroy_machine_descrip(mdescp); + goto fini; + } + mutex_exit(&mdescp->lock); + +fini: + return (md_fini(ptr)); +} + +/* + * General purpose initialization function used to extract parameters + * from the MD during the boot process. This is called immediately after + * the in kernel copy of the MD has been initialized so that global + * flags are available to various subsystems as they get initialized. + */ +static void +init_md_params(void) +{ + md_t *mdp; + int num_nodes; + mde_cookie_t *listp; + int listsz; + + mdp = md_get_handle(); + ASSERT(mdp); + num_nodes = md_node_count(mdp); + ASSERT(num_nodes >= 0); + + listsz = num_nodes * sizeof (mde_cookie_t); + listp = (mde_cookie_t *) + (*curr_mach_descrip_memops->meta_allocp)(listsz); + + /* + * Import various parameters from the MD. For now, + * the only parameter of interest is whether or not + * domaining features are supported. + */ + init_domaining_enabled(mdp, listp); + + (*curr_mach_descrip_memops->meta_freep)(listp, listsz); + (void) md_fini_handle(mdp); +} + +static void +init_domaining_enabled(md_t *mdp, mde_cookie_t *listp) +{ + mde_cookie_t rootnode; + int num_nodes; + uint64_t val = 0; + + /* + * If domaining has been manually disabled, always + * honor that and ignore the value in the MD. + */ + if (force_domaining_disabled) { + domaining_enabled = 0; + MDP(("domaining manually disabled\n")); + return; + } + + rootnode = md_root_node(mdp); + ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); + + num_nodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "platform"), + md_find_name(mdp, "fwd"), listp); + + /* should only be one platform node */ + ASSERT(num_nodes == 1); + + if (md_get_prop_val(mdp, *listp, "domaining-enabled", &val) != 0) { + /* + * The property is not present. This implies + * that the firmware does not support domaining + * features. + */ + MDP(("'domaining-enabled' property not present\n")); + + domaining_enabled = 0; + return; + } + + domaining_enabled = val; + + MDP(("domaining_enabled = 0x%x\n", domaining_enabled)); +} + +/* + * Client interface to get a pointer to the raw MD buffer + * Private to kernel and mdesc driver. + */ +caddr_t +md_get_md_raw(md_t *ptr) +{ + md_impl_t *mdp; + + mdp = (md_impl_t *)ptr; + if (mdp == NULL) + return (NULL); + return (mdp->caddr); +} + +/* + * This is called before an MD structure is intialized, so + * it walks the raw MD looking for the generation property. + */ +static uint64_t +mach_descrip_find_md_gen(caddr_t ptr) +{ + md_header_t *hdrp; + md_element_t *mdep; + md_element_t *rootnode = NULL; + md_element_t *elem = NULL; + char *namep; + boolean_t done; + int idx; + + hdrp = (md_header_t *)ptr; + mdep = (md_element_t *)(ptr + MD_HEADER_SIZE); + namep = (char *)(ptr + MD_HEADER_SIZE + hdrp->node_blk_sz); + + /* + * Very basic check for alignment to avoid + * bus error issues. + */ + if ((((uint64_t)ptr) & 7) != 0) + return (MDESC_INVAL_GEN); + + if (mdtoh32(hdrp->transport_version) != MD_TRANSPORT_VERSION) { + return (MDESC_INVAL_GEN); + } + + /* + * Search for the root node. Perform the walk manually + * since the MD structure is not set up yet. + */ + for (idx = 0, done = B_FALSE; done == B_FALSE; ) { + + md_element_t *np = &(mdep[idx]); + + switch (MDE_TAG(np)) { + case MDET_LIST_END: + done = B_TRUE; + break; + + case MDET_NODE: + if (strcmp(namep + MDE_NAME(np), "root") == 0) { + /* found root node */ + rootnode = np; + done = B_TRUE; + break; + } + idx = MDE_PROP_INDEX(np); + break; + + default: + /* ignore */ + idx++; } } + + if (rootnode == NULL) { + /* root not found */ + return (MDESC_INVAL_GEN); + } + + /* search the rootnode for the generation property */ + for (elem = (rootnode + 1); MDE_TAG(elem) != MDET_NODE_END; elem++) { + + char *prop_name; + + /* generation field is a prop_val */ + if (MDE_TAG(elem) != MDET_PROP_VAL) + continue; + + prop_name = namep + MDE_NAME(elem); + + if (strcmp(prop_name, "md-generation#") == 0) { + return (MDE_PROP_VALUE(elem)); + } + } + + return (MDESC_INVAL_GEN); +} + +/* + * Failed to allocate the list : Return value -1 + * md_scan_dag API failed : Return the result from md_scan_dag API + */ +int +md_alloc_scan_dag(md_t *ptr, + mde_cookie_t startnode, + char *node_name, + char *dag, + mde_cookie_t **list) +{ + int res; + md_impl_t *mdp = (md_impl_t *)ptr; + + *list = (mde_cookie_t *)mdp->allocp(sizeof (mde_cookie_t) * + mdp->node_count); + if (*list == NULL) + return (-1); + + res = md_scan_dag(ptr, startnode, + md_find_name(ptr, node_name), + md_find_name(ptr, dag), *list); + + /* + * If md_scan_dag API returned 0 or -1 then free the buffer + * and return -1 to indicate the error from this API. + */ + if (res < 1) { + md_free_scan_dag(ptr, list); + *list = NULL; + } + + return (res); +} + +void +md_free_scan_dag(md_t *ptr, + mde_cookie_t **list) +{ + md_impl_t *mdp = (md_impl_t *)ptr; + + mdp->freep(*list, sizeof (mde_cookie_t) * mdp->node_count); } diff --git a/usr/src/uts/sun4v/os/mach_mp_startup.c b/usr/src/uts/sun4v/os/mach_mp_startup.c index 25a37ecdf4..421546277a 100644 --- a/usr/src/uts/sun4v/os/mach_mp_startup.c +++ b/usr/src/uts/sun4v/os/mach_mp_startup.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,6 +30,8 @@ #include <sys/cpu_module.h> #include <sys/dtrace.h> #include <sys/cpu_sgnblk_defs.h> +#include <sys/mdesc.h> +#include <sys/mach_descrip.h> /* * Useful for disabling MP bring-up for an MP capable kernel @@ -87,25 +89,67 @@ init_cpu_info(struct cpu *cp) } } -/* ARGSUSED */ /* - * Routine used to cleanup a CPU that has been powered off. This will + * Routine used to cleanup a CPU that has been powered off. This will * destroy all per-cpu information related to this cpu. */ int mp_cpu_unconfigure(int cpuid) { - return (0); + int retval; + extern void empty_cpu(int); + extern int cleanup_cpu_common(int); + + ASSERT(MUTEX_HELD(&cpu_lock)); + + retval = cleanup_cpu_common(cpuid); + + empty_cpu(cpuid); + + return (retval); } -/* ARGSUSED */ +struct mp_find_cpu_arg { + int cpuid; /* set by mp_cpu_configure() */ + dev_info_t *dip; /* set by mp_find_cpu() */ +}; + int mp_find_cpu(dev_info_t *dip, void *arg) { - return (0); + struct mp_find_cpu_arg *target = (struct mp_find_cpu_arg *)arg; + char *type; + int rv = DDI_WALK_CONTINUE; + int cpuid; + + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "device_type", &type)) + return (DDI_WALK_CONTINUE); + + if (strcmp(type, "cpu") != 0) + goto out; + + cpuid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "reg", -1); + + if (cpuid == -1) { + cmn_err(CE_PANIC, "reg prop not found in cpu node"); + } + + cpuid = PROM_CFGHDL_TO_CPUID(cpuid); + + if (cpuid != target->cpuid) + goto out; + + /* Found it */ + rv = DDI_WALK_TERMINATE; + target->dip = dip; + +out: + ddi_prop_free(type); + return (rv); } -/* ARGSUSED */ /* * Routine used to setup a newly inserted CPU in preparation for starting * it running code. @@ -113,5 +157,68 @@ mp_find_cpu(dev_info_t *dip, void *arg) int mp_cpu_configure(int cpuid) { + extern void fill_cpu(md_t *, mde_cookie_t); + extern void setup_cpu_common(int); + extern void setup_exec_unit_mappings(md_t *); + md_t *mdp; + mde_cookie_t rootnode, cpunode = MDE_INVAL_ELEM_COOKIE; + int listsz, i; + mde_cookie_t *listp = NULL; + int num_nodes; + uint64_t cpuid_prop; + + + ASSERT(MUTEX_HELD(&cpu_lock)); + + if ((mdp = md_get_handle()) == NULL) + return (ENODEV); + + rootnode = md_root_node(mdp); + + ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); + + num_nodes = md_node_count(mdp); + + ASSERT(num_nodes > 0); + + listsz = num_nodes * sizeof (mde_cookie_t); + listp = kmem_zalloc(listsz, KM_SLEEP); + + num_nodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "cpu"), + md_find_name(mdp, "fwd"), listp); + + if (num_nodes < 0) + return (ENODEV); + + for (i = 0; i < num_nodes; i++) { + if (md_get_prop_val(mdp, listp[i], "id", &cpuid_prop)) + break; + if (cpuid_prop == (uint64_t)cpuid) { + cpunode = listp[i]; + break; + } + } + + if (cpunode == MDE_INVAL_ELEM_COOKIE) + return (ENODEV); + + kmem_free(listp, listsz); + + /* + * Note: uses cpu_lock to protect cpunodes and ncpunodes + * which will be modified inside of fill_cpu and + * setup_exec_unit_mappings. + */ + fill_cpu(mdp, cpunode); + + /* + * Remap all the cpunodes' execunit mappings. + */ + setup_exec_unit_mappings(mdp); + + (void) md_fini_handle(mdp); + + setup_cpu_common(cpuid); + return (0); } diff --git a/usr/src/uts/sun4v/os/mach_mp_states.c b/usr/src/uts/sun4v/os/mach_mp_states.c index e10feb6f48..d680cec27e 100644 --- a/usr/src/uts/sun4v/os/mach_mp_states.c +++ b/usr/src/uts/sun4v/os/mach_mp_states.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,14 +18,26 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" +#include <sys/cpuvar.h> #include <sys/cpu_module.h> +#include <sys/machsystm.h> +#include <sys/archsystm.h> +#include <sys/prom_plat.h> +#include <sys/hypervisor_api.h> +#include <sys/hsvc.h> + +extern uint64_t xc_tick_limit; +extern uint64_t xc_tick_jump_limit; + +extern void cpu_intrq_unregister_powerdown(uint64_t doneflag_va); /* * set_idle_cpu is called from idle() when a CPU becomes idle. @@ -45,3 +56,224 @@ void unset_idle_cpu(int cpun) { } + +/* + * Stop a CPU based on its cpuid, using the cpu_stop hypervisor call. + * Since this requires that the hypervisor force a remote CPU to stop, + * the assumption is made that this should take roughly the same amount + * of time as a CPU mondo. Consequently, the mondo timeout is used to + * determine when to give up waiting for the CPU to stop. + * + * Attempts to stop a CPU already in the stopped or error state will + * silently succeed. Zero is returned on success and a non-negative + * errno value is returned on failure. + */ +int +stopcpu_bycpuid(int cpuid) +{ + uint64_t loop_cnt; + uint64_t state; + uint64_t rv; + uint64_t major = 0; + uint64_t minor = 0; + uint64_t cpu_stop_time_limit; + extern uint64_t xc_mondo_time_limit; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + /* + * Check the state of the CPU up front to see if an + * attempt to stop it is even necessary. + */ + if (hv_cpu_state(cpuid, &state) != H_EOK) + return (EINVAL); + + /* treat stopped and error state the same */ + if (state != CPU_STATE_RUNNING) { + /* nothing to do */ + return (0); + } + + /* + * The HV API to stop a CPU is only supported in + * version 1.1 and later of the core group. If an + * older version of the HV is in use, return not + * supported. + */ + if (hsvc_version(HSVC_GROUP_CORE, &major, &minor) != 0) + return (EINVAL); + + ASSERT(major != 0); + + if ((major == 1) && (minor < 1)) + return (ENOTSUP); + + /* use the mondo timeout if it has been initialized */ + cpu_stop_time_limit = xc_mondo_time_limit; + + /* + * If called early in boot before the mondo time limit + * is set, use a reasonable timeout based on the the + * clock frequency of the current CPU. + */ + if (cpu_stop_time_limit == 0) + cpu_stop_time_limit = cpunodes[CPU->cpu_id].clock_freq; + + /* should only fail if called too early in boot */ + ASSERT(cpu_stop_time_limit > 0); + + loop_cnt = 0; + + /* + * Attempt to stop the CPU, retrying if it is busy. + */ + while (loop_cnt++ < cpu_stop_time_limit) { + + if ((rv = hv_cpu_stop(cpuid)) != H_EWOULDBLOCK) + break; + } + + if (loop_cnt == cpu_stop_time_limit) + return (ETIMEDOUT); + + if (rv != H_EOK) + return (EINVAL); + + /* + * Verify that the CPU has reached the stopped state. + */ + while (loop_cnt++ < cpu_stop_time_limit) { + + if (hv_cpu_state(cpuid, &state) != H_EOK) + return (EINVAL); + + /* treat stopped and error state the same */ + if (state != CPU_STATE_RUNNING) + break; + } + + return ((loop_cnt == cpu_stop_time_limit) ? ETIMEDOUT : 0); +} + +/* + * X-trap to the target to unregister its interrupt and error queues + * and put it in a safe place just before the CPU is stopped. After + * unregistering its queues, the target CPU must not return from the + * trap to priv or user context. Ensure that the interrupt CPU unregister + * succeeded. + */ +void +xt_cpu_unreg_powerdown(struct cpu *cpup) +{ + uint8_t volatile not_done; + uint64_t starttick, endtick, tick, lasttick; + processorid_t cpuid = cpup->cpu_id; + + kpreempt_disable(); + + /* + * Sun4v uses a queue for receiving mondos. Successful + * transmission of a mondo only indicates that the mondo + * has been written into the queue. + * + * Set the not_done flag to 1 before sending the cross + * trap and wait until the other cpu resets it to 0. + */ + + not_done = 1; + + xt_one_unchecked(cpuid, (xcfunc_t *)cpu_intrq_unregister_powerdown, + (uint64_t)¬_done, 0); + + starttick = lasttick = gettick(); + endtick = starttick + xc_tick_limit; + + while (not_done) { + + tick = gettick(); + + /* + * If there is a big jump between the current tick + * count and lasttick, we have probably hit a break + * point. Adjust endtick accordingly to avoid panic. + */ + if (tick > (lasttick + xc_tick_jump_limit)) { + endtick += (tick - lasttick); + } + + lasttick = tick; + if (tick > endtick) { + cmn_err(CE_CONT, "Cross trap timeout at cpu id %x\n", + cpuid); + cmn_err(CE_WARN, "xt_intrq_unreg_powerdown: timeout"); + } + } + + kpreempt_enable(); +} + +int +plat_cpu_poweroff(struct cpu *cp) +{ + int rv = 0; + int status; + processorid_t cpuid = cp->cpu_id; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + /* + * Capture all CPUs (except for detaching proc) to prevent + * crosscalls to the detaching proc until it has cleared its + * bit in cpu_ready_set. + * + * The CPU's remain paused and the prom_mutex is known to be free. + * This prevents the x-trap victim from blocking when doing prom + * IEEE-1275 calls at a high PIL level. + */ + promsafe_pause_cpus(); + + /* + * Quiesce interrupts on the target CPU. We do this by setting + * the CPU 'not ready'- (i.e. removing the CPU from cpu_ready_set) + * to prevent it from receiving cross calls and cross traps. This + * prevents the processor from receiving any new soft interrupts. + */ + mp_cpu_quiesce(cp); + + /* + * Send a cross trap to the cpu to unregister its interrupt + * error queues. + */ + xt_cpu_unreg_powerdown(cp); + + cp->cpu_flags = CPU_OFFLINE | CPU_QUIESCED | CPU_POWEROFF; + + /* call into the Hypervisor to stop the CPU */ + if ((status = stopcpu_bycpuid(cpuid)) != 0) { + rv = -1; + } + + start_cpus(); + + if (rv != 0) { + cmn_err(CE_WARN, "failed to stop cpu %d (%d)", cpuid, status); + /* mark the CPU faulted so that it cannot be onlined */ + cp->cpu_flags = CPU_OFFLINE | CPU_QUIESCED | CPU_FAULTED; + } + + return (rv); +} + +int +plat_cpu_poweron(struct cpu *cp) +{ + extern void restart_other_cpu(int); + + ASSERT(MUTEX_HELD(&cpu_lock)); + + cp->cpu_flags &= ~CPU_POWEROFF; + + restart_other_cpu(cp->cpu_id); + + return (0); +} diff --git a/usr/src/uts/sun4v/os/mach_startup.c b/usr/src/uts/sun4v/os/mach_startup.c index 44d199782b..e2e4c5857b 100644 --- a/usr/src/uts/sun4v/os/mach_startup.c +++ b/usr/src/uts/sun4v/os/mach_startup.c @@ -18,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -36,6 +37,8 @@ #include <sys/disp.h> #include <sys/hypervisor_api.h> #include <sys/traptrace.h> +#include <sys/modctl.h> +#include <sys/ldoms.h> #ifdef TRAPTRACE int mach_htraptrace_enable = 1; @@ -61,7 +64,7 @@ setup_trap_table(void) mmfsa_va = mmu_fault_status_area + (MMFSA_SIZE * CPU->cpu_id); - intr_init(CPU); /* init interrupt request free list */ + intr_init(CPU); /* init interrupt request free list */ setwstate(WSTATE_KERN); set_mmfsa_scratchpad(mmfsa_va); prom_set_mmfsa_traptable(&trap_table, va_to_pa(mmfsa_va)); @@ -427,3 +430,54 @@ mach_htraptrace_cleanup(int cpuid) ctlp->d.hpaddr_base = NULL; } } + +/* + * Load any required machine class (sun4v) specific drivers. + */ +void +load_mach_drivers(void) +{ + /* + * We don't want to load these LDOMs-specific + * modules if domaining has been disabled. Also, + * we must be able to run on non-LDOMs firmware. + */ + if (!domaining_enabled) + return; + + /* + * Load the core domain services module + */ + if (modload("misc", "ds") == -1) + cmn_err(CE_NOTE, "!'ds' module failed to load"); + + /* + * Load the rest of the domain services + */ + if (modload("misc", "fault_iso") == -1) + cmn_err(CE_NOTE, "!'fault_iso' module failed to load"); + + if (modload("misc", "platsvc") == -1) + cmn_err(CE_NOTE, "!'platsvc' module failed to load"); + + if (modload("misc", "dr_cpu") == -1) + cmn_err(CE_NOTE, "!'dr_cpu' module failed to load"); + + /* + * Attempt to attach any virtual device servers. These + * drivers must be loaded at start of day so that they + * can respond to any updates to the machine description. + * + * Since it is quite likely that a domain will not support + * one or more of these servers, failures are ignored. + */ + + /* virtual disk server */ + (void) i_ddi_attach_hw_nodes("vds"); + + /* virtual network switch */ + (void) i_ddi_attach_hw_nodes("vsw"); + + /* virtual console concentrator */ + (void) i_ddi_attach_hw_nodes("vcc"); +} diff --git a/usr/src/uts/sun4v/platsvc/Makefile b/usr/src/uts/sun4v/platsvc/Makefile new file mode 100644 index 0000000000..f7e729c43e --- /dev/null +++ b/usr/src/uts/sun4v/platsvc/Makefile @@ -0,0 +1,97 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +# This makefile drives the production of the platsvc kernel module. +# +# sun4v implementation architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = platsvc +OBJECTS = $(PLATSVC_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(PLATSVC_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_MISC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sun4v/Makefile.sun4v + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement +# +CFLAGS += -v + +# +# Turn on doubleword alignment for 64 bit registers +# +CFLAGS += -dalign + +# +# Module Dependencies +# +LDFLAGS += -dy -Nmisc/ds + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/$(PLATFORM)/Makefile.targ diff --git a/usr/src/uts/sun4v/promif/promif_asr.c b/usr/src/uts/sun4v/promif/promif_asr.c new file mode 100644 index 0000000000..7bc20982f0 --- /dev/null +++ b/usr/src/uts/sun4v/promif/promif_asr.c @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/promif_impl.h> + +/* + * The Automatic System Recovery (ASR) database present in some + * versions of firmware is not supported on sun4v platforms. + * However, there is an external interface to these prom interfaces + * from the openprom(7D) driver. They are not documented in the + * man page, but they should still be handled here, just enough + * so the user gets a sensible error back if they stumble onto + * them. + */ + +int +promif_asr_list_keys_len(void *p) +{ + cell_t *ci = (cell_t *)p; + + ci[3] = p1275_int2cell(-1); + + return (-1); +} + +int +promif_asr_list_keys(void *p) +{ + _NOTE(ARGUNUSED(p)) + + return (-1); +} + +int +promif_asr_export_len(void *p) +{ + cell_t *ci = (cell_t *)p; + + ci[3] = p1275_int2cell(-1); + + return (-1); +} + +int +promif_asr_export(void *p) +{ + _NOTE(ARGUNUSED(p)) + + return (-1); +} diff --git a/usr/src/uts/sun4v/promif/promif_cpu.c b/usr/src/uts/sun4v/promif/promif_cpu.c new file mode 100644 index 0000000000..fdeaa656ab --- /dev/null +++ b/usr/src/uts/sun4v/promif/promif_cpu.c @@ -0,0 +1,122 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/promif_impl.h> +#include <sys/machsystm.h> +#include <sys/hypervisor_api.h> +#include <sys/lpad.h> + +extern int (*prom_cif_handler)(void *); +extern int cif_cpu_mp_ready; + +int +promif_set_mmfsa_traptable(void *p) +{ + cell_t *ci = (cell_t *)p; + uint64_t rtba; + caddr_t tba; + uint64_t mmfsa_ra; + int rv, ret; + + ASSERT(ci[1] == 2); + + /* + * We use the same trap table for the rtba as well. + */ + rtba = va_to_pa(p1275_cell2ptr(ci[3])); + + /* + * if cif_cpu_mp_ready is not set the prom is still + * setting the mmfsa and trap table. Set the rtba + * after the prom cif call. + */ + if (!cif_cpu_mp_ready) { + ret = (*prom_cif_handler)(p); + if ((rv = hv_cpu_set_rtba(&rtba)) != H_EOK) + panic("hv_cpu_set_rtba failed: %d\n", rv); + return (ret); + } + + tba = p1275_cell2ptr(ci[3]); + mmfsa_ra = (uint64_t)p1275_cell2ptr(ci[4]); + + if (tba != (caddr_t)KERNELBASE) + return (-1); + + (void) set_tba(tba); + + if ((rv = hv_mmu_fault_area_conf(&mmfsa_ra)) != H_EOK) { + panic("hv_mmu_fault_area_conf failed: %d\n", rv); + } + + if ((rv = hv_cpu_set_rtba(&rtba)) != H_EOK) { + panic("hv_cpu_set_rtba failed: %d\n", rv); + } + + return (0); +} + +int +promif_start_cpu(void *p) +{ + cell_t *ci = (cell_t *)p; + int cpuid; + caddr_t pc; + int arg; + uint64_t rtba = 0; + int rv; + uint64_t *lpp; + + ASSERT(ci[1] == 3); + + cpuid = p1275_cell2int(ci[3]); + pc = p1275_cell2ptr(ci[4]); + arg = p1275_cell2int(ci[5]); + + if (!cif_cpu_mp_ready) + return ((*prom_cif_handler)(p)); + + rtba = va_to_pa(&trap_table); + + lpp = lpad_setup(cpuid, (uint64_t)pc, (uint64_t)arg); + + ASSERT(lpp); + + pc = (caddr_t)lpp; + + rv = hv_cpu_start(cpuid, va_to_pa(pc), rtba, cpuid); + + if (rv != H_EOK) { + panic("promif_start_cpu: failed to start cpu %d (%d)\n", + cpuid, rv); + } + + ci[6] = p1275_int2cell(rv); + + return (0); +} diff --git a/usr/src/uts/sun4v/promif/promif_emul.c b/usr/src/uts/sun4v/promif/promif_emul.c new file mode 100644 index 0000000000..8e4e41dbb3 --- /dev/null +++ b/usr/src/uts/sun4v/promif/promif_emul.c @@ -0,0 +1,268 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/promif_impl.h> +#include <sys/machsystm.h> +#include <sys/lpad.h> +#include <sys/vmsystm.h> +#include <sys/prom_plat.h> +#include <sys/ldoms.h> +#include <sys/kobj.h> +#include <sys/reboot.h> +#include <sys/hypervisor_api.h> + +#ifndef _KMDB +static processorid_t cif_cpu; +static struct translation *cif_prom_trans; +static size_t cif_prom_ntrans; + +int cif_cpu_mp_ready; +int (*prom_cif_handler)(void *) = NULL; +#endif + +#ifdef DEBUG +uint_t cif_debug; +#endif /* DEBUG */ + +extern int (*cif_handler)(void *); + +typedef struct { + char *name; + cif_func_t func; +} cif_callback_t; + +static cif_callback_t cb_table[] = { + { "getprop", promif_getprop }, + { "getproplen", promif_getproplen }, + { "nextprop", promif_nextprop }, + { "peer", promif_nextnode }, + { "child", promif_childnode }, + { "parent", promif_parentnode }, + { "enter", promif_enter_mon }, + { "exit", promif_exit_to_mon }, + { "boot", promif_reboot }, + { "write", promif_write }, + { "read", promif_read }, + { "interpret", promif_interpret }, + { "finddevice", promif_finddevice }, + { "instance-to-package", promif_instance_to_package }, +#ifndef _KMDB + { "setprop", promif_setprop }, + { "test", promif_test }, + { "instance-to-path", promif_instance_to_path }, + { "SUNW,power-off", promif_power_off }, + { "SUNW,asr-list-keys-len", promif_asr_list_keys_len }, + { "SUNW,asr-list-keys", promif_asr_list_keys }, + { "SUNW,asr-export-len", promif_asr_export_len }, + { "SUNW,asr-export", promif_asr_export }, + { "SUNW,set-security-key", promif_set_security_key }, + { "SUNW,get-security-key", promif_get_security_key }, + { "SUNW,start-cpu-by-cpuid", promif_start_cpu }, + { "SUNW,set-trap-table", promif_set_mmfsa_traptable }, + { "SUNW,set-sun4v-api-version", promif_set_sun4v_api_version }, + { "SUNW,get-sun4v-api-version", promif_get_sun4v_api_version }, +#endif + { NULL, NULL } +}; + +cif_func_t +promif_find_cif_callback(char *opname) +{ + cif_callback_t *cb; + + if (opname == NULL) + return (NULL); + + for (cb = cb_table; cb->name; cb++) { + if (prom_strcmp(cb->name, opname) == 0) + break; + } + + return (cb->func); +} + +static int +kern_cif_handler(void *p) +{ + cell_t *ci = (cell_t *)p; + char *opname; + cif_func_t func; + int rv; + + ASSERT(cif_handler == kern_cif_handler); + +#ifndef _KMDB + cif_cpu = getprocessorid(); +#endif + + opname = p1275_cell2ptr(ci[0]); + + /* lookup the callback for the desired operation */ + func = promif_find_cif_callback(opname); + + if (func == NULL) { +#ifdef _KMDB + prom_fatal_error("sun4v unsupported CIFs\n"); +#else + cmn_err(CE_CONT, "!sun4v unsupported CIF: %s\n", opname); + return (-1); +#endif + } + + /* callback found, execute it */ + rv = func(p); + +#ifndef _KMDB + cif_cpu = -1; +#endif + + return (rv); +} + +#ifdef _KMDB + +void +cif_init(char *pgmname, caddr_t root, ihandle_t in, ihandle_t out, + phandle_t pin, phandle_t pout, pnode_t chosen, pnode_t options) +{ + /* initialize pointer to a copy of OBP device tree */ + promif_stree_setroot(root); + + promif_set_nodes(chosen, options); + + /* initialize io parameters */ + promif_io_init(in, out, pin, pout); + + /* + * Switch CIF handler to the kernel. + */ + if (pgmname != NULL) + prom_init(pgmname, (void *)kern_cif_handler); + else + cif_handler = kern_cif_handler; +} + +#else + +static void cache_prom_data(void); + +/* + * This function returns 1 if the current thread is executing in + * the CIF and 0 otherwise. This is useful information to know + * since code that implements CIF handlers can assume that it has + * gone through the kern_preprom() entry point, implying it is + * running single threaded, has preemption disabled, etc. + */ +int +promif_in_cif(void) +{ + int mycpuid = getprocessorid(); + + return ((cif_cpu == mycpuid) ? 1 : 0); +} + +void +cif_init(void) +{ + void (*kmdb_cb)(void); + uint64_t rtba; + uint64_t rv; + + /* + * Check if domaining is enabled. If not, do not + * initialize the kernel CIF handler. + */ + if (!domaining_enabled) + return; + + /* + * Cache PROM data that is needed later, e.g. a shadow + * copy of the device tree, IO mappings, etc. + */ + cache_prom_data(); + + /* + * Prepare to take over the get/set of environmental variables. + */ + promif_prop_init(); + + /* + * Switch CIF handler to the kernel. + */ + prom_cif_handler = cif_handler; + + promif_preprom(); + cif_handler = kern_cif_handler; + + /* + * Take over rtba for the boot CPU. The rtba for + * all other CPUs are set as they enter the system. + */ + rtba = va_to_pa(&trap_table); + if ((rv = hv_cpu_set_rtba(&rtba)) != H_EOK) + panic("hv_cpu_set_rtba failed: %ld\n", rv); + + promif_postprom(); + + /* + * If the system has been booted with kmdb we need kmdb to + * use the kernel cif handler instead of the PROM cif handler. + */ + if (boothowto & RB_KMDB) { + kmdb_cb = (void (*)(void))modlookup("misc/kmdbmod", + "kctl_switch_promif"); + ASSERT(kmdb_cb != NULL); + (*kmdb_cb)(); + } +} + +static void +cache_prom_data(void) +{ + /* initialize copy of OBP device tree */ + promif_stree_init(); + + /* initialize io parameters */ + promif_io_init(); +} + + +/* + * Platform-specific actions to be taken when all cpus are running + * in the OS. + */ +void +cpu_mp_init(void) +{ + if (!domaining_enabled) + return; + + cif_cpu_mp_ready = 1; +} + +#endif /* _KMDB */ diff --git a/usr/src/uts/sun4v/promif/promif_interp.c b/usr/src/uts/sun4v/promif/promif_interp.c new file mode 100644 index 0000000000..fcf27b59ad --- /dev/null +++ b/usr/src/uts/sun4v/promif/promif_interp.c @@ -0,0 +1,42 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/promif_impl.h> + +/* + * There is no support for prom_interpret() once the kernel + * takes over the CIF, so this function just returns an error. + * Having this stub keeps harmless messages out of the log file + * that report that prom_interpret() is not supported. + */ +/*ARGSUSED*/ +int +promif_interpret(void *p) +{ + return (-1); +} diff --git a/usr/src/uts/sun4v/promif/promif_io.c b/usr/src/uts/sun4v/promif/promif_io.c new file mode 100644 index 0000000000..73ca7d3afa --- /dev/null +++ b/usr/src/uts/sun4v/promif/promif_io.c @@ -0,0 +1,220 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/promif_impl.h> +#include <sys/systm.h> +#include <sys/hypervisor_api.h> +#ifndef _KMDB +#include <sys/kmem.h> +#endif + +#define PROM_REG_TO_UNIT_ADDR(r) ((r) & ~(0xful << 28)) + +static pnode_t instance_to_package(ihandle_t ih); + +/* cached copies of IO params */ +static phandle_t pstdin; +static phandle_t pstdout; + +static ihandle_t istdin; +static ihandle_t istdout; + +int +promif_instance_to_package(void *p) +{ + cell_t *ci = (cell_t *)p; + ihandle_t ih; + phandle_t ph; + + ih = p1275_cell2ihandle(ci[3]); + + ph = instance_to_package(ih); + + ci[4] = p1275_phandle2cell(ph); + + return (0); +} + +int +promif_write(void *p) +{ + cell_t *ci = (cell_t *)p; + uint_t fd; + char *buf; + size_t len; + size_t rlen; + + ASSERT(ci[1] == 3); + + fd = p1275_cell2uint(ci[3]); + buf = p1275_cell2ptr(ci[4]); + len = p1275_cell2size(ci[5]); + + /* only support stdout (console) */ + ASSERT(fd == istdout); + + for (rlen = 0; rlen < len; rlen++) { + while (hv_cnputchar((uint8_t)buf[rlen]) == H_EWOULDBLOCK) + /* try forever */; + } + + /* return the length written */ + ci[6] = p1275_size2cell(rlen); + + return (0); +} + +int +promif_read(void *p) +{ + cell_t *ci = (cell_t *)p; + uint_t fd; + char *buf; + size_t len; + size_t rlen; + + ASSERT(ci[1] == 3); + + /* unpack arguments */ + fd = p1275_cell2uint(ci[3]); + buf = p1275_cell2ptr(ci[4]); + len = p1275_cell2size(ci[5]); + + /* only support stdin (console) */ + ASSERT(fd == istdin); + + for (rlen = 0; rlen < len; rlen++) { + if (hv_cngetchar((uint8_t *)&buf[rlen]) != H_EOK) + break; + } + + /* return the length read */ + ci[6] = p1275_size2cell(rlen); + + return (0); +} + +static pnode_t +instance_to_package(ihandle_t ih) +{ + /* only support stdin and stdout */ + ASSERT((ih == istdin) || (ih == istdout)); + + if (ih == istdin) + return (pstdin); + + if (ih == istdout) + return (pstdout); + + return (OBP_BADNODE); +} + +#ifdef _KMDB + +void +promif_io_init(ihandle_t in, ihandle_t out, phandle_t pin, phandle_t pout) +{ + istdin = in; + istdout = out; + pstdin = pin; + pstdout = pout; +} + +#else + +void +promif_io_init(void) +{ + /* + * Cache the mapping between the stdin and stdout + * ihandles and their respective phandles. + */ + pstdin = prom_stdin_node(); + pstdout = prom_stdout_node(); + + istdin = prom_stdin_ihandle(); + istdout = prom_stdout_ihandle(); +} + +int +promif_instance_to_path(void *p) +{ + cell_t *ci = (cell_t *)p; + pnode_t node; + ihandle_t ih; + char *buf; + int rlen; + char *regval; + uint_t *csaddr; + char name[OBP_MAXPROPNAME]; + char scratch[OBP_MAXPATHLEN]; + int rvlen; + + ih = p1275_cell2ihandle(ci[3]); + buf = p1275_cell2ptr(ci[4]); + + ci[6] = p1275_uint2cell(0); + + node = instance_to_package(ih); + + *buf = '\0'; + + while (node != prom_rootnode()) { + if (prom_getprop(node, OBP_NAME, name) == -1) { + prom_printf("instance_to_path: no name property " + "node=0x%x\n", node); + return (-1); + } + + /* construct the unit address from the 'reg' property */ + if ((rlen = prom_getproplen(node, OBP_REG)) == -1) + return (-1); + + regval = kmem_zalloc(rlen, KM_SLEEP); + + (void) prom_getprop(node, OBP_REG, regval); + + csaddr = (uint_t *)regval; + + (void) prom_sprintf(scratch, "/%s@%lx%s", name, + PROM_REG_TO_UNIT_ADDR(*csaddr), buf); + + kmem_free(regval, rlen); + + (void) prom_strcpy(buf, scratch); + + node = prom_parentnode(node); + } + + rvlen = prom_strlen(buf); + ci[6] = p1275_uint2cell(rvlen); + + return (0); +} + +#endif /* _KMDB */ diff --git a/usr/src/uts/sun4v/promif/promif_key.c b/usr/src/uts/sun4v/promif/promif_key.c new file mode 100644 index 0000000000..f35064d085 --- /dev/null +++ b/usr/src/uts/sun4v/promif/promif_key.c @@ -0,0 +1,58 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/promif_impl.h> + +/* + * Secure WAN boot requires firmware support for storing and + * retrieving security keys. The user command to set these + * keys in firmware storage is ickey(1M). Currently, sun4v + * platforms do not support this functionality. However, there + * is an external interface to these prom interfaces from the + * openprom(7D) driver. They are not documented in the man page, + * but they should still be handled just well enough so that + * the user gets a sensible error back. + */ + +int +promif_set_security_key(void *p) +{ + _NOTE(ARGUNUSED(p)) + + return (-1); +} + +int +promif_get_security_key(void *p) +{ + cell_t *ci = (cell_t *)p; + + ci[6] = p1275_int2cell(-1); + + return (-1); +} diff --git a/usr/src/uts/sun4v/promif/promif_mon.c b/usr/src/uts/sun4v/promif/promif_mon.c new file mode 100644 index 0000000000..73c66778ee --- /dev/null +++ b/usr/src/uts/sun4v/promif/promif_mon.c @@ -0,0 +1,203 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/promif_impl.h> +#include <sys/uadmin.h> +#include <sys/machsystm.h> +#include <sys/hypervisor_api.h> + +#ifdef _KMDB + +extern int kmdb_dpi_get_master_cpuid(void); +extern void kmdb_dpi_kernpanic(int cpuid); +extern void prom_reboot(char *bootstr); + +#define PIL_DECL(p) +#define PIL_SET7(p) +#define PIL_REST(p) + +#else + +extern int vx_handler(cell_t *argument_array); + +#define PIL_DECL(p) int p +#define PIL_SET7(p) (p = spl7()) +#define PIL_REST(p) (splx(p)) + +#endif + +#define PROMIF_ENTER 0 +#define PROMIF_EXIT 1 + +#define PROMIF_ISPRINT(c) (((c) >= ' ') && ((c) <= '~')) + +static void promif_mon(int mode); + +/*ARGSUSED*/ +int +promif_enter_mon(void *p) +{ + PIL_DECL(pil); + + PIL_SET7(pil); + + prom_printf("\n"); + +#ifdef _KMDB + promif_mon(PROMIF_ENTER); +#else + idle_other_cpus(); + promif_mon(PROMIF_ENTER); + resume_other_cpus(); +#endif + + PIL_REST(pil); + + return (0); +} + +/*ARGSUSED*/ +int +promif_exit_to_mon(void *p) +{ + PIL_DECL(pil); + + PIL_SET7(pil); + + prom_printf("Program terminated\n"); + + promif_mon(PROMIF_EXIT); + + PIL_REST(pil); + + return (0); +} + +static void +promif_mon(int mode) +{ + char cmd; + char *prompt; + boolean_t invalid_option; +#ifdef _KMDB + static char *exit_prompt = "r)eboot, h)alt? "; +#else + char value[ 8 ]; /* holds "true" or "false" */ + char *boot_msg; + static char *null_msg = ".\" \""; + static char *ignore_msg = + "cr .\" Ignoring auto-boot? setting for this boot.\" cr"; + static char *exit_prompt = "r)eboot, o)k prompt, h)alt? "; +#endif + static char *enter_prompt = "c)ontinue, s)ync, r)eboot, h)alt? "; + + prompt = (mode == PROMIF_EXIT) ? exit_prompt : enter_prompt; + + for (;;) { + prom_printf("%s", prompt); + + while (hv_cngetchar((uint8_t *)&cmd) != H_EOK) + ; + + prom_printf("%c\n", cmd); + + invalid_option = B_FALSE; + + switch (cmd) { + + case 'r': + prom_reboot(""); + break; + + case 'h': + (void) hv_mach_exit(0); + ASSERT(0); + + break; + +#ifndef _KMDB + case 'o': + /* + * This option gives the user an "ok" prompt after + * the system reset regardless of the value of + * auto-boot? We offer this option because halt(1m) + * doesn't leave the user at the ok prompt (as it + * does on non-ldoms systems). If auto-boot? is + * true tell user we are overriding the setting + * for this boot only. + */ + if (mode == PROMIF_EXIT) { + bzero(value, sizeof (value)); + (void) promif_stree_getprop(prom_optionsnode(), + "auto-boot?", value); + boot_msg = strcmp(value, "true") ? null_msg : + ignore_msg; + (void) promif_ldom_setprop("reboot-command", + boot_msg, strlen(boot_msg) + 1); + (void) hv_mach_sir(); + } else { + invalid_option = B_TRUE; + } + break; +#endif + + case '\r': + break; + + case 's': + if (mode == PROMIF_ENTER) { +#ifdef _KMDB + kmdb_dpi_kernpanic(kmdb_dpi_get_master_cpuid()); +#else + cell_t arg = p1275_ptr2cell("sync"); + (void) vx_handler(&arg); +#endif + } else { + invalid_option = B_TRUE; + } + break; + + case 'c': + if (mode == PROMIF_ENTER) { + return; + } else { + invalid_option = B_TRUE; + } + break; + + default: + invalid_option = B_TRUE; + break; + } + + if (invalid_option && PROMIF_ISPRINT(cmd)) + prom_printf("invalid option (%c)\n", cmd); + } + + _NOTE(NOTREACHED) +} diff --git a/usr/src/uts/sun4v/promif/promif_node.c b/usr/src/uts/sun4v/promif/promif_node.c new file mode 100644 index 0000000000..36ec3893fc --- /dev/null +++ b/usr/src/uts/sun4v/promif/promif_node.c @@ -0,0 +1,293 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/esunddi.h> +#include <sys/promif_impl.h> + +#ifdef _KMDB +static pnode_t chosennode; +static pnode_t optionsnode; +#else +static char *gettoken(char *tp, char *token); +static pnode_t finddevice(char *path); +#endif + +/* + * Routines for walking the PROMs devinfo tree + */ + +#ifdef _KMDB + +void +promif_set_nodes(pnode_t chosen, pnode_t options) +{ + chosennode = chosen; + optionsnode = options; +} + +int +promif_finddevice(void *p) +{ + cell_t *ci = (cell_t *)p; + char *path; + + ASSERT(ci[1] == 1); + + path = p1275_cell2ptr(ci[3]); + + if (strcmp("/chosen", path) == 0) { + ci[4] = p1275_dnode2cell(chosennode); + } else if (strcmp("/options", path) == 0) { + ci[4] = p1275_dnode2cell(optionsnode); + } else { + /* only supports known nodes */ + ASSERT(0); + } + + return (0); +} + +#else + +int +promif_finddevice(void *p) +{ + cell_t *ci = (cell_t *)p; + pnode_t node; + + ASSERT(ci[1] == 1); + + node = finddevice(p1275_cell2ptr(ci[3])); + + ci[4] = p1275_dnode2cell(node); + + return (0); +} + +#endif + +int +promif_nextnode(void *p) +{ + cell_t *ci = (cell_t *)p; + pnode_t next; + + ASSERT(ci[1] == 1); + + next = promif_stree_nextnode(p1275_cell2dnode(ci[3])); + + ci[4] = p1275_dnode2cell(next); + + return (0); +} + +int +promif_childnode(void *p) +{ + cell_t *ci = (cell_t *)p; + pnode_t child; + + ASSERT(ci[1] == 1); + + child = promif_stree_childnode(p1275_cell2dnode(ci[3])); + + ci[4] = p1275_dnode2cell(child); + + return (0); +} + +int +promif_parentnode(void *p) +{ + cell_t *ci = (cell_t *)p; + pnode_t parent; + + ASSERT(ci[1] == 1); + + parent = promif_stree_parentnode(p1275_cell2dnode(ci[3])); + + ci[4] = p1275_dnode2cell(parent); + + return (0); +} + +#ifndef _KMDB + +/* + * Get a token from a prom pathname, collecting everything + * until a non-comma, non-colon separator is found. Any + * options, including the ':' option separator, on the end + * of the token are removed. + */ +static char * +gettoken(char *tp, char *token) +{ + char *result = token; + + for (;;) { + tp = prom_path_gettoken(tp, token); + token += prom_strlen(token); + if ((*tp == ',') || (*tp == ':')) { + *token++ = *tp++; + *token = '\0'; + continue; + } + break; + } + + /* strip off any options from the token */ + prom_strip_options(result, result); + + return (tp); +} + +/* + * Retrieve the unit address for a node by looking it up + * in the corresponding dip. -1 is returned if no unit + * address can be determined. + */ +static int +get_unit_addr(pnode_t np, char *paddr) +{ + dev_info_t *dip; + char *addr; + + if ((dip = e_ddi_nodeid_to_dip(np)) == NULL) { + return (-1); + } + + if ((addr = ddi_get_name_addr(dip)) == NULL) { + ddi_release_devi(dip); + return (-1); + } + + (void) prom_strcpy(paddr, addr); + + ddi_release_devi(dip); + + return (0); +} + +/* + * Get node id of node in prom tree that path identifies + */ +static pnode_t +finddevice(char *path) +{ + char name[OBP_MAXPROPNAME]; + char addr[OBP_MAXPROPNAME]; + char pname[OBP_MAXPROPNAME]; + char paddr[OBP_MAXPROPNAME]; + char *tp; + pnode_t np; + pnode_t device; + + CIF_DBG_NODE("finddevice: %s\n", path); + + tp = path; + np = prom_rootnode(); + device = OBP_BADNODE; + + /* must be a fully specified path */ + if (*tp++ != '/') + goto done; + + for (;;) { + /* get the name from the path */ + tp = gettoken(tp, name); + if (*name == '\0') + break; + + /* get the address from the path */ + if (*tp == '@') { + tp++; + tp = gettoken(tp, addr); + } else { + addr[0] = '\0'; + } + + CIF_DBG_NODE("looking for: %s%s%s\n", name, + (*addr != '\0') ? "@" : "", addr); + + if ((np = prom_childnode(np)) == OBP_NONODE) + break; + + while (np != OBP_NONODE) { + + /* get the name from the current node */ + if (prom_getprop(np, OBP_NAME, pname) < 0) + goto done; + + /* get the address from the current node */ + if (get_unit_addr(np, paddr) < 0) + paddr[0] = '\0'; + + /* compare the names and addresses */ + if ((prom_strcmp(name, pname) == 0) && + (prom_strcmp(addr, paddr) == 0)) { + CIF_DBG_NODE("found dev: %s%s%s (0x%x)\n", + pname, (*paddr != '\0') ? "@" : "", + paddr, np); + break; + } else { + CIF_DBG_NODE(" no match: %s%s%s vs %s%s%s\n", + name, (*addr != '\0') ? "@" : "", addr, + pname, (*paddr != '\0') ? "@" : "", paddr); + } + np = prom_nextnode(np); + } + + /* path does not map to a node */ + if (np == OBP_NONODE) + break; + + if (*tp == '\0') { + /* found a matching node */ + device = np; + break; + } + + /* + * Continue the loop with the + * next component of the path. + */ + tp++; + } +done: + + if (device == OBP_BADNODE) { + CIF_DBG_NODE("device not found\n\n"); + } else { + CIF_DBG_NODE("returning 0x%x\n\n", device); + } + + return (device); +} + +#endif diff --git a/usr/src/uts/sun4v/promif/promif_power_off.c b/usr/src/uts/sun4v/promif/promif_power_off.c new file mode 100644 index 0000000000..fb54d006ca --- /dev/null +++ b/usr/src/uts/sun4v/promif/promif_power_off.c @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/promif_impl.h> +#include <sys/hypervisor_api.h> + +int +promif_power_off(void *p) +{ + _NOTE(ARGUNUSED(p)) + + int rv = 0; + + rv = hv_mach_exit(0); + + /* should not return */ + ASSERT(0); + + return (rv); +} diff --git a/usr/src/uts/sun4v/promif/promif_prop.c b/usr/src/uts/sun4v/promif/promif_prop.c new file mode 100644 index 0000000000..42cdffe32a --- /dev/null +++ b/usr/src/uts/sun4v/promif/promif_prop.c @@ -0,0 +1,327 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/promif_impl.h> +#include <sys/ds.h> +#include <sys/modctl.h> +#include <sys/ksynch.h> +#include <sys/varconfig.h> + +#ifndef _KMDB + +#define PROMIF_DS_TIMEOUT_SEC 15 + +static kmutex_t promif_prop_lock; +static kcondvar_t promif_prop_cv; +static var_config_msg_t promif_ds_resp; +static var_config_resp_t *cfg_rsp = &promif_ds_resp.var_config_resp; +static int (*ds_send)(); +static int (*ds_init)(); + +/* + * Domains Services interaction + */ +static ds_svc_hdl_t ds_primary_handle; +static ds_svc_hdl_t ds_backup_handle; + +static ds_ver_t vc_version[] = { { 1, 0 } }; + +#define VC_NVERS (sizeof (vc_version) / sizeof (vc_version[0])) + +static ds_capability_t vc_primary_cap = { + "var-config", /* svc_id */ + vc_version, /* vers */ + VC_NVERS /* nvers */ +}; + +static ds_capability_t vc_backup_cap = { + "var-config-backup", /* svc_id */ + vc_version, /* vers */ + VC_NVERS /* nvers */ +}; + +static void vc_reg_handler(ds_cb_arg_t, ds_ver_t *, ds_svc_hdl_t); +static void vc_unreg_handler(ds_cb_arg_t); +static void vc_data_handler(ds_cb_arg_t, void *, size_t); + +static ds_clnt_ops_t vc_primary_ops = { + vc_reg_handler, /* ds_primary_reg_cb */ + vc_unreg_handler, /* ds_primary_unreg_cb */ + vc_data_handler, /* ds_data_cb */ + &ds_primary_handle /* cb_arg */ +}; + +static ds_clnt_ops_t vc_backup_ops = { + vc_reg_handler, /* ds_backup_reg_cb */ + vc_unreg_handler, /* ds_backup_unreg_cb */ + vc_data_handler, /* ds_data_cb */ + &ds_backup_handle /* cb_arg */ +}; + +static void +vc_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl) +{ + _NOTE(ARGUNUSED(ver)) + + if ((ds_svc_hdl_t *)arg == &ds_primary_handle) + ds_primary_handle = hdl; + else if ((ds_svc_hdl_t *)arg == &ds_backup_handle) + ds_primary_handle = hdl; +} + +static void +vc_unreg_handler(ds_cb_arg_t arg) +{ + if ((ds_svc_hdl_t *)arg == &ds_primary_handle) + ds_primary_handle = DS_INVALID_HDL; + else if ((ds_svc_hdl_t *)arg == &ds_backup_handle) + ds_backup_handle = DS_INVALID_HDL; +} + +static void +vc_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen) +{ + _NOTE(ARGUNUSED(arg)) + + bcopy(buf, &promif_ds_resp, buflen); + mutex_enter(&promif_prop_lock); + cv_signal(&promif_prop_cv); + mutex_exit(&promif_prop_lock); +} + +/* + * Initialize the linkage with DS (Domain Services). We assume that + * the DS module has already been loaded by the platmod. + * + * The call to the DS init functions will eventually result in the + * invocation of our registration callback handlers, at which time DS + * is able to accept requests. + */ +static void +promif_ds_init(void) +{ + static char *me = "promif_ds_init"; + int rv; + + if ((ds_init = + (int (*)())modgetsymvalue("ds_cap_init", 0)) == 0) { + cmn_err(CE_WARN, "%s: can't find ds_cap_init", me); + return; + } + + if ((ds_send = + (int (*)())modgetsymvalue("ds_cap_send", 0)) == 0) { + cmn_err(CE_WARN, "%s: can't find ds_cap_send", me); + return; + } + + if ((rv = (*ds_init)(&vc_primary_cap, &vc_primary_ops)) != 0) { + cmn_err(CE_NOTE, + "%s: ds_cap_init failed (primary): %d", me, rv); + } + + + if ((rv = (*ds_init)(&vc_backup_cap, &vc_backup_ops)) != 0) { + cmn_err(CE_NOTE, + "%s: ds_cap_init failed (backup): %d", me, rv); + } +} + +/* + * Prepare for ldom variable requests. + */ +void +promif_prop_init(void) +{ + mutex_init(&promif_prop_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&promif_prop_cv, NULL, CV_DEFAULT, NULL); + + promif_ds_init(); +} + + +/* + * Replace the current value of a property string given its name and + * new value. + */ +int +promif_ldom_setprop(char *name, void *value, int valuelen) +{ + var_config_msg_t *req; + var_config_set_req_t *setp; + var_config_cmd_t cmd; + ds_svc_hdl_t ds_handle; + int rv; + int namelen = strlen(name); + int paylen = namelen + 1 + valuelen; /* valuelen includes the null */ + static char *me = "promif_ldom_setprop"; + + if (ds_primary_handle != DS_INVALID_HDL) + ds_handle = ds_primary_handle; + else if (ds_backup_handle != DS_INVALID_HDL) + ds_handle = ds_backup_handle; + else + return (-1); + + req = kmem_zalloc(sizeof (var_config_hdr_t) + paylen, KM_SLEEP); + req->var_config_cmd = VAR_CONFIG_SET_REQ; + setp = &req->var_config_set; + (void) strcpy(setp->name_and_value, name); + (void) strncpy(&setp->name_and_value[namelen + 1], value, valuelen); + + if ((rv = (*ds_send)(ds_handle, req, + sizeof (var_config_hdr_t) + paylen)) != 0) { + cmn_err(CE_WARN, "%s: ds_cap_send failed: %d", me, rv); + kmem_free(req, sizeof (var_config_hdr_t) + paylen); + return (-1); + } + + kmem_free(req, sizeof (var_config_hdr_t) + paylen); + + /* + * Since we are emulating OBP, we must comply with the promif + * infrastructure and execute only on the originating cpu. + */ + thread_affinity_set(curthread, CPU_CURRENT); + + mutex_enter(&promif_prop_lock); + if (cv_timedwait(&promif_prop_cv, + &promif_prop_lock, lbolt + PROMIF_DS_TIMEOUT_SEC * hz) == -1) { + cmn_err(CE_WARN, "%s: ds response timeout", me); + rv = -1; + goto out; + } + + cmd = promif_ds_resp.vc_hdr.cmd; + if (cmd != VAR_CONFIG_SET_RESP) { + cmn_err(CE_WARN, "%s: bad response type: %d", me, cmd); + rv = -1; + goto out; + } + rv = (cfg_rsp->result == VAR_CONFIG_SUCCESS) ? valuelen : -1; + +out: + mutex_exit(&promif_prop_lock); + thread_affinity_clear(curthread); + return (rv); +} + +int +promif_setprop(void *p) +{ + cell_t *ci = (cell_t *)p; + pnode_t node; + caddr_t name; + caddr_t value; + int len; + + ASSERT(ci[1] == 4); + + node = p1275_cell2dnode(ci[3]); + ASSERT(node == prom_optionsnode()); + name = p1275_cell2ptr(ci[4]); + value = p1275_cell2ptr(ci[5]); + len = p1275_cell2int(ci[6]); + + if (promif_stree_getproplen(node, name) != -1) + len = promif_ldom_setprop(name, value, len); + + if (len >= 0) + len = promif_stree_setprop(node, name, (void *)value, len); + + + ci[7] = p1275_int2cell(len); + + return ((len == -1) ? len : 0); +} + +#endif + +int +promif_getprop(void *p) +{ + cell_t *ci = (cell_t *)p; + pnode_t node; + caddr_t name; + caddr_t value; + int len; + + ASSERT(ci[1] == 4); + + node = p1275_cell2dnode(ci[3]); + name = p1275_cell2ptr(ci[4]); + value = p1275_cell2ptr(ci[5]); + + len = promif_stree_getprop(node, name, value); + + ci[7] = p1275_int2cell(len); + + return ((len == -1) ? len : 0); +} + +int +promif_getproplen(void *p) +{ + cell_t *ci = (cell_t *)p; + pnode_t node; + caddr_t name; + int len; + + ASSERT(ci[1] == 2); + + node = p1275_cell2dnode(ci[3]); + name = p1275_cell2ptr(ci[4]); + + len = promif_stree_getproplen(node, name); + + ci[5] = p1275_int2cell(len); + + return (0); +} + +int +promif_nextprop(void *p) +{ + cell_t *ci = (cell_t *)p; + pnode_t node; + caddr_t prev; + caddr_t next; + + ASSERT(ci[1] == 3); + + node = p1275_cell2dnode(ci[3]); + prev = p1275_cell2ptr(ci[4]); + next = p1275_cell2ptr(ci[5]); + + (void) promif_stree_nextprop(node, prev, next); + + return (0); +} diff --git a/usr/src/uts/sun4v/promif/promif_reboot.c b/usr/src/uts/sun4v/promif/promif_reboot.c new file mode 100644 index 0000000000..15a696184b --- /dev/null +++ b/usr/src/uts/sun4v/promif/promif_reboot.c @@ -0,0 +1,115 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/promif_impl.h> +#include <sys/hypervisor_api.h> + +/* + * Reboot Command String + * + * The prom_reboot() CIF handler takes an optional string containing + * arguments to the boot command that are to be applied to the reboot. + * This information is used to create a full boot command string that + * is stored in a well known ldom variable (REBOOT_CMD_VAR_NAME). The + * string is constructed to take the following form: + * + * boot <specified boot arguments><NULL> + * + * When the domain comes back up, OBP consults this variable. If set, + * it will use the unmodified boot command string to boot the domain. + * The maximum length of the boot command is specified by the constant + * REBOOT_CMD_MAX_LEN. If the specified arguments cause the command + * string to exceed this length, the arguments are truncated. + */ +#define REBOOT_CMD_VAR_NAME "reboot-command" +#define REBOOT_CMD_BASE "boot " +#define REBOOT_CMD_MAX_LEN 256 +#define REBOOT_CMD_ARGS_MAX_LEN (REBOOT_CMD_MAX_LEN - \ + prom_strlen(REBOOT_CMD_BASE) - 1) +int +promif_reboot(void *p) +{ + cell_t *ci = (cell_t *)p; + int rv = 0; +#ifndef _KMDB + char *bootargs; + char bootcmd[REBOOT_CMD_MAX_LEN]; + char *cmd_end; + int cmd_len; +#endif + + /* one argument expected */ + ASSERT(ci[1] == 1); + +#ifndef _KMDB + bootargs = p1275_cell2ptr(ci[3]); + + if (bootargs == NULL) + bootargs = ""; + + /* verify the length of the command string */ + cmd_len = prom_strlen(REBOOT_CMD_BASE) + prom_strlen(bootargs) + 1; + + if (cmd_len > REBOOT_CMD_MAX_LEN) { + /* + * Unable to set the requested boot arguments. + * Truncate them so that the boot command will + * fit within the maximum length. This follows + * the policy also used by OBP. + */ + cmd_end = bootargs + REBOOT_CMD_ARGS_MAX_LEN; + *cmd_end = '\0'; + + prom_printf("WARNING: reboot command length (%d) too long, " + "truncating command arguments\n", cmd_len); + } + + /* construct the boot command string */ + (void) prom_sprintf(bootcmd, "%s%s", REBOOT_CMD_BASE, bootargs); + + cmd_len = prom_strlen(bootcmd) + 1; + ASSERT(cmd_len <= REBOOT_CMD_MAX_LEN); + + CIF_DBG_REBOOT("bootcmd='%s'\n", bootcmd); + + /* attempt to set the ldom variable */ + if (promif_ldom_setprop(REBOOT_CMD_VAR_NAME, bootcmd, cmd_len) == -1) { + prom_printf("WARNING: unable to store boot command for " + "use on reboot\n"); + } +#endif + + prom_printf("Resetting...\n"); + + rv = hv_mach_sir(); + + /* should not return */ + ASSERT(0); + + return (rv); +} diff --git a/usr/src/uts/sun4v/promif/promif_stree.c b/usr/src/uts/sun4v/promif/promif_stree.c new file mode 100644 index 0000000000..c52545ed16 --- /dev/null +++ b/usr/src/uts/sun4v/promif/promif_stree.c @@ -0,0 +1,455 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/promif_impl.h> +#include <sys/kmem.h> +#include <sys/machsystm.h> + +/* + * A property attached to a node in the kernel's + * shadow copy of the PROM device tree. + */ +typedef struct prom_prop { + struct prom_prop *pp_next; + char *pp_name; + int pp_len; + void *pp_val; +} prom_prop_t; + +/* + * A node in the kernel's shadow copy of the PROM + * device tree. + */ +typedef struct prom_node { + pnode_t pn_nodeid; + struct prom_prop *pn_propp; + struct prom_node *pn_parent; + struct prom_node *pn_child; + struct prom_node *pn_sibling; +} prom_node_t; + +static prom_node_t *promif_root; + +static prom_node_t *find_node(pnode_t nodeid); +static prom_node_t *find_node_work(prom_node_t *np, pnode_t node); +static int getproplen(prom_node_t *pnp, char *name); +static void *getprop(prom_node_t *pnp, char *name); +static char *nextprop(prom_node_t *pnp, char *name); + +#ifndef _KMDB +static void create_prop(prom_node_t *pnp, char *name, void *val, int len); +static prom_node_t *create_node(prom_node_t *parent, pnode_t node); +static void create_peers(prom_node_t *pnp, pnode_t node); +static void create_children(prom_node_t *pnp, pnode_t parent); +#endif + +/* + * Hooks for kmdb for accessing the PROM shadow tree. The driver portion + * of kmdb will retrieve the root of the tree and pass it down to the + * debugger portion of kmdb. As the kmdb debugger is standalone, it has + * its own promif_root pointer that it will be set to the value passed by + * the driver so that kmdb points to the shadow tree maintained by the kernel. + * So the "get" function is in the kernel while the "set" function is in kmdb. + */ +#ifdef _KMDB +void +promif_stree_setroot(void *root) +{ + promif_root = (prom_node_t *)root; +} +#else +void * +promif_stree_getroot(void) +{ + return (promif_root); +} +#endif + +/* + * Interfaces used internally by promif functions. + * These hide all accesses to the shadow tree. + */ + +pnode_t +promif_stree_parentnode(pnode_t nodeid) +{ + prom_node_t *pnp; + + pnp = find_node(nodeid); + if (pnp && pnp->pn_parent) { + return (pnp->pn_parent->pn_nodeid); + } + + return (OBP_NONODE); +} + +pnode_t +promif_stree_childnode(pnode_t nodeid) +{ + prom_node_t *pnp; + + pnp = find_node(nodeid); + if (pnp && pnp->pn_child) + return (pnp->pn_child->pn_nodeid); + + return (OBP_NONODE); +} + +pnode_t +promif_stree_nextnode(pnode_t nodeid) +{ + prom_node_t *pnp; + + /* + * Note: next(0) returns the root node + */ + pnp = find_node(nodeid); + if (pnp && (nodeid == OBP_NONODE)) + return (pnp->pn_nodeid); + if (pnp && pnp->pn_sibling) + return (pnp->pn_sibling->pn_nodeid); + + return (OBP_NONODE); +} + +int +promif_stree_getproplen(pnode_t nodeid, char *name) +{ + prom_node_t *pnp; + + pnp = find_node(nodeid); + if (pnp == NULL) + return (-1); + + return (getproplen(pnp, name)); +} + +int +promif_stree_getprop(pnode_t nodeid, char *name, void *value) +{ + prom_node_t *pnp; + void *prop; + int len; + + pnp = find_node(nodeid); + if (pnp == NULL) { + prom_printf("find_node: no node?\n"); + return (-1); + } + + len = getproplen(pnp, name); + if (len > 0) { + prop = getprop(pnp, name); + bcopy(prop, value, len); + } else { + prom_printf("find_node: getproplen: %d\n", len); + } + + return (len); +} + +char * +promif_stree_nextprop(pnode_t nodeid, char *name, char *next) +{ + prom_node_t *pnp; + char *propname; + + next[0] = '\0'; + + pnp = find_node(nodeid); + if (pnp == NULL) + return (NULL); + + propname = nextprop(pnp, name); + if (propname == NULL) + return (next); + + (void) prom_strcpy(next, propname); + + return (next); +} + +static prom_node_t * +find_node_work(prom_node_t *np, pnode_t node) +{ + prom_node_t *nnp; + + if (np->pn_nodeid == node) + return (np); + + if (np->pn_child) + if ((nnp = find_node_work(np->pn_child, node)) != NULL) + return (nnp); + + if (np->pn_sibling) + if ((nnp = find_node_work(np->pn_sibling, node)) != NULL) + return (nnp); + + return (NULL); +} + +static prom_node_t * +find_node(pnode_t nodeid) +{ + + if (nodeid == OBP_NONODE) + return (promif_root); + + if (promif_root == NULL) + return (NULL); + + return (find_node_work(promif_root, nodeid)); +} + +static int +getproplen(prom_node_t *pnp, char *name) +{ + struct prom_prop *propp; + + for (propp = pnp->pn_propp; propp != NULL; propp = propp->pp_next) + if (prom_strcmp(propp->pp_name, name) == 0) + return (propp->pp_len); + + return (-1); +} + +static void * +getprop(prom_node_t *np, char *name) +{ + struct prom_prop *propp; + + for (propp = np->pn_propp; propp != NULL; propp = propp->pp_next) + if (prom_strcmp(propp->pp_name, name) == 0) + return (propp->pp_val); + + return (NULL); +} + +static char * +nextprop(prom_node_t *pnp, char *name) +{ + struct prom_prop *propp; + + /* + * getting next of NULL or a null string returns the first prop name + */ + if (name == NULL || *name == '\0') + if (pnp->pn_propp) + return (pnp->pn_propp->pp_name); + + for (propp = pnp->pn_propp; propp != NULL; propp = propp->pp_next) + if (prom_strcmp(propp->pp_name, name) == 0) + if (propp->pp_next) + return (propp->pp_next->pp_name); + + return (NULL); +} + +#ifndef _KMDB + +int +promif_stree_setprop(pnode_t nodeid, char *name, void *value, int len) +{ + prom_node_t *pnp; + struct prom_prop *prop; + + pnp = find_node(nodeid); + if (pnp == NULL) { + prom_printf("find_node: no node?\n"); + return (-1); + } + + /* + * If a property with this name exists, replace the existing + * value. + */ + for (prop = pnp->pn_propp; prop; prop = prop->pp_next) + if (prom_strcmp(prop->pp_name, name) == 0) { + kmem_free(prop->pp_val, prop->pp_len); + prop->pp_val = NULL; + if (len > 0) { + prop->pp_val = kmem_zalloc(len, KM_SLEEP); + bcopy(value, prop->pp_val, len); + } + prop->pp_len = len; + return (len); + } + + return (-1); +} + +/* + * Create a promif private copy of boot's device tree. + */ +void +promif_stree_init(void) +{ + pnode_t node; + prom_node_t *pnp; + + node = prom_rootnode(); + promif_root = pnp = create_node(OBP_NONODE, node); + + create_peers(pnp, node); + create_children(pnp, node); +} + +static void +create_children(prom_node_t *pnp, pnode_t parent) +{ + prom_node_t *cnp; + pnode_t child; + + _NOTE(CONSTCOND) + while (1) { + child = prom_childnode(parent); + if (child == 0) + break; + if (prom_getproplen(child, "name") <= 0) { + parent = child; + continue; + } + cnp = create_node(pnp, child); + pnp->pn_child = cnp; + create_peers(cnp, child); + pnp = cnp; + parent = child; + } +} + +static void +create_peers(prom_node_t *np, pnode_t node) +{ + prom_node_t *pnp; + pnode_t peer; + + _NOTE(CONSTCOND) + while (1) { + peer = prom_nextnode(node); + if (peer == 0) + break; + if (prom_getproplen(peer, "name") <= 0) { + node = peer; + continue; + } + pnp = create_node(np->pn_parent, peer); + np->pn_sibling = pnp; + create_children(pnp, peer); + np = pnp; + node = peer; + } +} + +static prom_node_t * +create_node(prom_node_t *parent, pnode_t node) +{ + prom_node_t *pnp; + char prvname[OBP_MAXPROPNAME]; + char propname[OBP_MAXPROPNAME]; + int proplen; + void *propval; + + pnp = kmem_zalloc(sizeof (prom_node_t), KM_SLEEP); + pnp->pn_nodeid = node; + pnp->pn_parent = parent; + + prvname[0] = '\0'; + + _NOTE(CONSTCOND) + while (1) { + (void) prom_nextprop(node, prvname, propname); + if (prom_strlen(propname) == 0) + break; + if ((proplen = prom_getproplen(node, propname)) == -1) + continue; + propval = NULL; + if (proplen != 0) { + propval = kmem_zalloc(proplen, KM_SLEEP); + (void) prom_getprop(node, propname, propval); + } + create_prop(pnp, propname, propval, proplen); + + (void) prom_strcpy(prvname, propname); + } + + return (pnp); +} + +static void +create_prop(prom_node_t *pnp, char *name, void *val, int len) +{ + struct prom_prop *prop; + struct prom_prop *newprop; + + newprop = kmem_zalloc(sizeof (*newprop), KM_SLEEP); + newprop->pp_name = kmem_zalloc(prom_strlen(name) + 1, KM_SLEEP); + (void) prom_strcpy(newprop->pp_name, name); + newprop->pp_val = val; + newprop->pp_len = len; + + if (pnp->pn_propp == NULL) { + pnp->pn_propp = newprop; + return; + } + + /* move to the end of the prop list */ + for (prop = pnp->pn_propp; prop->pp_next != NULL; prop = prop->pp_next) + /* empty */; + + /* append the new prop */ + prop->pp_next = newprop; +} + +static void +promif_dump_tree(prom_node_t *pnp) +{ + int i; + static int level = 0; + + if (pnp == NULL) + return; + + for (i = 0; i < level; i++) { + prom_printf(" "); + } + + prom_printf("Node 0x%x (parent=0x%x, sibling=0x%x)\n", pnp->pn_nodeid, + (pnp->pn_parent) ? pnp->pn_parent->pn_nodeid : 0, + (pnp->pn_sibling) ? pnp->pn_sibling->pn_nodeid : 0); + + if (pnp->pn_child != NULL) { + level++; + promif_dump_tree(pnp->pn_child); + level--; + } + + if (pnp->pn_sibling != NULL) + promif_dump_tree(pnp->pn_sibling); +} + +#endif diff --git a/usr/src/uts/sun4v/promif/promif_test.c b/usr/src/uts/sun4v/promif/promif_test.c new file mode 100644 index 0000000000..ceb9ec3947 --- /dev/null +++ b/usr/src/uts/sun4v/promif/promif_test.c @@ -0,0 +1,51 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/promif_impl.h> + +int +promif_test(void *p) +{ + cell_t *ci = (cell_t *)p; + char *opname; + cif_func_t func; + int rv; + + ASSERT(ci[1] == 1); + + opname = p1275_cell2ptr(ci[3]); + + func = promif_find_cif_callback(opname); + + /* zero indicates operation is supported */ + rv = (func != NULL) ? 0 : 1; + + ci[4] = p1275_int2cell(rv); + + return (0); +} diff --git a/usr/src/uts/sun4v/promif/promif_version.c b/usr/src/uts/sun4v/promif/promif_version.c new file mode 100644 index 0000000000..c79e02513f --- /dev/null +++ b/usr/src/uts/sun4v/promif/promif_version.c @@ -0,0 +1,82 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/promif_impl.h> +#include <sys/hypervisor_api.h> + +/* + * Wrappers to get/set the API version with Hypervisor. + */ + +int +promif_set_sun4v_api_version(void *p) +{ + cell_t *ci = (cell_t *)p; + uint64_t api_group; + uint64_t major; + uint64_t minor; + uint64_t status; + uint64_t supported_minor; + + ASSERT(ci[1] == 3); + ASSERT(ci[2] == 2); + + api_group = (uint64_t)p1275_cell2int(ci[3]); + major = (uint64_t)p1275_cell2int(ci[4]); + minor = (uint64_t)p1275_cell2int(ci[5]); + + status = hv_api_set_version(api_group, major, minor, &supported_minor); + + ci[6] = p1275_int2cell(status); + ci[7] = p1275_int2cell(supported_minor); + + return ((status == H_EOK) ? 0 : -1); +} + +int +promif_get_sun4v_api_version(void *p) +{ + cell_t *ci = (cell_t *)p; + uint64_t api_group; + uint64_t major; + uint64_t minor; + uint64_t status; + + ASSERT(ci[1] == 1); + ASSERT(ci[2] == 3); + + api_group = (uint64_t)p1275_cell2int(ci[3]); + + status = hv_api_get_version(api_group, &major, &minor); + + ci[4] = p1275_int2cell(status); + ci[5] = p1275_int2cell(major); + ci[6] = p1275_int2cell(minor); + + return ((status == H_EOK) ? 0 : -1); +} diff --git a/usr/src/uts/sun4v/sys/cnex.h b/usr/src/uts/sun4v/sys/cnex.h new file mode 100644 index 0000000000..f2b01a8ae7 --- /dev/null +++ b/usr/src/uts/sun4v/sys/cnex.h @@ -0,0 +1,98 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _CNEX_H +#define _CNEX_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Channel nexus "reg" spec + */ +typedef struct cnex_regspec { + uint64_t physaddr; + uint64_t size; +} cnex_regspec_t; + +/* + * Channel nexus interrupt map + */ +struct cnex_pil_map { + ldc_dev_t devclass; /* LDC device class */ + uint32_t pil; /* PIL for device class */ +}; + +/* + * Channel interrupt information + */ +typedef struct cnex_intr { + uint64_t ino; /* dev intr number */ + uint64_t icookie; /* dev intr cookie */ + uint_t (*hdlr)(); /* intr handler */ + caddr_t arg1; /* intr argument 1 */ + caddr_t arg2; /* intr argument 2 */ + void *ssp; /* back ptr to soft state */ +} cnex_intr_t; + +/* cnex interrupt types */ +typedef enum { + CNEX_TX_INTR = 1, /* transmit interrupt */ + CNEX_RX_INTR /* receive interrupt */ +} cnex_intrtype_t; + +/* + * Channel information + */ +typedef struct cnex_ldc { + kmutex_t lock; /* Channel lock */ + struct cnex_ldc *next; + + uint64_t id; + ldc_dev_t devclass; /* Device class channel belongs to */ + + cnex_intr_t tx; /* Transmit interrupt */ + cnex_intr_t rx; /* Receive interrupt */ +} cnex_ldc_t; + +/* + * Channel nexus soft state pointer + */ +typedef struct cnex_soft_state { + dev_info_t *devi; + uint64_t cfghdl; /* cnex config handle */ + kmutex_t clist_lock; /* lock to protect channel list */ + cnex_ldc_t *clist; /* list of registered channels */ +} cnex_soft_state_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _CNEX_H */ diff --git a/usr/src/uts/sun4v/sys/cpu_module.h b/usr/src/uts/sun4v/sys/cpu_module.h index e1c386533b..902c088411 100644 --- a/usr/src/uts/sun4v/sys/cpu_module.h +++ b/usr/src/uts/sun4v/sys/cpu_module.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -154,6 +153,18 @@ extern void bzero(void *addr, size_t count); int cpu_trapstat_conf(int cmd); void cpu_trapstat_data(void *buf, uint_t pgszs); +#define NO_EU_MAPPING_FOUND 0xffffffff +/* + * Default MMU pagesize mask for sun4v architecture. + */ +#define DEFAULT_SUN4V_MMU_PAGESIZE_MASK ((1 << TTE8K) | (1 << TTE64K) \ + | (1 << TTE4M)) + +void cpu_setup_common(char **); + +boolean_t broken_md_flag; +int va_bits; + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/sun4v/sys/dr_cpu.h b/usr/src/uts/sun4v/sys/dr_cpu.h new file mode 100644 index 0000000000..070645c556 --- /dev/null +++ b/usr/src/uts/sun4v/sys/dr_cpu.h @@ -0,0 +1,93 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DR_CPU_H +#define _DR_CPU_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * CPU DR Control Protocol + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * CPU DR Message Header + */ +typedef struct { + uint64_t req_num; /* request number */ + uint32_t msg_type; /* message type */ + uint32_t num_records; /* number of records */ +} dr_cpu_hdr_t; + +/* + * CPU command and response messages + */ + +#define DR_CPU_DS_ID "dr-cpu" + +#define DR_CPU_CONFIGURE ('C') +#define DR_CPU_UNCONFIGURE ('U') +#define DR_CPU_FORCE_UNCONFIG ('F') +#define DR_CPU_STATUS ('S') + +#define DR_CPU_OK ('o') +#define DR_CPU_ERROR ('e') + +/* + * Response Message + */ +typedef struct { + uint32_t cpuid; /* virtual CPU ID */ + uint32_t result; /* result of the operation */ + uint32_t status; /* status of the CPU */ + uint32_t string_off; /* informational string offset */ +} dr_cpu_stat_t; + +/* + * Result Codes + */ +#define DR_CPU_RES_OK 0x0 /* operation succeeded */ +#define DR_CPU_RES_FAILURE 0x1 /* operation failed */ +#define DR_CPU_RES_BLOCKED 0x2 /* operation was blocked */ +#define DR_CPU_RES_CPU_NOT_RESPONDING 0x3 /* CPU was not responding */ +#define DR_CPU_RES_NOT_IN_MD 0x4 /* CPU not defined in MD */ + +/* + * Status Codes + */ +#define DR_CPU_STAT_NOT_PRESENT 0x0 /* CPU ID not in MD */ +#define DR_CPU_STAT_UNCONFIGURED 0x1 /* CPU unconfigured */ +#define DR_CPU_STAT_CONFIGURED 0x2 /* CPU configured */ + +#ifdef __cplusplus +} +#endif + +#endif /* _DR_CPU_H */ diff --git a/usr/src/uts/sun4v/sys/dr_util.h b/usr/src/uts/sun4v/sys/dr_util.h new file mode 100644 index 0000000000..944738ff29 --- /dev/null +++ b/usr/src/uts/sun4v/sys/dr_util.h @@ -0,0 +1,108 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DR_UTIL_H +#define _DR_UTIL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * sun4v Common DR Header + */ + +#include <sys/ksynch.h> +#include <sys/cmn_err.h> +#include <sys/note.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Debugging support + */ +#ifdef DEBUG + +extern uint_t dr_debug; + +#define DR_DBG_FLAG_CTL 0x01 +#define DR_DBG_FLAG_CPU 0x02 +#define DR_DBG_FLAG_MEM 0x04 +#define DR_DBG_FLAG_IO 0x08 +#define DR_DBG_FLAG_TRANS 0x10 + +#define DR_DBG_ALL if (dr_debug) printf +#define DR_DBG_CTL if (dr_debug & DR_DBG_FLAG_CTL) printf +#define DR_DBG_CPU if (dr_debug & DR_DBG_FLAG_CPU) printf +#define DR_DBG_MEM if (dr_debug & DR_DBG_FLAG_MEM) printf +#define DR_DBG_IO if (dr_debug & DR_DBG_FLAG_IO) printf +#define DR_DBG_TRANS if (dr_debug & DR_DBG_FLAG_TRANS) printf + +#define DR_DBG_DUMP_MSG(buf, len) dr_dbg_dump_msg(buf, len) + +extern void dr_dbg_dump_msg(void *buf, size_t len); + +#else /* DEBUG */ + +#define DR_DBG_ALL _NOTE(CONSTCOND) if (0) printf +#define DR_DBG_CTL DR_DBG_ALL +#define DR_DBG_CPU DR_DBG_ALL +#define DR_DBG_MEM DR_DBG_ALL +#define DR_DBG_IO DR_DBG_ALL +#define DR_DBG_TRANS DR_DBG_ALL + +#define DR_DBG_DUMP_MSG(buf, len) + +#endif /* DEBUG */ + +typedef enum { + DR_TYPE_INVAL, + DR_TYPE_CPU, + DR_TYPE_MEM, + DR_TYPE_VIO, + DR_TYPE_DIO +} dr_type_t; + +/* + * Macro to convert a dr_type_t into a string. These strings are + * used to generate DR events and should only be modified using + * extreme caution. + */ +#define DR_TYPE2STR(t) ((t) == DR_TYPE_INVAL ? "invalid" : \ + (t) == DR_TYPE_CPU ? OBP_CPU : \ + (t) == DR_TYPE_MEM ? "memory" : \ + (t) == DR_TYPE_VIO ? "vio" : \ + (t) == DR_TYPE_DIO ? "dio" : \ + "unknown") + +extern boolean_t dr_is_disabled(dr_type_t type); +extern void dr_generate_event(dr_type_t type, int se_hint); + +#ifdef __cplusplus +} +#endif + +#endif /* _DR_UTIL_H */ diff --git a/usr/src/uts/sun4v/sys/ds.h b/usr/src/uts/sun4v/sys/ds.h new file mode 100644 index 0000000000..cd5efa807f --- /dev/null +++ b/usr/src/uts/sun4v/sys/ds.h @@ -0,0 +1,114 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DS_H +#define _DS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Domain Services Client Interface + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint64_t ds_svc_hdl_t; /* opaque service handle */ +typedef void *ds_cb_arg_t; /* client specified callback arg */ + +#define DS_INVALID_HDL (0) /* a ds handle cannot be zero */ + +/* + * Domain Services Versioning + */ +typedef struct ds_ver { + uint16_t major; + uint16_t minor; +} ds_ver_t; + +/* + * Domain Services Capability + * + * A DS capability is exported by a client using a unique service + * identifier string. Along with this identifier is the list of + * versions of the capability that the client supports. + */ +typedef struct ds_capability { + char *svc_id; /* service identifier */ + ds_ver_t *vers; /* list of supported versions */ + int nvers; /* number of supported versions */ +} ds_capability_t; + +/* + * Domain Services Client Event Callbacks + * + * A client implementing a DS capability provides a set of callbacks + * when it registers with the DS framework. The use of these callbacks + * is described below: + * + * ds_reg_cb(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl) + * + * The ds_reg_cb() callback is invoked when the DS framework + * has successfully completed version negotiation with the + * remote endpoint for the capability. It provides the client + * with the negotiated version and a handle to use when sending + * data. + * + * ds_unreg_cb(ds_cb_arg_t arg) + * + * The ds_unreg_cb() callback is invoked when the DS framework + * detects an event that causes the registered capability to + * become unavailable. This includes an explicit unregister + * message, a failure in the underlying communication transport, + * etc. Any such event invalidates the service handle that was + * received from the register callback. + * + * ds_data_cb(ds_cb_arg_t arg, void *buf, size_t buflen) + * + * The ds_data_cb() callback is invoked whenever there is an + * incoming data message for the client to process. It provides + * the contents of the message along with the message length. + */ +typedef struct ds_clnt_ops { + void (*ds_reg_cb)(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl); + void (*ds_unreg_cb)(ds_cb_arg_t arg); + void (*ds_data_cb)(ds_cb_arg_t arg, void *buf, size_t buflen); + ds_cb_arg_t cb_arg; +} ds_clnt_ops_t; + +/* + * Domain Services Capability Interface + */ +extern int ds_cap_init(ds_capability_t *cap, ds_clnt_ops_t *ops); +extern int ds_cap_fini(ds_capability_t *cap); +extern int ds_cap_send(ds_svc_hdl_t hdl, void *buf, size_t buflen); + +#ifdef __cplusplus +} +#endif + +#endif /* _DS_H */ diff --git a/usr/src/uts/sun4v/sys/ds_impl.h b/usr/src/uts/sun4v/sys/ds_impl.h new file mode 100644 index 0000000000..461214f4e3 --- /dev/null +++ b/usr/src/uts/sun4v/sys/ds_impl.h @@ -0,0 +1,332 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DS_IMPL_H +#define _DS_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The Domain Services Protocol + * + * The DS protocol is divided into two parts. The first is fixed and + * must remain exactly the same for *all* versions of the DS protocol. + * The only messages supported by the fixed portion of the protocol are + * to negotiate a version to use for the rest of the protocol. + */ + +/* + * Domain Services Header + */ +typedef struct ds_hdr { + uint32_t msg_type; /* message type */ + uint32_t payload_len; /* payload length */ +} ds_hdr_t; + +#define DS_HDR_SZ (sizeof (ds_hdr_t)) + +/* + * DS Fixed Message Types + */ +#define DS_INIT_REQ 0x0 /* initiate DS connection */ +#define DS_INIT_ACK 0x1 /* initiation acknowledgement */ +#define DS_INIT_NACK 0x2 /* initiation negative acknowledgment */ + +/* + * DS Fixed Initialization Messages + */ +typedef struct ds_init_req { + uint16_t major_vers; /* requested major version */ + uint16_t minor_vers; /* requested minor version */ +} ds_init_req_t; + +typedef struct ds_init_ack { + uint16_t minor_vers; /* highest supported minor version */ +} ds_init_ack_t; + +typedef struct ds_init_nack { + uint16_t major_vers; /* alternate supported major version */ +} ds_init_nack_t; + +/* + * DS Message Types for Version 1.0 + */ +#define DS_REG_REQ 0x3 /* register a service */ +#define DS_REG_ACK 0x4 /* register acknowledgement */ +#define DS_REG_NACK 0x5 /* register failed */ +#define DS_UNREG 0x6 /* unregister a service */ +#define DS_UNREG_ACK 0x7 /* unregister acknowledgement */ +#define DS_UNREG_NACK 0x8 /* unregister failed */ +#define DS_DATA 0x9 /* data message */ +#define DS_NACK 0xa /* data error */ + +/* result codes */ +#define DS_OK 0x0 /* success */ +#define DS_REG_VER_NACK 0x1 /* unsupported major version */ +#define DS_REG_DUP 0x2 /* duplicate registration attempted */ +#define DS_INV_HDL 0x3 /* service handle not valid */ +#define DS_TYPE_UNKNOWN 0x4 /* unknown message type received */ + +/* + * Service Register Messages + */ +typedef struct ds_reg_req { + uint64_t svc_handle; /* service handle to register */ + uint16_t major_vers; /* requested major version */ + uint16_t minor_vers; /* requested minor version */ + char svc_id[1]; /* service identifier string */ +} ds_reg_req_t; + +typedef struct ds_reg_ack { + uint64_t svc_handle; /* service handle sent in register */ + uint16_t minor_vers; /* highest supported minor version */ +} ds_reg_ack_t; + +typedef struct ds_reg_nack { + uint64_t svc_handle; /* service handle sent in register */ + uint64_t result; /* reason for the failure */ + uint16_t major_vers; /* alternate supported major version */ +} ds_reg_nack_t; + +/* + * Service Unregister Messages + */ +typedef struct ds_unreg_req { + uint64_t svc_handle; /* service handle to unregister */ +} ds_unreg_req_t; + +typedef struct ds_unreg_ack { + uint64_t svc_handle; /* service handle sent in unregister */ +} ds_unreg_ack_t; + +typedef struct ds_unreg_nack { + uint64_t svc_handle; /* service handle sent in unregister */ +} ds_unreg_nack_t; + +/* + * Data Transfer Messages + */ +typedef struct ds_data_handle { + uint64_t svc_handle; /* service handle for data */ +} ds_data_handle_t; + +typedef struct ds_data_nack { + uint64_t svc_handle; /* service handle sent in data msg */ + uint64_t result; /* reason for failure */ +} ds_data_nack_t; + +/* + * Message Processing Utilities + */ +#define DS_MSG_TYPE_VALID(type) ((type) <= DS_NACK) +#define DS_MSG_LEN(ds_type) (sizeof (ds_hdr_t) + sizeof (ds_type)) + + +/* + * Domain Service Port + * + * A DS port is a logical representation of an LDC dedicated to + * communication between DS endpoints. The ds_port_t maintains state + * associated with a connection to a remote endpoint. This includes + * the state of the port, the LDC state, the current version of the + * DS protocol in use on the port, and other port properties. + * + * Locking: The port is protected by a single mutex. It must be held + * while the port structure is being accessed and also when data is + * being read or written using the port + */ +typedef enum { + DS_PORT_FREE, /* port structure not in use */ + DS_PORT_INIT, /* port structure created */ + DS_PORT_LDC_INIT, /* ldc successfully initialized */ + DS_PORT_INIT_REQ, /* initialization handshake sent */ + DS_PORT_READY /* init handshake completed */ +} ds_port_state_t; + +typedef struct ds_ldc { + uint64_t id; /* LDC id */ + ldc_handle_t hdl; /* LDC handle */ + ldc_status_t state; /* current LDC state */ +} ds_ldc_t; + +typedef struct ds_port { + kmutex_t lock; /* port lock */ + uint64_t id; /* port id from MD */ + ds_port_state_t state; /* state of the port */ + ds_ver_t ver; /* DS protocol version in use */ + uint32_t ver_idx; /* index of version during handshake */ + ds_ldc_t ldc; /* LDC for this port */ +} ds_port_t; + +/* + * A DS portset is a bitmap that represents a collection of DS + * ports. Each bit represent a particular port id. The current + * implementation constrains the maximum number of ports to 64. + */ +typedef uint64_t ds_portset_t; + +#define DS_MAX_PORTS ((sizeof (ds_portset_t)) * 8) +#define DS_MAX_PORT_ID (DS_MAX_PORTS - 1) + +#define DS_PORT_SET(port) (1UL << (port)) +#define DS_PORT_IN_SET(set, port) ((set) & DS_PORT_SET(port)) +#define DS_PORTSET_ADD(set, port) ((void)((set) |= DS_PORT_SET(port))) +#define DS_PORTSET_DEL(set, port) ((void)((set) &= ~DS_PORT_SET(port))) +#define DS_PORTSET_ISNULL(set) ((set) == 0) +#define DS_PORTSET_DUP(set1, set2) ((void)((set1) = (set2))) + +/* + * LDC Information + */ +#define DS_QUEUE_LEN 128 /* LDC queue size */ + +/* + * Machine Description Constants + */ +#define DS_MD_PORT_NAME "domain-services-port" +#define DS_MD_CHAN_NAME "channel-endpoint" + +/* + * DS Services + * + * A DS Service is a mapping between a DS capability and a client + * of the DS framework that provides that capability. It includes + * information on the state of the service, the currently negotiated + * version of the capability specific protocol, the port that is + * currently in use by the capability, etc. + */ + +typedef enum { + DS_SVC_INVAL, /* svc structure uninitialized */ + DS_SVC_FREE, /* svc structure not in use */ + DS_SVC_INACTIVE, /* svc not registered */ + DS_SVC_REG_PENDING, /* register message sent */ + DS_SVC_ACTIVE /* register message acknowledged */ +} ds_svc_state_t; + +typedef struct ds_svc { + ds_capability_t cap; /* capability information */ + ds_clnt_ops_t ops; /* client ops vector */ + ds_svc_hdl_t hdl; /* handle assigned by DS */ + ds_svc_state_t state; /* current service state */ + ds_ver_t ver; /* svc protocol version in use */ + uint_t ver_idx; /* index into client version array */ + ds_port_t *port; /* port for this service */ + ds_portset_t avail; /* ports available to this service */ +} ds_svc_t; + +#define DS_SVC_ISFREE(svc) ((svc == NULL) || (svc->state == DS_SVC_FREE)) + +/* + * A service handle is a 64 bit value with two pieces of information + * encoded in it. The upper 32 bits is the index into the table of + * a particular service structure. The lower 32 bits is a counter + * that is incremented each time a service structure is reused. + */ +#define DS_IDX_SHIFT 32 +#define DS_COUNT_MASK 0xfffffffful + +#define DS_ALLOC_HDL(_idx, _count) (((uint64_t)_idx << DS_IDX_SHIFT) | \ + ((uint64_t)(_count + 1) & \ + DS_COUNT_MASK)) +#define DS_HDL2IDX(hdl) (hdl >> DS_IDX_SHIFT) +#define DS_HDL2COUNT(hdl) (hdl & DS_COUNT_MASK) + +/* + * DS Message Logging + * + * The DS framework logs all incoming and outgoing messages to a + * ring buffer. This provides the ability to reconstruct a trace + * of DS activity for use in debugging. In addition to the message + * data, each log entry contains a timestamp and the destination + * of the message. The destination is based on the port number the + * message passed through (port number + 1). The sign of the dest + * field distinguishes incoming messages from outgoing messages. + * Incoming messages have a negative destination field. + */ + +typedef struct ds_log_entry { + struct ds_log_entry *next; /* next in log or free list */ + struct ds_log_entry *prev; /* previous in log */ + time_t timestamp; /* time message added to log */ + size_t datasz; /* size of the data */ + void *data; /* the data itself */ + int32_t dest; /* message destination */ +} ds_log_entry_t; + +#define DS_LOG_IN(pid) (-(pid + 1)) +#define DS_LOG_OUT(pid) (pid + 1) + +/* + * DS Log Limits: + * + * The size of the log is controlled by two limits. The first is + * a soft limit that is configurable by the user (via the global + * variable ds_log_sz). When this limit is exceeded, each new + * message that is added to the log replaces the oldest message. + * + * The second is a hard limit that is calculated based on the soft + * limit (DS_LOG_LIMIT). It is defined to be ~3% above the soft limit. + * Once this limit is exceeded, a thread is scheduled to delete old + * messages until the size of the log is below the soft limit. + */ +#define DS_LOG_DEFAULT_SZ (128 * 1024) /* 128 KB */ + +#define DS_LOG_LIMIT (ds_log_sz + (ds_log_sz >> 5)) + +#define DS_LOG_ENTRY_SZ(ep) (sizeof (ds_log_entry_t) + (ep)->datasz) + +/* + * DS Log Memory Usage: + * + * The log free list is initialized from a pre-allocated pool of entry + * structures (the global ds_log_entry_pool). The number of entries + * in the pool (DS_LOG_NPOOL) is the number of entries that would + * take up half the default size of the log. + * + * As messages are added to the log, entry structures are pulled from + * the free list. If the free list is empty, memory is allocated for + * the entry. When entries are removed from the log, they are placed + * on the free list. Allocated memory is only deallocated when the + * entire log is destroyed. + */ +#define DS_LOG_NPOOL ((DS_LOG_DEFAULT_SZ >> 1) / \ + sizeof (ds_log_entry_t)) + +#define DS_LOG_POOL_END (ds_log_entry_pool + DS_LOG_NPOOL) + +#define DS_IS_POOL_ENTRY(ep) (((ep) >= ds_log_entry_pool) && \ + ((ep) <= &(ds_log_entry_pool[DS_LOG_NPOOL]))) + +#ifdef __cplusplus +} +#endif + +#endif /* _DS_IMPL_H */ diff --git a/usr/src/uts/sun4v/sys/error.h b/usr/src/uts/sun4v/sys/error.h index bad9123cec..eac767ed56 100644 --- a/usr/src/uts/sun4v/sys/error.h +++ b/usr/src/uts/sun4v/sys/error.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -33,15 +32,11 @@ extern "C" { #endif -#define CPU_RQ_ENTRIES 64 -#define CPU_NRQ_ENTRIES 64 - - /* * Resumable and Non-resumable queues */ -#define CPU_RQ 0x3e -#define CPU_NRQ 0x3f +#define CPU_RQ_ENTRIES 64 +#define CPU_NRQ_ENTRIES 64 #define Q_ENTRY_SIZE 64 #define CPU_RQ_SIZE (CPU_RQ_ENTRIES * Q_ENTRY_SIZE) #define CPU_NRQ_SIZE (CPU_NRQ_ENTRIES * Q_ENTRY_SIZE) diff --git a/usr/src/uts/sun4v/sys/fault_iso.h b/usr/src/uts/sun4v/sys/fault_iso.h new file mode 100644 index 0000000000..1566386df5 --- /dev/null +++ b/usr/src/uts/sun4v/sys/fault_iso.h @@ -0,0 +1,96 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _FAULT_ISO_H +#define _FAULT_ISO_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* FMA CPU message numbers */ +#define FMA_CPU_REQ_STATUS 0x0 +#define FMA_CPU_REQ_OFFLINE 0x1 +#define FMA_CPU_REQ_ONLINE 0x2 + +typedef struct { + uint64_t req_num; + uint32_t msg_type; + uint32_t cpu_id; +} fma_cpu_service_req_t; + +/* FMA CPU result codes */ +#define FMA_CPU_RESP_OK 0x0 +#define FMA_CPU_RESP_FAILURE 0x1 + +/* FMA CPU status codes */ +#define FMA_CPU_STAT_ONLINE 0x0 +#define FMA_CPU_STAT_OFFLINE 0x1 +#define FMA_CPU_STAT_ILLEGAL 0x2 + +typedef struct { + uint64_t req_num; + uint32_t result; + uint32_t status; +} fma_cpu_resp_t; + +/* FMA memory services message numbers */ +#define FMA_MEM_REQ_STATUS 0x0 +#define FMA_MEM_REQ_RETIRE 0x1 +#define FMA_MEM_REQ_RESURRECT 0x2 + +typedef struct { + uint64_t req_num; + uint32_t msg_type; + uint32_t _resvd; + uint64_t real_addr; + uint64_t length; +} fma_mem_service_req_t; + +/* FMA result codes */ +#define FMA_MEM_RESP_OK 0x0 +#define FMA_MEM_RESP_FAILURE 0x1 + +/* FMA status codes */ +#define FMA_MEM_STAT_NOTRETIRED 0x0 +#define FMA_MEM_STAT_RETIRED 0x1 +#define FMA_MEM_STAT_ILLEGAL 0x2 + +typedef struct { + uint64_t req_num; + uint32_t result; + uint32_t status; + uint64_t res_addr; + uint64_t res_length; +} fma_mem_resp_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _FAULT_ISO_H */ diff --git a/usr/src/uts/sun4v/sys/hsvc.h b/usr/src/uts/sun4v/sys/hsvc.h index 80c2a58172..72e2042a80 100644 --- a/usr/src/uts/sun4v/sys/hsvc.h +++ b/usr/src/uts/sun4v/sys/hsvc.h @@ -43,6 +43,7 @@ extern "C" { */ #define HSVC_GROUP_SUN4V 0x0000 #define HSVC_GROUP_CORE 0x0001 +#define HSVC_GROUP_INTR 0x0002 #define HSVC_GROUP_VPCI 0x0100 #define HSVC_GROUP_LDC 0x0101 #define HSVC_GROUP_VSC 0x0102 diff --git a/usr/src/uts/sun4v/sys/hypervisor_api.h b/usr/src/uts/sun4v/sys/hypervisor_api.h index 57e4808a75..d750b83ae5 100644 --- a/usr/src/uts/sun4v/sys/hypervisor_api.h +++ b/usr/src/uts/sun4v/sys/hypervisor_api.h @@ -49,6 +49,8 @@ extern "C" { #define MMU_MAP_ADDR 0x83 #define MMU_UNMAP_ADDR 0x84 +#define CORE_TRAP 0xff + /* * Error returns in %o0. * (Additional result is returned in %o1.) @@ -71,6 +73,8 @@ extern "C" { #define H_ENOMAP 14 /* Mapping is not valid, */ /* no translation exists */ #define H_EBUSY 17 /* Resource busy */ +#define H_ETOOMANY 15 /* Hard resource limit exceeded */ +#define H_ECHANNEL 16 /* Illegal LDC channel */ #define H_BREAK -1 /* Console Break */ #define H_HUP -2 /* Console Break */ @@ -85,9 +89,15 @@ extern "C" { */ #define HV_MACH_EXIT 0x00 #define HV_MACH_DESC 0x01 +#define HV_MACH_SIR 0x02 + +#define HV_CPU_START 0x10 +#define HV_CPU_STOP 0x11 #define HV_CPU_YIELD 0x12 -#define CPU_QCONF 0x14 +#define HV_CPU_QCONF 0x14 #define HV_CPU_STATE 0x17 +#define HV_CPU_SET_RTBA 0x18 + #define MMU_TSB_CTX0 0x20 #define MMU_TSB_CTXNON0 0x21 #define MMU_DEMAP_PAGE 0x22 @@ -95,20 +105,24 @@ extern "C" { #define MMU_DEMAP_ALL 0x24 #define MAP_PERM_ADDR 0x25 #define MMU_SET_INFOPTR 0x26 +#define MMU_ENABLE 0x27 #define UNMAP_PERM_ADDR 0x28 + #define HV_MEM_SCRUB 0x31 #define HV_MEM_SYNC 0x32 + #define HV_INTR_SEND 0x42 + #define TOD_GET 0x50 #define TOD_SET 0x51 -#define CONS_READ 0x60 -#define CONS_WRITE 0x61 + +#define CONS_GETCHAR 0x60 +#define CONS_PUTCHAR 0x61 #define TTRACE_BUF_CONF 0x90 #define TTRACE_BUF_INFO 0x91 #define TTRACE_ENABLE 0x92 #define TTRACE_FREEZE 0x93 - #define DUMP_BUF_UPDATE 0x94 #define HVIO_INTR_DEVINO2SYSINO 0xa0 @@ -119,6 +133,31 @@ extern "C" { #define HVIO_INTR_GETTARGET 0xa5 #define HVIO_INTR_SETTARGET 0xa6 +#define VINTR_GET_COOKIE 0xa7 +#define VINTR_SET_COOKIE 0xa8 +#define VINTR_GET_VALID 0xa9 +#define VINTR_SET_VALID 0xaa +#define VINTR_GET_STATE 0xab +#define VINTR_SET_STATE 0xac +#define VINTR_GET_TARGET 0xad +#define VINTR_SET_TARGET 0xae + +#define LDC_TX_QCONF 0xe0 +#define LDC_TX_QINFO 0xe1 +#define LDC_TX_GET_STATE 0xe2 +#define LDC_TX_SET_QTAIL 0xe3 +#define LDC_RX_QCONF 0xe4 +#define LDC_RX_QINFO 0xe5 +#define LDC_RX_GET_STATE 0xe6 +#define LDC_RX_SET_QHEAD 0xe7 + +#define LDC_SET_MAP_TABLE 0xea +#define LDC_GET_MAP_TABLE 0xeb +#define LDC_COPY 0xec +#define LDC_MAPIN 0xed +#define LDC_UNMAP 0xee +#define LDC_REVOKE 0xef + #ifdef SET_MMU_STATS #define MMU_STAT_AREA 0xfc #endif /* SET_MMU_STATS */ @@ -127,6 +166,15 @@ extern "C" { #define HV_HPRIV 0x201 /* + * Function numbers for CORE_TRAP. + */ +#define API_SET_VERSION 0x00 +#define API_PUT_CHAR 0x01 +#define API_EXIT 0x02 +#define API_GET_VERSION 0x03 + + +/* * Bits for MMU functions flags argument: * arg3 of MMU_MAP_ADDR * arg3 of MMU_DEMAP_CTX @@ -188,14 +236,14 @@ struct mmu_stat { }; #endif /* SET_MMU_STATS */ -#endif /* _ASM */ +#endif /* ! _ASM */ /* * CPU States */ #define CPU_STATE_INVALID 0x0 -#define CPU_STATE_IDLE 0x1 /* cpu not started */ -#define CPU_STATE_GUEST 0x2 /* cpu running guest code */ +#define CPU_STATE_STOPPED 0x1 /* cpu not started */ +#define CPU_STATE_RUNNING 0x2 /* cpu running guest code */ #define CPU_STATE_ERROR 0x3 /* cpu is in the error state */ #define CPU_STATE_LAST_PUBLIC CPU_STATE_ERROR /* last valid state */ @@ -256,19 +304,34 @@ struct mmu_stat { #define HVIO_DMA_SYNC_DIR_TO_DEV 0x01 #define HVIO_DMA_SYNC_DIR_FROM_DEV 0x02 +/* + * LDC Channel States + */ +#define LDC_CHANNEL_DOWN 0x0 +#define LDC_CHANNEL_UP 0x1 +#define LDC_CHANNEL_RESET 0x2 + #ifndef _ASM extern uint64_t hv_mmu_map_perm_addr(void *, int, uint64_t, int); extern uint64_t hv_mmu_unmap_perm_addr(void *, int, int); +extern uint64_t hv_mach_exit(uint64_t exit_code); +extern uint64_t hv_mach_sir(void); + +extern uint64_t hv_cpu_start(uint64_t cpuid, uint64_t pc, uint64_t rtba, + uint64_t arg); +extern uint64_t hv_cpu_stop(uint64_t cpuid); +extern uint64_t hv_cpu_set_rtba(uint64_t *rtba); + extern uint64_t hv_set_ctx0(uint64_t, uint64_t); extern uint64_t hv_set_ctxnon0(uint64_t, uint64_t); +extern uint64_t hv_mmu_fault_area_conf(void *raddr); #ifdef SET_MMU_STATS extern uint64_t hv_mmu_set_stat_area(uint64_t, uint64_t); #endif /* SET_MMU_STATS */ extern uint64_t hv_cpu_qconf(int queue, uint64_t paddr, int size); -extern uint64_t hv_cpu_yield(); - +extern uint64_t hv_cpu_yield(void); extern uint64_t hv_cpu_state(uint64_t cpuid, uint64_t *cpu_state); extern uint64_t hv_mem_scrub(uint64_t real_addr, uint64_t length, uint64_t *scrubbed_len); @@ -282,7 +345,6 @@ extern uint64_t hv_service_send(uint64_t s_id, uint64_t buf_pa, extern uint64_t hv_service_getstatus(uint64_t s_id, uint64_t *vreg); extern uint64_t hv_service_setstatus(uint64_t s_id, uint64_t bits); extern uint64_t hv_service_clrstatus(uint64_t s_id, uint64_t bits); - extern uint64_t hv_mach_desc(uint64_t buffer_ra, uint64_t *buffer_sizep); extern uint64_t hv_ttrace_buf_info(uint64_t *, uint64_t *); @@ -300,16 +362,64 @@ extern uint64_t hv_tod_set(uint64_t); extern uint64_t hvio_intr_devino_to_sysino(uint64_t dev_hdl, uint32_t devino, uint64_t *sysino); extern uint64_t hvio_intr_getvalid(uint64_t sysino, - int *intr_valid_state); + int *intr_valid_state); extern uint64_t hvio_intr_setvalid(uint64_t sysino, - int intr_valid_state); + int intr_valid_state); extern uint64_t hvio_intr_getstate(uint64_t sysino, - int *intr_state); + int *intr_state); extern uint64_t hvio_intr_setstate(uint64_t sysino, int intr_state); extern uint64_t hvio_intr_gettarget(uint64_t sysino, uint32_t *cpuid); extern uint64_t hvio_intr_settarget(uint64_t sysino, uint32_t cpuid); -#endif +extern uint64_t hv_ldc_tx_qconf(uint64_t channel, uint64_t ra_base, + uint64_t nentries); +extern uint64_t hv_ldc_tx_qinfo(uint64_t channel, uint64_t *ra_base, + uint64_t *nentries); +extern uint64_t hv_ldc_tx_get_state(uint64_t channel, uint64_t *headp, + uint64_t *tailp, uint64_t *state); +extern uint64_t hv_ldc_tx_set_qtail(uint64_t channel, uint64_t tail); +extern uint64_t hv_ldc_rx_qconf(uint64_t channel, uint64_t ra_base, + uint64_t nentries); +extern uint64_t hv_ldc_rx_qinfo(uint64_t channel, uint64_t *ra_base, + uint64_t *nentries); +extern uint64_t hv_ldc_rx_get_state(uint64_t channel, uint64_t *headp, + uint64_t *tailp, uint64_t *state); +extern uint64_t hv_ldc_rx_set_qhead(uint64_t channel, uint64_t head); + +extern uint64_t hv_ldc_set_map_table(uint64_t channel, uint64_t tbl_ra, + uint64_t tbl_entries); +extern uint64_t hv_ldc_get_map_table(uint64_t channel, uint64_t *tbl_ra, + uint64_t *tbl_entries); +extern uint64_t hv_ldc_copy(uint64_t channel, uint64_t request, + uint64_t cookie, uint64_t raddr, uint64_t length, uint64_t *lengthp); +extern uint64_t hv_ldc_mapin(uint64_t channel, uint64_t cookie, + uint64_t *raddr, uint64_t *perm); +extern uint64_t hv_ldc_unmap(uint64_t raddr); +extern uint64_t hv_ldc_revoke(uint64_t raddr); +extern uint64_t hv_api_get_version(uint64_t api_group, uint64_t *majorp, + uint64_t *minorp); +extern uint64_t hv_api_set_version(uint64_t api_group, uint64_t major, + uint64_t minor, uint64_t *supported_minor); + +extern uint64_t hvldc_intr_getcookie(uint64_t dev_hdl, uint32_t devino, + uint64_t *cookie); +extern uint64_t hvldc_intr_setcookie(uint64_t dev_hdl, uint32_t devino, + uint64_t cookie); +extern uint64_t hvldc_intr_getvalid(uint64_t dev_hdl, uint32_t devino, + int *intr_valid_state); +extern uint64_t hvldc_intr_setvalid(uint64_t dev_hdl, uint32_t devino, + int intr_valid_state); +extern uint64_t hvldc_intr_getstate(uint64_t dev_hdl, uint32_t devino, + int *intr_state); +extern uint64_t hvldc_intr_setstate(uint64_t dev_hdl, uint32_t devino, + int intr_state); +extern uint64_t hvldc_intr_gettarget(uint64_t dev_hdl, uint32_t devino, + uint32_t *cpuid); +extern uint64_t hvldc_intr_settarget(uint64_t dev_hdl, uint32_t devino, + uint32_t cpuid); + +#endif /* ! _ASM */ + #ifdef __cplusplus } diff --git a/usr/src/uts/sun4v/sys/ldc.h b/usr/src/uts/sun4v/sys/ldc.h new file mode 100644 index 0000000000..a9718b6591 --- /dev/null +++ b/usr/src/uts/sun4v/sys/ldc.h @@ -0,0 +1,221 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LDC_H +#define _LDC_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ioctl.h> +#include <sys/processor.h> + +/* Types */ +typedef uint64_t ldc_handle_t; /* Channel handle */ +typedef uint64_t ldc_mem_handle_t; /* Channel memory handle */ +typedef uint64_t ldc_dring_handle_t; /* Descriptor ring handle */ + +/* LDC transport mode */ +typedef enum { + LDC_MODE_RAW, /* Raw mode */ + LDC_MODE_UNRELIABLE, /* Unreliable packet mode */ + LDC_MODE_RELIABLE, /* Reliable packet mode */ + LDC_MODE_STREAM /* Reliable byte stream */ +} ldc_mode_t; + +/* LDC message payload sizes */ +#define LDC_ELEM_SIZE 8 /* size in bytes */ +#define LDC_PACKET_SIZE (LDC_ELEM_SIZE * 8) +#define LDC_PAYLOAD_SIZE_RAW (LDC_PACKET_SIZE) +#define LDC_PAYLOAD_SIZE_UNRELIABLE (LDC_PACKET_SIZE - LDC_ELEM_SIZE) +#define LDC_PAYLOAD_SIZE_RELIABLE (LDC_PACKET_SIZE - (LDC_ELEM_SIZE * 2)) + +/* LDC Channel Status */ +typedef enum { + LDC_INIT = 1, /* Channel initialized */ + LDC_OPEN, /* Channel open */ + LDC_READY, /* Channel peer opened (hw-link-up) */ + LDC_UP /* Channel UP - ready for data xfer */ +} ldc_status_t; + +/* Callback return values */ +#define LDC_SUCCESS 0 +#define LDC_FAILURE 1 + +/* LDC callback mode */ +typedef enum { + LDC_CB_ENABLE, /* Enable callbacks */ + LDC_CB_DISABLE /* Disable callbacks */ +} ldc_cb_mode_t; + +/* Callback events */ +#define LDC_EVT_DOWN 0x1 /* Channel DOWN, status = OPEN */ +#define LDC_EVT_RESET 0x2 /* Channel RESET, status = READY */ +#define LDC_EVT_UP 0x4 /* Channel UP, status = UP */ +#define LDC_EVT_READ 0x8 /* Channel has data for read */ +#define LDC_EVT_WRITE 0x10 /* Channel has space for write */ + +/* LDC device classes */ +typedef enum { + LDC_DEV_GENERIC = 1, /* generic device */ + LDC_DEV_BLK, /* block device, eg. vdc */ + LDC_DEV_BLK_SVC, /* block device service, eg. vds */ + LDC_DEV_NT, /* network device, eg. vnet */ + LDC_DEV_NT_SVC, /* network service eg. vsw */ + LDC_DEV_SERIAL /* serial device eg. vldc, vcc */ +} ldc_dev_t; + +/* Channel nexus registration */ +typedef struct ldc_cnex { + dev_info_t *dip; /* dip of channel nexus */ + int (*reg_chan)(); /* interface for channel register */ + int (*unreg_chan)(); /* interface for channel unregister */ + int (*add_intr)(); /* interface for adding interrupts */ + int (*rem_intr)(); /* interface for removing interrupts */ + int (*clr_intr)(); /* interface for clearing interrupts */ +} ldc_cnex_t; + +/* LDC attribute structure */ + +/* + * FIXME: Attribute passed in should be an MTU size + * Allocate the queue internally to ldc module to accomodate + * based on MTU size. For streaming mode, size can be zero. + */ + +typedef struct ldc_attr { + ldc_dev_t devclass; /* device class */ + uint64_t instance; /* device class instance */ + ldc_mode_t mode; /* channel mode */ + uint64_t qlen; /* channel queue elements */ +} ldc_attr_t; + +/* LDC memory cookie */ +typedef struct ldc_mem_cookie { + uint64_t addr; /* cookie address */ + uint64_t size; /* size @ offset */ +} ldc_mem_cookie_t; + +/* + * LDC Memory Map Type + * Specifies how shared memory being created is shared with its + * peer and/or how the peer has mapped in the exported memory. + */ +#define LDC_SHADOW_MAP 0x1 /* share mem via shadow copy only */ +#define LDC_DIRECT_MAP 0x2 /* share mem direct access */ +#define LDC_IO_MAP 0x4 /* share mem for IOMMU/DMA access */ + +/* LDC Memory Access Permissions */ +#define LDC_MEM_R 0x1 /* Memory region is read only */ +#define LDC_MEM_W 0x2 /* Memory region is write only */ +#define LDC_MEM_X 0x4 /* Memory region is execute only */ +#define LDC_MEM_RW (LDC_MEM_R|LDC_MEM_W) +#define LDC_MEM_RWX (LDC_MEM_R|LDC_MEM_W|LDC_MEM_X) + +/* LDC Memory Copy Direction */ +#define LDC_COPY_IN 0x0 /* Copy data to VA from cookie mem */ +#define LDC_COPY_OUT 0x1 /* Copy data from VA to cookie mem */ + +/* LDC memory/dring (handle) status */ +typedef enum { + LDC_UNBOUND, /* Memory handle is unbound */ + LDC_BOUND, /* Memory handle is bound */ + LDC_MAPPED /* Memory handle is mapped */ +} ldc_mstatus_t; + +/* LDC [dring] memory info */ +typedef struct ldc_mem_info { + uint8_t mtype; /* map type */ + uint8_t perm; /* RWX permissions */ + caddr_t vaddr; /* base VA */ + uintptr_t raddr; /* base RA */ + ldc_mstatus_t status; /* dring/mem handle status */ +} ldc_mem_info_t; + +/* API functions */ +int ldc_register(ldc_cnex_t *cinfo); +int ldc_unregister(ldc_cnex_t *cinfo); + +int ldc_init(uint64_t id, ldc_attr_t *attr, ldc_handle_t *handle); +int ldc_fini(ldc_handle_t handle); +int ldc_open(ldc_handle_t handle); +int ldc_close(ldc_handle_t handle); +int ldc_up(ldc_handle_t handle); +int ldc_reset(ldc_handle_t handle); +int ldc_reg_callback(ldc_handle_t handle, + uint_t(*callback)(uint64_t event, caddr_t arg), caddr_t arg); +int ldc_unreg_callback(ldc_handle_t handle); +int ldc_set_cb_mode(ldc_handle_t handle, ldc_cb_mode_t imode); +int ldc_chkq(ldc_handle_t handle, boolean_t *isempty); +int ldc_read(ldc_handle_t handle, caddr_t buf, size_t *size); +int ldc_write(ldc_handle_t handle, caddr_t buf, size_t *size); +int ldc_status(ldc_handle_t handle, ldc_status_t *status); + +int ldc_mem_alloc_handle(ldc_handle_t handle, ldc_mem_handle_t *mhandle); +int ldc_mem_free_handle(ldc_mem_handle_t mhandle); +int ldc_mem_bind_handle(ldc_mem_handle_t mhandle, caddr_t vaddr, size_t len, + uint8_t mtype, uint8_t perm, ldc_mem_cookie_t *cookie, uint32_t *ccount); +int ldc_mem_unbind_handle(ldc_mem_handle_t mhandle); +int ldc_mem_info(ldc_mem_handle_t mhandle, ldc_mem_info_t *minfo); +int ldc_mem_nextcookie(ldc_mem_handle_t mhandle, ldc_mem_cookie_t *cookie); +int ldc_mem_copy(ldc_handle_t handle, caddr_t vaddr, uint64_t off, size_t *len, + ldc_mem_cookie_t *cookies, uint32_t ccount, uint8_t direction); +int ldc_mem_rdwr_pa(ldc_handle_t handle, caddr_t vaddr, size_t *size, + caddr_t paddr, uint8_t direction); +int ldc_mem_map(ldc_mem_handle_t mhandle, ldc_mem_cookie_t *cookie, + uint32_t ccount, uint8_t mtype, caddr_t *vaddr, caddr_t *raddr); +int ldc_mem_acquire(ldc_mem_handle_t mhandle, uint64_t offset, uint64_t size); +int ldc_mem_release(ldc_mem_handle_t mhandle, uint64_t offset, uint64_t size); + +int ldc_mem_dring_create(uint32_t len, uint32_t dsize, + ldc_dring_handle_t *dhandle); +int ldc_mem_dring_destroy(ldc_dring_handle_t dhandle); +int ldc_mem_dring_bind(ldc_handle_t handle, ldc_dring_handle_t dhandle, + uint8_t mtype, uint8_t perm, ldc_mem_cookie_t *dcookie, uint32_t *ccount); +int ldc_mem_dring_nextcookie(ldc_dring_handle_t mhandle, + ldc_mem_cookie_t *cookie); +int ldc_mem_dring_unbind(ldc_dring_handle_t dhandle); +int ldc_mem_dring_info(ldc_dring_handle_t dhandle, ldc_mem_info_t *minfo); +int ldc_mem_dring_map(ldc_handle_t handle, ldc_mem_cookie_t *cookie, + uint32_t ccount, uint32_t len, uint32_t dsize, uint8_t mtype, + ldc_dring_handle_t *dhandle); +int ldc_mem_dring_unmap(ldc_dring_handle_t dhandle); +int ldc_mem_dring_acquire(ldc_dring_handle_t dhandle, uint64_t start, + uint64_t end); +int ldc_mem_dring_release(ldc_dring_handle_t dhandle, uint64_t start, + uint64_t end); + +#ifdef __cplusplus +} +#endif + +#endif /* _LDC_H */ diff --git a/usr/src/uts/sun4v/sys/ldc_impl.h b/usr/src/uts/sun4v/sys/ldc_impl.h new file mode 100644 index 0000000000..c4fd0ef973 --- /dev/null +++ b/usr/src/uts/sun4v/sys/ldc_impl.h @@ -0,0 +1,487 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LDC_IMPL_H +#define _LDC_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ioctl.h> + +/* Memory map table size */ +#define MTBL_MAX_SIZE 65536 /* 64K */ + +/* Define LDC Queue info */ +#define LDC_PACKET_SHIFT 6 +#define LDC_QUEUE_ENTRIES 128 +#define LDC_QUEUE_SIZE (LDC_QUEUE_ENTRIES << LDC_PACKET_SHIFT) +#define LDC_STREAM_MTU (LDC_QUEUE_SIZE >> 1) + +/* + * LDC Reliable mode - initial packet seqid + * - If peer initiated handshake, RDX should contain init_seqid + 1 + * - If this endpoint initiated handshake first data packet should + * contain the message init_seqid + 1 + */ +#define LDC_INIT_SEQID 0x0 + +/* LDC Message types */ +#define LDC_CTRL 0x01 /* Control Pkt */ +#define LDC_DATA 0x02 /* Data Pkt */ +#define LDC_ERR 0x10 /* Error Pkt */ + +/* LDC Message Subtypes */ +#define LDC_INFO 0x01 /* Control/Data/Error info pkt */ +#define LDC_ACK 0x02 /* Control/Data ACK */ +#define LDC_NACK 0x04 /* Control/Data NACK */ + +/* LDC Control Messages */ +#define LDC_VER 0x01 /* Version message */ +#define LDC_RTS 0x02 /* Request to Send */ +#define LDC_RTR 0x03 /* Ready To Receive */ +#define LDC_RDX 0x04 /* Ready for data exchange */ + +#define LDC_CTRL_MASK 0x0f /* Mask to read control bits */ + +/* LDC Channel Transport State (tstate) */ +#define TS_TXQ_RDY 0x01 /* allocated TX queue */ +#define TS_RXQ_RDY 0x02 /* allocated RX queue */ +#define TS_INIT (TS_TXQ_RDY | TS_RXQ_RDY) +#define TS_QCONF_RDY 0x04 /* registered queues with HV */ +#define TS_CNEX_RDY 0x08 /* registered channel with cnex */ +#define TS_OPEN (TS_INIT | TS_QCONF_RDY | TS_CNEX_RDY) +#define TS_LINK_READY 0x10 /* both endpts registered Rx queues */ +#define TS_READY (TS_OPEN | TS_LINK_READY) +#define TS_VER_DONE 0x20 /* negotiated version */ +#define TS_VREADY (TS_READY | TS_VER_DONE) +#define TS_HSHAKE_DONE 0x40 /* completed handshake */ +#define TS_UP (TS_READY | TS_VER_DONE | TS_HSHAKE_DONE) + +/* LDC Channel Transport Handshake states */ +#define TS_SENT_RTS 0x01 /* Sent RTS */ +#define TS_RCVD_RTR 0x02 /* Received RTR */ +#define TS_SENT_RDX 0x04 /* Sent RDX */ +#define TS_RCVD_RTS 0x10 /* Received RTS */ +#define TS_SENT_RTR 0x20 /* Sent RTR */ +#define TS_RCVD_RDX 0x40 /* Received RDX */ + +/* LDC MSG Envelope */ +#define LDC_LEN_MASK 0x3F +#define LDC_FRAG_MASK 0xC0 + +#define LDC_FRAG_START 0x40 /* frag_info = 0x01 */ +#define LDC_FRAG_STOP 0x80 /* frag_info = 0x02 */ +#define LDC_FRAG_CONT 0x00 /* frag_info = 0x00 */ + +/* + * LDC fragmented xfer loop wait cnt + * When data is arriving in fragments, the read thread will + * look for a packet 'LDC_CHK_CNT' times. Between each check + * it will loop 'LDC_LOOP_CNT' times + */ +#define LDC_CHK_CNT 1000 +#define LDC_LOOP_CNT 1000 + +/* + * LDC Version information + */ +#define LDC_PAYLOAD_VER_OFF 8 /* offset of version in payload */ + +typedef struct ldc_ver { + uint16_t major; + uint16_t minor; +} ldc_ver_t; + +/* + * Each guest consists of one or more LDC endpoints represented by a ldc_chan + * structure. Each ldc_chan structure points to a ldc_mtbl structure that + * contains information about the map table associated with this LDC endpoint. + * The map table contains the list of pages being shared by this guest over + * this endpoint with the guest at the other end of this endpoint. Each LDC + * endpoint also points to a list of memory handles used to bind and export + * memory segments from this guest. If a memory segment is bound, it points to + * a memory segment structure, which inturn consists of an array of ldc_page + * structure for all the pages within that segment. Each ldc_page structure + * contains information about the shared page and also points to the + * corresponding entry in the map table. + * + * Each LDC endpoint also points to a list of ldc_dring structures that refer + * to both imported and exported descriptor rings. If it is a exported + * descriptor ring, it then points to memory handle/memseg corresponding to + * the region of memory associated with the descriptor ring. + * + * +----------+ +----------+ +----------+ + * | ldc_chan |-->| ldc_chan |-->| ldc_chan |-->.... + * +----------+ +----------+ +----------+ + * | | | + * | | | + * | | | +-----------+ +-----------+ + * | | +----->| ldc_dring |---->| ldc_dring |---->...... + * | | +-----------+ +-----------+ + * | | | + * | | +----------------------------+ + * | | | + * | | v + * | | +----------+ +----------+ +----------+ + * | +----->| ldc_mhdl |---->| ldc_mhdl |---->| ldc_mhdl |---> .... + * | +----------+ +----------+ +----------+ + * v | | + * +----------+ | +------------+ | +------------+ + * | ldc_mtbl |--+ +--->| ldc_memseg |-----+ +--->| ldc_memseg | + * +----------+ | +------------+ | +------------+ + * | | | | | + * v v v | v + * +--------------+ +----------+ +--------+ | +--------+ + * | ldc_mte_slot |<--------| ldc_page | | cookie | | | cookie | + * +--------------+ +----------+ +--------+ | +--------+ + * | ldc_mte_slot |<--------| ldc_page | | cookie | v + * +--------------+ +----------+ +--------+ +----------+ + * | ldc_mte_slot |<-----------------------------------| ldc_page | + * +--------------+ +----------+ + * | ldc_mte_slot | + * +--------------+ + * | ...... |/ +------------+ + * +--------------+ | entry | + * | ldc_mte_slot | +------------+ + * +--------------+ | inv_cookie | + * \ +------------+ + * + */ + +/* + * Message format of each packet sent over the LDC channel. + * Each packet is 64-bytes long. + * + * Each packet that is sent over LDC can contain either data or acks. + * The type will reflect the contents. The len will contain in bytes + * the amount of data being sent. In the case of ACKs, the seqid and + * data fields will contain the SEQIDs of messages for which ACKs are + * being sent. + * + * Raw pkt format: + * + * +------------------------------------------------------+ + * 0 - 7 | data payload | + * +------------------------------------------------------+ + * + * Unreliable pkt format: + * + * +------------------------------------------------------+ + * 0 | seqid | env | ctrl | stype | type | + * +------------------------------------------------------+ + * 1 - 7 | data payload | + * +------------------------------------------------------+ + * + * Reliable pkt format: + * + * +------------------------------------------------------+ + * 0 | seqid | env | ctrl | stype | type | + * +------------------------------------------------------+ + * 1 | ackid | unused | + * +------------------------------------------------------+ + * 2 - 7 | data payload | + * +------------------------------------------------------+ + */ + +typedef struct ldc_msg { + union { + struct { + uint8_t _type; /* Message type */ + uint8_t _stype; /* Message subtype */ + uint8_t _ctrl; /* Control/Error Message */ + uint8_t _env; /* Message Envelope */ + uint32_t _seqid; /* Sequence ID */ + + union { + uint8_t _ud[LDC_PAYLOAD_SIZE_UNRELIABLE]; + /* Unreliable data payload */ + struct { + uint32_t _unused; /* unused */ + uint32_t _ackid; /* ACK ID */ + uint8_t _rd[LDC_PAYLOAD_SIZE_RELIABLE]; + /* Reliable data payload */ + } _rl; + } _data; + } _tpkt; + + uint8_t _raw[LDC_PAYLOAD_SIZE_RAW]; + } _pkt; + +} ldc_msg_t; + +#define raw _pkt._raw +#define type _pkt._tpkt._type +#define stype _pkt._tpkt._stype +#define ctrl _pkt._tpkt._ctrl +#define env _pkt._tpkt._env +#define seqid _pkt._tpkt._seqid +#define udata _pkt._tpkt._data._ud +#define ackid _pkt._tpkt._data._rl._ackid +#define rdata _pkt._tpkt._data._rl._rd + +/* + * LDC Map Table Entry (MTE) + * + * 6 6 1 1 1 + * |3 0| psz| 3| 1| 0| 9| 8| 7|6|5|4| 0| + * +------+--------------------------+----+----+--+--+--+--+-+-+-+-------+ + * | rsvd | PFN | 0 | 0 |CW|CR|IW|IR|X|W|R| pgszc | + * +------+--------------------------+----+----+--+--+--+--+-+-+-+-------+ + * | hv invalidation cookie | + * +---------------------------------------------------------------------+ + */ +typedef union { + struct { + uint64_t _rsvd2:8, /* <63:56> reserved */ + rpfn:43, /* <55:13> real pfn */ + _rsvd1:2, /* <12:11> reserved */ + cw:1, /* <10> copy write access */ + cr:1, /* <9> copy read perm */ + iw:1, /* <8> iommu write perm */ + ir:1, /* <7> iommu read perm */ + x:1, /* <6> execute perm */ + w:1, /* <5> write perm */ + r:1, /* <4> read perm */ + pgszc:4; /* <3:0> pgsz code */ + } mte_bit; + + uint64_t ll; + +} ldc_mte_t; + +#define mte_rpfn mte_bit.rpfn +#define mte_cw mte_bit.cw +#define mte_cr mte_bit.cr +#define mte_iw mte_bit.iw +#define mte_ir mte_bit.ir +#define mte_x mte_bit.x +#define mte_w mte_bit.w +#define mte_r mte_bit.r +#define mte_pgszc mte_bit.pgszc + +#define MTE_BSZS_SHIFT(sz) ((sz) * 3) +#define MTEBYTES(sz) (MMU_PAGESIZE << MTE_BSZS_SHIFT(sz)) +#define MTEPAGES(sz) (1 << MTE_BSZS_SHIFT(sz)) +#define MTE_PAGE_SHIFT(sz) (MMU_PAGESHIFT + MTE_BSZS_SHIFT(sz)) +#define MTE_PAGE_OFFSET(sz) (MTEBYTES(sz) - 1) +#define MTE_PAGEMASK(sz) (~MTE_PAGE_OFFSET(sz)) +#define MTE_PFNMASK(sz) (~(MTE_PAGE_OFFSET(sz) >> MMU_PAGESHIFT)) + +/* + * LDC Map Table Slot + */ +typedef struct ldc_mte_slot { + ldc_mte_t entry; + uint64_t cookie; +} ldc_mte_slot_t; + +/* + * LDC Memory Map Table + * + * Each LDC has a memory map table it uses to list all the pages + * it exporting to its peer over the channel. This structure + * contains information about the map table and is pointed to + * by the ldc_chan structure. + */ +typedef struct ldc_mtbl { + kmutex_t lock; /* Table lock */ + size_t size; /* Table size (in bytes) */ + uint64_t next_entry; /* Next entry to use */ + uint64_t num_entries; /* Num entries in table */ + uint64_t num_avail; /* Num of available entries */ + ldc_mte_slot_t *table; /* The table itself */ +} ldc_mtbl_t; + +/* + * LDC page and memory segment information + */ +typedef struct ldc_page { + uintptr_t raddr; /* Exported page RA */ + uint64_t offset; /* Exported page offset */ + size_t size; /* Exported page size */ + uint64_t index; /* Index in map table */ + ldc_mte_slot_t *mte; /* Map table entry */ +} ldc_page_t; + +typedef struct ldc_memseg { + caddr_t vaddr; /* Exported segment VA */ + uintptr_t raddr; /* Exported segment VA */ + size_t size; /* Exported segment size */ + uint64_t npages; /* Number of pages */ + ldc_page_t *pages; /* Array of exported pages */ + uint32_t ncookies; /* Number of cookies */ + ldc_mem_cookie_t *cookies; + uint64_t next_cookie; /* Index to next cookie */ +} ldc_memseg_t; + +/* + * LDC Cookie address format + * + * 6 6 m+n + * |3| 0| | m| 0| + * +-+-------+----------+-------------------+-------------------+ + * |X| pgszc | rsvd | table_idx | page_offset | + * +-+-------+----------+-------------------+-------------------+ + */ +#define LDC_COOKIE_PGSZC_MASK 0x7 +#define LDC_COOKIE_PGSZC_SHIFT 60 + +/* + * LDC Memory handle + */ +typedef struct ldc_chan ldc_chan_t; + +typedef struct ldc_mhdl { + kmutex_t lock; /* Mutex for memory handle */ + ldc_mstatus_t status; /* Memory map status */ + + uint8_t mtype; /* Type of sharing */ + uint8_t perm; /* Access permissions */ + boolean_t myshadow; /* TRUE=alloc'd shadow mem */ + + ldc_chan_t *ldcp; /* Pointer to channel struct */ + ldc_memseg_t *memseg; /* Bound memory segment */ + struct ldc_mhdl *next; /* Next memory handle */ +} ldc_mhdl_t; + +/* + * LDC Descriptor rings + */ + +typedef struct ldc_dring { + kmutex_t lock; /* Desc ring lock */ + ldc_mstatus_t status; /* Desc ring status */ + + uint32_t dsize; /* Descriptor size */ + uint32_t length; /* Descriptor ring length */ + uint64_t size; /* Desc ring size (in bytes) */ + caddr_t base; /* Descriptor ring base addr */ + + ldc_chan_t *ldcp; /* Pointer to bound channel */ + ldc_mem_handle_t mhdl; /* Mem handle to desc ring */ + + struct ldc_dring *ch_next; /* Next dring in channel */ + struct ldc_dring *next; /* Next dring overall */ + +} ldc_dring_t; + + +/* + * Channel specific information is kept in a separate + * structure. These are then stored on a array indexed + * by the channel number. + */ +struct ldc_chan { + ldc_chan_t *next; /* Next channel */ + + kmutex_t lock; /* Channel lock */ + uint64_t id; /* Channel ID */ + ldc_status_t status; /* Channel status */ + uint32_t tstate; /* Channel transport state */ + uint32_t hstate; /* Channel transport handshake state */ + + ldc_dev_t devclass; /* Associated device class */ + uint64_t devinst; /* Associated device instance */ + ldc_mode_t mode; /* Channel mode */ + + uint64_t mtu; /* Max TU size (streaming for now) */ + + ldc_ver_t version; /* Channel version */ + uint32_t next_vidx; /* Next version to match */ + + uint_t (*cb)(uint64_t event, caddr_t arg); + caddr_t cb_arg; /* Channel callback and arg */ + boolean_t cb_inprogress; /* Channel callback in progress */ + boolean_t cb_enabled; /* Channel callbacks are enabled */ + + boolean_t intr_pending; /* TRUE if interrupts are pending */ + + uint64_t tx_q_entries; /* Num entries in transmit queue */ + uint64_t tx_q_va; /* Virtual addr of transmit queue */ + uint64_t tx_q_ra; /* Real addr of transmit queue */ + uint64_t tx_head; /* Tx queue head */ + uint64_t tx_ackd_head; /* Tx queue ACKd head (Reliable) */ + uint64_t tx_tail; /* Tx queue tail */ + + uint64_t rx_q_entries; /* Num entries in receive queue */ + uint64_t rx_q_va; /* Virtual addr of receive queue */ + uint64_t rx_q_ra; /* Real addr of receive queue */ + + uint64_t link_state; /* Underlying HV channel state */ + + ldc_mtbl_t *mtbl; /* Memory table used by channel */ + ldc_mhdl_t *mhdl_list; /* List of memory handles */ + kmutex_t mlist_lock; /* Mem handle list lock */ + + ldc_dring_t *exp_dring_list; /* Exported desc ring list */ + kmutex_t exp_dlist_lock; /* Lock for exported desc ring list */ + ldc_dring_t *imp_dring_list; /* Imported desc ring list */ + kmutex_t imp_dlist_lock; /* Lock for imported desc ring list */ + + uint8_t pkt_payload; /* Size of packet payload */ + + uint32_t first_fragment; /* Seqid of first msg fragment */ + uint32_t last_msg_snt; /* Seqid of last packet sent */ + uint32_t last_ack_rcd; /* Seqid of last ACK recd */ + uint32_t last_msg_rcd; /* Seqid of last packet received */ + + uint32_t stream_remains; /* Number of bytes in stream */ + /* packet buffer */ + uint32_t stream_offset; /* Offset into packet buffer for */ + /* next read */ + uint8_t *stream_bufferp; /* Stream packet buffer */ + + int (*read_p)(ldc_chan_t *ldcp, caddr_t bufferp, + size_t *sizep); + int (*write_p)(ldc_chan_t *ldcp, caddr_t bufferp, + size_t *sizep); +}; + + +/* + * LDC module soft state structure + */ +typedef struct ldc_soft_state { + kmutex_t lock; /* Protects ldc_soft_state_t */ + ldc_cnex_t cinfo; /* channel nexus info */ + uint64_t channel_count; /* Number of channels */ + uint64_t channels_open; /* Number of open channels */ + ldc_chan_t *chan_list; /* List of LDC endpoints */ + ldc_dring_t *dring_list; /* Descriptor rings (for export) */ +} ldc_soft_state_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _LDC_IMPL_H */ diff --git a/usr/src/uts/sun4v/sys/ldoms.h b/usr/src/uts/sun4v/sys/ldoms.h new file mode 100644 index 0000000000..5e86dde864 --- /dev/null +++ b/usr/src/uts/sun4v/sys/ldoms.h @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LDOMS_H +#define _LDOMS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/param.h> /* for MAXHOSTNAMELEN */ + +/* + * Global LDoms definitions. + */ + +/* Maximum number of logical domains supported */ +#define LDOMS_MAX_DOMAINS 32 + +/* maximum number of characters in the logical domain name */ +#define LDOMS_MAX_NAME_LEN MAXHOSTNAMELEN + +/* + * Global flag that indicates whether domaining features are + * available. The value is set at boot time based on the value + * of the 'domaining-enabled' property in the MD. Updates to + * this variable after boot are not supported. + */ +extern uint_t domaining_enabled; + + +#ifdef __cplusplus +} +#endif + +#endif /* _LDOMS_H */ diff --git a/usr/src/uts/sun4v/sys/lpad.h b/usr/src/uts/sun4v/sys/lpad.h new file mode 100644 index 0000000000..e538702220 --- /dev/null +++ b/usr/src/uts/sun4v/sys/lpad.h @@ -0,0 +1,95 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LPAD_H +#define _LPAD_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * sun4v Landing Pad + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _ASM + +#include <sys/pte.h> + +typedef union { + struct { + unsigned int rsvd0:32; + unsigned int rsvd1:29; + unsigned int perm:1; + unsigned int mmuflags:2; + } flag_bits; + uint64_t ll; +} lpad_map_flag_t; + +typedef struct lpad_map { + lpad_map_flag_t flags; + uint64_t va; + tte_t tte; +} lpad_map_t; + +#define flag_mmuflags flags.flag_bits.mmuflags +#define flag_perm flags.flag_bits.perm + +typedef struct lpad_data { + uint64_t magic; /* magic value for sanity checking */ + uint64_t *inuse; /* clear flag when done with lpad */ + uint64_t mmfsa_ra; /* RA of MMU fault status area */ + uint64_t pc; /* VA of CPU startup function */ + uint64_t arg; /* argument to startup function */ + uint64_t nmap; /* number of mappings */ + lpad_map_t map[1]; /* array of mappings */ +} lpad_data_t; + +extern uint64_t *lpad_setup(int cpuid, uint64_t pc, uint64_t arg); + +#endif /* ! _ASM */ + +/* + * General landing pad constants + */ +#define LPAD_TEXT_SIZE 1024 +#define LPAD_DATA_SIZE 1024 +#define LPAD_SIZE (LPAD_TEXT_SIZE + LPAD_DATA_SIZE) +#define LPAD_MAGIC_VAL 0x4C502D4D41474943 /* "LP-MAGIC" */ + +/* + * Masks for the lpad_map_t flag bitfield + */ +#define FLAG_MMUFLAGS_MASK 0x3 +#define FLAG_LOCK_MASK 0x4 + +#ifdef __cplusplus +} +#endif + +#endif /* _LPAD_H */ diff --git a/usr/src/uts/sun4v/sys/mach_descrip.h b/usr/src/uts/sun4v/sys/mach_descrip.h index 2bf0c686fa..a003a9b23b 100644 --- a/usr/src/uts/sun4v/sys/mach_descrip.h +++ b/usr/src/uts/sun4v/sys/mach_descrip.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,8 +18,9 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,22 +33,58 @@ extern "C" { #endif +#include <sys/kstat.h> +#include <sys/ksynch.h> +#include <sys/mdesc.h> + /* - * Common structure between kernel and mdesc driver - * enabling the current machine description to be retrieved - * or updated. + * MD memory operations (memops) are of two types: + * buf: + * Buffer allocator routines used to allocate the MD buffer. + * Allocator must support an alignment argument. + * + * meta: + * Meta allocator routines to allocate meta data strcutures. + * These allocations are small and don't have alignment + * requirements. Examples, md_t handles and the machine_descrip_t + * structure. */ -struct machine_descrip_s { - void *va; - uint64_t pa; - uint64_t size; - uint64_t space; - kstat_t *ksp; -}; +typedef struct machine_descrip_memops { + void *(*buf_allocp)(size_t size, size_t align); + void (*buf_freep)(void *, size_t size); + void *(*meta_allocp)(size_t size); + void (*meta_freep)(void *, size_t size); +} machine_descrip_memops_t; -typedef struct machine_descrip_s machine_descrip_t; +/* + * Common structure/list between kernel and mdesc driver enabling + * the current machine description to be retrieved or updated. + * + * Locks: + * The current global MD is protected by the curr_mach_descrip_lock. + * Each Machine description has a lock to synchronize its ref count. + * The Obsolete MD list is protected by the obs_list_lock. + */ +typedef struct machine_descrip_s { + uint64_t gen; /* Generation number for MD */ + kmutex_t lock; /* synchronize access to MD */ + void *va; /* virtual address */ + uint64_t size; /* size of MD */ + uint64_t space; /* space allocated for MD */ + int refcnt; /* MD ref count */ + struct machine_descrip_s *next; /* Next MD in list */ + machine_descrip_memops_t *memops; /* Memory operations for MD */ +} machine_descrip_t; -extern machine_descrip_t machine_descrip; +/* + * Utility wrappers to get/fini a handle to the current MD. + */ +extern md_t *md_get_handle(void); +extern int md_fini_handle(md_t *); +extern caddr_t md_get_md_raw(md_t *); +extern int md_alloc_scan_dag(md_t *, mde_cookie_t, char *, char *, + mde_cookie_t **); +extern void md_free_scan_dag(md_t *, mde_cookie_t **); #ifdef __cplusplus } diff --git a/usr/src/uts/sun4v/sys/machcpuvar.h b/usr/src/uts/sun4v/sys/machcpuvar.h index 24050ea18f..8d9ac241aa 100644 --- a/usr/src/uts/sun4v/sys/machcpuvar.h +++ b/usr/src/uts/sun4v/sys/machcpuvar.h @@ -207,6 +207,7 @@ struct cpu_node { int ecache_associativity; int ecache_setsize; uint64_t device_id; + id_t exec_unit_mapping; }; extern struct cpu_node cpunodes[]; diff --git a/usr/src/uts/sun4v/sys/machparam.h b/usr/src/uts/sun4v/sys/machparam.h index 6deb0ea3f6..130e8e662f 100644 --- a/usr/src/uts/sun4v/sys/machparam.h +++ b/usr/src/uts/sun4v/sys/machparam.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -281,22 +280,26 @@ extern "C" { * names defined in sun4u/os/mach_cpu_states.c which should be kept up to * date if new #defines are added. */ -#define PTL1_BAD_DEBUG 0 -#define PTL1_BAD_WTRAP 1 -#define PTL1_BAD_KMISS 2 -#define PTL1_BAD_KPROT_FAULT 3 -#define PTL1_BAD_ISM 4 -#define PTL1_BAD_MMUTRAP 5 -#define PTL1_BAD_TRAP 6 -#define PTL1_BAD_FPTRAP 7 -#define PTL1_BAD_INTR_REQ 8 -#define PTL1_BAD_TRACE_PTR 9 -#define PTL1_BAD_STACK 10 -#define PTL1_BAD_DTRACE_FLAGS 11 -#define PTL1_BAD_CTX_STEAL 12 -#define PTL1_BAD_ECC 13 -#define PTL1_BAD_HCALL 14 -#define PTL1_BAD_GL 15 +#define PTL1_BAD_DEBUG 0 +#define PTL1_BAD_WTRAP 1 +#define PTL1_BAD_KMISS 2 +#define PTL1_BAD_KPROT_FAULT 3 +#define PTL1_BAD_ISM 4 +#define PTL1_BAD_MMUTRAP 5 +#define PTL1_BAD_TRAP 6 +#define PTL1_BAD_FPTRAP 7 +#define PTL1_BAD_INTR_REQ 8 +#define PTL1_BAD_TRACE_PTR 9 +#define PTL1_BAD_STACK 10 +#define PTL1_BAD_DTRACE_FLAGS 11 +#define PTL1_BAD_CTX_STEAL 12 +#define PTL1_BAD_ECC 13 +#define PTL1_BAD_HCALL 14 +#define PTL1_BAD_GL 15 +#define PTL1_BAD_WATCHDOG 16 +#define PTL1_BAD_RED 17 +#define PTL1_BAD_HCALL_UNMAP_PERM_EINVAL 18 +#define PTL1_BAD_HCALL_UNMAP_PERM_ENOMAP 19 /* * Defines the max trap level allowed diff --git a/usr/src/uts/sun4v/sys/machsystm.h b/usr/src/uts/sun4v/sys/machsystm.h index 64d6e5dd8f..c1e973cd51 100644 --- a/usr/src/uts/sun4v/sys/machsystm.h +++ b/usr/src/uts/sun4v/sys/machsystm.h @@ -337,6 +337,7 @@ extern void idle_stop_xcall(void); extern void set_idle_cpu(int); extern void unset_idle_cpu(int); extern void mp_cpu_quiesce(struct cpu *); +extern int stopcpu_bycpuid(int); /* * Panic at TL > 0 @@ -396,6 +397,12 @@ extern uchar_t kpm_size_shift; extern caddr_t kpm_vbase; #define INVALID_VADDR(a) (((a) >= hole_start && (a) < hole_end)) +#define VA_ADDRESS_SPACE_BITS 64 +#define RA_ADDRESS_SPACE_BITS 56 +#define MAX_REAL_ADDRESS (1ull << RA_ADDRESS_SPACE_BITS) +#define DEFAULT_VA_ADDRESS_SPACE_BITS 48 /* def. Niagara (broken MD) */ +#define PAGESIZE_MASK_BITS 16 +#define MAX_PAGESIZE_MASK ((1<<PAGESIZE_MASK_BITS) - 1) extern void adjust_hw_copy_limits(int); @@ -466,6 +473,25 @@ void sticksync_master(void); #define HV_TOD_RETRY_THRESH 100 #define HV_TOD_WAIT_USEC 5 +/* + * Interrupt Queues and Error Queues + */ + +#define INTR_CPU_Q 0x3c +#define INTR_DEV_Q 0x3d +#define CPU_RQ 0x3e +#define CPU_NRQ 0x3f +#define DEFAULT_CPU_Q_ENTRIES 0x100 +#define DEFAULT_DEV_Q_ENTRIES 0x100 +#define INTR_REPORT_SIZE 64 + +#ifndef _ASM +extern uint64_t cpu_q_entries; +extern uint64_t dev_q_entries; +extern uint64_t cpu_rq_entries; +extern uint64_t cpu_nrq_entries; +#endif /* _ASM */ + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/sun4v/sys/mdeg.h b/usr/src/uts/sun4v/sys/mdeg.h new file mode 100644 index 0000000000..c8149afaa6 --- /dev/null +++ b/usr/src/uts/sun4v/sys/mdeg.h @@ -0,0 +1,120 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _MDEG_H +#define _MDEG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * MD Event Generator (mdeg) interface. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/mdesc.h> + +/* + * Specification of a node property. + */ +typedef struct { + uint8_t type; + char *namep; + union { + char *strp; + uint64_t val; + } _p; + +} mdeg_prop_spec_t; + +#define ps_str _p.strp +#define ps_val _p.val + +/* + * Specification of unique node in the MD. The array + * of property name value pairs is used to determine + * whether the node matches the specification. + */ +typedef struct { + char *namep; + mdeg_prop_spec_t *specp; +} mdeg_node_spec_t; + +/* + * Specification of a method to match nodes. The + * array of properties are used to match two nodes + * from different MDs. If the specified properties + * match, the nodes are the same. + */ +typedef struct { + char *namep; + md_prop_match_t *matchp; +} mdeg_node_match_t; + +/* + * The result of the MD update as communicated + * through the parameter to the registered callback. + */ +typedef struct { + md_t *mdp; + mde_cookie_t *mdep; + uint_t nelem; +} mdeg_diff_t; + +/* + * Results of the MD update for a specific registration + */ +typedef struct { + mdeg_diff_t added; + mdeg_diff_t removed; + mdeg_diff_t match_curr; + mdeg_diff_t match_prev; +} mdeg_result_t; + +/* + * Client Interface + */ + +#define MDEG_SUCCESS 0 +#define MDEG_FAILURE 1 + +typedef uint64_t mdeg_handle_t; + +typedef int (*mdeg_cb_t)(void *cb_argp, mdeg_result_t *resp); + +int mdeg_register(mdeg_node_spec_t *pspecp, mdeg_node_match_t *nmatchp, + mdeg_cb_t cb, void *cb_argp, mdeg_handle_t *hdlp); + +int mdeg_unregister(mdeg_handle_t hdl); + + +#ifdef __cplusplus +} +#endif + +#endif /* _MDEG_H */ diff --git a/usr/src/uts/sun4v/sys/mmu.h b/usr/src/uts/sun4v/sys/mmu.h index b38d007d83..61d0812ace 100644 --- a/usr/src/uts/sun4v/sys/mmu.h +++ b/usr/src/uts/sun4v/sys/mmu.h @@ -1,5 +1,5 @@ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -117,6 +117,10 @@ extern "C" { */ #define FLUSH_ADDR (KERNELBASE + 2 * MMU_PAGESIZE4M) +#define MAX_NCTXS_BITS 16 /* sun4v max. contexts bits */ +#define MIN_NCTXS_BITS 2 +#define MAX_NCTXS (1ull << MAX_NCTXS_BITS) + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/sun4v/sys/ncp.h b/usr/src/uts/sun4v/sys/ncp.h index 7203f84fdd..491a7bf622 100644 --- a/usr/src/uts/sun4v/sys/ncp.h +++ b/usr/src/uts/sun4v/sys/ncp.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,7 +29,12 @@ #pragma ident "%Z%%M% %I% %E% SMI" #include <sys/types.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> #include <sys/kmem.h> +#include <sys/mdesc.h> +#include <sys/crypto/common.h> +#include <sys/crypto/spi.h> #include <sys/ncs.h> #ifdef __cplusplus @@ -45,11 +49,6 @@ extern "C" { #define FALSE 0 #define TRUE 1 -/* - * XXX - * NCP_MAX_NMAUS should come from OBP/HV - * NCP_MAX_CPUS_PER_MAU should come from OBP/HV - */ #define NCP_MAX_NMAUS 8 #define NCP_MAX_CPUS_PER_MAU 4 #define NCP_CPUID2MAUID(c) ((c) / NCP_MAX_CPUS_PER_MAU) @@ -96,8 +95,6 @@ typedef struct ncp_minor ncp_minor_t; typedef struct ncp_listnode ncp_listnode_t; typedef struct ncp_request ncp_request_t; typedef struct ncp_stat ncp_stat_t; -typedef struct ncp_mau_queue ncp_mau_queue_t; -typedef struct ncp_desc ncp_desc_t; @@ -246,46 +243,16 @@ struct ncp_stat { kstat_named_t ns_status; kstat_named_t ns_algs[DS_MAX]; struct { + kstat_named_t ns_mauid; + kstat_named_t ns_mauhandle; + kstat_named_t ns_maustate; kstat_named_t ns_submit; kstat_named_t ns_qfull; + kstat_named_t ns_qbusy; kstat_named_t ns_qupdate_failure; } ns_mau[NCP_MAX_NMAUS]; }; - -struct ncp { - kmutex_t n_lock; - kmem_cache_t *n_ds_cache; - kmem_cache_t *n_mactl_cache; - kmem_cache_t *n_mabuf_cache; - dev_info_t *n_dip; - minor_t n_minor; - int n_nmaus; - int n_max_nmaus; - int *n_mauids; - ncp_mau_queue_t *n_mau_q; - int n_mau_q_size; - - ddi_taskq_t *n_taskq; - - unsigned n_flags; /* dev state flags */ - - kstat_t *n_ksp; - kstat_t *n_intrstats; - u_longlong_t n_stats[DS_MAX]; - u_longlong_t n_qfull[NCP_MAX_NMAUS]; - u_longlong_t n_qupdate_failure[NCP_MAX_NMAUS]; - - ulong_t n_pagesize; - crypto_kcf_provider_handle_t n_prov; - - kmutex_t n_freereqslock; - ncp_listnode_t n_freereqs; /* available requests */ - - kmutex_t n_ctx_list_lock; - ncp_listnode_t n_ctx_list; -}; - /* * Device flags (ncp_t.ncp_flags) */ @@ -294,10 +261,25 @@ struct ncp { /* * IMPORTANT: - * (NCP_MAQUEUE_NENTRIES * sizeof (ncs_hvdesc_t)) <= PAGESIZE + * NCP_MAQUEUE_NENTRIES *must* be a power-of-2. + * requirement: sizeof (ncs_hvdesc_t) == 64 */ -#define NCP_MAQUEUE_NENTRIES 64 +#define NCP_MAQUEUE_NENTRIES (1 << 9) /* 512 */ #define NCP_MAQUEUE_WRAPMASK (NCP_MAQUEUE_NENTRIES - 1) +#define NCP_MAQUEUE_SIZE (NCP_MAQUEUE_NENTRIES * sizeof (ncs_hvdesc_t)) +#define NCP_MAQUEUE_ALIGN (NCP_MAQUEUE_SIZE - 1) +#define NCP_MAQUEUE_SLOTS_AVAIL(q) \ + (((q)->nmq_head > (q)->nmq_tail) ? \ + ((q)->nmq_head > (q)->nmq_tail - 1) : \ + (NCP_MAQUEUE_NENTRIES - \ + ((q)->nmq_tail - (q)->nmq_head) - 1)) + +#define NCP_QINDEX_TO_QOFFSET(i) ((i) * sizeof (ncs_hvdesc_t)) +#define NCP_QOFFSET_TO_QINDEX(o) ((o) / sizeof (ncs_hvdesc_t)) +#define NCP_QINDEX_INCR(i) (((i) + 1) & NCP_MAQUEUE_WRAPMASK) +#define NCP_QINDEX_IS_VALID(i) (((i) >= 0) && \ + ((i) < NCP_MAQUEUE_NENTRIES)) +#define NCP_QTIMEOUT_SECONDS 15 typedef struct ncp_ma { kmutex_t nma_lock; @@ -305,24 +287,141 @@ typedef struct ncp_ma { int nma_ref; /* # of descriptor references */ } ncp_ma_t; +typedef struct ncp_desc ncp_desc_t; struct ncp_desc { ncs_hvdesc_t nd_hv; ncp_desc_t *nd_link; /* to string related descriptors */ ncp_ma_t *nd_ma; /* referenced MA buffer */ }; +typedef struct ncp_descjob { + int dj_id; + kcondvar_t dj_cv; + ncp_desc_t *dj_jobp; + struct ncp_descjob *dj_prev; + struct ncp_descjob *dj_next; +} ncp_descjob_t; + /* * nmq_head, nmq_tail = indexes into nmq_desc[]. */ -struct ncp_mau_queue { - int nmq_id; +typedef struct { + uint64_t nmq_mauhandle; + uint64_t nmq_devino; + int nmq_inum; + int nmq_mauid; + int nmq_init; + int nmq_busy_wait; + kcondvar_t nmq_busy_cv; kmutex_t nmq_lock; int nmq_head; int nmq_tail; uint_t nmq_wrapmask; + ncp_descjob_t **nmq_jobs; + size_t nmq_jobs_size; ncs_hvdesc_t *nmq_desc; /* descriptor array */ - int nmq_desc_size; - uint64_t nmq_njobs; + char *nmq_mem; + size_t nmq_memsize; + ncp_descjob_t *nmq_joblist; + int nmq_joblistcnt; + struct { + uint64_t qks_njobs; + uint64_t qks_qfull; + uint64_t qks_qbusy; + uint64_t qks_qfail; + } nmq_ks; +} ncp_mau_queue_t; + +#define MAU_STATE_ERROR (-1) +#define MAU_STATE_OFFLINE 0 +#define MAU_STATE_ONLINE 1 + +typedef struct { + int mm_mauid; + int mm_cpulistsz; + int *mm_cpulist; + int mm_ncpus; + int mm_nextcpuidx; + /* + * Only protects mm_nextcpuidx field. + */ + kmutex_t mm_lock; + /* + * xxx - maybe need RW lock for mm_state? + */ + int mm_state; /* MAU_STATE_... */ + + ncp_mau_queue_t mm_queue; +} mau_entry_t; + +typedef struct { + int mc_cpuid; + int mc_mauid; + /* + * xxx - maybe need RW lock for mm_state? + * Mirrors mm_state in mau_entry_t. Duplicated + * for speed so we don't have search mau_entry + * table. Field rarely updated. + */ + int mc_state; /* MAU_STATE_... */ +} cpu_entry_t; + +typedef struct { + /* + * MAU stuff + */ + int m_maulistsz; + mau_entry_t *m_maulist; + int m_nmaus; + int m_nextmauidx; + /* + * Only protects m_nextmauidx field. + */ + kmutex_t m_lock; + + /* + * CPU stuff + */ + int m_cpulistsz; + cpu_entry_t *m_cpulist; + int m_ncpus; +} ncp_mau2cpu_map_t; + +struct ncp { + uint_t n_hvapi_minor_version; + kmutex_t n_lock; + kmem_cache_t *n_ds_cache; + kmem_cache_t *n_mactl_cache; + kmem_cache_t *n_mabuf_cache; + dev_info_t *n_dip; + minor_t n_minor; + + ddi_taskq_t *n_taskq; + + unsigned n_flags; /* dev state flags */ + + kstat_t *n_ksp; + kstat_t *n_intrstats; + u_longlong_t n_stats[DS_MAX]; + + ddi_intr_handle_t *n_htable; + int n_intr_mid[NCP_MAX_NMAUS]; + int n_intr_type; + int n_intr_cnt; + size_t n_intr_size; + uint_t n_intr_pri; + + ulong_t n_pagesize; + crypto_kcf_provider_handle_t n_prov; + + kmutex_t n_freereqslock; + ncp_listnode_t n_freereqs; /* available requests */ + + kmutex_t n_ctx_list_lock; + ncp_listnode_t n_ctx_list; + + md_t *n_mdp; + ncp_mau2cpu_map_t n_maumap; }; #endif /* _KERNEL */ @@ -343,14 +442,18 @@ struct ncp_mau_queue { #define DMA_LDST 0x00000004 #define DNCS_QTAIL 0x00000008 #define DATTACH 0x00000010 -#define DMOD 0x00000040 /* _init/_fini/_info/attach/detach */ -#define DENTRY 0x00000080 /* crypto routine entry/exit points */ +#define DMD 0x00000020 +#define DHV 0x00000040 +#define DINTR 0x00000080 +#define DMOD 0x00000100 /* _init/_fini/_info/attach/detach */ +#define DENTRY 0x00000200 /* crypto routine entry/exit points */ #define DALL 0xFFFFFFFF #define DBG0 ncp_dprintf #define DBG1 ncp_dprintf #define DBG2 ncp_dprintf #define DBG3 ncp_dprintf +#define DBG4 ncp_dprintf #define DBGCALL(flag, func) { if (ncp_dflagset(flag)) (void) func; } void ncp_dprintf(ncp_t *, int, const char *, ...); @@ -363,6 +466,7 @@ int ncp_dflagset(int); #define DBG1(vca, lvl, fmt, arg1) #define DBG2(vca, lvl, fmt, arg1, arg2) #define DBG3(vca, lvl, fmt, arg1, arg2, arg3) +#define DBG4(vca, lvl, fmt, arg1, arg2, arg3, arg4) #define DBGCALL(flag, func) #endif /* !defined(DEBUG) */ @@ -404,6 +508,16 @@ int ncp_dsaatomic(crypto_provider_handle_t, crypto_session_id_t, crypto_data_t *, int, crypto_req_handle_t, int); /* + * ncp_md. + */ +int ncp_init_mau2cpu_map(ncp_t *); +void ncp_deinit_mau2cpu_map(ncp_t *); +int ncp_map_cpu_to_mau(ncp_t *, int); +int ncp_map_mau_to_cpu(ncp_t *, int); +int ncp_map_nextmau(ncp_t *); +mau_entry_t *ncp_map_findmau(ncp_t *, int); + +/* * ncp_kstat.c */ void ncp_ksinit(ncp_t *); diff --git a/usr/src/uts/sun4v/sys/ncs.h b/usr/src/uts/sun4v/sys/ncs.h index bb28f7dc4c..11310e0817 100644 --- a/usr/src/uts/sun4v/sys/ncs.h +++ b/usr/src/uts/sun4v/sys/ncs.h @@ -33,12 +33,24 @@ extern "C" { #endif /* - * NCS HV API versioni definitions. + * NCS HV API version definitions. */ #define NCS_MAJOR_VER 1 -#define NCS_MINOR_VER 0 +#define NCS_MINOR_VER 1 +/* + * NCS HV API v1.0 + */ #define HV_NCS_REQUEST 0x110 +/* + * NCS HV API v1.1 + */ +#define HV_NCS_QCONF 0x111 +#define HV_NCS_QINFO 0x112 +#define HV_NCS_GETHEAD 0x113 +#define HV_NCS_GETTAIL 0x114 +#define HV_NCS_SETTAIL 0x115 +#define HV_NCS_QHANDLE_TO_DEVINO 0x116 #ifndef _ASM /* Forward typedefs */ @@ -62,7 +74,7 @@ union ma_ctl { uint64_t length:6; } bits; }; -#endif /* !_ASM */ +#endif /* _ASM */ /* Values for ma_ctl operation field */ #define MA_OP_LOAD 0x0 @@ -114,7 +126,7 @@ union ma_ma { #endif /* !_ASM */ /* - * NCS API definitions + * NCS HV API v1.0 definitions (PSARC/2005/125) */ /* @@ -164,8 +176,8 @@ typedef struct ma_regs { } ma_regs_t; #define ND_TYPE_UNASSIGNED 0 -#define ND_TYPE_MA 1 -#define ND_TYPE_SPU 2 +#define ND_TYPE_MA 1 /* v1.0 only */ +#define ND_TYPE_SPU 2 /* v1.0 only */ #define ND_STATE_FREE 0 #define ND_STATE_PENDING 1 @@ -190,7 +202,50 @@ typedef struct ncs_hvdesc { extern uint64_t hv_ncs_request(int, uint64_t, size_t); -#endif /* !_ASM */ +#endif /* _ASM */ + +/* + * NCS HV API v1.1 definitions (FWARC/2006/174) + * + * Some of the structures above (v1.0) are inherited for v1.1 + */ +/* + * In v1.1, the nhd_type field has the following values + * when non-zero (unassigned). The nhd_type field indicates + * whether the descriptor is the beginning of a crypto job, + * the continuation, or the end/last descriptor in a job. + * A job may be comprised of multiple descriptors. + */ +#define ND_TYPE_START 0x01 +#define ND_TYPE_CONT 0x02 +#define ND_TYPE_END 0x80 + +/* + * Types of queues supported by NCS + */ +#define NCS_QTYPE_MAU 0x1 +#define NCS_QTYPE_CWQ 0x2 + +/* + * This structure is accessed with offsets in ml/hcall.s. + * Any changes to this structure will require updates to + * the hv_ncs_qinfo entrypoint in ml/hcall.s. + */ +#ifndef _ASM +typedef struct ncs_qinfo { + uint64_t qi_qtype; + uint64_t qi_baseaddr; + uint64_t qi_qsize; +} ncs_qinfo_t; + +extern uint64_t hv_ncs_qconf(uint64_t, uint64_t, uint64_t, uint64_t *); +extern uint64_t hv_ncs_qinfo(uint64_t, ncs_qinfo_t *); +extern uint64_t hv_ncs_gethead(uint64_t, uint64_t *); +extern uint64_t hv_ncs_gettail(uint64_t, uint64_t *); +extern uint64_t hv_ncs_settail(uint64_t, uint64_t); +extern uint64_t hv_ncs_qhandle_to_devino(uint64_t, uint64_t *); +extern uint64_t hv_ncs_intr_clrstate(uint64_t); +#endif /* _ASM */ #ifdef __cplusplus } diff --git a/usr/src/uts/sun4v/sys/platsvc.h b/usr/src/uts/sun4v/sys/platsvc.h new file mode 100644 index 0000000000..9b76f1548c --- /dev/null +++ b/usr/src/uts/sun4v/sys/platsvc.h @@ -0,0 +1,95 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _PLATSVC_H +#define _PLATSVC_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/ds.h> + +#define MAX_REASON_SIZE 1 + +/* + * PLATSVC STATUS + */ +#define PLATSVC_SUCCESS 0x0 +#define PLATSVC_FAILURE 0x1 +#define PLATSVC_INVALID_MESG 0x2 + +#define MD_UPDATE_SUCCESS PLATSVC_SUCCESS +#define MD_UPDATE_FAILURE PLATSVC_FAILURE +#define MD_UPDATE_INVALID_MSG PLATSVC_INVALID_MESG + +#define DOMAIN_SHUTDOWN_SUCCESS PLATSVC_SUCCESS +#define DOMAIN_SHUTDOWN_FAILURE PLATSVC_FAILURE +#define DOMAIN_SHUTDOWN_INVALID_MSG PLATSVC_INVALID_MESG + +#define DOMAIN_PANIC_SUCCESS PLATSVC_SUCCESS +#define DOMAIN_PANIC_FAILURE PLATSVC_FAILURE +#define DOMAIN_PANIC_INVALID_MSG PLATSVC_INVALID_MESG + +typedef struct platsvc_md_update_req { + uint64_t req_num; +} platsvc_md_update_req_t; + +typedef struct platsvc_md_update_resp { + uint64_t req_num; + uint32_t result; +} platsvc_md_update_resp_t; + + +typedef struct platsvc_shutdown_req { + uint64_t req_num; + uint32_t delay; +} platsvc_shutdown_req_t; + +typedef struct platsvc_shutdown_resp { + uint64_t req_num; + uint32_t result; + char reason[MAX_REASON_SIZE]; +} platsvc_shutdown_resp_t; + +typedef struct platsvc_panic_req { + uint64_t req_num; +} platsvc_panic_req_t; + +typedef struct platsvc_panic_resp { + uint64_t req_num; + uint32_t result; + char reason[MAX_REASON_SIZE]; +} platsvc_panic_resp_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _PLATSVC_H */ diff --git a/usr/src/uts/sun4v/sys/promif_impl.h b/usr/src/uts/sun4v/sys/promif_impl.h new file mode 100644 index 0000000000..2f5602a5b2 --- /dev/null +++ b/usr/src/uts/sun4v/sys/promif_impl.h @@ -0,0 +1,144 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_PROMIF_IMPL_H +#define _SYS_PROMIF_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#ifdef _KERNEL +#include <sys/promimpl.h> +#endif +#include <sys/obpdefs.h> +#include <sys/cmn_err.h> +#include <sys/note.h> + +/* + * CIF handler functions + */ +typedef int (*cif_func_t) (void *); +extern int promif_getprop(void *p); +extern int promif_getproplen(void *p); +extern int promif_nextprop(void *p); +extern int promif_nextnode(void *p); +extern int promif_childnode(void *p); +extern int promif_parentnode(void *p); +extern int promif_enter_mon(void *p); +extern int promif_exit_to_mon(void *p); +extern int promif_reboot(void *p); +extern int promif_write(void *p); +extern int promif_read(void *p); +extern int promif_interpret(void *p); +extern int promif_finddevice(void *p); +extern int promif_instance_to_package(void *p); +#ifndef _KMDB +extern int promif_setprop(void *p); +extern int promif_test(void *p); +extern int promif_instance_to_path(void *p); +extern int promif_power_off(void *p); +extern int promif_asr_list_keys_len(void *p); +extern int promif_asr_list_keys(void *p); +extern int promif_asr_export_len(void *p); +extern int promif_asr_export(void *p); +extern int promif_set_security_key(void *p); +extern int promif_get_security_key(void *p); +extern int promif_start_cpu(void *p); +extern int promif_set_mmfsa_traptable(void *p); +extern int promif_set_sun4v_api_version(void *p); +extern int promif_get_sun4v_api_version(void *p); +#endif + +/* + * Shadow device tree access functions + */ +extern pnode_t promif_stree_nextnode(pnode_t nodeid); +extern pnode_t promif_stree_childnode(pnode_t nodeid); +extern pnode_t promif_stree_parentnode(pnode_t nodeid); +extern int promif_stree_getproplen(pnode_t, char *name); +extern int promif_stree_getprop(pnode_t, char *name, void *value); +extern int promif_stree_setprop(pnode_t, char *name, void *value, int len); +extern char *promif_stree_nextprop(pnode_t nodeid, char *name, char *next); + +/* + * Hooks for kmdb to get and set a pointer to the PROM shadow tree + */ +#ifdef _KMDB +extern void promif_stree_setroot(void *root); +extern caddr_t promif_stree_getroot(void); +#endif + +/* + * Miscellaneous functions + */ +extern cif_func_t promif_find_cif_callback(char *opname); +extern int promif_ldom_setprop(char *name, void *value, int valuelen); + +/* + * Initialization functions + */ +#ifdef _KMDB +extern void cif_init(char *, caddr_t, ihandle_t, ihandle_t, + phandle_t, phandle_t, pnode_t, pnode_t); +extern void promif_io_init(ihandle_t, ihandle_t, phandle_t, phandle_t); +extern void promif_set_nodes(pnode_t, pnode_t); +#else +extern void promif_io_init(void); +extern void promif_stree_init(void); +extern void promif_prop_init(void); +#endif + +/* + * Debugging support + */ +#ifdef DEBUG + +extern uint_t cif_debug; + +#define CIF_DBG_FLAG_NODE 0x01 +#define CIF_DBG_FLAG_REBOOT 0x02 + +#define CIF_DBG_ALL if (cif_debug) prom_printf +#define CIF_DBG_NODE if (cif_debug & CIF_DBG_FLAG_NODE) prom_printf +#define CIF_DBG_REBOOT if (cif_debug & CIF_DBG_FLAG_REBOOT) prom_printf + +#else /* DEBUG */ + +#define CIF_DBG_ALL _NOTE(CONSTCOND) if (0) prom_printf +#define CIF_DBG_NODE CIF_DBG_ALL +#define CIF_DBG_REBOOT CIF_DBG_ALL + +#endif /* DEBUG */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PROMIF_IMPL_H */ diff --git a/usr/src/uts/sun4v/sys/varconfig.h b/usr/src/uts/sun4v/sys/varconfig.h new file mode 100644 index 0000000000..5d01809355 --- /dev/null +++ b/usr/src/uts/sun4v/sys/varconfig.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VARCONFIG_H +#define _SYS_VARCONFIG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + + +typedef enum { + VAR_CONFIG_SET_REQ, + VAR_CONFIG_DELETE_REQ, + VAR_CONFIG_SET_RESP, + VAR_CONFIG_DELETE_RESP +} var_config_cmd_t; + +typedef struct { + var_config_cmd_t cmd; +} var_config_hdr_t; + + +typedef struct { + char name_and_value[1]; +} var_config_set_req_t; + +typedef struct { + char name[1]; +} var_config_delete_req_t; + + +typedef enum { + VAR_CONFIG_SUCCESS = 0, + VAR_CONFIG_NO_SPACE, + VAR_CONFIG_INVALID_VAR, + VAR_CONFIG_INVALID_VAL, + VAR_CONFIG_VAR_NOT_PRESENT +} var_config_status_t; + +typedef struct { + var_config_status_t result; +} var_config_resp_t; + + +typedef struct { + var_config_hdr_t vc_hdr; + union { + var_config_set_req_t vc_set; + var_config_delete_req_t vc_delete; + var_config_resp_t vc_resp; + } un; +} var_config_msg_t; + +#define var_config_cmd vc_hdr.cmd +#define var_config_set un.vc_set +#define var_config_delete un.vc_delete +#define var_config_resp un.vc_resp + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VARCONFIG_H */ diff --git a/usr/src/uts/sun4v/sys/vcc.h b/usr/src/uts/sun4v/sys/vcc.h new file mode 100644 index 0000000000..378fdce8e2 --- /dev/null +++ b/usr/src/uts/sun4v/sys/vcc.h @@ -0,0 +1,110 @@ + +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VCC_H +#define _VCC_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/stream.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ioctl.h> + +/* + * vcc and vntsd exchange information using ioctl commands. When vntsd starts, + * it uses VCC_NUM_CONSOLE to get number of existing ports and + * VCC_CONS_TBL to obtain the table of existing consoles. In this table, + * vcc returns information about each of the console ports using vcc_console_t + * structure. Vntsd then sleeps on polling vcc control port. + * + * When there is a change in configuration, such as addtion or deletion + * of a console port, vcc wakes up vntsd via the poll events. Subsequently, + * vntsd uses VCC_INQUIRY ioctl to determine the reason for wakeup. In + * response to the inquiry, vcc provides a vcc_response_t structure + * containing reason and port number. + * + * If a port is being added or updated (group change), vntsd uses + * VCC_CONS_INFO ioctl with port number to obtain configuration of + * the port. + * + * If the port is being deleted, vntsd uses VCC_DEL_CONS_OK ioctl to notify + * vcc after its clean up is done. Vcc subsequently tears down + * its internal configuration and remove the associated TTY minor node. + * + * Only one open is allowd for each vcc port. If vntsd opens a port that is + * already open, vntsd will use VNTSD_FORCE_CLOSE to take port from other + * application + */ + +/* VCC CNTRL IOCTL */ + +#define VCC_IOCTL_CMD ('c' << 8) + + +#define VCC_NUM_CONSOLE VCC_IOCTL_CMD | 0x1 /* num of consoles */ +#define VCC_CONS_TBL VCC_IOCTL_CMD | 0x2 /* config table */ +#define VCC_INQUIRY VCC_IOCTL_CMD | 0x3 /* inquiry by vntsd */ +#define VCC_CONS_INFO VCC_IOCTL_CMD | 0x4 /* config */ +#define VCC_CONS_STATUS VCC_IOCTL_CMD | 0x5 /* console status */ +#define VCC_FORCE_CLOSE VCC_IOCTL_CMD | 0x6 /* force to close */ + +/* reasons to wake up vntsd */ +typedef enum { + VCC_CONS_ADDED, /* a port was added */ + VCC_CONS_DELETED, /* a port was removed */ + /* XXX not implemented yet */ + VCC_CONS_UPDATED /* a port configuration was changed */ +} vcc_reason_t; + +/* + * structure that vcc returns to vntsd in response to VCC_CONS_TBL and + * VCC_CONS_INFO ioctl call. + */ +typedef struct vcc_console { + int cons_no; /* console port number */ + uint64_t tcp_port; /* tcp port for the group */ + char domain_name[MAXPATHLEN]; /* domain name */ + char group_name[MAXPATHLEN]; /* group name */ + char dev_name[MAXPATHLEN]; +} vcc_console_t; + +/* structure that vcc sends to vntsd in response to wake up inquiry */ +typedef struct vcc_response { + int cons_no; /* console port number */ + vcc_reason_t reason; /* wake up reason */ +} vcc_response_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _VCC_H */ diff --git a/usr/src/uts/sun4v/sys/vcc_impl.h b/usr/src/uts/sun4v/sys/vcc_impl.h new file mode 100644 index 0000000000..8bb42fa15a --- /dev/null +++ b/usr/src/uts/sun4v/sys/vcc_impl.h @@ -0,0 +1,304 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VCC_IMPL_H +#define _VCC_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/stream.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ioctl.h> +#include <sys/vcc.h> + +#define VCC_DEV_TO_INST(dev) (getminor(dev)) +#define VCC_INST_TO_DEV(instance) (instance) + +#define VCC_DRIVER_NAME "vcc" +#define VCC_NAME VCC_DRIVER_NAME + +/* + * VCC Port States + */ + +/* + * There is one lock in port structure to protect the states of the port. + * States of the port are: + * 1. VCC_PORT_AVAIL + * 2. VCC_PORT_OPEN + * 3. VCC_PORT_USE_READ_LDC - There is a thread doing vcc_read. + * 4. VCC_PORT_USE_WRITE_LDC - There is a thread doing vcc_write. + * 6. VCC_PORT_LDC_DATA_READY - Data is ready from ldc. + * 5. VCC_PORT_LDC_WRITE_READY - Ldc has space to receive data. + * 7. VCC_PORT_LDC_CHANNEL_READY - Ldc channel is up. + * 8. VCC_PORT_ADDED - A new port was added. + * 9. VCC_PORT_TERM_RD - Terminal read is enabled vs suspended + * 10. VCC_PORT_TERM_WR - Terminal write is enabled vc suspended + * 11. VCC_PORT_NONBLOCK - A port was opened with non blocking flag. + * 12. VCC_PORT_LDC_LINK_DOWN + * + * + * Code flow for port to transit from one state to another is as the follows: + * + * 1. VCC_PORT_AVAIL + * + * Transition from unavailable to available + * - obtain port lock + * Transit port to available and added states + * - release port lock + * - obtain softstate lock + * Increase total number of ports + * - release softsate lock + * + * after download added port to vntsd + * - obtain port lock + * Transit port to not added state + * - release port lock + * + * Transition from available to unavailable + * - obtain port lock + * - cv_wait read available + * Transit port to read unavailable + * - cv_wait write available + * Transit port to write unavailable + * Transit port to not ready. (close ldc channel) + * Transit port to deleted state + * Transit port to read and write available + * - cv_broadcast + * - release lock + * + * vntsd close the deleted port + * - obtained port lock + * Transit port to close and deleted state + * - release port lock + * + * after vntsd deletion of the port + * - obtain softstate lock + * - cv_wait port table unlocked + * Transit softstate to port table locked + * - release softstate lock + * - obtain port lock + * Transit port to unavailable + * destroy port lock + * - obtain softstate lock + * Transit softstate to port table unlocked + * - cv_broadcast + * - release softsate lock + * + * 2. VCC_PORT_OPEN + * + * Transition from close to open + * - obtain port lock + * transit port to open + * - release port lock + * + * Transition from open to close + * - obtain port lock + * - cv_wait read available + * Transit port to read unavailable + * - cv_wait write available + * Transit port to write unavailable + * Transit port to not ready. (close ldc channel) + * Transit port to close state + * Transit port to read and write available + * - cv_broadcast + * - release lock + * + * 3. VCC_PORT_USE_READ_LDC/VCC_PORT_USE_WRITE_LDC + * Transition from read availale/write available + * to read unavailable/write unavailable + * - obtain port lock + * - cv_wait read available + * Transit to read/write unavailable + * - release port lock + * + * Transition from read unavailale/write unavailable + * to read available/write available + * - obtain port lock + * Transit to read/write available + * - cv_broadcast + * - release port lock + * + * 4. VCC_PORT_LDC_CHANNEL_READY + * Transition from data not ready to data ready + * - obtain port lock + * Transit to data ready + * - cv_broadcast + * - release port lock + * + * Transition from data ready to data not ready + * - obtain port lock + * Transit to data not ready + * - release port lock + */ + +#define VCC_PORT_AVAIL 0x1 /* port is configured */ +#define VCC_PORT_OPEN 0x2 /* port is opened */ +#define VCC_PORT_LDC_CHANNEL_READY 0x4 /* ready for data transfer */ +#define VCC_PORT_USE_READ_LDC 0x8 /* read lock */ +#define VCC_PORT_USE_WRITE_LDC 0x10 /* write lock */ +#define VCC_PORT_LDC_DATA_READY 0x20 /* data ready */ +#define VCC_PORT_LDC_WRITE_READY 0x40 /* ldc ready receive data */ +#define VCC_PORT_ADDED 0x80 /* added, no ack from vntsd */ +#define VCC_PORT_UPDATED 0x100 /* updated, no ack from vntsd */ +#define VCC_PORT_TERM_RD 0x200 /* suspend write */ +#define VCC_PORT_TERM_WR 0x400 /* suspend read */ +#define VCC_PORT_NONBLOCK 0x800 /* open with non block flag */ +#define VCC_PORT_LDC_LINK_DOWN 0x1000 /* ldc link down */ + +/* Poll Flags */ +#define VCC_POLL_CONFIG 0x1 /* poll configuration change */ + +/* Poll evnets */ +#define VCC_POLL_ADD_PORT 0x10 /* add a console port */ +#define VCC_POLL_UPDATE_PORT 0x20 /* update a console port */ + +/* softstate port table state */ +#define VCC_LOCK_PORT_TBL 0x1 + +/* VCC limits */ +#define VCC_MAX_PORTS 0x800 /* number of domains */ +#define VCC_MAX_MINORS VCC_MAX_PORTS /* number of minors */ + + +#define VCC_MAX_PORT_MINORS (VCC_MAX_MINORS - 1) +#define VCC_CONTROL_MINOR_IDX (VCC_MAX_MINORS - 1) + +/* size of vcc message data */ +#define VCC_MTU_SZ 56 + + +/* Default values */ +#define VCC_HDR_SZ 8 /* header size */ +#define VCC_BUF_SZ (VCC_HDR_SZ + VCC_MTU_SZ) + +#define VCC_CONTROL_PORT 0x7ff /* port 2047 is control port */ +#define VCC_INST_SHIFT 11 +#define VCC_INVALID_CHANNEL -1 +#define VCC_NO_PID_BLOCKING -1 + +#define VCC_QUEUE_LEN 0x80 /* ldc queue size */ + +#define VCC_MINOR_NAME_PREFIX "ldom-" /* device name prefix */ + +/* HV message data type */ +#define LDC_CONSOLE_CTRL 0x1 /* ctrl msg */ +#define LDC_CONSOLE_DATA 0x2 /* data msg */ + +/* HV control messages */ +#define LDC_CONSOLE_BREAK -1 /* brk */ +#define LDC_CONSOLE_HUP -2 /* hup */ + +/* minor number to port number */ +#define VCCPORT(p, minor) (p->minor_tbl[(minor & \ + VCC_CONTROL_PORT)].portno) + +/* minor number to minor pointer */ +#define VCCMINORP(p, minor) (&(p->minor_tbl[(minor & \ + VCC_CONTROL_PORT)])) + +/* minor number to instance */ +#define VCCINST(minor) ((minor) >> VCC_INST_SHIFT) + + +/* hv console packet format */ +typedef struct vcc_msg { + uint8_t type; /* type - data or ctrl */ + uint8_t size; /* data size */ + uint16_t unused; /* not used */ + int32_t ctrl_msg; /* data if type is ctrl */ + uint8_t data[VCC_MTU_SZ]; /* data if type is data */ +} vcc_msg_t; + +/* + * minor node to port mapping table + */ +typedef struct vcc_minor { + uint_t portno; /* port number */ + char domain_name[MAXPATHLEN]; /* doman name */ +} vcc_minor_t; + +/* console port structure */ +typedef struct vcc_port { + + kmutex_t lock; /* protects port */ + kcondvar_t read_cv; /* cv to sleep for reads */ + kcondvar_t write_cv; /* cv to sleep for writes */ + + uint_t number; /* port number */ + uint32_t status; /* port status */ + + char group_name[MAXPATHLEN]; + uint64_t tcp_port; /* tcp port num */ + + struct termios term; /* terminal emulation */ + + vcc_minor_t *minorp; /* pointer to minor table entry */ + + uint64_t ldc_id; /* Channel number */ + ldc_handle_t ldc_handle; /* Channel handle */ + ldc_status_t ldc_status; /* Channel Status */ + + uint_t pollflag; /* indicated poll status */ + struct pollhead poll; + uint32_t pollevent; + pid_t valid_pid; /* pid that allows cb_ops */ + +} vcc_port_t; + +/* + * vcc driver's soft state structure + */ +typedef struct vcc { + + /* protects vcc_t (soft state) */ + kmutex_t lock; + + uint_t status; + + dev_info_t *dip; /* dev_info */ + + mdeg_node_spec_t *md_ispecp; /* mdeg prop spec */ + mdeg_handle_t mdeg_hdl; /* mdeg handle */ + + vcc_port_t port[VCC_MAX_PORTS]; /* port table */ + uint_t num_ports; /* avail ports */ + + vcc_minor_t minor_tbl[VCC_MAX_PORTS]; /* minor table */ + uint_t minors_assigned; /* assigned minors */ +} vcc_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _VCC_IMPL_H */ diff --git a/usr/src/uts/sun4v/sys/vdc.h b/usr/src/uts/sun4v/sys/vdc.h new file mode 100644 index 0000000000..a551d6a7f0 --- /dev/null +++ b/usr/src/uts/sun4v/sys/vdc.h @@ -0,0 +1,260 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VDC_H +#define _VDC_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Virtual disk client implementation definitions + */ + +#include <sys/sysmacros.h> +#include <sys/note.h> + +#include <sys/ldc.h> +#include <sys/vio_mailbox.h> +#include <sys/vdsk_mailbox.h> +#include <sys/vdsk_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define VDC_DRIVER_NAME "vdc" + +/* + * Bit-field values to indicate if parts of the vdc driver are initialised. + */ +#define VDC_SOFT_STATE 0x0001 +#define VDC_LOCKS 0x0002 +#define VDC_MINOR 0x0004 +#define VDC_THREAD 0x0008 +#define VDC_LDC 0x0010 +#define VDC_LDC_INIT 0x0020 +#define VDC_LDC_CB 0x0040 +#define VDC_LDC_OPEN 0x0080 +#define VDC_DRING_INIT 0x0100 /* The DRing was created */ +#define VDC_DRING_BOUND 0x0200 /* The DRing was bound to an LDC channel */ +#define VDC_DRING_LOCAL 0x0400 /* The local private DRing was allocated */ +#define VDC_DRING_ENTRY 0x0800 /* At least one DRing entry was initialised */ +#define VDC_DRING (VDC_DRING_INIT | VDC_DRING_BOUND | \ + VDC_DRING_LOCAL | VDC_DRING_ENTRY) +#define VDC_HANDSHAKE 0x1000 /* Indicates if a handshake is in progress */ +#define VDC_HANDSHAKE_STOP 0x2000 /* stop further handshakes */ + +/* + * Bit-field values to indicate status of local DRing entry + * + * The lowest 8 bits are reserved for the DRing state. + */ +#define VDC_ALLOC_HANDLE 0x10 + +/* + * Definitions of strings to be used to create device node properties. + * (vdc uses the capitalised versions of these properties as they are 64-bit) + */ +#define VDC_NBLOCKS_PROP_NAME "Nblocks" +#define VDC_SIZE_PROP_NAME "Size" + +/* + * Definitions of MD nodes/properties. + */ +#define VDC_MD_CHAN_NAME "channel-endpoint" +#define VDC_MD_VDEV_NAME "virtual-device" +#define VDC_MD_DISK_NAME "disk" +#define VDC_MD_CFG_HDL "cfg-handle" +#define VDC_ID_PROP "id" + +/* + * Scheme to store the instance number and the slice number in the minor number. + * (Uses the same format and definitions as the sd(7D) driver) + */ +#define VD_MAKE_DEV(instance, minor) ((instance << SDUNIT_SHIFT) | minor) + +/* + * variables controlling how long to wait before timing out and how many + * retries to attempt before giving up when communicating with vds. + */ +#define VDC_RETRIES 10 + +#define VDC_USEC_TIMEOUT_MIN (30 * MICROSEC) /* 30 sec */ + +#define VD_GET_TIMEOUT_HZ(mul) \ + (ddi_get_lbolt() + (vdc_hz_timeout * MAX(1, mul))) + +/* + * Macros to manipulate Descriptor Ring variables in the soft state + * structure. + */ +#define VDC_GET_NEXT_REQ_ID(vdc) ((vdc->req_id)++) + +#define VDC_GET_DRING_ENTRY_PTR(vdc, idx) \ + (vd_dring_entry_t *)(vdc->dring_mem_info.vaddr + \ + (idx * vdc->dring_entry_size)) + +#define VDC_MARK_DRING_ENTRY_FREE(vdc, idx) \ + { \ + vd_dring_entry_t *dep = NULL; \ + ASSERT(vdc != NULL); \ + ASSERT((idx >= 0) && (idx < VD_DRING_LEN)); \ + ASSERT(vdc->dring_mem_info.vaddr != NULL); \ + dep = (vd_dring_entry_t *)(vdc->dring_mem_info.vaddr + \ + (idx * vdc->dring_entry_size)); \ + ASSERT(dep != NULL); \ + dep->hdr.dstate = VIO_DESC_FREE; \ + } + +/* Initialise the Session ID and Sequence Num in the DRing msg */ +#define VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdc) \ + ASSERT(vdc != NULL); \ + dmsg.tag.vio_sid = vdc->session_id; \ + dmsg.seq_num = ++(vdc->seq_num); + +/* + * The states the message processing thread can be in. + */ +typedef enum vdc_thr_state { + VDC_THR_RUNNING, /* thread is running & ready to process */ + VDC_THR_STOP, /* The detach func signals the thread to stop */ + VDC_THR_DONE /* Thread has exited */ +} vdc_thr_state_t; + +/* + * Local Descriptor Ring entry + * + * vdc creates a Local (private) descriptor ring the same size as the + * public descriptor ring it exports to vds. + */ +typedef struct vdc_local_desc { + kmutex_t lock; /* protects all fields */ + kcondvar_t cv; /* indicate processing done */ + int flags; /* Dring entry state, etc */ + int operation; /* VD_OP_xxx to be performed */ + caddr_t addr; /* addr passed in by consumer */ + caddr_t align_addr; /* used if addr non-aligned */ + struct buf *buf; /* buf passed to strategy() */ + ldc_mem_handle_t desc_mhdl; /* Mem handle of buf */ + vd_dring_entry_t *dep; /* public Dring Entry Pointer */ +} vdc_local_desc_t; + +/* + * vdc soft state structure + */ +typedef struct vdc { + kmutex_t attach_lock; /* used by CV which waits in attach */ + kcondvar_t attach_cv; /* signal when attach can finish */ + + kmutex_t lock; /* protects next 2 sections of vars */ + kcondvar_t cv; /* signal when upper layers can send */ + + dev_info_t *dip; /* device info pointer */ + int instance; /* driver instance number */ + int initialized; /* keeps track of what's init'ed */ + int open; /* count of outstanding opens */ + int dkio_flush_pending; /* # outstanding DKIO flushes */ + + uint64_t session_id; /* common ID sent with all messages */ + uint64_t seq_num; /* most recent sequence num generated */ + uint64_t seq_num_reply; /* Last seq num ACK/NACK'ed by vds */ + uint64_t req_id; /* Most recent Request ID generated */ + vd_state_t state; /* Current handshake state */ + vd_disk_type_t vdisk_type; /* type of device/disk being imported */ + uint64_t vdisk_size; /* device size in bytes */ + uint64_t max_xfer_sz; /* maximum block size of a descriptor */ + uint64_t block_size; /* device block size used */ + struct dk_cinfo *cinfo; /* structure to store DKIOCINFO data */ + struct dk_minfo *minfo; /* structure for DKIOCGMEDIAINFO data */ + struct vtoc *vtoc; /* structure to store VTOC data */ + + /* + * The mutex 'msg_proc_lock' protects the following group of fields. + * + * The callback function checks to see if LDC triggered it due to + * there being data available and the callback will signal to + * the message processing thread waiting on 'msg_proc_cv'. + */ + kmutex_t msg_proc_lock; + kcondvar_t msg_proc_cv; + boolean_t msg_pending; + vdc_thr_state_t msg_proc_thr_state; + kthread_t *msg_proc_thr_id; + + /* + * The mutex 'dring_lock' protects the following group of fields. + */ + kmutex_t dring_lock; + ldc_mem_info_t dring_mem_info; + uint_t dring_curr_idx; + uint32_t dring_len; + uint32_t dring_cookie_count; + uint32_t dring_entry_size; + ldc_mem_cookie_t *dring_cookie; + uint64_t dring_ident; + + vdc_local_desc_t *local_dring; + + uint64_t ldc_id; + ldc_status_t ldc_state; + ldc_handle_t ldc_handle; + ldc_dring_handle_t ldc_dring_hdl; +} vdc_t; + +/* + * Debugging macros + */ +#ifdef DEBUG +extern int vdc_msglevel; + +#define PR0 if (vdc_msglevel > 0) \ + vdc_msg + +#define PR1 if (vdc_msglevel > 1) \ + vdc_msg + +#define PR2 if (vdc_msglevel > 2) \ + vdc_msg + +#define VDC_DUMP_DRING_MSG(dmsgp) \ + vdc_msg("sq:%d start:%d end:%d ident:%x\n", \ + dmsgp->seq_num, dmsgp->start_idx, \ + dmsgp->end_idx, dmsgp->dring_ident); + +#else /* !DEBUG */ +#define PR0(...) +#define PR1(...) +#define PR2(...) + +#define VDC_DUMP_DRING_MSG(dmsgp) + +#endif /* !DEBUG */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VDC_H */ diff --git a/usr/src/uts/sun4v/sys/vdsk_common.h b/usr/src/uts/sun4v/sys/vdsk_common.h new file mode 100644 index 0000000000..7cfffda28c --- /dev/null +++ b/usr/src/uts/sun4v/sys/vdsk_common.h @@ -0,0 +1,194 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VDSK_COMMON_H +#define _VDSK_COMMON_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This header file contains the private LDoms Virtual Disk (vDisk) definitions + * common to both the server (vds) and the client (vdc) + */ + +#include <sys/machparam.h> +#include <sys/vtoc.h> + +#include <sys/ldc.h> +#include <sys/vio_common.h> +#include <sys/vio_mailbox.h> + +/* + * vDisk definitions + */ + +/* + * The number of Descriptor Ring entries + * + * Constraints: + * - overall DRing size must be greater than 8K (MMU_PAGESIZE) + * - overall DRing size should be 8K aligned (desirable but not enforced) + * - DRing entry must be 8 byte aligned + */ +#define VD_DRING_LEN 512 + +/* + * + */ +#define VD_DRING_ENTRY_SZ (sizeof (vd_dring_entry_t) + \ + (sizeof (ldc_mem_cookie_t) * (VD_MAX_COOKIES - 1))) + +/* + * The maximum block size we can transmit using one Descriptor Ring entry + * + * Currently no FS uses more than 128K and it doesn't look like they + * will either as there is no perf gain to be had by larger values. + * ( see ZFS comment at definition of SPA_MAXBLOCKSIZE ). + * + * We choose 256K to give us some headroom. + */ +#define VD_MAX_BLOCK_SIZE (256 * 1024) + +#define VD_MAX_COOKIES ((VD_MAX_BLOCK_SIZE / PAGESIZE) + 1) +#define VD_USEC_TIMEOUT 20000 +#define VD_LDC_IDS_PROP "ldc-ids" +#define VD_LDC_QLEN 32 + +/* + * Flags used by ioctl routines to indicate if a copyin/copyout is needed + */ +#define VD_COPYOUT 0x1 +#define VD_COPYIN 0x2 + +/* + * vDisk operations on physical devices + */ +#define VD_OP_BREAD 0x01 /* Block Read */ +#define VD_OP_BWRITE 0x02 /* Block Write */ +#define VD_OP_FLUSH 0x03 /* Flush disk write cache contents */ +#define VD_OP_GET_WCE 0x04 /* Get disk W$ status */ +#define VD_OP_SET_WCE 0x05 /* Enable/Disable disk W$ */ +#define VD_OP_GET_VTOC 0x06 /* Get VTOC */ +#define VD_OP_SET_VTOC 0x07 /* Set VTOC */ +#define VD_OP_GET_DISKGEOM 0x08 /* Get disk geometry */ +#define VD_OP_SET_DISKGEOM 0x09 /* Set disk geometry */ +#define VD_OP_SCSICMD 0x0a /* SCSI control command */ +#define VD_OP_MASK 0xFF /* mask of all possible operations */ +#define VD_OP_COUNT 10 /* Number of operations */ + +/* + * Definitions of the various ways vds can export disk support to vdc. + */ +typedef enum vd_disk_type { + VD_DISK_TYPE_UNK = 0, /* Unknown device type */ + VD_DISK_TYPE_SLICE, /* slice in block device */ + VD_DISK_TYPE_DISK /* entire disk (slice 2) */ +} vd_disk_type_t; + +/* + * vDisk Descriptor payload + */ +typedef struct vd_dring_payload { + uint64_t req_id; /* The request ID being processed */ + uint8_t operation; /* operation for server to perform */ + uint8_t slice; /* The disk slice being accessed */ + uint16_t resv1; /* padding */ + uint32_t status; /* "errno" of server operation */ + uint64_t addr; /* LP64 diskaddr_t (block I/O) */ + uint64_t nbytes; /* LP64 size_t */ + uint32_t ncookies; /* Number of cookies used */ + uint32_t resv2; /* padding */ + + ldc_mem_cookie_t cookie[1]; /* variable sized array */ +} vd_dring_payload_t; + + +/* + * vDisk Descriptor entry + */ +typedef struct vd_dring_entry { + vio_dring_entry_hdr_t hdr; /* common header */ + vd_dring_payload_t payload; /* disk specific data */ +} vd_dring_entry_t; + + +/* + * vDisk control operation structures + * + * XXX FIXME - future support - add structures for VD_OP_XXXX + */ + +/* + * VTOC message + * + * vDisk Get Volume Table of Contents (VD_OP_GET_VTOC) + * + */ +typedef struct vd_partition { + uint16_t p_tag; /* ID tag of partition */ + uint16_t p_flag; /* permision flags */ + uint32_t reserved; /* padding */ + int64_t p_start; /* start sector no of partition */ + int64_t p_size; /* # of blocks in partition */ +} vd_partition_t; + +typedef struct vd_vtoc { + uint8_t v_volume[LEN_DKL_VVOL]; /* volume name */ + uint16_t v_sectorsz; /* sector size in bytes */ + uint16_t v_nparts; /* num of partitions */ + uint32_t reserved; /* padding */ + uint8_t v_asciilabel[LEN_DKL_ASCII]; /* for compatibility */ + +} vd_vtoc_t; + + +/* + * vDisk Get Geometry (VD_OP_GET_GEOM) + */ +typedef struct vd_geom { + uint16_t dkg_ncyl; /* # of data cylinders */ + uint16_t dkg_acyl; /* # of alternate cylinders */ + uint16_t dkg_bcyl; /* cyl offset (for fixed head area) */ + uint16_t dkg_nhead; /* # of heads */ + uint16_t dkg_nsect; /* # of data sectors per track */ + uint16_t dkg_intrlv; /* interleave factor */ + uint16_t dkg_apc; /* alternates per cyl (SCSI only) */ + uint16_t dkg_rpm; /* revolutions per minute */ + uint16_t dkg_pcyl; /* # of physical cylinders */ + uint16_t dkg_write_reinstruct; /* # sectors to skip, writes */ + uint16_t dkg_read_reinstruct; /* # sectors to skip, reads */ +} vd_geom_t; + + +#ifdef __cplusplus +} +#endif + +#endif /* _VDSK_COMMON_H */ diff --git a/usr/src/uts/sun4v/sys/vdsk_mailbox.h b/usr/src/uts/sun4v/sys/vdsk_mailbox.h new file mode 100644 index 0000000000..553ac2c9b6 --- /dev/null +++ b/usr/src/uts/sun4v/sys/vdsk_mailbox.h @@ -0,0 +1,100 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VDSK_MAILBOX_H +#define _VDSK_MAILBOX_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This header file contains the private LDoms Virtual Disk (vDisk) mailbox + * definitions common to both the server (vds) and the client (vdc) + */ + +#include <sys/vio_mailbox.h> +#include <sys/vio_common.h> +#include <sys/vdsk_common.h> + +/* + * Definition of the various states the vDisk state machine can + * be in during the handshake between vdc and vds. + */ +typedef enum vd_state { + VD_STATE_INIT = 0, + VD_STATE_VER, + VD_STATE_ATTR, + VD_STATE_DRING, + VD_STATE_RDX, + VD_STATE_DATA +} vd_state_t; + +#define VD_VER_MAJOR 0x1 +#define VD_VER_MINOR 0x0 + +/* + * vDisk device attributes information message. + * + * tag.msgtype == VIO_TYPE_CTRL + * tag.submsgtype = VIO_SUBTYPE_{INFO|ACK|NACK} + * tag.subtype_env == VIO_ATTR_INFO + */ +typedef struct vd_attr_msg { + /* Common tag */ + vio_msg_tag_t tag; + + /* vdisk-attribute-specific payload */ + uint8_t xfer_mode; /* data exchange method. */ + uint8_t vdisk_type; /* disk, slice, read-only, etc. */ + uint16_t resv; /* padding */ + uint32_t vdisk_block_size; /* bytes per disk block */ + uint64_t operations; /* bit-field of server supported ops */ + uint64_t vdisk_size; /* size for Nblocks property. */ + uint64_t max_xfer_sz; /* maximum block transfer size */ + + uint64_t resv2[VIO_PAYLOAD_ELEMS - 4]; /* padding */ +} vd_attr_msg_t; + +/* + * vDisk inband descriptor message. + * + * For clients that do not use descriptor rings, the descriptor contents + * are sent as part of an inband message. + */ +typedef struct vd_dring_inband_msg { + vio_inband_desc_msg_hdr_t hdr; + vd_dring_payload_t payload; +} vd_dring_inband_msg_t; + + +#ifdef __cplusplus +} +#endif + +#endif /* _VDSK_MAILBOX_H */ diff --git a/usr/src/uts/sun4v/sys/vio_common.h b/usr/src/uts/sun4v/sys/vio_common.h new file mode 100644 index 0000000000..f2a6d7968b --- /dev/null +++ b/usr/src/uts/sun4v/sys/vio_common.h @@ -0,0 +1,55 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VIO_COMMON_H +#define _SYS_VIO_COMMON_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * Common header for VIO descriptor ring entries + */ +typedef struct vio_dring_entry_hdr { + uint8_t dstate; /* Current state of Dring entry */ + uint8_t ack:1; /* 1 => receiver must ACK when DONE */ + + /* + * Padding. + */ + uint16_t resv[3]; +} vio_dring_entry_hdr_t; + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VIO_COMMON_H */ diff --git a/usr/src/uts/sun4v/sys/vio_mailbox.h b/usr/src/uts/sun4v/sys/vio_mailbox.h new file mode 100644 index 0000000000..eef1e1d5ef --- /dev/null +++ b/usr/src/uts/sun4v/sys/vio_mailbox.h @@ -0,0 +1,331 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VIO_MAILBOX_H +#define _SYS_VIO_MAILBOX_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/ldc.h> + +/* Message types */ +#define VIO_TYPE_CTRL 0x1 +#define VIO_TYPE_DATA 0x2 +#define VIO_TYPE_ERR 0x4 + +/* Message sub-types */ +#define VIO_SUBTYPE_INFO 0x1 +#define VIO_SUBTYPE_ACK 0x2 +#define VIO_SUBTYPE_NACK 0x4 + +/* + * VIO specific control envelopes: 0x0000 - 0x00FF + * VNET specific control envelopes: 0x0100 - 0x01FF + * VDSK specific control envelopes: 0x0200 - 0x02FF + * UNUSED envelopes: 0x0300 - 0x0FFF + */ + +/* + * Generic Control Subtype Envelopes: + * type == VIO_TYPE_CTRL + * subtype == VIO_SUBTYPE_{INFO|ACK|NACK} + * + * 0x0000 - 0x003F + */ +#define VIO_VER_INFO 0x0001 +#define VIO_ATTR_INFO 0x0002 +#define VIO_DRING_REG 0x0003 +#define VIO_DRING_UNREG 0x0004 +#define VIO_RDX 0x0005 + +/* + * Generic subtype Data envelopes + * type == VIO_TYPE_DATA + * subtype == VIO_SUBTYPE_{INFO|ACK|NACK} + * + * 0x0040 - 0x007F + */ +#define VIO_PKT_DATA 0x0040 +#define VIO_DESC_DATA 0x0041 +#define VIO_DRING_DATA 0x0042 + + +/* + * Generic subtype Error envelopes + * type == VIO_TYPE_ERR + * subtype == VIO_SUBTYPE_{INFO|ACK|NACK} + * + * 0x0080 - 0x00FF + * + * Currently unused + */ + +/* + * Supported Device Types + */ +#define VDEV_NETWORK 0x1 +#define VDEV_NETWORK_SWITCH 0x2 +#define VDEV_DISK 0x3 +#define VDEV_DISK_SERVER 0x4 + +/* addr_type */ +#define ADDR_TYPE_MAC 0x1 /* XXX move to vnet_mailbox.h ? */ + +/* + * VIO data transfer mode + */ +#define VIO_PKT_MODE 0x1 +#define VIO_DESC_MODE 0x2 +#define VIO_DRING_MODE 0x3 + +/* + * VIO Descriptor Ring registration options + * (intended use for Descriptor Ring) + */ +#define VIO_TX_DRING 0x1 +#define VIO_RX_DRING 0x2 + +/* + * Size of message payload + */ +#define VIO_MSGTAG_SZ (sizeof (vio_msg_tag_t)) /* bytes */ +#define VIO_PAYLOAD_SZ (LDC_PAYLOAD_SIZE_UNRELIABLE - VIO_MSGTAG_SZ) +#define VIO_PAYLOAD_ELEMS (VIO_PAYLOAD_SZ / LDC_ELEM_SIZE) /* num words */ + +/* + * VIO device message tag. + * + * These 64 bits are used as a common header for all VIO message types. + */ +typedef union vio_msg_tag { + struct { + uint8_t _msgtype; + uint8_t _subtype; + uint16_t _subtype_env; + uint32_t _sid; /* session id */ + } _hdr; + uint64_t tagword; +} vio_msg_tag_t; + +#define vio_msgtype _hdr._msgtype +#define vio_subtype _hdr._subtype +#define vio_subtype_env _hdr._subtype_env +#define vio_sid _hdr._sid + +/* + * VIO version negotation message. + * + * tag.msgtype == VIO_TYPE_CTRL + * tag.submsgtype = VIO_SUBTYPE_{INFO|ACK|NACK} + * tag.subtype_env == VIO_VER_INFO + */ +typedef struct vio_ver_msg { + /* Common tag */ + vio_msg_tag_t tag; + + /* version specific payload */ + uint32_t ver_major:16, /* major version number */ + ver_minor:16; /* minor version number */ + + uint8_t dev_class; /* type of device */ + + /* padding */ + uint8_t resv1; + uint16_t resv2; + uint64_t resv3[VIO_PAYLOAD_ELEMS - 1]; +} vio_ver_msg_t; + +/* + * VIO Descriptor Ring Register message. + * + * tag.msgtype == VIO_TYPE_CTRL + * tag.submsgtype = VIO_SUBTYPE_{INFO|ACK|NACK} + * tag.subtype_env == VIO_DRING_REG + */ +typedef struct vio_dring_reg_msg { + /* Common tag */ + vio_msg_tag_t tag; + + /* Descriptor ring information */ + uint64_t dring_ident; /* =0 for SUBTYPE_INFO msg */ + uint32_t num_descriptors; /* # of desc in the ring */ + uint32_t descriptor_size; /* size of each entry */ + uint16_t options; /* intended use */ + uint16_t resv; /* padding */ + uint32_t ncookies; /* # cookies exporting ring */ + + /* + * cookie is a variable sized array. If the number of cookies is 1, + * the message can be sent by LDC without fragmentation. + */ + ldc_mem_cookie_t cookie[1]; +} vio_dring_reg_msg_t; + +/* + * VIO Descriptor Ring Unregister message. + * + * tag.msgtype == VIO_TYPE_CTRL + * tag.submsgtype = VIO_SUBTYPE_{INFO|ACK|NACK} + * tag.subtype_env == VIO_DRING_UNREG + */ +typedef struct vio_dring_unreg_msg { + /* Common tag */ + vio_msg_tag_t tag; + + /* Descriptor ring information */ + uint64_t dring_ident; + uint64_t resv[VIO_PAYLOAD_ELEMS - 1]; +} vio_dring_unreg_msg_t; + + +/* + * Definition of a generic VIO message (with no payload) which can be cast + * to other message types. + */ +typedef struct vio_msg { + /* Common tag */ + vio_msg_tag_t tag; + + /* no payload */ + uint64_t resv[VIO_PAYLOAD_ELEMS]; +} vio_msg_t; + +/* + * VIO Ready to Receive message. + * + * tag.msgtype == VIO_TYPE_CTRL + * tag.submsgtype = VIO_SUBTYPE_{INFO|ACK} + * tag.subtype_env == VIO_RDX + */ +typedef vio_msg_t vio_rdx_msg_t; + +/* + * VIO error message. + * + * tag.msgtype == VIO_TYPE_ERR + * tag.subtype == VIO_SUBTYPE_{INFO|ACK|NACK} + * tag.subtype_env == TBD + */ +typedef vio_msg_t vio_err_msg_t; + +/* + * VIO descriptor ring data message. + * + * tag.msgtype == VIO_TYPE_DATA + * tag.subtype == VIO_SUBTYPE_{INFO|ACK|NACK} + * tag.subtype_env == VIO_DRING_DATA + */ +typedef struct vio_dring_msg { + /* Common message tag */ + vio_msg_tag_t tag; + + /* Data dring info */ + uint64_t seq_num; + uint64_t dring_ident; /* ident of modified DRing */ + uint32_t start_idx; /* Indx of first updated elem */ + int32_t end_idx; /* Indx of last updated elem */ + + /* + * Padding. + */ + uint64_t resv[VIO_PAYLOAD_ELEMS - 3]; +} vio_dring_msg_t; + +/* + * VIO Common header for inband descriptor messages. + * + * Clients will then combine this header with a device specific payload. + */ +typedef struct vio_inband_desc_msg_hdr { + /* Common message tag */ + vio_msg_tag_t tag; + + uint64_t seq_num; /* sequence number */ + uint64_t desc_handle; /* opaque descriptor handle */ +} vio_inband_desc_msg_hdr_t; + +/* + * VIO raw data message. + * + * tag.msgtype == VIO_TYPE_DATA + * tag.subtype == VIO_SUBTYPE_{INFO|ACK|NACK} + * tag.subtype_env == VIO_PKT_DATA + * + * Note the data payload is so small to keep this message + * within the size LDC can cope with without fragmentation. + * If it turns out in the future that we are not concerned + * with fragmentation then we can increase the size of this + * field. + */ +typedef struct vio_raw_data_msg { + /* Common message tag */ + vio_msg_tag_t tag; + + /* Raw data packet payload */ + uint64_t seq_num; /* sequence number */ + uint64_t data[VIO_PAYLOAD_ELEMS - 1]; +} vio_raw_data_msg_t; + +/* + * Definitions of the valid states a Descriptor can be in. + */ +#define VIO_DESC_FREE 0x1 +#define VIO_DESC_READY 0x2 +#define VIO_DESC_ACCEPTED 0x3 +#define VIO_DESC_DONE 0x4 +#define VIO_DESC_MASK 0xf + +/* Macro to check that the state in variable supplied is a valid DRing state */ +#define VIO_IS_VALID_DESC_STATE(flag) \ + (((flag | VIO_DESC_MASK) == VIO_DESC_FREE) || \ + ((flag | VIO_DESC_MASK) == VIO_DESC_READY) || \ + ((flag | VIO_DESC_MASK) == VIO_DESC_ACCEPTED) || \ + ((flag | VIO_DESC_MASK) == VIO_DESC_READY)) + +#define VIO_SET_DESC_STATE(flag, state) \ + { \ + flag &= (flag | ~VIO_DESC_MASK); \ + flag |= (state & VIO_DESC_MASK); \ + } + +#define VIO_GET_DESC_STATE(flag) ((flag) & VIO_DESC_MASK) + +/* Macro to populate the generic fields of the DRing data msg */ +#define VIO_INIT_DRING_DATA_TAG(dmsg) \ + dmsg.tag.vio_msgtype = VIO_TYPE_DATA; \ + dmsg.tag.vio_subtype = VIO_SUBTYPE_INFO; \ + dmsg.tag.vio_subtype_env = VIO_DRING_DATA; + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VIO_MAILBOX_H */ diff --git a/usr/src/uts/sun4v/sys/vldc.h b/usr/src/uts/sun4v/sys/vldc.h new file mode 100644 index 0000000000..112f17c2a7 --- /dev/null +++ b/usr/src/uts/sun4v/sys/vldc.h @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VLDC_H +#define _VLDC_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/ioctl.h> + +/* Channel IOCTL Commands */ + +#define VLDC_IOCTL_SHIFT 8 +#define VLDC_IOCTL ('1' << VLDC_IOCTL_SHIFT) + +#define VLDC_IOCTL_OPT_OP (VLDC_IOCTL | 0x1) /* ctrl op */ +#define VLDC_IOCTL_READ_COOKIE (VLDC_IOCTL | 0x2) /* read cookie */ +#define VLDC_IOCTL_WRITE_COOKIE (VLDC_IOCTL | 0x3) /* write cookie */ + +/* supported ctrl operations */ +#define VLDC_OP_GET 0x1 /* get specified value */ +#define VLDC_OP_SET 0x2 /* set specified value */ + +/* supported ctrl operation options */ +#define VLDC_OPT_MTU_SZ 0x1 /* MTU */ +#define VLDC_OPT_STATUS 0x2 /* port status */ +#define VLDC_OPT_MODE 0x3 /* port channel mode */ + +/* values returned by VLDC_OPT_OP_STATUS */ +#define VLDC_PORT_CLOSED 0x1 /* port is closed */ +#define VLDC_PORT_OPEN 0x2 /* port is already open */ +#define VLDC_PORT_READY 0x4 /* port is open and ready */ + +/* + * Values for VLDC_OPT_MODE are defined in ldc.h. + */ + +/* + * Structure that is used by vldc driver and all its clients to communicate + * the type and nature of the option as well as for clients to get port + * status. + */ +typedef struct vldc_opt_op { + int32_t op_sel; /* operation selector(ex: GET) */ + int32_t opt_sel; /* option selector (ex: MTU) */ + uint32_t opt_val; /* option value to set or returned */ +} vldc_opt_op_t; + +/* + * Structure that is used by the LDom manager to download instruction + * sequences and read/write new machine descriptions. + */ +typedef struct vldc_data { + uint64_t src_addr; /* source address */ + uint64_t dst_addr; /* destination address */ + uint64_t length; /* size of transfer */ +} vldc_data_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _VLDC_H */ diff --git a/usr/src/uts/sun4v/sys/vldc_impl.h b/usr/src/uts/sun4v/sys/vldc_impl.h new file mode 100644 index 0000000000..8610344b42 --- /dev/null +++ b/usr/src/uts/sun4v/sys/vldc_impl.h @@ -0,0 +1,133 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VLDC_IMPL_H +#define _VLDC_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/stream.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ldc.h> +#include <sys/vldc.h> + +/* default values */ +#define VLDC_DEFAULT_MTU 0x800 /* default mtu size */ + +/* VLDC limits */ +#define VLDC_MAX_COOKIE 0x40000 /* max. size of xfer to/from HV */ +#define VLDC_MAX_MTU 0x40000 /* 256K */ +#define VLDC_MAX_PORTS 0x800 +#define VLDC_MAX_MINORS VLDC_MAX_PORTS +#define VLDC_QUEUE_LEN 0x80 + +#define VLDC_MINOR_MASK (VLDC_MAX_PORTS - 1) +#define VLDC_INST_SHIFT 11 + +/* get port number from minor number */ +#define VLDCPORT(vldcp, minor) \ + ((vldcp)->minor_tbl[(minor) & VLDC_MINOR_MASK].portno) + +/* get minor table entry from minor number */ +#define VLDCMINOR(vldcp, minor) \ + (&((vldcp)->minor_tbl[(minor) & VLDC_MINOR_MASK])) + +/* get instance number from minor number */ +#define VLDCINST(minor) ((minor) >> VLDC_INST_SHIFT) + +/* indicates an invalid port number */ +#define VLDC_INVALID_PORTNO ((uint_t)-1) + +/* + * Minor node number to port number mapping table. + * + * The lock field in the vldc_minor structure is used to serialize operations + * on the port associated with the minor node. It also protects the minor node + * in_use field which is used to track the number of active users of the minor + * node. Driver ops will either hold the lock over the whole operation or + * will increment (and then decrement) the in use count if they need to + * release and re-acquire the lock, e.g. when copying data in from or out to + * userland. When the MDEG framework calls into the driver via the callback to + * remove a port, the driver must wait until the in use count for the minor + * node associated with the port drops to zero, before it can remove the + * port. + */ +typedef struct vldc_minor { + kmutex_t lock; /* protects port/in_use count */ + kcondvar_t cv; /* for waiting on in use */ + uint_t in_use; /* in use counter */ + uint_t portno; /* port number */ + char sname[MAXPATHLEN]; /* service name */ +} vldc_minor_t; + +typedef struct vldc_port { + uint_t number; /* port number */ + uint32_t status; /* port status */ + vldc_minor_t *minorp; /* minor table entry pointer */ + uint32_t mtu; /* port mtu */ + caddr_t send_buf; /* send buffer */ + caddr_t recv_buf; /* receive buffer */ + + uint64_t ldc_id; /* Channel number */ + ldc_handle_t ldc_handle; /* Channel handle */ + ldc_mode_t ldc_mode; /* Channel mode */ + + boolean_t is_stream; /* streaming mode */ + boolean_t hanged_up; /* port hanged up */ + + struct pollhead poll; /* for poll */ +} vldc_port_t; + +/* + * vldc driver's soft state structure + */ +typedef struct vldc { + kmutex_t lock; /* serializes detach and MDEG */ + boolean_t detaching; /* true iff busy detaching */ + dev_info_t *dip; /* dev_info */ + mdeg_node_spec_t *inst_spec; /* vldc instance specifier */ + mdeg_handle_t mdeg_hdl; /* MD event handle */ + + uint_t num_ports; + vldc_port_t port[VLDC_MAX_PORTS]; + + /* table for assigned minors */ + vldc_minor_t minor_tbl[VLDC_MAX_MINORS]; + + /* number of minors already assigned */ + uint_t minors_assigned; +} vldc_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _VLDC_IMPL_H */ diff --git a/usr/src/uts/sun4v/sys/vnet.h b/usr/src/uts/sun4v/sys/vnet.h new file mode 100644 index 0000000000..b7b111eb61 --- /dev/null +++ b/usr/src/uts/sun4v/sys/vnet.h @@ -0,0 +1,118 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VNET_H +#define _VNET_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#define VNET_SUCCESS (0) /* successful return */ +#define VNET_FAILURE (-1) /* unsuccessful return */ + +#define KMEM_FREE(_p) kmem_free((_p), sizeof (*(_p))) + +#define VNET_NTXDS 512 /* power of 2 tx descriptors */ +#define VNET_RECLAIM_LOWAT 32 /* tx reclaim low watermark */ +#define VNET_RECLAIM_HIWAT (512 - 32) /* tx reclaim high watermark */ +#define VNET_LDCWD_INTERVAL 1000 /* watchdog freq in msec */ +#define VNET_LDCWD_TXTIMEOUT 1000 /* tx timeout in msec */ +#define VNET_LDC_QLEN 1024 /* ldc qlen */ + +/* + * vnet proxy transport layer information. There is one instance of this for + * every transport being used by a vnet device and a list of these transports + * is maintained by vnet. + */ +typedef struct vp_tl { + struct vp_tl *nextp; /* next in list */ + mac_t *macp; /* transport ops */ + char name[LIFNAMSIZ]; /* device name */ + major_t major; /* driver major # */ + uint_t instance; /* dev instance */ +} vp_tl_t; + +/* + * Forwarding database (FDB) entry, used by vnet to provide switching + * functionality. Each fdb entry corresponds to a destination vnet device + * within the ldoms which is directly reachable by invoking a transmit + * function provided by a vnet proxy transport layer. Currently, the generic + * transport layer adds/removes/modifies entries in fdb. + */ +typedef struct fdb { + struct fdb *nextp; /* next entry in the list */ + uint8_t macaddr[ETHERADDRL]; /* destination mac address */ + mac_tx_t m_tx; /* transmit function */ + void *txarg; /* arg to the transmit func */ +} fdb_t; + +/* FDB hash queue head */ +typedef struct fdbf_s { + fdb_t *headp; /* head of fdb entries */ + krwlock_t rwlock; /* protect the list */ +} fdb_fanout_t; + +#define VNET_NFDB_HASH 4 /* default number of hash queues in fdb */ +#define VNET_NFDB_HASH_MAX 32 /* max number of hash queues in fdb */ + +/* Hash calculation using the mac address */ +#define MACHASH(a, n) ((*(((uchar_t *)(a)) + 0) ^ \ + *(((uchar_t *)(a)) + 1) ^ \ + *(((uchar_t *)(a)) + 2) ^ \ + *(((uchar_t *)(a)) + 3) ^ \ + *(((uchar_t *)(a)) + 4) ^ \ + *(((uchar_t *)(a)) + 5)) % (uint32_t)n) + +/* rwlock macros */ +#define READ_ENTER(x) rw_enter(x, RW_READER) +#define WRITE_ENTER(x) rw_enter(x, RW_WRITER) +#define RW_EXIT(x) rw_exit(x) + +/* + * vnet instance state information + */ +typedef struct vnet { + int instance; /* instance # */ + dev_info_t *dip; /* dev_info */ + struct vnet *nextp; /* next in list */ + mac_t *macp; /* MAC - macinfo */ + uchar_t vendor_addr[ETHERADDRL]; /* orig macadr */ + uchar_t curr_macaddr[ETHERADDRL]; /* current macadr */ + vp_tl_t *tlp; /* list of vp_tl */ + krwlock_t trwlock; /* lock for vp_tl list */ + char vgen_name[MAXNAMELEN]; /* name of generic tl */ + fdb_fanout_t *fdbhp; /* fdb hash queues */ + int nfdb_hash; /* num fdb hash queues */ +} vnet_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _VNET_H */ diff --git a/usr/src/uts/sun4v/sys/vnet_common.h b/usr/src/uts/sun4v/sys/vnet_common.h new file mode 100644 index 0000000000..feed7025a2 --- /dev/null +++ b/usr/src/uts/sun4v/sys/vnet_common.h @@ -0,0 +1,76 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VNET_COMMON_H +#define _VNET_COMMON_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/vio_common.h> +#include <sys/vio_mailbox.h> +#include <sys/ethernet.h> + +/* + * This header file contains definitions common to LDoms Virtual Network + * server (vsw) and client (vnet). + */ + +/* max # of cookies per frame size */ +#define MAX_COOKIES ((ETHERMAX >> MMU_PAGESHIFT) + 2) + +/* initial send sequence number */ +#define VNET_ISS 0x1 + +/* vnet descriptor */ +typedef struct vnet_public_desc { + vio_dring_entry_hdr_t hdr; /* descriptor header */ + uint32_t nbytes; /* data length */ + uint32_t ncookies; /* number of data cookies */ + ldc_mem_cookie_t memcookie[MAX_COOKIES]; /* data cookies */ +} vnet_public_desc_t; + +/* + * VIO in-band descriptor. Used by those vio clients + * such as OBP who do not use descriptor rings. + */ +typedef struct vio_ibnd_desc { + vio_inband_desc_msg_hdr_t hdr; + + /* payload */ + uint32_t nbytes; + uint32_t ncookies; + ldc_mem_cookie_t memcookie[MAX_COOKIES]; +} vio_ibnd_desc_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _VNET_COMMON_H */ diff --git a/usr/src/uts/sun4v/sys/vnet_gen.h b/usr/src/uts/sun4v/sys/vnet_gen.h new file mode 100644 index 0000000000..2ce1f390d8 --- /dev/null +++ b/usr/src/uts/sun4v/sys/vnet_gen.h @@ -0,0 +1,337 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VNET_GEN_H +#define _VNET_GEN_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#define VGEN_SUCCESS (0) /* successful return */ +#define VGEN_FAILURE (-1) /* unsuccessful return */ + +#define VGEN_NUM_VER 1 /* max # of vgen versions */ + +#define VGEN_LOCAL 1 /* local ldc end-point */ +#define VGEN_PEER 2 /* peer ldc end-point */ + +/* vgen_t flags */ +#define VGEN_STOPPED 0x0 +#define VGEN_STARTED 0x1 + +#define KMEM_FREE(_p) kmem_free((_p), sizeof (*(_p))) + +#define VGEN_INIT_MCTAB_SIZE 16 /* initial size of multicast table */ + +#define READ_ENTER(x) rw_enter(x, RW_READER) +#define WRITE_ENTER(x) rw_enter(x, RW_WRITER) +#define RW_EXIT(x) rw_exit(x) + +/* channel flags */ +#define CHANNEL_ATTACHED 0x1 +#define CHANNEL_STARTED 0x2 + +/* transmit return values */ +#define VGEN_TX_SUCCESS 0 /* transmit success */ +#define VGEN_TX_FAILURE 1 /* transmit failure */ +#define VGEN_TX_NORESOURCES 2 /* out of tbufs/txds */ + +/* private descriptor flags */ +#define VGEN_PRIV_DESC_FREE 0x0 /* desc is available */ +#define VGEN_PRIV_DESC_BUSY 0x1 /* desc in use */ + +#define LDC_TO_VNET(ldcp) ((ldcp)->portp->vgenp->vnetp) +#define LDC_TO_VGEN(ldcp) ((ldcp)->portp->vgenp) + +/* get the address of next tbuf */ +#define NEXTTBUF(ldcp, tbufp) (((tbufp) + 1) == (ldcp)->tbufendp \ + ? (ldcp)->tbufp : ((tbufp) + 1)) + +/* increment recv index */ +#define INCR_RXI(i, ldcp) \ + ((i) = (((i) + 1) & ((ldcp)->num_rxds - 1))) + +/* decrement recv index */ +#define DECR_RXI(i, ldcp) \ + ((i) = (((i) - 1) & ((ldcp)->num_rxds - 1))) + +/* increment tx index */ +#define INCR_TXI(i, ldcp) \ + ((i) = (((i) + 1) & ((ldcp)->num_txds - 1))) + +/* decrement tx index */ +#define DECR_TXI(i, ldcp) \ + ((i) = (((i) - 1) & ((ldcp)->num_txds - 1))) + +/* bounds check rx index */ +#define CHECK_RXI(i, ldcp) \ + (((i) >= 0) && ((i) < (ldcp)->num_rxds)) + +/* bounds check tx index */ +#define CHECK_TXI(i, ldcp) \ + (((i) >= 0) && ((i) < (ldcp)->num_txds)) + +/* private descriptor */ +typedef struct vgen_priv_desc { + uint64_t flags; /* flag bits */ + vnet_public_desc_t *descp; /* associated public desc */ + ldc_mem_handle_t memhandle; /* mem handle for data */ + mblk_t *mp; /* associated packet */ + uint64_t datap; /* mp->b_rptr */ + uint64_t datalen; /* total actual datalen */ + uint64_t seqnum; /* sequence number of pkt */ + uint64_t ncookies; /* num ldc_mem_cookies */ + ldc_mem_cookie_t memcookie[MAX_COOKIES]; /* data cookies */ +} vgen_private_desc_t; + +/* + * Handshake parameters (per vio_mailbox.h) of each ldc end point, used + * during handshake negotiation. + */ +typedef struct vgen_handshake_params { + /* version specific params */ + uint32_t ver_major:16, + ver_minor:16; /* major, minor version */ + uint8_t dev_class; /* device class */ + + /* attributes specific params */ + uint64_t mtu; /* max transfer unit size */ + uint64_t addr; /* address of the device */ + uint8_t addr_type; /* type of address */ + uint8_t xfer_mode; /* SHM or PKT */ + uint16_t ack_freq; /* dring data ack freq */ + + /* descriptor ring params */ + uint32_t num_desc; /* # of descriptors in ring */ + uint32_t desc_size; /* size of descriptor */ + ldc_mem_cookie_t dring_cookie; /* desc ring cookie */ + uint32_t num_dcookies; /* # of dring cookies */ + uint64_t dring_ident; /* ident=0 for INFO msg */ + boolean_t dring_ready; /* dring ready flag */ +} vgen_hparams_t; + +/* version info */ +typedef struct vgen_ver { + uint32_t ver_major:16, + ver_minor:16; +} vgen_ver_t; + +typedef struct vgen_stats { + + /* Link Input/Output stats */ + uint64_t ipackets; + uint64_t ierrors; + uint64_t opackets; + uint64_t oerrors; +#if 0 + uint64_t collisions; +#endif + + /* MIB II variables */ + uint64_t rbytes; /* # bytes received */ + uint64_t obytes; /* # bytes transmitted */ + uint32_t multircv; /* # multicast packets received */ + uint32_t multixmt; /* # multicast packets for xmit */ + uint32_t brdcstrcv; /* # broadcast packets received */ + uint32_t brdcstxmt; /* # broadcast packets for xmit */ + uint32_t norcvbuf; /* # rcv packets discarded */ + uint32_t noxmtbuf; /* # xmit packets discarded */ + + /* Tx Statistics */ + uint32_t tx_no_desc; + uint32_t tx_allocb_fail; + + /* Rx Statistics */ + uint32_t rx_no_desc; + uint32_t rx_allocb_fail; + uint32_t rx_lost_pkts; + + /* Callback statistics */ + uint32_t callbacks; + uint32_t dring_data_acks; + +} vgen_stats_t; + +typedef struct vgen_kstats { + /* + * Link Input/Output stats + */ + kstat_named_t ipackets; + kstat_named_t ipackets64; + kstat_named_t ierrors; + kstat_named_t opackets; + kstat_named_t opackets64; + kstat_named_t oerrors; +#if 0 + kstat_named_t collisions; +#endif + /* + * required by kstat for MIB II objects(RFC 1213) + */ + kstat_named_t rbytes; /* MIB - ifInOctets */ + kstat_named_t rbytes64; + kstat_named_t obytes; /* MIB - ifOutOctets */ + kstat_named_t obytes64; + kstat_named_t multircv; /* MIB - ifInNUcastPkts */ + kstat_named_t multixmt; /* MIB - ifOutNUcastPkts */ + kstat_named_t brdcstrcv; /* MIB - ifInNUcastPkts */ + kstat_named_t brdcstxmt; /* MIB - ifOutNUcastPkts */ + kstat_named_t norcvbuf; /* MIB - ifInDiscards */ + kstat_named_t noxmtbuf; /* MIB - ifOutDiscards */ + + /* Tx Statistics */ + kstat_named_t tx_no_desc; + kstat_named_t tx_allocb_fail; + + /* Rx Statistics */ + kstat_named_t rx_no_desc; + kstat_named_t rx_allocb_fail; + kstat_named_t rx_lost_pkts; + + /* Callback statistics */ + kstat_named_t callbacks; + kstat_named_t dring_data_acks; + +} vgen_kstats_t; + +/* Channel information associated with a vgen-port */ +typedef struct vgen_ldc { + + struct vgen_ldc *nextp; /* next ldc in the list */ + struct vgen_port *portp; /* associated port */ + + /* + * Locks: + * locking hierarchy when more than one lock is held concurrently: + * cblock > txlock > tclock. + */ + kmutex_t cblock; /* sync callback processing */ + kmutex_t txlock; /* sync transmits */ + kmutex_t tclock; /* tx reclaim lock */ + + /* channel info from ldc layer */ + uint64_t ldc_id; /* channel number */ + uint64_t ldc_handle; /* channel handle */ + ldc_status_t ldc_status; /* channel status */ + + /* handshake info */ + vgen_ver_t vgen_versions[VGEN_NUM_VER]; /* versions */ + int hphase; /* handshake phase */ + int hstate; /* handshake state bits */ + uint32_t local_sid; /* local session id */ + uint32_t peer_sid; /* session id of peer */ + vgen_hparams_t local_hparams; /* local handshake params */ + vgen_hparams_t peer_hparams; /* peer's handshake params */ + timeout_id_t htid; /* handshake wd timeout id */ + + /* transmit and receive descriptor ring info */ + ldc_dring_handle_t tx_dhandle; /* tx descriptor ring handle */ + ldc_mem_cookie_t tx_dcookie; /* tx descriptor ring cookie */ + ldc_dring_handle_t rx_dhandle; /* mapped rx dhandle */ + ldc_mem_cookie_t rx_dcookie; /* rx descriptor ring cookie */ + vnet_public_desc_t *txdp; /* transmit frame descriptors */ + vnet_public_desc_t *txdendp; /* txd ring end */ + vgen_private_desc_t *tbufp; /* associated tx resources */ + vgen_private_desc_t *tbufendp; /* tbuf ring end */ + vgen_private_desc_t *next_tbufp; /* next free tbuf */ + vgen_private_desc_t *cur_tbufp; /* next reclaim tbuf */ + uint64_t next_txseq; /* next tx sequence number */ + uint32_t num_txdcookies; /* # of tx dring cookies */ + uint32_t num_rxdcookies; /* # of rx dring cookies */ + uint32_t next_txi; /* next tx descriptor index */ + uint32_t num_txds; /* number of tx descriptors */ + uint32_t reclaim_lowat; /* lowat for tx reclaim */ + uint32_t reclaim_hiwat; /* hiwat for tx reclaim */ + clock_t reclaim_lbolt; /* time of last tx reclaim */ + timeout_id_t wd_tid; /* tx watchdog timeout id */ + vnet_public_desc_t *rxdp; /* receive frame descriptors */ + uint64_t next_rxseq; /* next expected recv seqnum */ + uint32_t next_rxi; /* next expected recv index */ + uint32_t num_rxds; /* number of rx descriptors */ + + /* misc */ + uint32_t flags; /* flags */ + boolean_t need_resched; /* reschedule tx */ + boolean_t need_ldc_reset; /* ldc_reset needed */ + boolean_t need_mcast_sync; /* sync mcast table with vsw */ + uint32_t hretries; /* handshake retry count */ + + /* channel statistics */ + vgen_stats_t *statsp; /* channel statistics */ + kstat_t *ksp; /* channel kstats */ + +} vgen_ldc_t; + +/* Channel list structure */ +typedef struct vgen_ldclist_s { + vgen_ldc_t *headp; /* head of the list */ + krwlock_t rwlock; /* sync access to the list */ + int num_ldcs; /* number of channels in the list */ +} vgen_ldclist_t; + +/* port information structure */ +typedef struct vgen_port { + struct vgen_port *nextp; /* next port in the list */ + struct vgen *vgenp; /* associated vgen_t */ + int port_num; /* port number */ + vgen_ldclist_t ldclist; /* list of ldcs for this port */ + struct ether_addr macaddr; /* mac address of peer */ +} vgen_port_t; + +/* port list structure */ +typedef struct vgen_portlist { + vgen_port_t *headp; /* head of ports */ + vgen_port_t *tailp; /* tail */ + krwlock_t rwlock; /* sync access to the port list */ +} vgen_portlist_t; + +/* vgen instance information */ +typedef struct vgen { + void *vnetp; /* associated vnet instance */ + dev_info_t *vnetdip; /* dip of vnet */ + void *vnetmacp; /* mac_t of vnet */ + uint8_t macaddr[ETHERADDRL]; /* mac addr of vnet */ + mac_resource_handle_t mrh; /* handle for mac_rx() */ + kmutex_t lock; /* synchornize ops */ + int flags; /* flags */ + vgen_portlist_t vgenports; /* Port List */ + mdeg_node_spec_t *mdeg_parentp; + mdeg_handle_t mdeg_hdl; + vgen_port_t *vsw_portp; /* port connected to vsw */ + mac_t vgenmac; /* vgen mac ops */ + struct ether_addr *mctab; /* multicast addr table */ + uint32_t mcsize; /* allocated size of mctab */ + uint32_t mccount; /* # of valid addrs in mctab */ +} vgen_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _VNET_GEN_H */ diff --git a/usr/src/uts/sun4v/sys/vnet_mailbox.h b/usr/src/uts/sun4v/sys/vnet_mailbox.h new file mode 100644 index 0000000000..4812b6c6a6 --- /dev/null +++ b/usr/src/uts/sun4v/sys/vnet_mailbox.h @@ -0,0 +1,95 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VNET_MAILBOX_H +#define _SYS_VNET_MAILBOX_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/vio_mailbox.h> +#include <sys/ethernet.h> + +/* + * VNET specific Control envelopes: 0x0100 - 0x01FF + * type == VIO_TYPE_CTRL + * subtype == VIO_SUBTYPE_{INFO|ACK|NACK} + */ +#define VNET_MCAST_INFO 0x0101 + +/* + * Vnet/Vswitch device attributes information message. + * + * tag.msgtype == VIO_TYPE_CTRL + * tag.submsgtype = VIO_SUBTYPE_{INFO|ACK|NACK} + * tag.subtype_env == VIO_ATTR_INFO + */ +typedef struct vnet_attr_msg { + /* Common tag */ + vio_msg_tag_t tag; + + /* attributes specific payload */ + uint8_t xfer_mode; /* data transfer mode */ + uint8_t addr_type; /* device address type */ + uint16_t ack_freq; /* ack after rcving # of pkts */ + uint32_t resv1; /* padding */ + + uint64_t addr; /* device address */ + uint64_t mtu; /* maximum data xfer unit */ + + /* padding to align things */ + uint64_t resv2[3]; + +} vnet_attr_msg_t; + +/* + * Vnet/Vswitch enable/disable multicast address msg + * + * tag.msgtype == VIO_TYPE_CTRL + * tag.subtype == VIO_SUBTYPE_{INFO|ACK|NACK} + * tag.subtype_env == VNET_MCAST_INFO + */ +#define VNET_NUM_MCAST 7 /* max # of multicast addresses in the msg */ + +typedef struct vnet_mcast_msg { + /* Common tag */ + vio_msg_tag_t tag; + + /* multicast address information */ + uint8_t set; /* add if set to 1, else remove */ + uint8_t count; /* number of addrs in the msg */ + struct ether_addr mca[VNET_NUM_MCAST]; /* mcast addrs */ + uint32_t resv1; /* padding */ +} vnet_mcast_msg_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VNET_MAILBOX_H */ diff --git a/usr/src/uts/sun4v/sys/vnet_proxy.h b/usr/src/uts/sun4v/sys/vnet_proxy.h new file mode 100644 index 0000000000..7d3872fb52 --- /dev/null +++ b/usr/src/uts/sun4v/sys/vnet_proxy.h @@ -0,0 +1,133 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VNET_PROXY_H +#define _VNET_PROXY_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * vnet proxy client is a low level driver which provides link specific + * functionality required by the vnet device. The vnet leaf driver and vnet + * proxy implement generic mac functionality required by the mac module as + * part of NEMO network stack. A vnet proxy provides these entry points + * as defined below in the vnet_proxy_ops structure. Note that some of the + * entry points may not be implemented by certain modules and will be + * initialized to NULL. All entry points return 0 for success and non zero + * for failure. + */ + +typedef uint64_t vp_handle_t; /* vnet proxy handle */ + +typedef struct vnet_proxy_ops { + +/* + * vp_start() enables the client to send and receive data and generate + * interrupts. In addition a client may register opaque objects to be + * passed during transmit. This is done by a client which provides links + * to specific destination mac addresses by calling vnet_add_fdb(). + * (described below: Functions exported by vnet). + * vp_stop() disables the client from generating interrupts and IO. + * The client will also unregister any opaque objects using vnet_del_fdb(). + */ + int (*vp_start)(vp_handle_t vp_handle); + int (*vp_stop)(vp_handle_t vp_handle); + +/* + * vp_tx() is invoked to transmit a packet. The first argument points + * to the client specific opaque object. + * The vp_tx routine must return 0 if unable to send the packet (eg, due to + * lack of resources). + */ + int (*vp_tx)(void *arg, mblk_t *mp); + +/* + * vp_resources() is called to enable the client register its receive + * resources. + */ + int (*vp_resources)(vp_handle_t vp_handle); + +/* + * vp_multicast() is used to add/remove addresses to and from the set of + * multicast addresses for which the client will receive packets. + * If the second argument is B_TRUE then the address pointed to by the + * third argument should be added to the set. If the second argument is + * B_FALSE then the address pointed to by the third argument should be + * removed. + */ + int (*vp_multicast)(vp_handle_t vp_handle, boolean_t add, + const uint8_t *mca); + +/* + * vp_promisc() is used to set the promiscuity of the client. + * If the second argument is B_TRUE then the client should receive all + * packets. If it is set to B_FALSE then only packets destined for the + * vnet device's unicast address and broadcast address should be received. + */ + int (*vp_promisc)(vp_handle_t vp_handle, boolean_t on); + +/* vp_unicast() is used to set a new unicast address for the vnet device */ + int (*vp_unicast)(vp_handle_t vp_handle, const uint8_t *mca); + +/* TBD: vp_statistics */ + uint64_t (*vp_statistics)(vp_handle_t vp_handle, enum mac_stat); + +/* TBD: vp_ctl is used to to support client specific control commands */ + int (*vp_ctl)(vp_handle_t vp_handle, mblk_t *mp); + +} vnet_proxy_ops_t; + +/* vnet_proxy entry point types */ + +typedef int (*vp_start_t)(vp_handle_t); +typedef int (*vp_stop_t)(vp_handle_t); +typedef int (*vp_tx_t)(void *, mblk_t *); +typedef int (*vp_resources_t)(vp_handle_t); +typedef int (*vp_multicast_t)(vp_handle_t, boolean_t, + const uint8_t *); +typedef int (*vp_promisc_t)(vp_handle_t, boolean_t); +typedef int (*vp_unicast_t)(vp_handle_t, const uint8_t *); +typedef uint64_t (*vp_statistics_t)(vp_handle_t, enum mac_stat); +typedef int (*vp_ctl_t)(vp_handle_t, mblk_t *); + +/* + * The client calls this function to add/remove an entry into vnet's FBD. + */ +void vnet_add_fdb(void *arg, uint8_t *macaddr, vp_tx_t vp_tx, void *txarg); +void vnet_del_fdb(void *arg, uint8_t *macaddr); +void vnet_modify_fdb(void *arg, uint8_t *macaddr, vp_tx_t vp_tx, void *txarg); +void vnet_add_def_rte(void *arg, vp_tx_t vp_tx, void *txarg); +void vnet_del_def_rte(void *arg); + +#ifdef __cplusplus +} +#endif + +#endif /* _VNET_PROXY_H */ diff --git a/usr/src/uts/sun4v/sys/vnetmsg.h b/usr/src/uts/sun4v/sys/vnetmsg.h new file mode 100644 index 0000000000..79ed1d4336 --- /dev/null +++ b/usr/src/uts/sun4v/sys/vnetmsg.h @@ -0,0 +1,81 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VNETMSG_H +#define _VNETMSG_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#define LM_SIGNATURE 0x564E45544C4D5347 /* "VNETLMSG" */ + +/* lm_type (below) */ +#define LM_DATA 0x1 +#define LM_ACK 0x2 + +/* + * msg protocol used for ldc_mem IO. currently, 2 cookies are supported. + * (In Unreliable mode LDC-maxpayload is 56 bytes). + */ + +typedef struct vnet_ldc_msg { + uint64_t lm_signature; /* signature: "VNETLMSG" */ + uint8_t lm_type; /* data or ack */ + uint8_t lm_ncookies; /* # of cookies in the msg */ + uint16_t lm_id; /* opaque id (sender) */ + uint16_t lm_dlen; /* actual data length */ + uint16_t lm_resv; /* reserved */ + ldc_mem_cookie_t lm_cookie[2]; /* array of cookies */ +} vnet_ldc_msg_t; + +/* + * XXX Co-ordinate these def's with Harsha, expect that these will + * come from vnet header file. + */ +#define MAX_COOKIES ((ETHERMTU >> MMU_PAGESHIFT) + 2) + +#define VNET_PUB_DESC_FREE 0x0 +#define VNET_PUB_DESC_READY 0x1 +#define VNET_PUB_DESC_DONE 0x2 +#define VNET_PUB_DESC_ACK 0x4 + +#define VNET_PRIV_DESC_FREE 0x0 +#define VNET_PRIV_DESC_BUSY 0x1 + +typedef struct vnet_public_desc { + uint64_t flags; + uint64_t ncookies; + ldc_mem_cookie_t memcookie[MAX_COOKIES]; +} vnet_public_desc_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _VNETMSG_H */ diff --git a/usr/src/uts/sun4v/sys/vsw.h b/usr/src/uts/sun4v/sys/vsw.h new file mode 100644 index 0000000000..d284db9dc6 --- /dev/null +++ b/usr/src/uts/sun4v/sys/vsw.h @@ -0,0 +1,455 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * This header file contains the basic data structures which the + * virtual switch (vsw) uses to communicate with its clients and + * the outside world. + * + * The virtual switch reads the machine description (MD) to + * determine how many port_t structures to create (each port_t + * can support communications to a single network device). The + * port_t's are maintained in a linked list. + * + * Each port in turn contains a number of logical domain channels + * (ldc's) which are inter domain communications channels which + * are used for passing small messages between the domains. Their + * may be an unlimited number of channels associated with each port, + * though most devices only use a single channel. + * + * The ldc is a bi-directional channel, which is divided up into + * two directional 'lanes', one outbound from the switch to the + * virtual network device, the other inbound to the switch. + * Depending on the type of device each lane may have seperate + * communication paramaters (such as mtu etc). + * + * For those network clients which use descriptor rings the + * rings are associated with the appropriate lane. I.e. rings + * which the switch exports are associated with the outbound lanes + * while those which the network clients are exporting to the switch + * are associated with the inbound lane. + * + * In diagram form the data structures look as follows: + * + * vsw instance + * | + * +----->port_t----->port_t----->port_t-----> + * | + * +--->ldc_t--->ldc_t--->ldc_t---> + * | + * +--->lane_t (inbound) + * | | + * | +--->dring--->dring---> + * | + * +--->lane_t (outbound) + * | + * +--->dring--->dring---> + * + */ + +#ifndef _VSW_H +#define _VSW_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/vio_mailbox.h> +#include <sys/vnet_common.h> +#include <sys/ethernet.h> + +/* + * Default message type. + */ +typedef struct def_msg { + uint64_t data[8]; +} def_msg_t; + +/* + * Currently only support one major/minor pair. + */ +#define VSW_NUM_VER 1 + +typedef struct ver_sup { + uint32_t ver_major:16, + ver_minor:16; +} ver_sup_t; + +/* + * Only support ETHER mtu at moment. + */ +#define VSW_MTU ETHERMAX + +/* + * Lane states. + */ +#define VSW_LANE_INACTIV 0x0 /* No params set for lane */ + +#define VSW_VER_INFO_SENT 0x1 /* Version # sent to peer */ +#define VSW_VER_INFO_RECV 0x2 /* Version # recv from peer */ +#define VSW_VER_ACK_RECV 0x4 +#define VSW_VER_ACK_SENT 0x8 +#define VSW_VER_NACK_RECV 0x10 +#define VSW_VER_NACK_SENT 0x20 + +#define VSW_ATTR_INFO_SENT 0x40 /* Attributes sent to peer */ +#define VSW_ATTR_INFO_RECV 0x80 /* Peer attributes received */ +#define VSW_ATTR_ACK_SENT 0x100 +#define VSW_ATTR_ACK_RECV 0x200 +#define VSW_ATTR_NACK_SENT 0x400 +#define VSW_ATTR_NACK_RECV 0x800 + +#define VSW_DRING_INFO_SENT 0x1000 /* Dring info sent to peer */ +#define VSW_DRING_INFO_RECV 0x2000 /* Dring info received */ +#define VSW_DRING_ACK_SENT 0x4000 +#define VSW_DRING_ACK_RECV 0x8000 +#define VSW_DRING_NACK_SENT 0x10000 +#define VSW_DRING_NACK_RECV 0x20000 + +#define VSW_RDX_INFO_SENT 0x40000 /* RDX sent to peer */ +#define VSW_RDX_INFO_RECV 0x80000 /* RDX received from peer */ +#define VSW_RDX_ACK_SENT 0x100000 +#define VSW_RDX_ACK_RECV 0x200000 +#define VSW_RDX_NACK_SENT 0x400000 +#define VSW_RDX_NACK_RECV 0x800000 + +#define VSW_MCST_INFO_SENT 0x1000000 +#define VSW_MCST_INFO_RECV 0x2000000 +#define VSW_MCST_ACK_SENT 0x4000000 +#define VSW_MCST_ACK_RECV 0x8000000 +#define VSW_MCST_NACK_SENT 0x10000000 +#define VSW_MCST_NACK_RECV 0x20000000 + +#define VSW_LANE_ACTIVE 0x40000000 /* Lane open to xmit data */ + +/* Handshake milestones */ +#define VSW_MILESTONE0 0x1 /* ver info exchanged */ +#define VSW_MILESTONE1 0x2 /* attribute exchanged */ +#define VSW_MILESTONE2 0x4 /* dring info exchanged */ +#define VSW_MILESTONE3 0x8 /* rdx exchanged */ +#define VSW_MILESTONE4 0x10 /* handshake complete */ + +/* + * Lane direction (relative to ourselves). + */ +#define INBOUND 0x1 +#define OUTBOUND 0x2 + +/* Peer session id received */ +#define VSW_PEER_SESSION 0x1 + +/* + * Maximum number of consecutive reads of data from channel + */ +#define VSW_MAX_CHAN_READ 50 + +/* + * LDC queue length + */ +#define VSW_LDC_QLEN 1024 + +/* + * Currently only support one ldc per port. + */ +#define VSW_PORT_MAX_LDCS 1 /* max # of ldcs per port */ + +/* + * Used for port add/deletion. + */ +#define VSW_PORT_UPDATED 0x1 + +#define LDC_TX_SUCCESS 0 /* ldc transmit success */ +#define LDC_TX_FAILURE 1 /* ldc transmit failure */ +#define LDC_TX_NORESOURCES 2 /* out of descriptors */ + +/* ID of the source of a frame being switched */ +#define VSW_PHYSDEV 1 /* physical device associated */ +#define VSW_VNETPORT 2 /* port connected to vnet (over ldc) */ +#define VSW_LOCALDEV 4 /* vsw configured as an eth interface */ + +/* + * Descriptor ring info + * + * Each descriptor element has a pre-allocated data buffer + * associated with it, into which data being transmitted is + * copied. By pre-allocating we speed up the copying process. + * The buffer is re-used once the peer has indicated that it is + * finished with the descriptor. + */ +#define VSW_RING_NUM_EL 512 /* Num of entries in ring */ +#define VSW_RING_EL_DATA_SZ 2048 /* Size of data section (bytes) */ +#define VSW_PRIV_SIZE sizeof (vnet_private_desc_t) +#define VSW_PUB_SIZE sizeof (vnet_public_desc_t) + +#define VSW_MAX_COOKIES ((ETHERMTU >> MMU_PAGESHIFT) + 2) + +/* + * Private descriptor + */ +typedef struct vsw_private_desc { + uint64_t dstate; + vnet_public_desc_t *descp; + ldc_mem_handle_t memhandle; + void *datap; + uint64_t datalen; + uint64_t ncookies; + ldc_mem_cookie_t memcookie[VSW_MAX_COOKIES]; + int bound; +} vsw_private_desc_t; + +/* + * Descriptor ring structure + */ +typedef struct dring_info { + struct dring_info *next; /* next ring in chain */ + kmutex_t dlock; + uint32_t num_descriptors; + uint32_t descriptor_size; + uint32_t options; + uint32_t ncookies; + ldc_mem_cookie_t cookie[1]; + + ldc_dring_handle_t handle; + uint64_t ident; /* identifier sent to peer */ + uint64_t end_idx; /* last idx processed */ + + /* + * base address of private and public portions of the + * ring (where appropriate), and data block. + */ + void *pub_addr; /* base of public section */ + void *priv_addr; /* base of private section */ + void *data_addr; /* base of data section */ + size_t data_sz; /* size of data section */ +} dring_info_t; + +/* + * Each ldc connection is comprised of two lanes, incoming + * from a peer, and outgoing to that peer. Each lane shares + * common ldc parameters and also has private lane-specific + * parameters. + */ +typedef struct lane { + uint64_t lstate; /* Lane state */ + uint32_t ver_major:16, /* Version major number */ + ver_minor:16; /* Version minor number */ + uint64_t seq_num; /* Sequence number */ + uint64_t mtu; /* ETHERMTU */ + uint64_t addr; /* Unique physical address */ + uint8_t addr_type; /* Only MAC address at moment */ + uint8_t xfer_mode; /* Dring or Pkt based */ + uint8_t ack_freq; /* Only non zero for Pkt based xfer */ + dring_info_t *dringp; /* List of drings for this lane */ +} lane_t; + +/* channel drain states */ +#define VSW_LDC_INIT 0x1 /* Initial non-drain state */ +#define VSW_LDC_DRAINING 0x2 /* Channel draining */ + +/* ldc information associated with a vsw-port */ +typedef struct vsw_ldc { + struct vsw_ldc *ldc_next; /* next ldc in the list */ + struct vsw_port *ldc_port; /* associated port */ + struct vsw *ldc_vswp; /* associated vsw */ + kmutex_t ldc_cblock; /* sync callback processing */ + kmutex_t ldc_txlock; /* sync transmits */ + uint64_t ldc_id; /* channel number */ + ldc_handle_t ldc_handle; /* channel handle */ + kmutex_t drain_cv_lock; + kcondvar_t drain_cv; /* channel draining */ + int drain_state; + uint32_t hphase; /* handshake phase */ + int hcnt; /* # handshake attempts */ + ldc_status_t ldc_status; /* channel status */ + uint64_t local_session; /* Our session id */ + uint64_t peer_session; /* Our peers session id */ + uint8_t session_status; /* Session recv'd, sent */ + kmutex_t hss_lock; + uint32_t hss_id; /* Handshake session id */ + uint64_t next_ident; /* Next dring ident # to use */ + lane_t lane_in; /* Inbound lane */ + lane_t lane_out; /* Outbound lane */ + uint8_t dev_class; /* Peer device class */ +} vsw_ldc_t; + +/* list of ldcs per port */ +typedef struct vsw_ldc_list { + vsw_ldc_t *head; /* head of the list */ + krwlock_t lockrw; /* sync access(rw) to the list */ + int num_ldcs; /* number of ldcs in the list */ +} vsw_ldc_list_t; + +/* multicast addresses port is interested in */ +typedef struct mcst_addr { + struct mcst_addr *nextp; + uint64_t addr; +} mcst_addr_t; + +/* Port detach states */ +#define VSW_PORT_INIT 0x1 /* Initial non-detach state */ +#define VSW_PORT_DETACHING 0x2 /* In process of being detached */ +#define VSW_PORT_DETACHABLE 0x4 /* Safe to detach */ + +/* port information associated with a vsw */ +typedef struct vsw_port { + int p_instance; /* port instance */ + struct vsw_port *p_next; /* next port in the list */ + struct vsw *p_vswp; /* associated vsw */ + vsw_ldc_list_t p_ldclist; /* list of ldcs for this port */ + + kmutex_t tx_lock; /* transmit lock */ + int (*transmit)(vsw_ldc_t *, mblk_t *); + + int state; /* port state */ + kmutex_t state_lock; + kcondvar_t state_cv; + + int ref_cnt; /* # of active references */ + kmutex_t ref_lock; + kcondvar_t ref_cv; + + kmutex_t mca_lock; /* multicast lock */ + mcst_addr_t *mcap; /* list of multicast addrs */ + + /* + * mac address of the port & connected device + */ + struct ether_addr p_macaddr; +} vsw_port_t; + +/* list of ports per vsw */ +typedef struct vsw_port_list { + vsw_port_t *head; /* head of the list */ + krwlock_t lockrw; /* sync access(rw) to the list */ + int num_ports; /* number of ports in the list */ +} vsw_port_list_t; + +/* + * Taskq control message + */ +typedef struct vsw_ctrl_task { + vsw_ldc_t *ldcp; + def_msg_t pktp; + uint32_t hss_id; +} vsw_ctrl_task_t; + +/* + * Number of hash chains in the multicast forwarding database. + */ +#define VSW_NCHAINS 8 + +/* + * State of interface if switch plumbed as network device. + */ +#define VSW_IF_UP 0x1 /* Interface UP */ +#define VSW_IF_PROMISC 0x2 /* Interface in promiscious mode */ + +#define VSW_U_P(state) \ + (state == (VSW_IF_UP | VSW_IF_PROMISC)) + +/* + * Switching modes. + */ +#define VSW_LAYER2 0x1 /* Layer 2 - MAC switching */ +#define VSW_LAYER2_PROMISC 0x2 /* Layer 2 + promisc mode */ +#define VSW_LAYER3 0x4 /* Layer 3 - IP switching */ + +#define NUM_SMODES 3 /* number of switching modes */ + +/* + * Bits indicating which properties we've read from MD. + */ +#define VSW_MD_PHYSNAME 0x1 +#define VSW_MD_MACADDR 0x2 +#define VSW_MD_SMODE 0x4 + +/* + * vsw instance state information. + */ +typedef struct vsw { + int instance; /* instance # */ + dev_info_t *dip; /* associated dev_info */ + struct vsw *next; /* next in list */ + char physname[LIFNAMSIZ]; /* phys-dev */ + uint8_t smode[NUM_SMODES]; /* switching mode */ + int smode_idx; /* curr pos in smode array */ + uint8_t mdprops; /* bitmask of props found */ + vsw_port_list_t plist; /* associated ports */ + ddi_taskq_t *taskq_p; /* VIO ctrl msg taskq */ + mod_hash_t *fdb; /* forwarding database */ + + mod_hash_t *mfdb; /* multicast FDB */ + krwlock_t mfdbrw; /* rwlock for mFDB */ + + /* mac layer */ + mac_handle_t mh; + mac_rx_handle_t mrh; + mac_notify_handle_t mnh; + const mac_txinfo_t *txinfo; /* MAC tx routine */ + + /* Initial promisc setting of interface */ + boolean_t init_promisc; + + /* Machine Description updates */ + mdeg_node_spec_t *inst_spec; + mdeg_handle_t mdeg_hdl; + + /* if configured as an ethernet interface */ + mac_t *if_macp; /* MAC structure */ + mac_resource_handle_t if_mrh; + struct ether_addr if_addr; /* interface address */ + krwlock_t if_lockrw; + uint8_t if_state; /* interface state */ + + /* multicast addresses when configured as eth interface */ + kmutex_t mca_lock; /* multicast lock */ + mcst_addr_t *mcap; /* list of multicast addrs */ +} vsw_t; + + +/* + * Ethernet broadcast address definition. + */ +static struct ether_addr etherbroadcastaddr = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +#define IS_BROADCAST(ehp) \ + (ether_cmp(&ehp->ether_dhost, ðerbroadcastaddr) == 0) +#define IS_MULTICAST(ehp) \ + ((ehp->ether_dhost.ether_addr_octet[0] & 01) == 1) + +#define READ_ENTER(x) rw_enter(x, RW_READER) +#define WRITE_ENTER(x) rw_enter(x, RW_WRITER) +#define RW_EXIT(x) rw_exit(x) + +#ifdef __cplusplus +} +#endif + +#endif /* _VSW_H */ diff --git a/usr/src/uts/sun4v/sys/vsw_fdb.h b/usr/src/uts/sun4v/sys/vsw_fdb.h new file mode 100644 index 0000000000..7f155cc6f7 --- /dev/null +++ b/usr/src/uts/sun4v/sys/vsw_fdb.h @@ -0,0 +1,64 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VSW_FDB_H +#define _VSW_FDB_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Convert ethernet (mac) address to hash table key. + */ +#define KEY_HASH(key, addr) \ + (key = ((((uint64_t)addr.ether_addr_octet[0]) << 40) | \ + (((uint64_t)addr.ether_addr_octet[1]) << 32) | \ + (((uint64_t)addr.ether_addr_octet[2]) << 24) | \ + (((uint64_t)addr.ether_addr_octet[3]) << 16) | \ + (((uint64_t)addr.ether_addr_octet[4]) << 8) | \ + ((uint64_t)addr.ether_addr_octet[5]))); + +/* + * Multicast forwarding database (mFDB) is a hashtable + * keyed off the mac address, with the value being a linked + * list of mfdb_ent_t structures, each of which is a destination + * (either a vsw_port or the vsw instance itself when plumbed as + * a network device) to which the multicast pkt should be forwarded. + */ +typedef struct mfdb_ent { + struct mfdb_ent *nextp; /* next entry in list */ + void *d_addr; /* address of dest */ + uint8_t d_type; /* destination type */ +} mfdb_ent_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _VSW_FDB_H */ diff --git a/usr/src/uts/sun4v/vcc/Makefile b/usr/src/uts/sun4v/vcc/Makefile new file mode 100644 index 0000000000..96f860045c --- /dev/null +++ b/usr/src/uts/sun4v/vcc/Makefile @@ -0,0 +1,106 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# uts/sun4v/vcc/Makefile +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +# This makefile drives the production of the vcc driver kernel module. +# +# sun4v implementation architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = vcc +OBJECTS = $(VCC_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(VCC_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_DRV_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sun4v/Makefile.sun4v + +# +# Override defaults to build a unique, local modstubs.o. +# +MODSTUBS_DIR = $(OBJS_DIR) + +CLEANFILES += $(MODSTUBS_O) + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement +# +CFLAGS += $(CCVERBOSE) + +# +# compiler failes with not reached statements +# +CERRWARN += -erroff=E_STATEMENT_NOT_REACHED + +# +# module dependencies +# +LDFLAGS += -dy -Nmisc/ldc -Nmisc/platsvc + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/$(PLATFORM)/Makefile.targ diff --git a/usr/src/uts/sun4v/vdc/Makefile b/usr/src/uts/sun4v/vdc/Makefile new file mode 100644 index 0000000000..b20f06c368 --- /dev/null +++ b/usr/src/uts/sun4v/vdc/Makefile @@ -0,0 +1,108 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# uts/sun4v/vdc/Makefile +# +#pragma ident "%Z%%M% %I% %E% SMI" +# +# This makefile drives the production of the vdc driver module. +# +# sun4v implementation architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = vdc +OBJECTS = $(VDC_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(VDC_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_DRV_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sun4v/Makefile.sun4v + +# +# Override defaults to build a unique, local modstubs.o. +# +MODSTUBS_DIR = $(OBJS_DIR) + +CLEANFILES += $(MODSTUBS_O) + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement +# +CFLAGS += $(CCVERBOSE) +CFLAGS += -errwarn=%all + +LDFLAGS += -dy -Nmisc/ldc -Nmisc/platsvc + +# +# Re-enable C99 compilation to use stack allocation of variable-sized arrays. +# According to usr/src/uts/Makefile.uts, C99 is disabled until a problem seen +# on x86 machines can be fully diagnosed; presumably a sun4v (i.e., SPARC) +# module should be "safe". Furthermore, only the variable-sized array +# extension is needed/used. +# +C99MODE = $(C99_ENABLE) + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/$(PLATFORM)/Makefile.targ diff --git a/usr/src/uts/sun4v/vds/Makefile b/usr/src/uts/sun4v/vds/Makefile new file mode 100644 index 0000000000..397ebc4309 --- /dev/null +++ b/usr/src/uts/sun4v/vds/Makefile @@ -0,0 +1,106 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# uts/sun4v/vds/Makefile +# +#pragma ident "%Z%%M% %I% %E% SMI" +# +# This makefile drives the production of the vds driver module. +# +# sun4v implementation architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = vds +OBJECTS = $(VDS_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(VDS_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_DRV_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sun4v/Makefile.sun4v + +# +# Override defaults to build a unique, local modstubs.o. +# +MODSTUBS_DIR = $(OBJS_DIR) + +CLEANFILES += $(MODSTUBS_O) + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement +# +CFLAGS += $(CCVERBOSE) + +# +# module dependencies +# +LDFLAGS += -dy -Nmisc/ldc -Nmisc/platsvc + +# +# Manually turn on C99 compilation until the sync with snv_38 re-enables it +# +C99MODE = $(C99_ENABLE) + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/$(PLATFORM)/Makefile.targ diff --git a/usr/src/uts/sun4v/vldc/Makefile b/usr/src/uts/sun4v/vldc/Makefile new file mode 100644 index 0000000000..c36cf42690 --- /dev/null +++ b/usr/src/uts/sun4v/vldc/Makefile @@ -0,0 +1,101 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" +# +# uts/sun4v/vldc/Makefile +# +# This makefile drives the production of the vldc driver module. +# +# sun4v implementation architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = vldc +OBJECTS = $(VLDC_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(VLDC_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_DRV_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sun4v/Makefile.sun4v + +# +# Override defaults to build a unique, local modstubs.o. +# +MODSTUBS_DIR = $(OBJS_DIR) + +CLEANFILES += $(MODSTUBS_O) + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement +# +CFLAGS += $(CCVERBOSE) + +# +# module dependencies +# +LDFLAGS += -dy -Nmisc/ldc -Nmisc/platsvc + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/$(PLATFORM)/Makefile.targ diff --git a/usr/src/uts/sun4v/vnet/Makefile b/usr/src/uts/sun4v/vnet/Makefile new file mode 100644 index 0000000000..2eed19f4bc --- /dev/null +++ b/usr/src/uts/sun4v/vnet/Makefile @@ -0,0 +1,105 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#pragma ident "%Z%%M% %I% %E% SMI" +# +# uts/sun4v/vnet/Makefile +# +# This makefile drives the production of the vnet driver module. +# +# sun4v implementation architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = vnet +OBJECTS = $(VNET_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(VNET_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_DRV_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sun4v/Makefile.sun4v + +# +# Override defaults to build a unique, local modstubs.o. +# +MODSTUBS_DIR = $(OBJS_DIR) + +CLEANFILES += $(MODSTUBS_O) + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement +# +CFLAGS += $(CCVERBOSE) +CFLAGS += -DVGEN_HANDLE_LOST_PKTS +#CFLAGS += -DVGEN_USE_MAC_TX_UPDATE +#CFLAGS += -DVGEN_REXMIT + + +# +# Driver depends on MAC & IP +# +LDFLAGS += -dy -N misc/mac -N drv/ip -N misc/ldc -N misc/platsvc + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/$(PLATFORM)/Makefile.targ diff --git a/usr/src/uts/sun4v/vsw/Makefile b/usr/src/uts/sun4v/vsw/Makefile new file mode 100644 index 0000000000..88fdda49e6 --- /dev/null +++ b/usr/src/uts/sun4v/vsw/Makefile @@ -0,0 +1,112 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#pragma ident "%Z%%M% %I% %E% SMI" +# +# uts/sun4v/vsw/Makefile +# +# This makefile drives the production of the vsw driver module. +# +# sun4v implementation architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = vsw +OBJECTS = $(VSW_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(VSW_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_PSM_DRV_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sun4v/Makefile.sun4v + +# +# Override defaults to build a unique, local modstubs.o. +# +MODSTUBS_DIR = $(OBJS_DIR) + +CLEANFILES += $(MODSTUBS_O) + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement +# +CFLAGS += $(CCVERBOSE) + +# +# Module dependencies +# +LDFLAGS += -dy -Nmisc/ldc -Nmisc/mac -Nmisc/platsvc + +# +# Re-enable C99 compilation to use stack allocation of variable-sized arrays. +# According to usr/src/uts/Makefile.uts, C99 is disabled until a problem seen +# on x86 machines can be fully diagnosed; presumably a sun4v (i.e., SPARC) +# module should be "safe". Furthermore, only the variable-sized array +# extension is needed/used. +# +# C99 mode also gives us macros such as __func__ +# +C99MODE = $(99_ENABLE) + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/$(PLATFORM)/Makefile.targ |