diff options
Diffstat (limited to 'usr/src/lib/libvmm/libvmm.c')
| -rw-r--r-- | usr/src/lib/libvmm/libvmm.c | 860 | 
1 files changed, 860 insertions, 0 deletions
| diff --git a/usr/src/lib/libvmm/libvmm.c b/usr/src/lib/libvmm/libvmm.c new file mode 100644 index 0000000000..dc552a8de0 --- /dev/null +++ b/usr/src/lib/libvmm/libvmm.c @@ -0,0 +1,860 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source.  A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * Library for native code to access bhyve VMs, without the need to use + * FreeBSD compat headers + */ + +#include <sys/param.h> +#include <sys/list.h> +#include <sys/stddef.h> +#include <sys/mman.h> +#include <sys/kdi_regs.h> +#include <sys/sysmacros.h> +#include <sys/controlregs.h> +#include <sys/note.h> +#include <sys/debug.h> +#include <errno.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> +#include <assert.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include <libvmm.h> + +typedef struct vmm_memseg vmm_memseg_t; + +#define	VMM_MEMSEG_DEVMEM	0x1 + +struct vmm_memseg { +	list_node_t vms_list; +	int vms_segid; +	int vms_prot; +	int vms_flags; +	uintptr_t vms_gpa; +	off_t vms_segoff; +	size_t vms_seglen; +	size_t vms_maplen; +	char vms_name[64]; +}; + +struct vmm { +	struct vmctx *vmm_ctx; +	list_t vmm_memlist; +	char *vmm_mem; +	size_t vmm_memsize; +	size_t vmm_ncpu; +}; + + +/* + * This code relies on two assumptions: + * - CPUs are never removed from the "active set", not even when suspended. + *   A CPU being active just means that it has been used by the guest OS. + * - The CPU numbering is consecutive. + */ +static void +vmm_update_ncpu(vmm_t *vmm) +{ +	cpuset_t cpuset; + +	assert(vm_active_cpus(vmm->vmm_ctx, &cpuset) == 0); + +	for (vmm->vmm_ncpu = 0; +	    CPU_ISSET(vmm->vmm_ncpu, &cpuset) == 1; +	    vmm->vmm_ncpu++) +		; +} + +vmm_t * +vmm_open_vm(const char *name) +{ +	vmm_t *vmm = NULL; + +	vmm = malloc(sizeof (vmm_t)); +	if (vmm == NULL) +		return (NULL); + +	bzero(vmm, sizeof (vmm_t)); +	vmm->vmm_mem = MAP_FAILED; + +	list_create(&vmm->vmm_memlist, sizeof (vmm_memseg_t), +	    offsetof(vmm_memseg_t, vms_list)); + +	vmm->vmm_ctx = vm_open(name); +	if (vmm->vmm_ctx == NULL) { +		free(vmm); +		return (NULL); +	} + +	vmm_update_ncpu(vmm); + +	/* +	 * If we open a VM that has just been created we may see a state +	 * where it has no CPUs configured yet. We'll just wait for 10ms +	 * and retry until we get a non-zero CPU count. +	 */ +	if (vmm->vmm_ncpu == 0) { +		do { +			(void) usleep(10000); +			vmm_update_ncpu(vmm); +		} while (vmm->vmm_ncpu == 0); +	} + +	return (vmm); +} + +void +vmm_close_vm(vmm_t *vmm) +{ +	vmm_unmap(vmm); + +	list_destroy(&vmm->vmm_memlist); + +	if (vmm->vmm_ctx != NULL) +		vm_close(vmm->vmm_ctx); + +	free(vmm); +} + +static vmm_memseg_t * +vmm_get_memseg(vmm_t *vmm, uintptr_t gpa) +{ +	vmm_memseg_t ms, *ret; +	int error, flags; + +	bzero(&ms, sizeof (vmm_memseg_t)); +	ms.vms_gpa = gpa; +	error = vm_mmap_getnext(vmm->vmm_ctx, &ms.vms_gpa, &ms.vms_segid, +	    &ms.vms_segoff, &ms.vms_maplen, &ms.vms_prot, &flags); +	if (error) +		return (NULL); + +	error = vm_get_memseg(vmm->vmm_ctx, ms.vms_segid, &ms.vms_seglen, +	    ms.vms_name, sizeof (ms.vms_name)); +	if (error) +		return (NULL); + +	/* +	 * Regular memory segments don't have a name, but devmem segments do. +	 * We can use that information to set the DEVMEM flag if necessary. +	 */ +	ms.vms_flags = ms.vms_name[0] != '\0' ? VMM_MEMSEG_DEVMEM : 0; + +	ret = malloc(sizeof (vmm_memseg_t)); +	if (ret == NULL) +		return (NULL); + +	*ret = ms; + +	return (ret); +} + +int +vmm_map(vmm_t *vmm, boolean_t writable) +{ +	uintptr_t last_gpa = 0; +	vmm_memseg_t *ms; +	int prot_write = writable ? PROT_WRITE : 0; + +	if (vmm->vmm_mem != MAP_FAILED) { +		errno = EINVAL; +		return (-1); +	} + +	assert(list_is_empty(&vmm->vmm_memlist)); + +	for (;;) { +		ms = vmm_get_memseg(vmm, last_gpa); + +		if (ms == NULL) +			break; + +		last_gpa = ms->vms_gpa + ms->vms_maplen; +		list_insert_tail(&vmm->vmm_memlist, ms); +	} + +	vmm->vmm_mem = mmap(NULL, last_gpa, PROT_NONE, +	    MAP_PRIVATE | MAP_ANON | MAP_NORESERVE, -1, 0); + +	if (vmm->vmm_mem == MAP_FAILED) +		goto fail; + +	for (ms = list_head(&vmm->vmm_memlist); +	    ms != NULL; +	    ms = list_next(&vmm->vmm_memlist, ms)) { +		off_t mapoff = ms->vms_gpa; + +		if ((ms->vms_flags & VMM_MEMSEG_DEVMEM) && +		    vm_get_devmem_offset(vmm->vmm_ctx, ms->vms_segid, &mapoff) +		    != 0) +			goto fail; + +		vmm->vmm_memsize += ms->vms_maplen; + +		if (mmap(vmm->vmm_mem + ms->vms_gpa, ms->vms_maplen, +		    PROT_READ | prot_write, MAP_SHARED | MAP_FIXED, +		    vm_get_device_fd(vmm->vmm_ctx), mapoff) == MAP_FAILED) +			goto fail; +	} + +	return (0); + +fail: +	vmm_unmap(vmm); + +	return (-1); +} + +void +vmm_unmap(vmm_t *vmm) +{ +	while (!list_is_empty(&vmm->vmm_memlist)) { +		vmm_memseg_t *ms = list_remove_head(&vmm->vmm_memlist); + +		if (vmm->vmm_mem != MAP_FAILED) { +			(void) munmap(vmm->vmm_mem + ms->vms_gpa, +			    ms->vms_maplen); +		} + +		free(ms); +	} + +	if (vmm->vmm_mem != MAP_FAILED) +		(void) munmap(vmm->vmm_mem, vmm->vmm_memsize); + +	vmm->vmm_mem = MAP_FAILED; +	vmm->vmm_memsize = 0; +} + +ssize_t +vmm_pread(vmm_t *vmm, void *buf, size_t len, uintptr_t addr) +{ +	ssize_t count = 0; +	vmm_memseg_t *ms; +	ssize_t res = len; + +	for (ms = list_head(&vmm->vmm_memlist); +	    ms != NULL && len != 0; +	    ms = list_next(&vmm->vmm_memlist, ms)) { + +		if (addr >= ms->vms_gpa && +		    addr < ms->vms_gpa + ms->vms_maplen) { +			res = (addr + len) - (ms->vms_gpa + ms->vms_maplen); + +			if (res < 0) +				res = 0; + +			bcopy(vmm->vmm_mem + addr, buf, len - res); +			count += len - res; +			addr += len - res; +			len = res; +		} +	} + +	if (res) +		errno = EFAULT; +	else +		errno = 0; + +	return (count); +} + +ssize_t +vmm_pwrite(vmm_t *vmm, const void *buf, size_t len, uintptr_t addr) +{ +	ssize_t count = 0; +	vmm_memseg_t *ms; +	ssize_t res = len; + +	for (ms = list_head(&vmm->vmm_memlist); +	    ms != NULL; +	    ms = list_next(&vmm->vmm_memlist, ms)) { +		if (addr >= ms->vms_gpa && +		    addr < ms->vms_gpa + ms->vms_maplen) { +			res = (addr + len) - (ms->vms_gpa + ms->vms_maplen); + +			if (res < 0) +				res = 0; + +			bcopy(buf, vmm->vmm_mem + addr, len - res); +			count += len - res; +			addr += len - res; +			len = res; +		} +	} + +	if (res) +		errno = EFAULT; +	else +		errno = 0; + +	return (count); +} + +size_t +vmm_ncpu(vmm_t *vmm) +{ +	return (vmm->vmm_ncpu); +} + +size_t +vmm_memsize(vmm_t *vmm) +{ +	return (vmm->vmm_memsize); +} + +int +vmm_cont(vmm_t *vmm) +{ +	return (vm_resume_cpu(vmm->vmm_ctx, -1)); +} + +int +vmm_step(vmm_t *vmm, int vcpu) +{ +	cpuset_t cpuset; +	int ret; + +	if (vcpu >= vmm->vmm_ncpu) { +		errno = EINVAL; +		return (-1); +	} + +	ret = vm_set_capability(vmm->vmm_ctx, vcpu, VM_CAP_MTRAP_EXIT, 1); +	if (ret != 0) +		return (-1); + +	assert(vm_resume_cpu(vmm->vmm_ctx, vcpu) == 0); + +	do { +		(void) vm_debug_cpus(vmm->vmm_ctx, &cpuset); +	} while (!CPU_ISSET(vcpu, &cpuset)); + +	(void) vm_set_capability(vmm->vmm_ctx, vcpu, VM_CAP_MTRAP_EXIT, 0); + +	return (ret); +} + +int +vmm_stop(vmm_t *vmm) +{ +	int ret = vm_suspend_cpu(vmm->vmm_ctx, -1); + +	if (ret == 0) +		vmm_update_ncpu(vmm); + +	return (ret); +} + +/* + * Mapping of KDI-defined registers to vmmapi-defined registers. + * Registers not known to vmmapi use VM_REG_LAST, which is invalid and + * causes an error in vm_{get,set}_register_set(). + * + * This array must be kept in sync with the definitions in kdi_regs.h. + */ +static int vmm_kdi_regmap[] = { +	VM_REG_LAST,		/* KDIREG_SAVFP */ +	VM_REG_LAST,		/* KDIREG_SAVPC */ +	VM_REG_GUEST_RDI,	/* KDIREG_RDI */ +	VM_REG_GUEST_RSI,	/* KDIREG_RSI */ +	VM_REG_GUEST_RDX,	/* KDIREG_RDX */ +	VM_REG_GUEST_RCX,	/* KDIREG_RCX */ +	VM_REG_GUEST_R8,	/* KDIREG_R8 */ +	VM_REG_GUEST_R9,	/* KDIREG_R9 */ +	VM_REG_GUEST_RAX,	/* KDIREG_RAX */ +	VM_REG_GUEST_RBX,	/* KDIREG_RBX */ +	VM_REG_GUEST_RBP,	/* KDIREG_RBP */ +	VM_REG_GUEST_R10,	/* KDIREG_R10 */ +	VM_REG_GUEST_R11,	/* KDIREG_R11 */ +	VM_REG_GUEST_R12,	/* KDIREG_R12 */ +	VM_REG_GUEST_R13,	/* KDIREG_R13 */ +	VM_REG_GUEST_R14,	/* KDIREG_R14 */ +	VM_REG_GUEST_R15,	/* KDIREG_R15 */ +	VM_REG_LAST,		/* KDIREG_FSBASE */ +	VM_REG_LAST,		/* KDIREG_GSBASE */ +	VM_REG_LAST,		/* KDIREG_KGSBASE */ +	VM_REG_GUEST_CR2,	/* KDIREG_CR2 */ +	VM_REG_GUEST_CR3,	/* KDIREG_CR3 */ +	VM_REG_GUEST_DS,	/* KDIREG_DS */ +	VM_REG_GUEST_ES,	/* KDIREG_ES */ +	VM_REG_GUEST_FS,	/* KDIREG_FS */ +	VM_REG_GUEST_GS,	/* KDIREG_GS */ +	VM_REG_LAST,		/* KDIREG_TRAPNO */ +	VM_REG_LAST,		/* KDIREG_ERR */ +	VM_REG_GUEST_RIP,	/* KDIREG_RIP */ +	VM_REG_GUEST_CS,	/* KDIREG_CS */ +	VM_REG_GUEST_RFLAGS,	/* KDIREG_RFLAGS */ +	VM_REG_GUEST_RSP,	/* KDIREG_RSP */ +	VM_REG_GUEST_SS		/* KDIREG_SS */ +}; +CTASSERT(ARRAY_SIZE(vmm_kdi_regmap) == KDIREG_NGREG); + +/* + * Mapping of libvmm-defined registers to vmmapi-defined registers. + * + * This array must be kept in sync with the definitions in libvmm.h + */ +static int vmm_sys_regmap[] = { +	VM_REG_GUEST_CR0,	/* VMM_REG_CR0 */ +	VM_REG_GUEST_CR2,	/* VMM_REG_CR2 */ +	VM_REG_GUEST_CR3,	/* VMM_REG_CR3 */ +	VM_REG_GUEST_CR4,	/* VMM_REG_CR4 */ +	VM_REG_GUEST_DR0,	/* VMM_REG_DR0 */ +	VM_REG_GUEST_DR1,	/* VMM_REG_DR1 */ +	VM_REG_GUEST_DR2,	/* VMM_REG_DR2 */ +	VM_REG_GUEST_DR3,	/* VMM_REG_DR3 */ +	VM_REG_GUEST_DR6,	/* VMM_REG_DR6 */ +	VM_REG_GUEST_DR7,	/* VMM_REG_DR7 */ +	VM_REG_GUEST_EFER,	/* VMM_REG_EFER */ +	VM_REG_GUEST_PDPTE0,	/* VMM_REG_PDPTE0 */ +	VM_REG_GUEST_PDPTE1,	/* VMM_REG_PDPTE1 */ +	VM_REG_GUEST_PDPTE2,	/* VMM_REG_PDPTE2 */ +	VM_REG_GUEST_PDPTE3,	/* VMM_REG_PDPTE3 */ +	VM_REG_GUEST_INTR_SHADOW, /* VMM_REG_INTR_SHADOW */ +}; + +/* + * Mapping of libvmm-defined descriptors to vmmapi-defined descriptors. + * + * This array must be kept in sync with the definitions in libvmm.h + */ +static int vmm_descmap[] = { +	VM_REG_GUEST_GDTR, +	VM_REG_GUEST_LDTR, +	VM_REG_GUEST_IDTR, +	VM_REG_GUEST_TR, +	VM_REG_GUEST_CS, +	VM_REG_GUEST_DS, +	VM_REG_GUEST_ES, +	VM_REG_GUEST_FS, +	VM_REG_GUEST_GS, +	VM_REG_GUEST_SS +}; + +static int +vmm_mapreg(int reg) +{ +	errno = 0; + +	if (reg < 0) +		goto fail; + +	if (reg < KDIREG_NGREG) +		return (vmm_kdi_regmap[reg]); + +	if (reg >= VMM_REG_OFFSET && +	    reg < VMM_REG_OFFSET + ARRAY_SIZE(vmm_sys_regmap)) +		return (vmm_sys_regmap[reg - VMM_REG_OFFSET]); + +fail: +	errno = EINVAL; +	return (VM_REG_LAST); +} + +static int +vmm_mapdesc(int desc) +{ +	errno = 0; + +	if (desc >= VMM_DESC_OFFSET && +	    desc < VMM_DESC_OFFSET + ARRAY_SIZE(vmm_descmap)) +		return (vmm_descmap[desc - VMM_DESC_OFFSET]); + +	errno = EINVAL; +	return (VM_REG_LAST); +} + +int +vmm_getreg(vmm_t *vmm, int vcpu, int reg, uint64_t *val) +{ +	reg = vmm_mapreg(reg); + +	if (reg == VM_REG_LAST) +		return (-1); + +	return (vm_get_register(vmm->vmm_ctx, vcpu, reg, val)); +} + +int +vmm_setreg(vmm_t *vmm, int vcpu, int reg, uint64_t val) +{ +	reg = vmm_mapreg(reg); + +	if (reg == VM_REG_LAST) +		return (-1); + +	return (vm_set_register(vmm->vmm_ctx, vcpu, reg, val)); +} + +int +vmm_get_regset(vmm_t *vmm, int vcpu, size_t nregs, const int *regnums, +    uint64_t *regvals) +{ +	int *vm_regnums; +	int i; +	int ret = -1; + +	vm_regnums = malloc(sizeof (int) * nregs); +	if (vm_regnums == NULL) +		return (ret); + +	for (i = 0; i != nregs; i++) { +		vm_regnums[i] = vmm_mapreg(regnums[i]); +		if (vm_regnums[i] == VM_REG_LAST) +			goto fail; +	} + +	ret = vm_get_register_set(vmm->vmm_ctx, vcpu, nregs, vm_regnums, +	    regvals); + +fail: +	free(vm_regnums); +	return (ret); +} + +int +vmm_set_regset(vmm_t *vmm, int vcpu, size_t nregs, const int *regnums, +    uint64_t *regvals) +{ +	int *vm_regnums; +	int i; +	int ret = -1; + +	vm_regnums = malloc(sizeof (int) * nregs); +	if (vm_regnums == NULL) +		return (ret); + +	for (i = 0; i != nregs; i++) { +		vm_regnums[i] = vmm_mapreg(regnums[i]); +		if (vm_regnums[i] == VM_REG_LAST) +			goto fail; +	} + +	ret = vm_set_register_set(vmm->vmm_ctx, vcpu, nregs, vm_regnums, +	    regvals); + +fail: +	free(vm_regnums); +	return (ret); +} + +int +vmm_get_desc(vmm_t *vmm, int vcpu, int desc, vmm_desc_t *vd) +{ +	desc = vmm_mapdesc(desc); +	if (desc == VM_REG_LAST) +		return (-1); + +	return (vm_get_desc(vmm->vmm_ctx, vcpu, desc, &vd->vd_base, &vd->vd_lim, +	    &vd->vd_acc)); +} + +int +vmm_set_desc(vmm_t *vmm, int vcpu, int desc, vmm_desc_t *vd) +{ +	desc = vmm_mapdesc(desc); +	if (desc == VM_REG_LAST) +		return (-1); + +	return (vm_set_desc(vmm->vmm_ctx, vcpu, desc, vd->vd_base, vd->vd_lim, +	    vd->vd_acc)); +} + +/* + * Structure to hold MMU state during address translation. + * The contents of vmm_mmu_regnum[] must be kept in sync with this. + */ +typedef struct vmm_mmu { +	uint64_t vm_cr0; +	uint64_t vm_cr3; +	uint64_t vm_cr4; +	uint64_t vm_efer; +} vmm_mmu_t; + +static const int vmm_mmu_regnum[] = { +	VMM_REG_CR0, +	VMM_REG_CR3, +	VMM_REG_CR4, +	VMM_REG_EFER +}; + +#define	X86_PTE_P		0x001ULL +#define	X86_PTE_PS		0x080ULL + +#define	X86_PTE_PHYSMASK	0x000ffffffffff000ULL +#define	X86_PAGE_SHIFT		12 +#define	X86_PAGE_SIZE		(1ULL << X86_PAGE_SHIFT) + +#define	X86_SEG_CODE_DATA	(1ULL << 4) +#define	X86_SEG_PRESENT		(1ULL << 7) +#define	X86_SEG_LONG		(1ULL << 13) +#define	X86_SEG_BIG		(1ULL << 14) +#define	X86_SEG_GRANULARITY	(1ULL << 15) +#define	X86_SEG_UNUSABLE	(1ULL << 16) + +#define	X86_SEG_USABLE		(X86_SEG_PRESENT | X86_SEG_CODE_DATA) +#define	X86_SEG_USABLE_MASK	(X86_SEG_UNUSABLE | X86_SEG_USABLE) + +/* + * vmm_pte2paddr: + * + * Recursively calculate the physical address from a virtual address, + * starting at the given PTE level using the given PTE. + */ +static int +vmm_pte2paddr(vmm_t *vmm, uint64_t pte, boolean_t ia32, int level, +    uint64_t vaddr, uint64_t *paddr) +{ +	int pte_size = ia32 ? sizeof (uint32_t) : sizeof (uint64_t); +	int off_bits = ia32 ? 10 : 9; +	boolean_t hugepage = B_FALSE; +	uint64_t offset; +	uint64_t off_mask, off_shift; + +	if (level < 4 && (pte & X86_PTE_P) == 0) { +		errno = EFAULT; +		return (-1); +	} + +	off_shift = X86_PAGE_SHIFT + off_bits * level; +	off_mask = (1ULL << off_shift) - 1; + +	offset = vaddr & off_mask; + +	if ((level == 1 || level == 2) && (pte & X86_PTE_PS) != 0) { +		hugepage = B_TRUE; +	} else { +		if (level > 0) { +			offset >>= off_shift - off_bits; +			offset <<= X86_PAGE_SHIFT - off_bits; +		} +		off_mask = 0xfff; +	} + +	*paddr = (pte & X86_PTE_PHYSMASK & ~off_mask) + offset; + +	if (level == 0 || hugepage) +		return (0); + +	pte = 0; +	if (vmm_pread(vmm, &pte,  pte_size, *paddr) != pte_size) +		return (-1); +	return (vmm_pte2paddr(vmm, pte, ia32, level - 1, vaddr, paddr)); +} + +static vmm_mode_t +vmm_vcpu_mmu_mode(vmm_t *vmm, int vcpu, vmm_mmu_t *mmu) +{ +	if ((mmu->vm_cr0 & CR0_PE) == 0) +		return (VMM_MODE_REAL); +	else if ((mmu->vm_cr4 & CR4_PAE) == 0) +		return (VMM_MODE_PROT); +	else if ((mmu->vm_efer & AMD_EFER_LME) == 0) +		return (VMM_MODE_PAE); +	else +		return (VMM_MODE_LONG); +} + +vmm_mode_t +vmm_vcpu_mode(vmm_t *vmm, int vcpu) +{ +	vmm_mmu_t mmu = { 0 }; + +	if (vmm_get_regset(vmm, vcpu, ARRAY_SIZE(vmm_mmu_regnum), +	    vmm_mmu_regnum, (uint64_t *)&mmu) != 0) +		return (VMM_MODE_UNKNOWN); + +	return (vmm_vcpu_mmu_mode(vmm, vcpu, &mmu)); +} + +vmm_isa_t +vmm_vcpu_isa(vmm_t *vmm, int vcpu) +{ +	vmm_desc_t cs; + +	if (vmm_get_desc(vmm, vcpu, VMM_DESC_CS, &cs) != 0) +		return (VMM_ISA_UNKNOWN); + +	switch (cs.vd_acc & (X86_SEG_BIG | X86_SEG_LONG)) { +	case 0x0:		/* 16b code segment */ +		return (VMM_ISA_16); +	case X86_SEG_LONG:	/* 64b code segment */ +		return (VMM_ISA_64); +	case X86_SEG_BIG:	/* 32b code segment */ +		return (VMM_ISA_32); +	} + +	return (VMM_ISA_UNKNOWN); +} + +/* + * vmm_vtol: + * + * Translate a virtual address to a physical address on a certain vCPU, + * using the specified segment register or descriptor according to the mode. + * + */ +int +vmm_vtol(vmm_t *vmm, int vcpu, int seg, uint64_t vaddr, uint64_t *laddr) +{ +	vmm_desc_t desc; +	uint64_t limit; + +	if (vmm_get_desc(vmm, vcpu, seg, &desc) != 0) +		return (-1); + +	switch (vmm_vcpu_mode(vmm, vcpu)) { +	case VMM_MODE_REAL: +		if (seg == VMM_DESC_FS || seg == VMM_DESC_GS) +			goto fault; +		/* FALLTHRU */ +	case VMM_MODE_PROT: +	case VMM_MODE_PAE: +		if ((desc.vd_acc & X86_SEG_USABLE_MASK) != X86_SEG_USABLE) +			/* unusable, system segment, or not present */ +			goto fault; + +		limit = desc.vd_lim; +		if (desc.vd_acc & X86_SEG_GRANULARITY) +			limit *= 4096; + +		if (vaddr > limit) +			goto fault; +		/* FALLTHRU */ +	case VMM_MODE_LONG: +		*laddr = desc.vd_base + vaddr; +		return (0); + +	default: +	fault: +		errno = EFAULT; +		return (-1); +	} + +} + +/* + * vmm_vtop: + * + * Translate a virtual address to a guest physical address on a certain vCPU, + * according to the mode the vCPU is in. + */ +int +vmm_vtop(vmm_t *vmm, int vcpu, int seg, uint64_t vaddr, uint64_t *paddr) +{ +	vmm_mmu_t mmu = { 0 }; +	int ret = 0; + +	if (vmm_vtol(vmm, vcpu, seg, vaddr, &vaddr) != 0) +		return (-1); + +	if (vmm_get_regset(vmm, vcpu, ARRAY_SIZE(vmm_mmu_regnum), +	    vmm_mmu_regnum, (uint64_t *)&mmu) != 0) +		return (-1); + +	if ((mmu.vm_cr0 & CR0_PG) == 0) { +		/* no paging, physical equals virtual */ +		*paddr = vaddr; +		return (0); +	} + +	switch (vmm_vcpu_mmu_mode(vmm, vcpu, &mmu)) { +	case VMM_MODE_PROT: +		/* protected mode, no PAE: 2-level paging, 32bit PTEs */ +		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_TRUE, 2, vaddr, paddr); +		break; +	case VMM_MODE_PAE: +		/* protected mode with PAE: 3-level paging, 64bit PTEs */ +		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 3, vaddr, paddr); +		break; +	case VMM_MODE_LONG: +		/* long mode: 4-level paging, 64bit PTEs */ +		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 4, vaddr, paddr); +		break; +	default: +		ret = -1; +	} + +	return (ret); +} + +ssize_t +vmm_vread(vmm_t *vmm, int vcpu, int seg, void *buf, size_t len, uintptr_t addr) +{ +	ssize_t res = 0; +	uint64_t paddr; +	size_t plen; +	uint64_t boundary; + +	while (len != 0) { +		if (vmm_vtop(vmm, vcpu, seg, addr, &paddr) != 0) { +			errno = EFAULT; +			return (0); +		} + +		boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1); +		if (addr + len > boundary) +			plen = boundary - addr; +		else +			plen = len; + +		if (vmm_pread(vmm, buf, plen, paddr) != plen) +			return (0); +		len -= plen; +		addr += plen; +		buf += plen; +		res += plen; +	} + +	return (res); +} + +ssize_t +vmm_vwrite(vmm_t *vmm, int vcpu, int seg, const void *buf, size_t len, +    uintptr_t addr) +{ +	ssize_t res = 0; +	uint64_t paddr; +	size_t plen; +	uint64_t boundary; + +	while (len != 0) { +		if (vmm_vtop(vmm, vcpu, seg, addr, &paddr) != 0) { +			errno = EFAULT; +			return (0); +		} + +		boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1); +		if (addr + len > boundary) +			plen = boundary - addr; +		else +			plen = len; + +		if (vmm_pwrite(vmm, buf, plen, paddr) != plen) +			return (0); +		len -= plen; +		addr += plen; +		buf += plen; +		res += plen; +	} + +	return (res); +} | 
