src/usr.sbin/vmd/virtio.c

/*	$OpenBSD: virtio.c,v 1.123 2025/01/08 15:46:10 dv Exp $	*/

/*
 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <sys/param.h>	/* PAGE_SIZE */
#include <sys/socket.h>
#include <sys/wait.h>

#include <dev/pci/pcireg.h>
#include <dev/pci/pcidevs.h>
#include <dev/pv/virtioreg.h>
#include <dev/pci/virtio_pcireg.h>
#include <dev/pv/vioblkreg.h>
#include <dev/vmm/vmm.h>

#include <net/if.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>

#include <errno.h>
#include <event.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "atomicio.h"
#include "pci.h"
#include "vioscsi.h"
#include "virtio.h"
#include "vmd.h"

extern struct vmd *env;
extern char *__progname;

struct viornd_dev viornd;
struct vioscsi_dev *vioscsi;
struct vmmci_dev vmmci;

/* Devices emulated in subprocesses are inserted into this list. */
SLIST_HEAD(virtio_dev_head, virtio_dev) virtio_devs;

#define MAXPHYS	(64 * 1024)	/* max raw I/O transfer size */

#define VIRTIO_NET_F_MAC	(1<<5)

#define VMMCI_F_TIMESYNC	(1<<0)
#define VMMCI_F_ACK		(1<<1)
#define VMMCI_F_SYNCRTC		(1<<2)

#define RXQ	0
#define TXQ	1

static int virtio_dev_launch(struct vmd_vm *, struct virtio_dev *);
static void virtio_dispatch_dev(int, short, void *);
static int handle_dev_msg(struct viodev_msg *, struct virtio_dev *);
static int virtio_dev_closefds(struct virtio_dev *);
static void vmmci_pipe_dispatch(int, short, void *);

const char *
virtio_reg_name(uint8_t reg)
{
	switch (reg) {
	case VIRTIO_CONFIG_DEVICE_FEATURES: return "device feature";
	case VIRTIO_CONFIG_GUEST_FEATURES: return "guest feature";
	case VIRTIO_CONFIG_QUEUE_PFN: return "queue address";
	case VIRTIO_CONFIG_QUEUE_SIZE: return "queue size";
	case VIRTIO_CONFIG_QUEUE_SELECT: return "queue select";
	case VIRTIO_CONFIG_QUEUE_NOTIFY: return "queue notify";
	case VIRTIO_CONFIG_DEVICE_STATUS: return "device status";
	case VIRTIO_CONFIG_ISR_STATUS: return "isr status";
	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI...VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
		return "device config 0";
	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
		return "device config 1";
	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: return "device config 2";
	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: return "device config 3";
	case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: return "device config 4";
	default: return "unknown";
	}
}

uint32_t
vring_size(uint32_t vq_size)
{
	uint32_t allocsize1, allocsize2;

	/* allocsize1: descriptor table + avail ring + pad */
	allocsize1 = VIRTQUEUE_ALIGN(sizeof(struct vring_desc) * vq_size
	    + sizeof(uint16_t) * (2 + vq_size));
	/* allocsize2: used ring + pad */
	allocsize2 = VIRTQUEUE_ALIGN(sizeof(uint16_t) * 2
	    + sizeof(struct vring_used_elem) * vq_size);

	return allocsize1 + allocsize2;
}

/* Update queue select */
void
viornd_update_qs(void)
{
	struct virtio_vq_info *vq_info;

	/* Invalid queue? */
	if (viornd.cfg.queue_select > 0) {
		viornd.cfg.queue_size = 0;
		return;
	}

	vq_info = &viornd.vq[viornd.cfg.queue_select];

	/* Update queue pfn/size based on queue select */
	viornd.cfg.queue_pfn = vq_info->q_gpa >> 12;
	viornd.cfg.queue_size = vq_info->qs;
}

/* Update queue address */
void
viornd_update_qa(void)
{
	struct virtio_vq_info *vq_info;
	void *hva = NULL;

	/* Invalid queue? */
	if (viornd.cfg.queue_select > 0)
		return;

	vq_info = &viornd.vq[viornd.cfg.queue_select];
	vq_info->q_gpa = (uint64_t)viornd.cfg.queue_pfn * VIRTIO_PAGE_SIZE;

	hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIORND_QUEUE_SIZE));
	if (hva == NULL)
		fatalx("viornd_update_qa");
	vq_info->q_hva = hva;
}

int
viornd_notifyq(void)
{
	size_t sz;
	int dxx, ret;
	uint16_t aidx, uidx;
	char *vr, *rnd_data;
	struct vring_desc *desc;
	struct vring_avail *avail;
	struct vring_used *used;
	struct virtio_vq_info *vq_info;

	ret = 0;

	/* Invalid queue? */
	if (viornd.cfg.queue_notify > 0)
		return (0);

	vq_info = &viornd.vq[viornd.cfg.queue_notify];
	vr = vq_info->q_hva;
	if (vr == NULL)
		fatalx("%s: null vring", __func__);

	desc = (struct vring_desc *)(vr);
	avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
	used = (struct vring_used *)(vr + vq_info->vq_usedoffset);

	aidx = avail->idx & VIORND_QUEUE_MASK;
	uidx = used->idx & VIORND_QUEUE_MASK;

	dxx = avail->ring[aidx] & VIORND_QUEUE_MASK;

	sz = desc[dxx].len;
	if (sz > MAXPHYS)
		fatalx("viornd descriptor size too large (%zu)", sz);

	rnd_data = malloc(sz);

	if (rnd_data != NULL) {
		arc4random_buf(rnd_data, sz);
		if (write_mem(desc[dxx].addr, rnd_data, sz)) {
			log_warnx("viornd: can't write random data @ "
			    "0x%llx",
			    desc[dxx].addr);
		} else {
			/* ret == 1 -> interrupt needed */
			/* XXX check VIRTIO_F_NO_INTR */
			ret = 1;
			viornd.cfg.isr_status = 1;
			used->ring[uidx].id = dxx;
			used->ring[uidx].len = sz;
			__sync_synchronize();
			used->idx++;
		}
		free(rnd_data);
	} else
		fatal("memory allocation error for viornd data");

	return (ret);
}

int
virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
    void *unused, uint8_t sz)
{
	*intr = 0xFF;

	if (dir == 0) {
		switch (reg) {
		case VIRTIO_CONFIG_DEVICE_FEATURES:
		case VIRTIO_CONFIG_QUEUE_SIZE:
		case VIRTIO_CONFIG_ISR_STATUS:
			log_warnx("%s: illegal write %x to %s",
			    __progname, *data, virtio_reg_name(reg));
			break;
		case VIRTIO_CONFIG_GUEST_FEATURES:
			viornd.cfg.guest_feature = *data;
			break;
		case VIRTIO_CONFIG_QUEUE_PFN:
			viornd.cfg.queue_pfn = *data;
			viornd_update_qa();
			break;
		case VIRTIO_CONFIG_QUEUE_SELECT:
			viornd.cfg.queue_select = *data;
			viornd_update_qs();
			break;
		case VIRTIO_CONFIG_QUEUE_NOTIFY:
			viornd.cfg.queue_notify = *data;
			if (viornd_notifyq())
				*intr = 1;
			break;
		case VIRTIO_CONFIG_DEVICE_STATUS:
			viornd.cfg.device_status = *data;
			break;
		}
	} else {
		switch (reg) {
		case VIRTIO_CONFIG_DEVICE_FEATURES:
			*data = viornd.cfg.device_feature;
			break;
		case VIRTIO_CONFIG_GUEST_FEATURES:
			*data = viornd.cfg.guest_feature;
			break;
		case VIRTIO_CONFIG_QUEUE_PFN:
			*data = viornd.cfg.queue_pfn;
			break;
		case VIRTIO_CONFIG_QUEUE_SIZE:
			*data = viornd.cfg.queue_size;
			break;
		case VIRTIO_CONFIG_QUEUE_SELECT:
			*data = viornd.cfg.queue_select;
			break;
		case VIRTIO_CONFIG_QUEUE_NOTIFY:
			*data = viornd.cfg.queue_notify;
			break;
		case VIRTIO_CONFIG_DEVICE_STATUS:
			*data = viornd.cfg.device_status;
			break;
		case VIRTIO_CONFIG_ISR_STATUS:
			*data = viornd.cfg.isr_status;
			viornd.cfg.isr_status = 0;
			vcpu_deassert_irq(viornd.vm_id, 0, viornd.irq);
			break;
		}
	}
	return (0);
}

/*
 * vmmci_ctl
 *
 * Inject a command into the vmmci device, potentially delivering interrupt.
 *
 * Called by the vm process's event(3) loop.
 */
int
vmmci_ctl(unsigned int cmd)
{
	int ret = 0;
	struct timeval tv = { 0, 0 };

	mutex_lock(&vmmci.mutex);

	if ((vmmci.cfg.device_status &
	    VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0) {
		ret = -1;
		goto unlock;
	}

	if (cmd == vmmci.cmd)
		goto unlock;

	switch (cmd) {
	case VMMCI_NONE:
		break;
	case VMMCI_SHUTDOWN:
	case VMMCI_REBOOT:
		/* Update command */
		vmmci.cmd = cmd;

		/*
		 * vmm VMs do not support powerdown, send a reboot request
		 * instead and turn it off after the triple fault.
		 */
		if (cmd == VMMCI_SHUTDOWN)
			cmd = VMMCI_REBOOT;

		/* Trigger interrupt */
		vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
		vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq);

		/* Add ACK timeout */
		tv.tv_sec = VMMCI_TIMEOUT_SHORT;
		evtimer_add(&vmmci.timeout, &tv);
		break;
	case VMMCI_SYNCRTC:
		if (vmmci.cfg.guest_feature & VMMCI_F_SYNCRTC) {
			/* RTC updated, request guest VM resync of its RTC */
			vmmci.cmd = cmd;

			vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
			vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq);
		} else {
			log_debug("%s: RTC sync skipped (guest does not "
			    "support RTC sync)\n", __func__);
		}
		break;
	default:
		fatalx("invalid vmmci command: %d", cmd);
	}

unlock:
	mutex_unlock(&vmmci.mutex);

	return (ret);
}

/*
 * vmmci_ack
 *
 * Process a write to the command register.
 *
 * Called by the vcpu thread. Must be called with the mutex held.
 */
void
vmmci_ack(unsigned int cmd)
{
	switch (cmd) {
	case VMMCI_NONE:
		break;
	case VMMCI_SHUTDOWN:
		/*
		 * The shutdown was requested by the VM if we don't have
		 * a pending shutdown request.  In this case add a short
		 * timeout to give the VM a chance to reboot before the
		 * timer is expired.
		 */
		if (vmmci.cmd == 0) {
			log_debug("%s: vm %u requested shutdown", __func__,
			    vmmci.vm_id);
			vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_SHORT);
			return;
		}
		/* FALLTHROUGH */
	case VMMCI_REBOOT:
		/*
		 * If the VM acknowledged our shutdown request, give it
		 * enough time to shutdown or reboot gracefully.  This
		 * might take a considerable amount of time (running
		 * rc.shutdown on the VM), so increase the timeout before
		 * killing it forcefully.
		 */
		if (cmd == vmmci.cmd) {
			log_debug("%s: vm %u acknowledged shutdown request",
			    __func__, vmmci.vm_id);
			vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_LONG);
		}
		break;
	case VMMCI_SYNCRTC:
		log_debug("%s: vm %u acknowledged RTC sync request",
		    __func__, vmmci.vm_id);
		vmmci.cmd = VMMCI_NONE;
		break;
	default:
		log_warnx("%s: illegal request %u", __func__, cmd);
		break;
	}
}

void
vmmci_timeout(int fd, short type, void *arg)
{
	log_debug("%s: vm %u shutdown", __progname, vmmci.vm_id);
	vm_shutdown(vmmci.cmd == VMMCI_REBOOT ? VMMCI_REBOOT : VMMCI_SHUTDOWN);
}

int
vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
    void *unused, uint8_t sz)
{
	*intr = 0xFF;

	mutex_lock(&vmmci.mutex);
	if (dir == 0) {
		switch (reg) {
		case VIRTIO_CONFIG_DEVICE_FEATURES:
		case VIRTIO_CONFIG_QUEUE_SIZE:
		case VIRTIO_CONFIG_ISR_STATUS:
			log_warnx("%s: illegal write %x to %s",
			    __progname, *data, virtio_reg_name(reg));
			break;
		case VIRTIO_CONFIG_GUEST_FEATURES:
			vmmci.cfg.guest_feature = *data;
			break;
		case VIRTIO_CONFIG_QUEUE_PFN:
			vmmci.cfg.queue_pfn = *data;
			break;
		case VIRTIO_CONFIG_QUEUE_SELECT:
			vmmci.cfg.queue_select = *data;
			break;
		case VIRTIO_CONFIG_QUEUE_NOTIFY:
			vmmci.cfg.queue_notify = *data;
			break;
		case VIRTIO_CONFIG_DEVICE_STATUS:
			vmmci.cfg.device_status = *data;
			break;
		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
			vmmci_ack(*data);
			break;
		}
	} else {
		switch (reg) {
		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
			*data = vmmci.cmd;
			break;
		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
			/* Update time once when reading the first register */
			gettimeofday(&vmmci.time, NULL);
			*data = (uint64_t)vmmci.time.tv_sec;
			break;
		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8:
			*data = (uint64_t)vmmci.time.tv_sec << 32;
			break;
		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12:
			*data = (uint64_t)vmmci.time.tv_usec;
			break;
		case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16:
			*data = (uint64_t)vmmci.time.tv_usec << 32;
			break;
		case VIRTIO_CONFIG_DEVICE_FEATURES:
			*data = vmmci.cfg.device_feature;
			break;
		case VIRTIO_CONFIG_GUEST_FEATURES:
			*data = vmmci.cfg.guest_feature;
			break;
		case VIRTIO_CONFIG_QUEUE_PFN:
			*data = vmmci.cfg.queue_pfn;
			break;
		case VIRTIO_CONFIG_QUEUE_SIZE:
			*data = vmmci.cfg.queue_size;
			break;
		case VIRTIO_CONFIG_QUEUE_SELECT:
			*data = vmmci.cfg.queue_select;
			break;
		case VIRTIO_CONFIG_QUEUE_NOTIFY:
			*data = vmmci.cfg.queue_notify;
			break;
		case VIRTIO_CONFIG_DEVICE_STATUS:
			*data = vmmci.cfg.device_status;
			break;
		case VIRTIO_CONFIG_ISR_STATUS:
			*data = vmmci.cfg.isr_status;
			vmmci.cfg.isr_status = 0;
			vcpu_deassert_irq(vmmci.vm_id, 0, vmmci.irq);
			break;
		}
	}
	mutex_unlock(&vmmci.mutex);

	return (0);
}

int
virtio_get_base(int fd, char *path, size_t npath, int type, const char *dpath)
{
	switch (type) {
	case VMDF_RAW:
		return 0;
	case VMDF_QCOW2:
		return virtio_qcow2_get_base(fd, path, npath, dpath);
	}
	log_warnx("%s: invalid disk format", __func__);
	return -1;
}

static void
vmmci_pipe_dispatch(int fd, short event, void *arg)
{
	enum pipe_msg_type msg;
	struct timeval tv = { 0, 0 };

	msg = vm_pipe_recv(&vmmci.dev_pipe);
	switch (msg) {
	case VMMCI_SET_TIMEOUT_SHORT:
		tv.tv_sec = VMMCI_TIMEOUT_SHORT;
		evtimer_add(&vmmci.timeout, &tv);
		break;
	case VMMCI_SET_TIMEOUT_LONG:
		tv.tv_sec = VMMCI_TIMEOUT_LONG;
		evtimer_add(&vmmci.timeout, &tv);
		break;
	default:
		log_warnx("%s: invalid pipe message type %d", __func__, msg);
	}
}

void
virtio_init(struct vmd_vm *vm, int child_cdrom,
    int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
{
	struct vmop_create_params *vmc = &vm->vm_params;
	struct vm_create_params *vcp = &vmc->vmc_params;
	struct virtio_dev *dev;
	uint8_t id;
	uint8_t i, j;
	int ret = 0;

	/* Virtio entropy device */
	if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
	    PCI_PRODUCT_QUMRANET_VIO_RNG, PCI_CLASS_SYSTEM,
	    PCI_SUBCLASS_SYSTEM_MISC,
	    PCI_VENDOR_OPENBSD,
	    PCI_PRODUCT_VIRTIO_ENTROPY, 1, NULL)) {
		log_warnx("%s: can't add PCI virtio rng device",
		    __progname);
		return;
	}

	if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_rnd_io, NULL)) {
		log_warnx("%s: can't add bar for virtio rng device",
		    __progname);
		return;
	}

	memset(&viornd, 0, sizeof(viornd));
	viornd.vq[0].qs = VIORND_QUEUE_SIZE;
	viornd.vq[0].vq_availoffset = sizeof(struct vring_desc) *
	    VIORND_QUEUE_SIZE;
	viornd.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
	    sizeof(struct vring_desc) * VIORND_QUEUE_SIZE
	    + sizeof(uint16_t) * (2 + VIORND_QUEUE_SIZE));
	viornd.pci_id = id;
	viornd.irq = pci_get_dev_irq(id);
	viornd.vm_id = vcp->vcp_id;

	SLIST_INIT(&virtio_devs);

	if (vmc->vmc_nnics > 0) {
		for (i = 0; i < vmc->vmc_nnics; i++) {
			dev = calloc(1, sizeof(struct virtio_dev));
			if (dev == NULL) {
				log_warn("%s: calloc failure allocating vionet",
				    __progname);
				return;
			}
			/* Virtio network */
			dev->dev_type = VMD_DEVTYPE_NET;

			if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
				PCI_PRODUCT_QUMRANET_VIO_NET, PCI_CLASS_SYSTEM,
				PCI_SUBCLASS_SYSTEM_MISC, PCI_VENDOR_OPENBSD,
				PCI_PRODUCT_VIRTIO_NETWORK, 1, NULL)) {
				log_warnx("%s: can't add PCI virtio net device",
				    __progname);
				return;
			}
			dev->pci_id = id;
			dev->sync_fd = -1;
			dev->async_fd = -1;
			dev->vm_id = vcp->vcp_id;
			dev->vm_vmid = vm->vm_vmid;
			dev->irq = pci_get_dev_irq(id);

			/* The vionet pci bar function is called by the vcpu. */
			if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
			    dev)) {
				log_warnx("%s: can't add bar for virtio net "
				    "device", __progname);
				return;
			}

			dev->vionet.vq[RXQ].qs = VIONET_QUEUE_SIZE;
			dev->vionet.vq[RXQ].vq_availoffset =
			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
			dev->vionet.vq[RXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
				sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
				+ sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
			dev->vionet.vq[RXQ].last_avail = 0;
			dev->vionet.vq[RXQ].notified_avail = 0;

			dev->vionet.vq[TXQ].qs = VIONET_QUEUE_SIZE;
			dev->vionet.vq[TXQ].vq_availoffset =
			    sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
			dev->vionet.vq[TXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
				sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
				+ sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
			dev->vionet.vq[TXQ].last_avail = 0;
			dev->vionet.vq[TXQ].notified_avail = 0;

			dev->vionet.data_fd = child_taps[i];

			/* MAC address has been assigned by the parent */
			memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6);
			dev->vionet.cfg.device_feature = VIRTIO_NET_F_MAC;

			dev->vionet.lockedmac =
			    vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0;
			dev->vionet.local =
			    vmc->vmc_ifflags[i] & VMIFF_LOCAL ? 1 : 0;
			if (i == 0 && vmc->vmc_bootdevice & VMBOOTDEV_NET)
				dev->vionet.pxeboot = 1;
			memcpy(&dev->vionet.local_prefix,
			    &env->vmd_cfg.cfg_localprefix,
			    sizeof(dev->vionet.local_prefix));
			log_debug("%s: vm \"%s\" vio%u lladdr %s%s%s%s",
			    __func__, vcp->vcp_name, i,
			    ether_ntoa((void *)dev->vionet.mac),
			    dev->vionet.lockedmac ? ", locked" : "",
			    dev->vionet.local ? ", local" : "",
			    dev->vionet.pxeboot ? ", pxeboot" : "");

			/* Add the vionet to our device list. */
			dev->vionet.idx = i;
			SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
		}
	}

	if (vmc->vmc_ndisks > 0) {
		for (i = 0; i < vmc->vmc_ndisks; i++) {
			dev = calloc(1, sizeof(struct virtio_dev));
			if (dev == NULL) {
				log_warn("%s: calloc failure allocating vioblk",
				    __progname);
				return;
			}

			/* One vioblk device for each disk defined in vcp */
			dev->dev_type = VMD_DEVTYPE_DISK;

			if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
			    PCI_PRODUCT_QUMRANET_VIO_BLOCK,
			    PCI_CLASS_MASS_STORAGE,
			    PCI_SUBCLASS_MASS_STORAGE_SCSI,
			    PCI_VENDOR_OPENBSD,
			    PCI_PRODUCT_VIRTIO_BLOCK, 1, NULL)) {
				log_warnx("%s: can't add PCI virtio block "
				    "device", __progname);
				return;
			}
			dev->pci_id = id;
			dev->sync_fd = -1;
			dev->async_fd = -1;
			dev->vm_id = vcp->vcp_id;
			dev->vm_vmid = vm->vm_vmid;
			dev->irq = pci_get_dev_irq(id);

			if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
			    &dev->vioblk)) {
				log_warnx("%s: can't add bar for virtio block "
				    "device", __progname);
				return;
			}
			dev->vioblk.vq[0].qs = VIOBLK_QUEUE_SIZE;
			dev->vioblk.vq[0].vq_availoffset =
			    sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE;
			dev->vioblk.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
			    sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE
			    + sizeof(uint16_t) * (2 + VIOBLK_QUEUE_SIZE));
			dev->vioblk.vq[0].last_avail = 0;
			dev->vioblk.cfg.device_feature =
			    VIRTIO_BLK_F_SEG_MAX;
			dev->vioblk.seg_max = VIOBLK_SEG_MAX;

			/*
			 * Initialize disk fds to an invalid fd (-1), then
			 * set any child disk fds.
			 */
			memset(&dev->vioblk.disk_fd, -1,
			    sizeof(dev->vioblk.disk_fd));
			dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i];
			for (j = 0; j < dev->vioblk.ndisk_fd; j++)
				dev->vioblk.disk_fd[j] = child_disks[i][j];

			dev->vioblk.idx = i;
			SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
		}
	}

	/*
	 * Launch virtio devices that support subprocess execution.
	 */
	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
		if (virtio_dev_launch(vm, dev) != 0)
			fatalx("failed to launch virtio device");
	}

	/* vioscsi cdrom */
	if (strlen(vmc->vmc_cdrom)) {
		vioscsi = calloc(1, sizeof(struct vioscsi_dev));
		if (vioscsi == NULL) {
			log_warn("%s: calloc failure allocating vioscsi",
			    __progname);
			return;
		}

		if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
		    PCI_PRODUCT_QUMRANET_VIO_SCSI,
		    PCI_CLASS_MASS_STORAGE,
		    PCI_SUBCLASS_MASS_STORAGE_SCSI,
		    PCI_VENDOR_OPENBSD,
		    PCI_PRODUCT_VIRTIO_SCSI, 1, NULL)) {
			log_warnx("%s: can't add PCI vioscsi device",
			    __progname);
			return;
		}

		if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vioscsi_io, vioscsi)) {
			log_warnx("%s: can't add bar for vioscsi device",
			    __progname);
			return;
		}

		for (i = 0; i < VIRTIO_MAX_QUEUES; i++) {
			vioscsi->vq[i].qs = VIOSCSI_QUEUE_SIZE;
			vioscsi->vq[i].vq_availoffset =
			    sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE;
			vioscsi->vq[i].vq_usedoffset = VIRTQUEUE_ALIGN(
			    sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE
			    + sizeof(uint16_t) * (2 + VIOSCSI_QUEUE_SIZE));
			vioscsi->vq[i].last_avail = 0;
		}
		if (virtio_raw_init(&vioscsi->file, &vioscsi->sz, &child_cdrom,
		    1) == -1) {
			log_warnx("%s: unable to determine iso format",
			    __func__);
			return;
		}
		vioscsi->locked = 0;
		vioscsi->lba = 0;
		vioscsi->n_blocks = vioscsi->sz / VIOSCSI_BLOCK_SIZE_CDROM;
		vioscsi->max_xfer = VIOSCSI_BLOCK_SIZE_CDROM;
		vioscsi->pci_id = id;
		vioscsi->vm_id = vcp->vcp_id;
		vioscsi->irq = pci_get_dev_irq(id);
	}

	/* virtio control device */
	if (pci_add_device(&id, PCI_VENDOR_OPENBSD,
	    PCI_PRODUCT_OPENBSD_CONTROL,
	    PCI_CLASS_COMMUNICATIONS,
	    PCI_SUBCLASS_COMMUNICATIONS_MISC,
	    PCI_VENDOR_OPENBSD,
	    PCI_PRODUCT_VIRTIO_VMMCI, 1, NULL)) {
		log_warnx("%s: can't add PCI vmm control device",
		    __progname);
		return;
	}

	if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vmmci_io, NULL)) {
		log_warnx("%s: can't add bar for vmm control device",
		    __progname);
		return;
	}

	memset(&vmmci, 0, sizeof(vmmci));
	vmmci.cfg.device_feature = VMMCI_F_TIMESYNC | VMMCI_F_ACK |
	    VMMCI_F_SYNCRTC;
	vmmci.vm_id = vcp->vcp_id;
	vmmci.irq = pci_get_dev_irq(id);
	vmmci.pci_id = id;
	ret = pthread_mutex_init(&vmmci.mutex, NULL);
	if (ret) {
		errno = ret;
		fatal("could not initialize vmmci mutex");
	}

	evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
	vm_pipe_init(&vmmci.dev_pipe, vmmci_pipe_dispatch);
	event_add(&vmmci.dev_pipe.read_ev, NULL);
}

/*
 * vionet_set_hostmac
 *
 * Sets the hardware address for the host-side tap(4) on a vionet_dev.
 *
 * This should only be called from the event-loop thread
 *
 * vm: pointer to the current vmd_vm instance
 * idx: index into the array of vionet_dev's for the target vionet_dev
 * addr: ethernet address to set
 */
void
vionet_set_hostmac(struct vmd_vm *vm, unsigned int idx, uint8_t *addr)
{
	struct vmop_create_params	*vmc = &vm->vm_params;
	struct virtio_dev		*dev;
	struct vionet_dev		*vionet = NULL;
	int ret;

	if (idx > vmc->vmc_nnics)
		fatalx("%s: invalid vionet index: %u", __func__, idx);

	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
		if (dev->dev_type == VMD_DEVTYPE_NET
		    && dev->vionet.idx == idx) {
			vionet = &dev->vionet;
			break;
		}
	}
	if (vionet == NULL)
		fatalx("%s: dev == NULL, idx = %u", __func__, idx);

	/* Set the local vm process copy. */
	memcpy(vionet->hostmac, addr, sizeof(vionet->hostmac));

	/* Send the information to the device process. */
	ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_HOSTMAC, 0, 0, -1,
	    vionet->hostmac, sizeof(vionet->hostmac));
	if (ret == -1) {
		log_warnx("%s: failed to queue hostmac to vionet dev %u",
		    __func__, idx);
		return;
	}
}

void
virtio_shutdown(struct vmd_vm *vm)
{
	int ret, status;
	pid_t pid = 0;
	struct virtio_dev *dev, *tmp;
	struct viodev_msg msg;
	struct imsgbuf *ibuf;

	/* Ensure that our disks are synced. */
	if (vioscsi != NULL)
		vioscsi->file.close(vioscsi->file.p, 0);

	/*
	 * Broadcast shutdown to child devices. We need to do this
	 * synchronously as we have already stopped the async event thread.
	 */
	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
		memset(&msg, 0, sizeof(msg));
		msg.type = VIODEV_MSG_SHUTDOWN;
		ibuf = &dev->sync_iev.ibuf;
		ret = imsg_compose(ibuf, VIODEV_MSG_SHUTDOWN, 0, 0, -1,
		    &msg, sizeof(msg));
		if (ret == -1)
			fatalx("%s: failed to send shutdown to device",
			    __func__);
		if (imsgbuf_flush(ibuf) == -1)
			fatalx("%s: imsgbuf_flush", __func__);
	}

	/*
	 * Wait for all children to shutdown using a simple approach of
	 * iterating over known child devices and waiting for them to die.
	 */
	SLIST_FOREACH_SAFE(dev, &virtio_devs, dev_next, tmp) {
		log_debug("%s: waiting on device pid %d", __func__,
		    dev->dev_pid);
		do {
			pid = waitpid(dev->dev_pid, &status, WNOHANG);
		} while (pid == 0 || (pid == -1 && errno == EINTR));
		if (pid == dev->dev_pid)
			log_debug("%s: device for pid %d is stopped",
			    __func__, pid);
		else
			log_warnx("%s: unexpected pid %d", __func__, pid);
		free(dev);
	}
}

int
vmmci_restore(int fd, uint32_t vm_id)
{
	log_debug("%s: receiving vmmci", __func__);
	if (atomicio(read, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
		log_warnx("%s: error reading vmmci from fd", __func__);
		return (-1);
	}

	if (pci_set_bar_fn(vmmci.pci_id, 0, vmmci_io, NULL)) {
		log_warnx("%s: can't set bar fn for vmm control device",
		    __progname);
		return (-1);
	}
	vmmci.vm_id = vm_id;
	vmmci.irq = pci_get_dev_irq(vmmci.pci_id);
	memset(&vmmci.timeout, 0, sizeof(struct event));
	evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
	return (0);
}

int
viornd_restore(int fd, struct vmd_vm *vm)
{
	void *hva = NULL;

	log_debug("%s: receiving viornd", __func__);
	if (atomicio(read, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
		log_warnx("%s: error reading viornd from fd", __func__);
		return (-1);
	}
	if (pci_set_bar_fn(viornd.pci_id, 0, virtio_rnd_io, NULL)) {
		log_warnx("%s: can't set bar fn for virtio rng device",
		    __progname);
		return (-1);
	}
	viornd.vm_id = vm->vm_params.vmc_params.vcp_id;
	viornd.irq = pci_get_dev_irq(viornd.pci_id);

	hva = hvaddr_mem(viornd.vq[0].q_gpa, vring_size(VIORND_QUEUE_SIZE));
	if (hva == NULL)
		fatal("failed to restore viornd virtqueue");
	viornd.vq[0].q_hva = hva;

	return (0);
}

int
vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
{
	struct vmop_create_params *vmc = &vm->vm_params;
	struct vm_create_params *vcp = &vmc->vmc_params;
	struct virtio_dev *dev;
	uint8_t i;

	if (vmc->vmc_nnics == 0)
		return (0);

	for (i = 0; i < vmc->vmc_nnics; i++) {
		dev = calloc(1, sizeof(struct virtio_dev));
		if (dev == NULL) {
			log_warn("%s: calloc failure allocating vionet",
			    __progname);
			return (-1);
		}

		log_debug("%s: receiving virtio network device", __func__);
		if (atomicio(read, fd, dev, sizeof(struct virtio_dev))
		    != sizeof(struct virtio_dev)) {
			log_warnx("%s: error reading vionet from fd",
			    __func__);
			return (-1);
		}

		/* Virtio network */
		if (dev->dev_type != VMD_DEVTYPE_NET) {
			log_warnx("%s: invalid device type", __func__);
			return (-1);
		}

		dev->sync_fd = -1;
		dev->async_fd = -1;
		dev->vm_id = vcp->vcp_id;
		dev->vm_vmid = vm->vm_vmid;
		dev->irq = pci_get_dev_irq(dev->pci_id);

		if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) {
			log_warnx("%s: can't set bar fn for virtio net "
			    "device", __progname);
			return (-1);
		}

		dev->vionet.data_fd = child_taps[i];
		dev->vionet.idx = i;

		SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
	}

	return (0);
}

int
vioblk_restore(int fd, struct vmd_vm *vm,
    int child_disks[][VM_MAX_BASE_PER_DISK])
{
	struct vmop_create_params *vmc = &vm->vm_params;
	struct virtio_dev *dev;
	uint8_t i, j;

	if (vmc->vmc_ndisks == 0)
		return (0);

	for (i = 0; i < vmc->vmc_ndisks; i++) {
		dev = calloc(1, sizeof(struct virtio_dev));
		if (dev == NULL) {
			log_warn("%s: calloc failure allocating vioblks",
			    __progname);
			return (-1);
		}

		log_debug("%s: receiving vioblk", __func__);
		if (atomicio(read, fd, dev, sizeof(struct virtio_dev))
		    != sizeof(struct virtio_dev)) {
			log_warnx("%s: error reading vioblk from fd", __func__);
			return (-1);
		}
		if (dev->dev_type != VMD_DEVTYPE_DISK) {
			log_warnx("%s: invalid device type", __func__);
			return (-1);
		}

		dev->sync_fd = -1;
		dev->async_fd = -1;

		if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) {
			log_warnx("%s: can't set bar fn for virtio block "
			    "device", __progname);
			return (-1);
		}
		dev->vm_id = vmc->vmc_params.vcp_id;
		dev->irq = pci_get_dev_irq(dev->pci_id);

		memset(&dev->vioblk.disk_fd, -1, sizeof(dev->vioblk.disk_fd));
		dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i];
		for (j = 0; j < dev->vioblk.ndisk_fd; j++)
			dev->vioblk.disk_fd[j] = child_disks[i][j];

		dev->vioblk.idx = i;
		SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
	}
	return (0);
}

int
vioscsi_restore(int fd, struct vmd_vm *vm, int child_cdrom)
{
	void *hva = NULL;
	unsigned int i;

	if (!strlen(vm->vm_params.vmc_cdrom))
		return (0);

	vioscsi = calloc(1, sizeof(struct vioscsi_dev));
	if (vioscsi == NULL) {
		log_warn("%s: calloc failure allocating vioscsi", __progname);
		return (-1);
	}

	log_debug("%s: receiving vioscsi", __func__);

	if (atomicio(read, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
	    sizeof(struct vioscsi_dev)) {
		log_warnx("%s: error reading vioscsi from fd", __func__);
		return (-1);
	}

	if (pci_set_bar_fn(vioscsi->pci_id, 0, vioscsi_io, vioscsi)) {
		log_warnx("%s: can't set bar fn for vmm control device",
		    __progname);
		return (-1);
	}

	vioscsi->vm_id = vm->vm_params.vmc_params.vcp_id;
	vioscsi->irq = pci_get_dev_irq(vioscsi->pci_id);

	/* vioscsi uses 3 virtqueues. */
	for (i = 0; i < 3; i++) {
		hva = hvaddr_mem(vioscsi->vq[i].q_gpa,
		    vring_size(VIOSCSI_QUEUE_SIZE));
		if (hva == NULL)
			fatal("failed to restore vioscsi virtqueue");
		vioscsi->vq[i].q_hva = hva;
	}

	return (0);
}

int
virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
    int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
{
	struct virtio_dev *dev;
	int ret;

	SLIST_INIT(&virtio_devs);

	if ((ret = viornd_restore(fd, vm)) == -1)
		return (ret);

	if ((ret = vioblk_restore(fd, vm, child_disks)) == -1)
		return (ret);

	if ((ret = vioscsi_restore(fd, vm, child_cdrom)) == -1)
		return (ret);

	if ((ret = vionet_restore(fd, vm, child_taps)) == -1)
		return (ret);

	if ((ret = vmmci_restore(fd, vm->vm_params.vmc_params.vcp_id)) == -1)
		return (ret);

	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
		if (virtio_dev_launch(vm, dev) != 0)
			fatalx("%s: failed to restore virtio dev", __func__);
	}

	return (0);
}

int
viornd_dump(int fd)
{
	log_debug("%s: sending viornd", __func__);

	viornd.vq[0].q_hva = NULL;

	if (atomicio(vwrite, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
		log_warnx("%s: error writing viornd to fd", __func__);
		return (-1);
	}
	return (0);
}

int
vmmci_dump(int fd)
{
	log_debug("%s: sending vmmci", __func__);

	if (atomicio(vwrite, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
		log_warnx("%s: error writing vmmci to fd", __func__);
		return (-1);
	}
	return (0);
}

int
vionet_dump(int fd)
{
	struct virtio_dev	*dev, temp;
	struct viodev_msg	 msg;
	struct imsg		 imsg;
	struct imsgbuf		*ibuf = NULL;
	size_t			 sz;
	int			 ret;

	log_debug("%s: dumping vionet", __func__);

	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
		if (dev->dev_type != VMD_DEVTYPE_NET)
			continue;

		memset(&msg, 0, sizeof(msg));
		memset(&imsg, 0, sizeof(imsg));

		ibuf = &dev->sync_iev.ibuf;
		msg.type = VIODEV_MSG_DUMP;

		ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
		    sizeof(msg));
		if (ret == -1) {
			log_warnx("%s: failed requesting dump of vionet[%d]",
			    __func__, dev->vionet.idx);
			return (-1);
		}
		if (imsgbuf_flush(ibuf) == -1) {
			log_warnx("%s: imsgbuf_flush", __func__);
			return (-1);
		}

		sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp));
		if (sz != sizeof(temp)) {
			log_warnx("%s: failed to dump vionet[%d]", __func__,
			    dev->vionet.idx);
			return (-1);
		}

		/* Clear volatile state. Will reinitialize on restore. */
		temp.vionet.vq[RXQ].q_hva = NULL;
		temp.vionet.vq[TXQ].q_hva = NULL;
		temp.async_fd = -1;
		temp.sync_fd = -1;
		memset(&temp.async_iev, 0, sizeof(temp.async_iev));
		memset(&temp.sync_iev, 0, sizeof(temp.sync_iev));

		if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) {
			log_warnx("%s: error writing vionet to fd", __func__);
			return (-1);
		}
	}

	return (0);
}

int
vioblk_dump(int fd)
{
	struct virtio_dev	*dev, temp;
	struct viodev_msg	 msg;
	struct imsg		 imsg;
	struct imsgbuf		*ibuf = NULL;
	size_t			 sz;
	int			 ret;

	log_debug("%s: dumping vioblk", __func__);

	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
		if (dev->dev_type != VMD_DEVTYPE_DISK)
			continue;

		memset(&msg, 0, sizeof(msg));
		memset(&imsg, 0, sizeof(imsg));

		ibuf = &dev->sync_iev.ibuf;
		msg.type = VIODEV_MSG_DUMP;

		ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
		    sizeof(msg));
		if (ret == -1) {
			log_warnx("%s: failed requesting dump of vioblk[%d]",
			    __func__, dev->vioblk.idx);
			return (-1);
		}
		if (imsgbuf_flush(ibuf) == -1) {
			log_warnx("%s: imsgbuf_flush", __func__);
			return (-1);
		}


		sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp));
		if (sz != sizeof(temp)) {
			log_warnx("%s: failed to dump vioblk[%d]", __func__,
			    dev->vioblk.idx);
			return (-1);
		}

		/* Clear volatile state. Will reinitialize on restore. */
		temp.vioblk.vq[0].q_hva = NULL;
		temp.async_fd = -1;
		temp.sync_fd = -1;
		memset(&temp.async_iev, 0, sizeof(temp.async_iev));
		memset(&temp.sync_iev, 0, sizeof(temp.sync_iev));

		if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) {
			log_warnx("%s: error writing vioblk to fd", __func__);
			return (-1);
		}
	}

	return (0);
}

int
vioscsi_dump(int fd)
{
	unsigned int i;

	if (vioscsi == NULL)
		return (0);

	log_debug("%s: sending vioscsi", __func__);

	for (i = 0; i < 3; i++)
		vioscsi->vq[i].q_hva = NULL;

	if (atomicio(vwrite, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
	    sizeof(struct vioscsi_dev)) {
		log_warnx("%s: error writing vioscsi to fd", __func__);
		return (-1);
	}
	return (0);
}

int
virtio_dump(int fd)
{
	int ret;

	if ((ret = viornd_dump(fd)) == -1)
		return ret;

	if ((ret = vioblk_dump(fd)) == -1)
		return ret;

	if ((ret = vioscsi_dump(fd)) == -1)
		return ret;

	if ((ret = vionet_dump(fd)) == -1)
		return ret;

	if ((ret = vmmci_dump(fd)) == -1)
		return ret;

	return (0);
}

void virtio_broadcast_imsg(struct vmd_vm *vm, uint16_t type, void *data,
    uint16_t datalen)
{
	struct virtio_dev *dev;
	int ret;

	SLIST_FOREACH(dev, &virtio_devs, dev_next) {
		ret = imsg_compose_event(&dev->async_iev, type, 0, 0, -1, data,
		    datalen);
		if (ret == -1) {
			log_warnx("%s: failed to broadcast imsg type %u",
			    __func__, type);
		}
	}

}

void
virtio_stop(struct vmd_vm *vm)
{
	return virtio_broadcast_imsg(vm, IMSG_VMDOP_PAUSE_VM, NULL, 0);
}

void
virtio_start(struct vmd_vm *vm)
{
	return virtio_broadcast_imsg(vm, IMSG_VMDOP_UNPAUSE_VM, NULL, 0);
}

/*
 * Fork+exec a child virtio device. Returns 0 on success.
 */
static int
virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev)
{
	char *nargv[12], num[32], vmm_fd[32], vm_name[VM_NAME_MAX], t[2];
	pid_t dev_pid;
	int sync_fds[2], async_fds[2], ret = 0;
	size_t i, sz = 0;
	struct viodev_msg msg;
	struct virtio_dev *dev_entry;
	struct imsg imsg;
	struct imsgev *iev = &dev->sync_iev;

	switch (dev->dev_type) {
	case VMD_DEVTYPE_NET:
		log_debug("%s: launching vionet%d",
		    vm->vm_params.vmc_params.vcp_name, dev->vionet.idx);
		break;
	case VMD_DEVTYPE_DISK:
		log_debug("%s: launching vioblk%d",
		    vm->vm_params.vmc_params.vcp_name, dev->vioblk.idx);
		break;
		/* NOTREACHED */
	default:
		log_warn("%s: invalid device type", __func__);
		return (EINVAL);
	}

	/* We need two channels: one synchronous (IO reads) and one async. */
	if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, PF_UNSPEC,
	    sync_fds) == -1) {
		log_warn("failed to create socketpair");
		return (errno);
	}
	if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, PF_UNSPEC,
	    async_fds) == -1) {
		log_warn("failed to create async socketpair");
		return (errno);
	}

	/* Fork... */
	dev_pid = fork();
	if (dev_pid == -1) {
		ret = errno;
		log_warn("%s: fork failed", __func__);
		goto err;
	}

	if (dev_pid > 0) {
		/* Parent */
		close_fd(sync_fds[1]);
		close_fd(async_fds[1]);

		/* Save the child's pid to help with cleanup. */
		dev->dev_pid = dev_pid;

		/* Set the channel fds to the child's before sending. */
		dev->sync_fd = sync_fds[1];
		dev->async_fd = async_fds[1];

		/* 1. Send over our configured device. */
		log_debug("%s: sending '%c' type device struct", __func__,
			dev->dev_type);
		sz = atomicio(vwrite, sync_fds[0], dev, sizeof(*dev));
		if (sz != sizeof(*dev)) {
			log_warnx("%s: failed to send device", __func__);
			ret = EIO;
			goto err;
		}

		/* Close data fds. Only the child device needs them now. */
		if (virtio_dev_closefds(dev) == -1) {
			log_warnx("%s: failed to close device data fds",
			    __func__);
			goto err;
		}

		/* 2. Send over details on the VM (including memory fds). */
		log_debug("%s: sending vm message for '%s'", __func__,
			vm->vm_params.vmc_params.vcp_name);
		sz = atomicio(vwrite, sync_fds[0], vm, sizeof(*vm));
		if (sz != sizeof(*vm)) {
			log_warnx("%s: failed to send vm details", __func__);
			ret = EIO;
			goto err;
		}

		/*
		 * Initialize our imsg channel to the child device. The initial
		 * communication will be synchronous. We expect the child to
		 * report itself "ready" to confirm the launch was a success.
		 */
		if (imsgbuf_init(&iev->ibuf, sync_fds[0]) == -1) {
			log_warn("%s: failed to init imsgbuf", __func__);
			goto err;
		}
		imsgbuf_allow_fdpass(&iev->ibuf);
		ret = imsgbuf_read_one(&iev->ibuf, &imsg);
		if (ret == 0 || ret == -1) {
			log_warnx("%s: failed to receive ready message from "
			    "'%c' type device", __func__, dev->dev_type);
			ret = EIO;
			goto err;
		}
		ret = 0;

		IMSG_SIZE_CHECK(&imsg, &msg);
		memcpy(&msg, imsg.data, sizeof(msg));
		imsg_free(&imsg);

		if (msg.type != VIODEV_MSG_READY) {
			log_warnx("%s: expected ready message, got type %d",
			    __func__, msg.type);
			ret = EINVAL;
			goto err;
		}
		log_debug("%s: device reports ready via sync channel",
		    __func__);

		/*
		 * Wire in the async event handling, but after reverting back
		 * to the parent's fd's.
		 */
		dev->sync_fd = sync_fds[0];
		dev->async_fd = async_fds[0];
		vm_device_pipe(dev, virtio_dispatch_dev, NULL);
	} else {
		/* Child */
		close_fd(async_fds[0]);
		close_fd(sync_fds[0]);

		/* Close pty. Virtio devices do not need it. */
		close_fd(vm->vm_tty);
		vm->vm_tty = -1;

		if (vm->vm_cdrom != -1) {
			close_fd(vm->vm_cdrom);
			vm->vm_cdrom = -1;
		}

		/* Keep data file descriptors open after exec. */
		SLIST_FOREACH(dev_entry, &virtio_devs, dev_next) {
			if (dev_entry == dev)
				continue;
			if (virtio_dev_closefds(dev_entry) == -1)
				fatalx("unable to close other virtio devs");
		}

		memset(num, 0, sizeof(num));
		snprintf(num, sizeof(num), "%d", sync_fds[1]);
		memset(vmm_fd, 0, sizeof(vmm_fd));
		snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd);
		memset(vm_name, 0, sizeof(vm_name));
		snprintf(vm_name, sizeof(vm_name), "%s",
		    vm->vm_params.vmc_params.vcp_name);

		t[0] = dev->dev_type;
		t[1] = '\0';

		i = 0;
		nargv[i++] = env->argv0;
		nargv[i++] = "-X";
		nargv[i++] = num;
		nargv[i++] = "-t";
		nargv[i++] = t;
		nargv[i++] = "-i";
		nargv[i++] = vmm_fd;
		nargv[i++] = "-p";
		nargv[i++] = vm_name;
		if (env->vmd_debug)
			nargv[i++] = "-d";
		if (env->vmd_verbose == 1)
			nargv[i++] = "-v";
		else if (env->vmd_verbose > 1)
			nargv[i++] = "-vv";
		nargv[i++] = NULL;
		if (i > sizeof(nargv) / sizeof(nargv[0]))
			fatalx("%s: nargv overflow", __func__);

		/* Control resumes in vmd.c:main(). */
		execvp(nargv[0], nargv);

		ret = errno;
		log_warn("%s: failed to exec device", __func__);
		_exit(ret);
		/* NOTREACHED */
	}

	return (ret);

err:
	close_fd(sync_fds[0]);
	close_fd(sync_fds[1]);
	close_fd(async_fds[0]);
	close_fd(async_fds[1]);
	return (ret);
}

/*
 * Initialize an async imsg channel for a virtio device.
 */
int
vm_device_pipe(struct virtio_dev *dev, void (*cb)(int, short, void *),
    struct event_base *ev_base)
{
	struct imsgev *iev = &dev->async_iev;
	int fd = dev->async_fd;

	log_debug("%s: initializing '%c' device pipe (fd=%d)", __func__,
	    dev->dev_type, fd);

	if (imsgbuf_init(&iev->ibuf, fd) == -1)
		fatal("imsgbuf_init");
	imsgbuf_allow_fdpass(&iev->ibuf);
	iev->handler = cb;
	iev->data = dev;
	iev->events = EV_READ;
	imsg_event_add2(iev, ev_base);

	return (0);
}

void
virtio_dispatch_dev(int fd, short event, void *arg)
{
	struct virtio_dev	*dev = (struct virtio_dev*)arg;
	struct imsgev		*iev = &dev->async_iev;
	struct imsgbuf		*ibuf = &iev->ibuf;
	struct imsg		 imsg;
	struct viodev_msg	 msg;
	ssize_t			 n = 0;

	if (event & EV_READ) {
		if ((n = imsgbuf_read(ibuf)) == -1)
			fatal("%s: imsgbuf_read", __func__);
		if (n == 0) {
			/* this pipe is dead, so remove the event handler */
			log_debug("%s: pipe dead (EV_READ)", __func__);
			event_del(&iev->ev);
			event_loopexit(NULL);
			return;
		}
	}

	if (event & EV_WRITE) {
		if (imsgbuf_write(ibuf) == -1) {
			if (errno == EPIPE) {
				/* this pipe is dead, remove the handler */
				log_debug("%s: pipe dead (EV_WRITE)", __func__);
				event_del(&iev->ev);
				event_loopexit(NULL);
				return;
			}
			fatal("%s: imsgbuf_write", __func__);
		}
	}

	for (;;) {
		if ((n = imsg_get(ibuf, &imsg)) == -1)
			fatal("%s: imsg_get", __func__);
		if (n == 0)
			break;

		switch (imsg.hdr.type) {
		case IMSG_DEVOP_MSG:
			IMSG_SIZE_CHECK(&imsg, &msg);
			memcpy(&msg, imsg.data, sizeof(msg));
			handle_dev_msg(&msg, dev);
			break;
		default:
			log_warnx("%s: got non devop imsg %d", __func__,
			    imsg.hdr.type);
			break;
		}
		imsg_free(&imsg);
	}
	imsg_event_add(iev);
}


static int
handle_dev_msg(struct viodev_msg *msg, struct virtio_dev *gdev)
{
	uint32_t vm_id = gdev->vm_id;
	int irq = gdev->irq;

	switch (msg->type) {
	case VIODEV_MSG_KICK:
		if (msg->state == INTR_STATE_ASSERT)
			vcpu_assert_irq(vm_id, msg->vcpu, irq);
		else if (msg->state == INTR_STATE_DEASSERT)
			vcpu_deassert_irq(vm_id, msg->vcpu, irq);
		break;
	case VIODEV_MSG_READY:
		log_debug("%s: device reports ready", __func__);
		break;
	case VIODEV_MSG_ERROR:
		log_warnx("%s: device reported error", __func__);
		break;
	case VIODEV_MSG_INVALID:
	case VIODEV_MSG_IO_READ:
	case VIODEV_MSG_IO_WRITE:
		/* FALLTHROUGH */
	default:
		log_warnx("%s: unsupported device message type %d", __func__,
		    msg->type);
		return (1);
	}

	return (0);
};

/*
 * Called by the VM process while processing IO from the VCPU thread.
 *
 * N.b. Since the VCPU thread calls this function, we cannot mutate the event
 * system. All ipc messages must be sent manually and cannot be queued for
 * the event loop to push them. (We need to perform a synchronous read, so
 * this isn't really a big deal.)
 */
int
virtio_pci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
    void *cookie, uint8_t sz)
{
	struct virtio_dev *dev = (struct virtio_dev *)cookie;
	struct imsgbuf *ibuf = &dev->sync_iev.ibuf;
	struct imsg imsg;
	struct viodev_msg msg;
	int ret = 0;

	memset(&msg, 0, sizeof(msg));
	msg.reg = reg;
	msg.io_sz = sz;

	if (dir == 0) {
		msg.type = VIODEV_MSG_IO_WRITE;
		msg.data = *data;
		msg.data_valid = 1;
	} else
		msg.type = VIODEV_MSG_IO_READ;

	if (msg.type == VIODEV_MSG_IO_WRITE) {
		/*
		 * Write request. No reply expected.
		 */
		ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
		    sizeof(msg));
		if (ret == -1) {
			log_warn("%s: failed to send async io event to virtio"
			    " device", __func__);
			return (ret);
		}
		if (imsgbuf_flush(ibuf) == -1) {
			log_warnx("%s: imsgbuf_flush (write)", __func__);
			return (-1);
		}
	} else {
		/*
		 * Read request. Requires waiting for a reply.
		 */
		ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
		    sizeof(msg));
		if (ret == -1) {
			log_warnx("%s: failed to send sync io event to virtio"
			    " device", __func__);
			return (ret);
		}
		if (imsgbuf_flush(ibuf) == -1) {
			log_warnx("%s: imsgbuf_flush (read)", __func__);
			return (-1);
		}

		/* Read our reply. */
		ret = imsgbuf_read_one(ibuf, &imsg);
		if (ret == 0 || ret == -1) {
			log_warn("%s: imsgbuf_read (n=%d)", __func__, ret);
			return (-1);
		}
		IMSG_SIZE_CHECK(&imsg, &msg);
		memcpy(&msg, imsg.data, sizeof(msg));
		imsg_free(&imsg);

		if (msg.type == VIODEV_MSG_IO_READ && msg.data_valid) {
#if DEBUG
			log_debug("%s: got sync read response (reg=%s)",
			    __func__, virtio_reg_name(msg.reg));
#endif /* DEBUG */
			*data = msg.data;
			/*
			 * It's possible we're asked to {de,}assert after the
			 * device performs a register read.
			 */
			if (msg.state == INTR_STATE_ASSERT)
				vcpu_assert_irq(dev->vm_id, msg.vcpu, msg.irq);
			else if (msg.state == INTR_STATE_DEASSERT)
				vcpu_deassert_irq(dev->vm_id, msg.vcpu, msg.irq);
		} else {
			log_warnx("%s: expected IO_READ, got %d", __func__,
			    msg.type);
			return (-1);
		}
	}

	return (0);
}

void
virtio_assert_irq(struct virtio_dev *dev, int vcpu)
{
	struct viodev_msg msg;
	int ret;

	memset(&msg, 0, sizeof(msg));
	msg.irq = dev->irq;
	msg.vcpu = vcpu;
	msg.type = VIODEV_MSG_KICK;
	msg.state = INTR_STATE_ASSERT;

	ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
	    &msg, sizeof(msg));
	if (ret == -1)
		log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
}

void
virtio_deassert_irq(struct virtio_dev *dev, int vcpu)
{
	struct viodev_msg msg;
	int ret;

	memset(&msg, 0, sizeof(msg));
	msg.irq = dev->irq;
	msg.vcpu = vcpu;
	msg.type = VIODEV_MSG_KICK;
	msg.state = INTR_STATE_DEASSERT;

	ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
	    &msg, sizeof(msg));
	if (ret == -1)
		log_warnx("%s: failed to deassert irq %d", __func__, dev->irq);
}

/*
 * Close all underlying file descriptors for a given virtio device.
 */
static int
virtio_dev_closefds(struct virtio_dev *dev)
{
	size_t i;

	switch (dev->dev_type) {
		case VMD_DEVTYPE_DISK:
			for (i = 0; i < dev->vioblk.ndisk_fd; i++) {
				close_fd(dev->vioblk.disk_fd[i]);
				dev->vioblk.disk_fd[i] = -1;
			}
			break;
		case VMD_DEVTYPE_NET:
			close_fd(dev->vionet.data_fd);
			dev->vionet.data_fd = -1;
			break;
	default:
		log_warnx("%s: invalid device type", __func__);
		return (-1);
	}

	close_fd(dev->async_fd);
	dev->async_fd = -1;
	close_fd(dev->sync_fd);
	dev->sync_fd = -1;

	return (0);
}