src/usr.sbin/vmd/virtio.c

1814 lines
45 KiB
C

/* $OpenBSD: virtio.c,v 1.123 2025/01/08 15:46:10 dv Exp $ */
/*
* Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/param.h> /* PAGE_SIZE */
#include <sys/socket.h>
#include <sys/wait.h>
#include <dev/pci/pcireg.h>
#include <dev/pci/pcidevs.h>
#include <dev/pv/virtioreg.h>
#include <dev/pci/virtio_pcireg.h>
#include <dev/pv/vioblkreg.h>
#include <dev/vmm/vmm.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>
#include <errno.h>
#include <event.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "atomicio.h"
#include "pci.h"
#include "vioscsi.h"
#include "virtio.h"
#include "vmd.h"
extern struct vmd *env;
extern char *__progname;
struct viornd_dev viornd;
struct vioscsi_dev *vioscsi;
struct vmmci_dev vmmci;
/* Devices emulated in subprocesses are inserted into this list. */
SLIST_HEAD(virtio_dev_head, virtio_dev) virtio_devs;
#define MAXPHYS (64 * 1024) /* max raw I/O transfer size */
#define VIRTIO_NET_F_MAC (1<<5)
#define VMMCI_F_TIMESYNC (1<<0)
#define VMMCI_F_ACK (1<<1)
#define VMMCI_F_SYNCRTC (1<<2)
#define RXQ 0
#define TXQ 1
static int virtio_dev_launch(struct vmd_vm *, struct virtio_dev *);
static void virtio_dispatch_dev(int, short, void *);
static int handle_dev_msg(struct viodev_msg *, struct virtio_dev *);
static int virtio_dev_closefds(struct virtio_dev *);
static void vmmci_pipe_dispatch(int, short, void *);
const char *
virtio_reg_name(uint8_t reg)
{
switch (reg) {
case VIRTIO_CONFIG_DEVICE_FEATURES: return "device feature";
case VIRTIO_CONFIG_GUEST_FEATURES: return "guest feature";
case VIRTIO_CONFIG_QUEUE_PFN: return "queue address";
case VIRTIO_CONFIG_QUEUE_SIZE: return "queue size";
case VIRTIO_CONFIG_QUEUE_SELECT: return "queue select";
case VIRTIO_CONFIG_QUEUE_NOTIFY: return "queue notify";
case VIRTIO_CONFIG_DEVICE_STATUS: return "device status";
case VIRTIO_CONFIG_ISR_STATUS: return "isr status";
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI...VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
return "device config 0";
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
return "device config 1";
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: return "device config 2";
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: return "device config 3";
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: return "device config 4";
default: return "unknown";
}
}
uint32_t
vring_size(uint32_t vq_size)
{
uint32_t allocsize1, allocsize2;
/* allocsize1: descriptor table + avail ring + pad */
allocsize1 = VIRTQUEUE_ALIGN(sizeof(struct vring_desc) * vq_size
+ sizeof(uint16_t) * (2 + vq_size));
/* allocsize2: used ring + pad */
allocsize2 = VIRTQUEUE_ALIGN(sizeof(uint16_t) * 2
+ sizeof(struct vring_used_elem) * vq_size);
return allocsize1 + allocsize2;
}
/* Update queue select */
void
viornd_update_qs(void)
{
struct virtio_vq_info *vq_info;
/* Invalid queue? */
if (viornd.cfg.queue_select > 0) {
viornd.cfg.queue_size = 0;
return;
}
vq_info = &viornd.vq[viornd.cfg.queue_select];
/* Update queue pfn/size based on queue select */
viornd.cfg.queue_pfn = vq_info->q_gpa >> 12;
viornd.cfg.queue_size = vq_info->qs;
}
/* Update queue address */
void
viornd_update_qa(void)
{
struct virtio_vq_info *vq_info;
void *hva = NULL;
/* Invalid queue? */
if (viornd.cfg.queue_select > 0)
return;
vq_info = &viornd.vq[viornd.cfg.queue_select];
vq_info->q_gpa = (uint64_t)viornd.cfg.queue_pfn * VIRTIO_PAGE_SIZE;
hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIORND_QUEUE_SIZE));
if (hva == NULL)
fatalx("viornd_update_qa");
vq_info->q_hva = hva;
}
int
viornd_notifyq(void)
{
size_t sz;
int dxx, ret;
uint16_t aidx, uidx;
char *vr, *rnd_data;
struct vring_desc *desc;
struct vring_avail *avail;
struct vring_used *used;
struct virtio_vq_info *vq_info;
ret = 0;
/* Invalid queue? */
if (viornd.cfg.queue_notify > 0)
return (0);
vq_info = &viornd.vq[viornd.cfg.queue_notify];
vr = vq_info->q_hva;
if (vr == NULL)
fatalx("%s: null vring", __func__);
desc = (struct vring_desc *)(vr);
avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
aidx = avail->idx & VIORND_QUEUE_MASK;
uidx = used->idx & VIORND_QUEUE_MASK;
dxx = avail->ring[aidx] & VIORND_QUEUE_MASK;
sz = desc[dxx].len;
if (sz > MAXPHYS)
fatalx("viornd descriptor size too large (%zu)", sz);
rnd_data = malloc(sz);
if (rnd_data != NULL) {
arc4random_buf(rnd_data, sz);
if (write_mem(desc[dxx].addr, rnd_data, sz)) {
log_warnx("viornd: can't write random data @ "
"0x%llx",
desc[dxx].addr);
} else {
/* ret == 1 -> interrupt needed */
/* XXX check VIRTIO_F_NO_INTR */
ret = 1;
viornd.cfg.isr_status = 1;
used->ring[uidx].id = dxx;
used->ring[uidx].len = sz;
__sync_synchronize();
used->idx++;
}
free(rnd_data);
} else
fatal("memory allocation error for viornd data");
return (ret);
}
int
virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
void *unused, uint8_t sz)
{
*intr = 0xFF;
if (dir == 0) {
switch (reg) {
case VIRTIO_CONFIG_DEVICE_FEATURES:
case VIRTIO_CONFIG_QUEUE_SIZE:
case VIRTIO_CONFIG_ISR_STATUS:
log_warnx("%s: illegal write %x to %s",
__progname, *data, virtio_reg_name(reg));
break;
case VIRTIO_CONFIG_GUEST_FEATURES:
viornd.cfg.guest_feature = *data;
break;
case VIRTIO_CONFIG_QUEUE_PFN:
viornd.cfg.queue_pfn = *data;
viornd_update_qa();
break;
case VIRTIO_CONFIG_QUEUE_SELECT:
viornd.cfg.queue_select = *data;
viornd_update_qs();
break;
case VIRTIO_CONFIG_QUEUE_NOTIFY:
viornd.cfg.queue_notify = *data;
if (viornd_notifyq())
*intr = 1;
break;
case VIRTIO_CONFIG_DEVICE_STATUS:
viornd.cfg.device_status = *data;
break;
}
} else {
switch (reg) {
case VIRTIO_CONFIG_DEVICE_FEATURES:
*data = viornd.cfg.device_feature;
break;
case VIRTIO_CONFIG_GUEST_FEATURES:
*data = viornd.cfg.guest_feature;
break;
case VIRTIO_CONFIG_QUEUE_PFN:
*data = viornd.cfg.queue_pfn;
break;
case VIRTIO_CONFIG_QUEUE_SIZE:
*data = viornd.cfg.queue_size;
break;
case VIRTIO_CONFIG_QUEUE_SELECT:
*data = viornd.cfg.queue_select;
break;
case VIRTIO_CONFIG_QUEUE_NOTIFY:
*data = viornd.cfg.queue_notify;
break;
case VIRTIO_CONFIG_DEVICE_STATUS:
*data = viornd.cfg.device_status;
break;
case VIRTIO_CONFIG_ISR_STATUS:
*data = viornd.cfg.isr_status;
viornd.cfg.isr_status = 0;
vcpu_deassert_irq(viornd.vm_id, 0, viornd.irq);
break;
}
}
return (0);
}
/*
* vmmci_ctl
*
* Inject a command into the vmmci device, potentially delivering interrupt.
*
* Called by the vm process's event(3) loop.
*/
int
vmmci_ctl(unsigned int cmd)
{
int ret = 0;
struct timeval tv = { 0, 0 };
mutex_lock(&vmmci.mutex);
if ((vmmci.cfg.device_status &
VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) == 0) {
ret = -1;
goto unlock;
}
if (cmd == vmmci.cmd)
goto unlock;
switch (cmd) {
case VMMCI_NONE:
break;
case VMMCI_SHUTDOWN:
case VMMCI_REBOOT:
/* Update command */
vmmci.cmd = cmd;
/*
* vmm VMs do not support powerdown, send a reboot request
* instead and turn it off after the triple fault.
*/
if (cmd == VMMCI_SHUTDOWN)
cmd = VMMCI_REBOOT;
/* Trigger interrupt */
vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq);
/* Add ACK timeout */
tv.tv_sec = VMMCI_TIMEOUT_SHORT;
evtimer_add(&vmmci.timeout, &tv);
break;
case VMMCI_SYNCRTC:
if (vmmci.cfg.guest_feature & VMMCI_F_SYNCRTC) {
/* RTC updated, request guest VM resync of its RTC */
vmmci.cmd = cmd;
vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq);
} else {
log_debug("%s: RTC sync skipped (guest does not "
"support RTC sync)\n", __func__);
}
break;
default:
fatalx("invalid vmmci command: %d", cmd);
}
unlock:
mutex_unlock(&vmmci.mutex);
return (ret);
}
/*
* vmmci_ack
*
* Process a write to the command register.
*
* Called by the vcpu thread. Must be called with the mutex held.
*/
void
vmmci_ack(unsigned int cmd)
{
switch (cmd) {
case VMMCI_NONE:
break;
case VMMCI_SHUTDOWN:
/*
* The shutdown was requested by the VM if we don't have
* a pending shutdown request. In this case add a short
* timeout to give the VM a chance to reboot before the
* timer is expired.
*/
if (vmmci.cmd == 0) {
log_debug("%s: vm %u requested shutdown", __func__,
vmmci.vm_id);
vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_SHORT);
return;
}
/* FALLTHROUGH */
case VMMCI_REBOOT:
/*
* If the VM acknowledged our shutdown request, give it
* enough time to shutdown or reboot gracefully. This
* might take a considerable amount of time (running
* rc.shutdown on the VM), so increase the timeout before
* killing it forcefully.
*/
if (cmd == vmmci.cmd) {
log_debug("%s: vm %u acknowledged shutdown request",
__func__, vmmci.vm_id);
vm_pipe_send(&vmmci.dev_pipe, VMMCI_SET_TIMEOUT_LONG);
}
break;
case VMMCI_SYNCRTC:
log_debug("%s: vm %u acknowledged RTC sync request",
__func__, vmmci.vm_id);
vmmci.cmd = VMMCI_NONE;
break;
default:
log_warnx("%s: illegal request %u", __func__, cmd);
break;
}
}
void
vmmci_timeout(int fd, short type, void *arg)
{
log_debug("%s: vm %u shutdown", __progname, vmmci.vm_id);
vm_shutdown(vmmci.cmd == VMMCI_REBOOT ? VMMCI_REBOOT : VMMCI_SHUTDOWN);
}
int
vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
void *unused, uint8_t sz)
{
*intr = 0xFF;
mutex_lock(&vmmci.mutex);
if (dir == 0) {
switch (reg) {
case VIRTIO_CONFIG_DEVICE_FEATURES:
case VIRTIO_CONFIG_QUEUE_SIZE:
case VIRTIO_CONFIG_ISR_STATUS:
log_warnx("%s: illegal write %x to %s",
__progname, *data, virtio_reg_name(reg));
break;
case VIRTIO_CONFIG_GUEST_FEATURES:
vmmci.cfg.guest_feature = *data;
break;
case VIRTIO_CONFIG_QUEUE_PFN:
vmmci.cfg.queue_pfn = *data;
break;
case VIRTIO_CONFIG_QUEUE_SELECT:
vmmci.cfg.queue_select = *data;
break;
case VIRTIO_CONFIG_QUEUE_NOTIFY:
vmmci.cfg.queue_notify = *data;
break;
case VIRTIO_CONFIG_DEVICE_STATUS:
vmmci.cfg.device_status = *data;
break;
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
vmmci_ack(*data);
break;
}
} else {
switch (reg) {
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
*data = vmmci.cmd;
break;
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
/* Update time once when reading the first register */
gettimeofday(&vmmci.time, NULL);
*data = (uint64_t)vmmci.time.tv_sec;
break;
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8:
*data = (uint64_t)vmmci.time.tv_sec << 32;
break;
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12:
*data = (uint64_t)vmmci.time.tv_usec;
break;
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16:
*data = (uint64_t)vmmci.time.tv_usec << 32;
break;
case VIRTIO_CONFIG_DEVICE_FEATURES:
*data = vmmci.cfg.device_feature;
break;
case VIRTIO_CONFIG_GUEST_FEATURES:
*data = vmmci.cfg.guest_feature;
break;
case VIRTIO_CONFIG_QUEUE_PFN:
*data = vmmci.cfg.queue_pfn;
break;
case VIRTIO_CONFIG_QUEUE_SIZE:
*data = vmmci.cfg.queue_size;
break;
case VIRTIO_CONFIG_QUEUE_SELECT:
*data = vmmci.cfg.queue_select;
break;
case VIRTIO_CONFIG_QUEUE_NOTIFY:
*data = vmmci.cfg.queue_notify;
break;
case VIRTIO_CONFIG_DEVICE_STATUS:
*data = vmmci.cfg.device_status;
break;
case VIRTIO_CONFIG_ISR_STATUS:
*data = vmmci.cfg.isr_status;
vmmci.cfg.isr_status = 0;
vcpu_deassert_irq(vmmci.vm_id, 0, vmmci.irq);
break;
}
}
mutex_unlock(&vmmci.mutex);
return (0);
}
int
virtio_get_base(int fd, char *path, size_t npath, int type, const char *dpath)
{
switch (type) {
case VMDF_RAW:
return 0;
case VMDF_QCOW2:
return virtio_qcow2_get_base(fd, path, npath, dpath);
}
log_warnx("%s: invalid disk format", __func__);
return -1;
}
static void
vmmci_pipe_dispatch(int fd, short event, void *arg)
{
enum pipe_msg_type msg;
struct timeval tv = { 0, 0 };
msg = vm_pipe_recv(&vmmci.dev_pipe);
switch (msg) {
case VMMCI_SET_TIMEOUT_SHORT:
tv.tv_sec = VMMCI_TIMEOUT_SHORT;
evtimer_add(&vmmci.timeout, &tv);
break;
case VMMCI_SET_TIMEOUT_LONG:
tv.tv_sec = VMMCI_TIMEOUT_LONG;
evtimer_add(&vmmci.timeout, &tv);
break;
default:
log_warnx("%s: invalid pipe message type %d", __func__, msg);
}
}
void
virtio_init(struct vmd_vm *vm, int child_cdrom,
int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
{
struct vmop_create_params *vmc = &vm->vm_params;
struct vm_create_params *vcp = &vmc->vmc_params;
struct virtio_dev *dev;
uint8_t id;
uint8_t i, j;
int ret = 0;
/* Virtio entropy device */
if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
PCI_PRODUCT_QUMRANET_VIO_RNG, PCI_CLASS_SYSTEM,
PCI_SUBCLASS_SYSTEM_MISC,
PCI_VENDOR_OPENBSD,
PCI_PRODUCT_VIRTIO_ENTROPY, 1, NULL)) {
log_warnx("%s: can't add PCI virtio rng device",
__progname);
return;
}
if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_rnd_io, NULL)) {
log_warnx("%s: can't add bar for virtio rng device",
__progname);
return;
}
memset(&viornd, 0, sizeof(viornd));
viornd.vq[0].qs = VIORND_QUEUE_SIZE;
viornd.vq[0].vq_availoffset = sizeof(struct vring_desc) *
VIORND_QUEUE_SIZE;
viornd.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
sizeof(struct vring_desc) * VIORND_QUEUE_SIZE
+ sizeof(uint16_t) * (2 + VIORND_QUEUE_SIZE));
viornd.pci_id = id;
viornd.irq = pci_get_dev_irq(id);
viornd.vm_id = vcp->vcp_id;
SLIST_INIT(&virtio_devs);
if (vmc->vmc_nnics > 0) {
for (i = 0; i < vmc->vmc_nnics; i++) {
dev = calloc(1, sizeof(struct virtio_dev));
if (dev == NULL) {
log_warn("%s: calloc failure allocating vionet",
__progname);
return;
}
/* Virtio network */
dev->dev_type = VMD_DEVTYPE_NET;
if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
PCI_PRODUCT_QUMRANET_VIO_NET, PCI_CLASS_SYSTEM,
PCI_SUBCLASS_SYSTEM_MISC, PCI_VENDOR_OPENBSD,
PCI_PRODUCT_VIRTIO_NETWORK, 1, NULL)) {
log_warnx("%s: can't add PCI virtio net device",
__progname);
return;
}
dev->pci_id = id;
dev->sync_fd = -1;
dev->async_fd = -1;
dev->vm_id = vcp->vcp_id;
dev->vm_vmid = vm->vm_vmid;
dev->irq = pci_get_dev_irq(id);
/* The vionet pci bar function is called by the vcpu. */
if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
dev)) {
log_warnx("%s: can't add bar for virtio net "
"device", __progname);
return;
}
dev->vionet.vq[RXQ].qs = VIONET_QUEUE_SIZE;
dev->vionet.vq[RXQ].vq_availoffset =
sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
dev->vionet.vq[RXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
+ sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
dev->vionet.vq[RXQ].last_avail = 0;
dev->vionet.vq[RXQ].notified_avail = 0;
dev->vionet.vq[TXQ].qs = VIONET_QUEUE_SIZE;
dev->vionet.vq[TXQ].vq_availoffset =
sizeof(struct vring_desc) * VIONET_QUEUE_SIZE;
dev->vionet.vq[TXQ].vq_usedoffset = VIRTQUEUE_ALIGN(
sizeof(struct vring_desc) * VIONET_QUEUE_SIZE
+ sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE));
dev->vionet.vq[TXQ].last_avail = 0;
dev->vionet.vq[TXQ].notified_avail = 0;
dev->vionet.data_fd = child_taps[i];
/* MAC address has been assigned by the parent */
memcpy(&dev->vionet.mac, &vmc->vmc_macs[i], 6);
dev->vionet.cfg.device_feature = VIRTIO_NET_F_MAC;
dev->vionet.lockedmac =
vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0;
dev->vionet.local =
vmc->vmc_ifflags[i] & VMIFF_LOCAL ? 1 : 0;
if (i == 0 && vmc->vmc_bootdevice & VMBOOTDEV_NET)
dev->vionet.pxeboot = 1;
memcpy(&dev->vionet.local_prefix,
&env->vmd_cfg.cfg_localprefix,
sizeof(dev->vionet.local_prefix));
log_debug("%s: vm \"%s\" vio%u lladdr %s%s%s%s",
__func__, vcp->vcp_name, i,
ether_ntoa((void *)dev->vionet.mac),
dev->vionet.lockedmac ? ", locked" : "",
dev->vionet.local ? ", local" : "",
dev->vionet.pxeboot ? ", pxeboot" : "");
/* Add the vionet to our device list. */
dev->vionet.idx = i;
SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
}
}
if (vmc->vmc_ndisks > 0) {
for (i = 0; i < vmc->vmc_ndisks; i++) {
dev = calloc(1, sizeof(struct virtio_dev));
if (dev == NULL) {
log_warn("%s: calloc failure allocating vioblk",
__progname);
return;
}
/* One vioblk device for each disk defined in vcp */
dev->dev_type = VMD_DEVTYPE_DISK;
if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
PCI_PRODUCT_QUMRANET_VIO_BLOCK,
PCI_CLASS_MASS_STORAGE,
PCI_SUBCLASS_MASS_STORAGE_SCSI,
PCI_VENDOR_OPENBSD,
PCI_PRODUCT_VIRTIO_BLOCK, 1, NULL)) {
log_warnx("%s: can't add PCI virtio block "
"device", __progname);
return;
}
dev->pci_id = id;
dev->sync_fd = -1;
dev->async_fd = -1;
dev->vm_id = vcp->vcp_id;
dev->vm_vmid = vm->vm_vmid;
dev->irq = pci_get_dev_irq(id);
if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io,
&dev->vioblk)) {
log_warnx("%s: can't add bar for virtio block "
"device", __progname);
return;
}
dev->vioblk.vq[0].qs = VIOBLK_QUEUE_SIZE;
dev->vioblk.vq[0].vq_availoffset =
sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE;
dev->vioblk.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN(
sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE
+ sizeof(uint16_t) * (2 + VIOBLK_QUEUE_SIZE));
dev->vioblk.vq[0].last_avail = 0;
dev->vioblk.cfg.device_feature =
VIRTIO_BLK_F_SEG_MAX;
dev->vioblk.seg_max = VIOBLK_SEG_MAX;
/*
* Initialize disk fds to an invalid fd (-1), then
* set any child disk fds.
*/
memset(&dev->vioblk.disk_fd, -1,
sizeof(dev->vioblk.disk_fd));
dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i];
for (j = 0; j < dev->vioblk.ndisk_fd; j++)
dev->vioblk.disk_fd[j] = child_disks[i][j];
dev->vioblk.idx = i;
SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
}
}
/*
* Launch virtio devices that support subprocess execution.
*/
SLIST_FOREACH(dev, &virtio_devs, dev_next) {
if (virtio_dev_launch(vm, dev) != 0)
fatalx("failed to launch virtio device");
}
/* vioscsi cdrom */
if (strlen(vmc->vmc_cdrom)) {
vioscsi = calloc(1, sizeof(struct vioscsi_dev));
if (vioscsi == NULL) {
log_warn("%s: calloc failure allocating vioscsi",
__progname);
return;
}
if (pci_add_device(&id, PCI_VENDOR_QUMRANET,
PCI_PRODUCT_QUMRANET_VIO_SCSI,
PCI_CLASS_MASS_STORAGE,
PCI_SUBCLASS_MASS_STORAGE_SCSI,
PCI_VENDOR_OPENBSD,
PCI_PRODUCT_VIRTIO_SCSI, 1, NULL)) {
log_warnx("%s: can't add PCI vioscsi device",
__progname);
return;
}
if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vioscsi_io, vioscsi)) {
log_warnx("%s: can't add bar for vioscsi device",
__progname);
return;
}
for (i = 0; i < VIRTIO_MAX_QUEUES; i++) {
vioscsi->vq[i].qs = VIOSCSI_QUEUE_SIZE;
vioscsi->vq[i].vq_availoffset =
sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE;
vioscsi->vq[i].vq_usedoffset = VIRTQUEUE_ALIGN(
sizeof(struct vring_desc) * VIOSCSI_QUEUE_SIZE
+ sizeof(uint16_t) * (2 + VIOSCSI_QUEUE_SIZE));
vioscsi->vq[i].last_avail = 0;
}
if (virtio_raw_init(&vioscsi->file, &vioscsi->sz, &child_cdrom,
1) == -1) {
log_warnx("%s: unable to determine iso format",
__func__);
return;
}
vioscsi->locked = 0;
vioscsi->lba = 0;
vioscsi->n_blocks = vioscsi->sz / VIOSCSI_BLOCK_SIZE_CDROM;
vioscsi->max_xfer = VIOSCSI_BLOCK_SIZE_CDROM;
vioscsi->pci_id = id;
vioscsi->vm_id = vcp->vcp_id;
vioscsi->irq = pci_get_dev_irq(id);
}
/* virtio control device */
if (pci_add_device(&id, PCI_VENDOR_OPENBSD,
PCI_PRODUCT_OPENBSD_CONTROL,
PCI_CLASS_COMMUNICATIONS,
PCI_SUBCLASS_COMMUNICATIONS_MISC,
PCI_VENDOR_OPENBSD,
PCI_PRODUCT_VIRTIO_VMMCI, 1, NULL)) {
log_warnx("%s: can't add PCI vmm control device",
__progname);
return;
}
if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, vmmci_io, NULL)) {
log_warnx("%s: can't add bar for vmm control device",
__progname);
return;
}
memset(&vmmci, 0, sizeof(vmmci));
vmmci.cfg.device_feature = VMMCI_F_TIMESYNC | VMMCI_F_ACK |
VMMCI_F_SYNCRTC;
vmmci.vm_id = vcp->vcp_id;
vmmci.irq = pci_get_dev_irq(id);
vmmci.pci_id = id;
ret = pthread_mutex_init(&vmmci.mutex, NULL);
if (ret) {
errno = ret;
fatal("could not initialize vmmci mutex");
}
evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
vm_pipe_init(&vmmci.dev_pipe, vmmci_pipe_dispatch);
event_add(&vmmci.dev_pipe.read_ev, NULL);
}
/*
* vionet_set_hostmac
*
* Sets the hardware address for the host-side tap(4) on a vionet_dev.
*
* This should only be called from the event-loop thread
*
* vm: pointer to the current vmd_vm instance
* idx: index into the array of vionet_dev's for the target vionet_dev
* addr: ethernet address to set
*/
void
vionet_set_hostmac(struct vmd_vm *vm, unsigned int idx, uint8_t *addr)
{
struct vmop_create_params *vmc = &vm->vm_params;
struct virtio_dev *dev;
struct vionet_dev *vionet = NULL;
int ret;
if (idx > vmc->vmc_nnics)
fatalx("%s: invalid vionet index: %u", __func__, idx);
SLIST_FOREACH(dev, &virtio_devs, dev_next) {
if (dev->dev_type == VMD_DEVTYPE_NET
&& dev->vionet.idx == idx) {
vionet = &dev->vionet;
break;
}
}
if (vionet == NULL)
fatalx("%s: dev == NULL, idx = %u", __func__, idx);
/* Set the local vm process copy. */
memcpy(vionet->hostmac, addr, sizeof(vionet->hostmac));
/* Send the information to the device process. */
ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_HOSTMAC, 0, 0, -1,
vionet->hostmac, sizeof(vionet->hostmac));
if (ret == -1) {
log_warnx("%s: failed to queue hostmac to vionet dev %u",
__func__, idx);
return;
}
}
void
virtio_shutdown(struct vmd_vm *vm)
{
int ret, status;
pid_t pid = 0;
struct virtio_dev *dev, *tmp;
struct viodev_msg msg;
struct imsgbuf *ibuf;
/* Ensure that our disks are synced. */
if (vioscsi != NULL)
vioscsi->file.close(vioscsi->file.p, 0);
/*
* Broadcast shutdown to child devices. We need to do this
* synchronously as we have already stopped the async event thread.
*/
SLIST_FOREACH(dev, &virtio_devs, dev_next) {
memset(&msg, 0, sizeof(msg));
msg.type = VIODEV_MSG_SHUTDOWN;
ibuf = &dev->sync_iev.ibuf;
ret = imsg_compose(ibuf, VIODEV_MSG_SHUTDOWN, 0, 0, -1,
&msg, sizeof(msg));
if (ret == -1)
fatalx("%s: failed to send shutdown to device",
__func__);
if (imsgbuf_flush(ibuf) == -1)
fatalx("%s: imsgbuf_flush", __func__);
}
/*
* Wait for all children to shutdown using a simple approach of
* iterating over known child devices and waiting for them to die.
*/
SLIST_FOREACH_SAFE(dev, &virtio_devs, dev_next, tmp) {
log_debug("%s: waiting on device pid %d", __func__,
dev->dev_pid);
do {
pid = waitpid(dev->dev_pid, &status, WNOHANG);
} while (pid == 0 || (pid == -1 && errno == EINTR));
if (pid == dev->dev_pid)
log_debug("%s: device for pid %d is stopped",
__func__, pid);
else
log_warnx("%s: unexpected pid %d", __func__, pid);
free(dev);
}
}
int
vmmci_restore(int fd, uint32_t vm_id)
{
log_debug("%s: receiving vmmci", __func__);
if (atomicio(read, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
log_warnx("%s: error reading vmmci from fd", __func__);
return (-1);
}
if (pci_set_bar_fn(vmmci.pci_id, 0, vmmci_io, NULL)) {
log_warnx("%s: can't set bar fn for vmm control device",
__progname);
return (-1);
}
vmmci.vm_id = vm_id;
vmmci.irq = pci_get_dev_irq(vmmci.pci_id);
memset(&vmmci.timeout, 0, sizeof(struct event));
evtimer_set(&vmmci.timeout, vmmci_timeout, NULL);
return (0);
}
int
viornd_restore(int fd, struct vmd_vm *vm)
{
void *hva = NULL;
log_debug("%s: receiving viornd", __func__);
if (atomicio(read, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
log_warnx("%s: error reading viornd from fd", __func__);
return (-1);
}
if (pci_set_bar_fn(viornd.pci_id, 0, virtio_rnd_io, NULL)) {
log_warnx("%s: can't set bar fn for virtio rng device",
__progname);
return (-1);
}
viornd.vm_id = vm->vm_params.vmc_params.vcp_id;
viornd.irq = pci_get_dev_irq(viornd.pci_id);
hva = hvaddr_mem(viornd.vq[0].q_gpa, vring_size(VIORND_QUEUE_SIZE));
if (hva == NULL)
fatal("failed to restore viornd virtqueue");
viornd.vq[0].q_hva = hva;
return (0);
}
int
vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
{
struct vmop_create_params *vmc = &vm->vm_params;
struct vm_create_params *vcp = &vmc->vmc_params;
struct virtio_dev *dev;
uint8_t i;
if (vmc->vmc_nnics == 0)
return (0);
for (i = 0; i < vmc->vmc_nnics; i++) {
dev = calloc(1, sizeof(struct virtio_dev));
if (dev == NULL) {
log_warn("%s: calloc failure allocating vionet",
__progname);
return (-1);
}
log_debug("%s: receiving virtio network device", __func__);
if (atomicio(read, fd, dev, sizeof(struct virtio_dev))
!= sizeof(struct virtio_dev)) {
log_warnx("%s: error reading vionet from fd",
__func__);
return (-1);
}
/* Virtio network */
if (dev->dev_type != VMD_DEVTYPE_NET) {
log_warnx("%s: invalid device type", __func__);
return (-1);
}
dev->sync_fd = -1;
dev->async_fd = -1;
dev->vm_id = vcp->vcp_id;
dev->vm_vmid = vm->vm_vmid;
dev->irq = pci_get_dev_irq(dev->pci_id);
if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) {
log_warnx("%s: can't set bar fn for virtio net "
"device", __progname);
return (-1);
}
dev->vionet.data_fd = child_taps[i];
dev->vionet.idx = i;
SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
}
return (0);
}
int
vioblk_restore(int fd, struct vmd_vm *vm,
int child_disks[][VM_MAX_BASE_PER_DISK])
{
struct vmop_create_params *vmc = &vm->vm_params;
struct virtio_dev *dev;
uint8_t i, j;
if (vmc->vmc_ndisks == 0)
return (0);
for (i = 0; i < vmc->vmc_ndisks; i++) {
dev = calloc(1, sizeof(struct virtio_dev));
if (dev == NULL) {
log_warn("%s: calloc failure allocating vioblks",
__progname);
return (-1);
}
log_debug("%s: receiving vioblk", __func__);
if (atomicio(read, fd, dev, sizeof(struct virtio_dev))
!= sizeof(struct virtio_dev)) {
log_warnx("%s: error reading vioblk from fd", __func__);
return (-1);
}
if (dev->dev_type != VMD_DEVTYPE_DISK) {
log_warnx("%s: invalid device type", __func__);
return (-1);
}
dev->sync_fd = -1;
dev->async_fd = -1;
if (pci_set_bar_fn(dev->pci_id, 0, virtio_pci_io, dev)) {
log_warnx("%s: can't set bar fn for virtio block "
"device", __progname);
return (-1);
}
dev->vm_id = vmc->vmc_params.vcp_id;
dev->irq = pci_get_dev_irq(dev->pci_id);
memset(&dev->vioblk.disk_fd, -1, sizeof(dev->vioblk.disk_fd));
dev->vioblk.ndisk_fd = vmc->vmc_diskbases[i];
for (j = 0; j < dev->vioblk.ndisk_fd; j++)
dev->vioblk.disk_fd[j] = child_disks[i][j];
dev->vioblk.idx = i;
SLIST_INSERT_HEAD(&virtio_devs, dev, dev_next);
}
return (0);
}
int
vioscsi_restore(int fd, struct vmd_vm *vm, int child_cdrom)
{
void *hva = NULL;
unsigned int i;
if (!strlen(vm->vm_params.vmc_cdrom))
return (0);
vioscsi = calloc(1, sizeof(struct vioscsi_dev));
if (vioscsi == NULL) {
log_warn("%s: calloc failure allocating vioscsi", __progname);
return (-1);
}
log_debug("%s: receiving vioscsi", __func__);
if (atomicio(read, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
sizeof(struct vioscsi_dev)) {
log_warnx("%s: error reading vioscsi from fd", __func__);
return (-1);
}
if (pci_set_bar_fn(vioscsi->pci_id, 0, vioscsi_io, vioscsi)) {
log_warnx("%s: can't set bar fn for vmm control device",
__progname);
return (-1);
}
vioscsi->vm_id = vm->vm_params.vmc_params.vcp_id;
vioscsi->irq = pci_get_dev_irq(vioscsi->pci_id);
/* vioscsi uses 3 virtqueues. */
for (i = 0; i < 3; i++) {
hva = hvaddr_mem(vioscsi->vq[i].q_gpa,
vring_size(VIOSCSI_QUEUE_SIZE));
if (hva == NULL)
fatal("failed to restore vioscsi virtqueue");
vioscsi->vq[i].q_hva = hva;
}
return (0);
}
int
virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
{
struct virtio_dev *dev;
int ret;
SLIST_INIT(&virtio_devs);
if ((ret = viornd_restore(fd, vm)) == -1)
return (ret);
if ((ret = vioblk_restore(fd, vm, child_disks)) == -1)
return (ret);
if ((ret = vioscsi_restore(fd, vm, child_cdrom)) == -1)
return (ret);
if ((ret = vionet_restore(fd, vm, child_taps)) == -1)
return (ret);
if ((ret = vmmci_restore(fd, vm->vm_params.vmc_params.vcp_id)) == -1)
return (ret);
SLIST_FOREACH(dev, &virtio_devs, dev_next) {
if (virtio_dev_launch(vm, dev) != 0)
fatalx("%s: failed to restore virtio dev", __func__);
}
return (0);
}
int
viornd_dump(int fd)
{
log_debug("%s: sending viornd", __func__);
viornd.vq[0].q_hva = NULL;
if (atomicio(vwrite, fd, &viornd, sizeof(viornd)) != sizeof(viornd)) {
log_warnx("%s: error writing viornd to fd", __func__);
return (-1);
}
return (0);
}
int
vmmci_dump(int fd)
{
log_debug("%s: sending vmmci", __func__);
if (atomicio(vwrite, fd, &vmmci, sizeof(vmmci)) != sizeof(vmmci)) {
log_warnx("%s: error writing vmmci to fd", __func__);
return (-1);
}
return (0);
}
int
vionet_dump(int fd)
{
struct virtio_dev *dev, temp;
struct viodev_msg msg;
struct imsg imsg;
struct imsgbuf *ibuf = NULL;
size_t sz;
int ret;
log_debug("%s: dumping vionet", __func__);
SLIST_FOREACH(dev, &virtio_devs, dev_next) {
if (dev->dev_type != VMD_DEVTYPE_NET)
continue;
memset(&msg, 0, sizeof(msg));
memset(&imsg, 0, sizeof(imsg));
ibuf = &dev->sync_iev.ibuf;
msg.type = VIODEV_MSG_DUMP;
ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
sizeof(msg));
if (ret == -1) {
log_warnx("%s: failed requesting dump of vionet[%d]",
__func__, dev->vionet.idx);
return (-1);
}
if (imsgbuf_flush(ibuf) == -1) {
log_warnx("%s: imsgbuf_flush", __func__);
return (-1);
}
sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp));
if (sz != sizeof(temp)) {
log_warnx("%s: failed to dump vionet[%d]", __func__,
dev->vionet.idx);
return (-1);
}
/* Clear volatile state. Will reinitialize on restore. */
temp.vionet.vq[RXQ].q_hva = NULL;
temp.vionet.vq[TXQ].q_hva = NULL;
temp.async_fd = -1;
temp.sync_fd = -1;
memset(&temp.async_iev, 0, sizeof(temp.async_iev));
memset(&temp.sync_iev, 0, sizeof(temp.sync_iev));
if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) {
log_warnx("%s: error writing vionet to fd", __func__);
return (-1);
}
}
return (0);
}
int
vioblk_dump(int fd)
{
struct virtio_dev *dev, temp;
struct viodev_msg msg;
struct imsg imsg;
struct imsgbuf *ibuf = NULL;
size_t sz;
int ret;
log_debug("%s: dumping vioblk", __func__);
SLIST_FOREACH(dev, &virtio_devs, dev_next) {
if (dev->dev_type != VMD_DEVTYPE_DISK)
continue;
memset(&msg, 0, sizeof(msg));
memset(&imsg, 0, sizeof(imsg));
ibuf = &dev->sync_iev.ibuf;
msg.type = VIODEV_MSG_DUMP;
ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
sizeof(msg));
if (ret == -1) {
log_warnx("%s: failed requesting dump of vioblk[%d]",
__func__, dev->vioblk.idx);
return (-1);
}
if (imsgbuf_flush(ibuf) == -1) {
log_warnx("%s: imsgbuf_flush", __func__);
return (-1);
}
sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp));
if (sz != sizeof(temp)) {
log_warnx("%s: failed to dump vioblk[%d]", __func__,
dev->vioblk.idx);
return (-1);
}
/* Clear volatile state. Will reinitialize on restore. */
temp.vioblk.vq[0].q_hva = NULL;
temp.async_fd = -1;
temp.sync_fd = -1;
memset(&temp.async_iev, 0, sizeof(temp.async_iev));
memset(&temp.sync_iev, 0, sizeof(temp.sync_iev));
if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) {
log_warnx("%s: error writing vioblk to fd", __func__);
return (-1);
}
}
return (0);
}
int
vioscsi_dump(int fd)
{
unsigned int i;
if (vioscsi == NULL)
return (0);
log_debug("%s: sending vioscsi", __func__);
for (i = 0; i < 3; i++)
vioscsi->vq[i].q_hva = NULL;
if (atomicio(vwrite, fd, vioscsi, sizeof(struct vioscsi_dev)) !=
sizeof(struct vioscsi_dev)) {
log_warnx("%s: error writing vioscsi to fd", __func__);
return (-1);
}
return (0);
}
int
virtio_dump(int fd)
{
int ret;
if ((ret = viornd_dump(fd)) == -1)
return ret;
if ((ret = vioblk_dump(fd)) == -1)
return ret;
if ((ret = vioscsi_dump(fd)) == -1)
return ret;
if ((ret = vionet_dump(fd)) == -1)
return ret;
if ((ret = vmmci_dump(fd)) == -1)
return ret;
return (0);
}
void virtio_broadcast_imsg(struct vmd_vm *vm, uint16_t type, void *data,
uint16_t datalen)
{
struct virtio_dev *dev;
int ret;
SLIST_FOREACH(dev, &virtio_devs, dev_next) {
ret = imsg_compose_event(&dev->async_iev, type, 0, 0, -1, data,
datalen);
if (ret == -1) {
log_warnx("%s: failed to broadcast imsg type %u",
__func__, type);
}
}
}
void
virtio_stop(struct vmd_vm *vm)
{
return virtio_broadcast_imsg(vm, IMSG_VMDOP_PAUSE_VM, NULL, 0);
}
void
virtio_start(struct vmd_vm *vm)
{
return virtio_broadcast_imsg(vm, IMSG_VMDOP_UNPAUSE_VM, NULL, 0);
}
/*
* Fork+exec a child virtio device. Returns 0 on success.
*/
static int
virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev)
{
char *nargv[12], num[32], vmm_fd[32], vm_name[VM_NAME_MAX], t[2];
pid_t dev_pid;
int sync_fds[2], async_fds[2], ret = 0;
size_t i, sz = 0;
struct viodev_msg msg;
struct virtio_dev *dev_entry;
struct imsg imsg;
struct imsgev *iev = &dev->sync_iev;
switch (dev->dev_type) {
case VMD_DEVTYPE_NET:
log_debug("%s: launching vionet%d",
vm->vm_params.vmc_params.vcp_name, dev->vionet.idx);
break;
case VMD_DEVTYPE_DISK:
log_debug("%s: launching vioblk%d",
vm->vm_params.vmc_params.vcp_name, dev->vioblk.idx);
break;
/* NOTREACHED */
default:
log_warn("%s: invalid device type", __func__);
return (EINVAL);
}
/* We need two channels: one synchronous (IO reads) and one async. */
if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, PF_UNSPEC,
sync_fds) == -1) {
log_warn("failed to create socketpair");
return (errno);
}
if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, PF_UNSPEC,
async_fds) == -1) {
log_warn("failed to create async socketpair");
return (errno);
}
/* Fork... */
dev_pid = fork();
if (dev_pid == -1) {
ret = errno;
log_warn("%s: fork failed", __func__);
goto err;
}
if (dev_pid > 0) {
/* Parent */
close_fd(sync_fds[1]);
close_fd(async_fds[1]);
/* Save the child's pid to help with cleanup. */
dev->dev_pid = dev_pid;
/* Set the channel fds to the child's before sending. */
dev->sync_fd = sync_fds[1];
dev->async_fd = async_fds[1];
/* 1. Send over our configured device. */
log_debug("%s: sending '%c' type device struct", __func__,
dev->dev_type);
sz = atomicio(vwrite, sync_fds[0], dev, sizeof(*dev));
if (sz != sizeof(*dev)) {
log_warnx("%s: failed to send device", __func__);
ret = EIO;
goto err;
}
/* Close data fds. Only the child device needs them now. */
if (virtio_dev_closefds(dev) == -1) {
log_warnx("%s: failed to close device data fds",
__func__);
goto err;
}
/* 2. Send over details on the VM (including memory fds). */
log_debug("%s: sending vm message for '%s'", __func__,
vm->vm_params.vmc_params.vcp_name);
sz = atomicio(vwrite, sync_fds[0], vm, sizeof(*vm));
if (sz != sizeof(*vm)) {
log_warnx("%s: failed to send vm details", __func__);
ret = EIO;
goto err;
}
/*
* Initialize our imsg channel to the child device. The initial
* communication will be synchronous. We expect the child to
* report itself "ready" to confirm the launch was a success.
*/
if (imsgbuf_init(&iev->ibuf, sync_fds[0]) == -1) {
log_warn("%s: failed to init imsgbuf", __func__);
goto err;
}
imsgbuf_allow_fdpass(&iev->ibuf);
ret = imsgbuf_read_one(&iev->ibuf, &imsg);
if (ret == 0 || ret == -1) {
log_warnx("%s: failed to receive ready message from "
"'%c' type device", __func__, dev->dev_type);
ret = EIO;
goto err;
}
ret = 0;
IMSG_SIZE_CHECK(&imsg, &msg);
memcpy(&msg, imsg.data, sizeof(msg));
imsg_free(&imsg);
if (msg.type != VIODEV_MSG_READY) {
log_warnx("%s: expected ready message, got type %d",
__func__, msg.type);
ret = EINVAL;
goto err;
}
log_debug("%s: device reports ready via sync channel",
__func__);
/*
* Wire in the async event handling, but after reverting back
* to the parent's fd's.
*/
dev->sync_fd = sync_fds[0];
dev->async_fd = async_fds[0];
vm_device_pipe(dev, virtio_dispatch_dev, NULL);
} else {
/* Child */
close_fd(async_fds[0]);
close_fd(sync_fds[0]);
/* Close pty. Virtio devices do not need it. */
close_fd(vm->vm_tty);
vm->vm_tty = -1;
if (vm->vm_cdrom != -1) {
close_fd(vm->vm_cdrom);
vm->vm_cdrom = -1;
}
/* Keep data file descriptors open after exec. */
SLIST_FOREACH(dev_entry, &virtio_devs, dev_next) {
if (dev_entry == dev)
continue;
if (virtio_dev_closefds(dev_entry) == -1)
fatalx("unable to close other virtio devs");
}
memset(num, 0, sizeof(num));
snprintf(num, sizeof(num), "%d", sync_fds[1]);
memset(vmm_fd, 0, sizeof(vmm_fd));
snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd);
memset(vm_name, 0, sizeof(vm_name));
snprintf(vm_name, sizeof(vm_name), "%s",
vm->vm_params.vmc_params.vcp_name);
t[0] = dev->dev_type;
t[1] = '\0';
i = 0;
nargv[i++] = env->argv0;
nargv[i++] = "-X";
nargv[i++] = num;
nargv[i++] = "-t";
nargv[i++] = t;
nargv[i++] = "-i";
nargv[i++] = vmm_fd;
nargv[i++] = "-p";
nargv[i++] = vm_name;
if (env->vmd_debug)
nargv[i++] = "-d";
if (env->vmd_verbose == 1)
nargv[i++] = "-v";
else if (env->vmd_verbose > 1)
nargv[i++] = "-vv";
nargv[i++] = NULL;
if (i > sizeof(nargv) / sizeof(nargv[0]))
fatalx("%s: nargv overflow", __func__);
/* Control resumes in vmd.c:main(). */
execvp(nargv[0], nargv);
ret = errno;
log_warn("%s: failed to exec device", __func__);
_exit(ret);
/* NOTREACHED */
}
return (ret);
err:
close_fd(sync_fds[0]);
close_fd(sync_fds[1]);
close_fd(async_fds[0]);
close_fd(async_fds[1]);
return (ret);
}
/*
* Initialize an async imsg channel for a virtio device.
*/
int
vm_device_pipe(struct virtio_dev *dev, void (*cb)(int, short, void *),
struct event_base *ev_base)
{
struct imsgev *iev = &dev->async_iev;
int fd = dev->async_fd;
log_debug("%s: initializing '%c' device pipe (fd=%d)", __func__,
dev->dev_type, fd);
if (imsgbuf_init(&iev->ibuf, fd) == -1)
fatal("imsgbuf_init");
imsgbuf_allow_fdpass(&iev->ibuf);
iev->handler = cb;
iev->data = dev;
iev->events = EV_READ;
imsg_event_add2(iev, ev_base);
return (0);
}
void
virtio_dispatch_dev(int fd, short event, void *arg)
{
struct virtio_dev *dev = (struct virtio_dev*)arg;
struct imsgev *iev = &dev->async_iev;
struct imsgbuf *ibuf = &iev->ibuf;
struct imsg imsg;
struct viodev_msg msg;
ssize_t n = 0;
if (event & EV_READ) {
if ((n = imsgbuf_read(ibuf)) == -1)
fatal("%s: imsgbuf_read", __func__);
if (n == 0) {
/* this pipe is dead, so remove the event handler */
log_debug("%s: pipe dead (EV_READ)", __func__);
event_del(&iev->ev);
event_loopexit(NULL);
return;
}
}
if (event & EV_WRITE) {
if (imsgbuf_write(ibuf) == -1) {
if (errno == EPIPE) {
/* this pipe is dead, remove the handler */
log_debug("%s: pipe dead (EV_WRITE)", __func__);
event_del(&iev->ev);
event_loopexit(NULL);
return;
}
fatal("%s: imsgbuf_write", __func__);
}
}
for (;;) {
if ((n = imsg_get(ibuf, &imsg)) == -1)
fatal("%s: imsg_get", __func__);
if (n == 0)
break;
switch (imsg.hdr.type) {
case IMSG_DEVOP_MSG:
IMSG_SIZE_CHECK(&imsg, &msg);
memcpy(&msg, imsg.data, sizeof(msg));
handle_dev_msg(&msg, dev);
break;
default:
log_warnx("%s: got non devop imsg %d", __func__,
imsg.hdr.type);
break;
}
imsg_free(&imsg);
}
imsg_event_add(iev);
}
static int
handle_dev_msg(struct viodev_msg *msg, struct virtio_dev *gdev)
{
uint32_t vm_id = gdev->vm_id;
int irq = gdev->irq;
switch (msg->type) {
case VIODEV_MSG_KICK:
if (msg->state == INTR_STATE_ASSERT)
vcpu_assert_irq(vm_id, msg->vcpu, irq);
else if (msg->state == INTR_STATE_DEASSERT)
vcpu_deassert_irq(vm_id, msg->vcpu, irq);
break;
case VIODEV_MSG_READY:
log_debug("%s: device reports ready", __func__);
break;
case VIODEV_MSG_ERROR:
log_warnx("%s: device reported error", __func__);
break;
case VIODEV_MSG_INVALID:
case VIODEV_MSG_IO_READ:
case VIODEV_MSG_IO_WRITE:
/* FALLTHROUGH */
default:
log_warnx("%s: unsupported device message type %d", __func__,
msg->type);
return (1);
}
return (0);
};
/*
* Called by the VM process while processing IO from the VCPU thread.
*
* N.b. Since the VCPU thread calls this function, we cannot mutate the event
* system. All ipc messages must be sent manually and cannot be queued for
* the event loop to push them. (We need to perform a synchronous read, so
* this isn't really a big deal.)
*/
int
virtio_pci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
void *cookie, uint8_t sz)
{
struct virtio_dev *dev = (struct virtio_dev *)cookie;
struct imsgbuf *ibuf = &dev->sync_iev.ibuf;
struct imsg imsg;
struct viodev_msg msg;
int ret = 0;
memset(&msg, 0, sizeof(msg));
msg.reg = reg;
msg.io_sz = sz;
if (dir == 0) {
msg.type = VIODEV_MSG_IO_WRITE;
msg.data = *data;
msg.data_valid = 1;
} else
msg.type = VIODEV_MSG_IO_READ;
if (msg.type == VIODEV_MSG_IO_WRITE) {
/*
* Write request. No reply expected.
*/
ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
sizeof(msg));
if (ret == -1) {
log_warn("%s: failed to send async io event to virtio"
" device", __func__);
return (ret);
}
if (imsgbuf_flush(ibuf) == -1) {
log_warnx("%s: imsgbuf_flush (write)", __func__);
return (-1);
}
} else {
/*
* Read request. Requires waiting for a reply.
*/
ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
sizeof(msg));
if (ret == -1) {
log_warnx("%s: failed to send sync io event to virtio"
" device", __func__);
return (ret);
}
if (imsgbuf_flush(ibuf) == -1) {
log_warnx("%s: imsgbuf_flush (read)", __func__);
return (-1);
}
/* Read our reply. */
ret = imsgbuf_read_one(ibuf, &imsg);
if (ret == 0 || ret == -1) {
log_warn("%s: imsgbuf_read (n=%d)", __func__, ret);
return (-1);
}
IMSG_SIZE_CHECK(&imsg, &msg);
memcpy(&msg, imsg.data, sizeof(msg));
imsg_free(&imsg);
if (msg.type == VIODEV_MSG_IO_READ && msg.data_valid) {
#if DEBUG
log_debug("%s: got sync read response (reg=%s)",
__func__, virtio_reg_name(msg.reg));
#endif /* DEBUG */
*data = msg.data;
/*
* It's possible we're asked to {de,}assert after the
* device performs a register read.
*/
if (msg.state == INTR_STATE_ASSERT)
vcpu_assert_irq(dev->vm_id, msg.vcpu, msg.irq);
else if (msg.state == INTR_STATE_DEASSERT)
vcpu_deassert_irq(dev->vm_id, msg.vcpu, msg.irq);
} else {
log_warnx("%s: expected IO_READ, got %d", __func__,
msg.type);
return (-1);
}
}
return (0);
}
void
virtio_assert_irq(struct virtio_dev *dev, int vcpu)
{
struct viodev_msg msg;
int ret;
memset(&msg, 0, sizeof(msg));
msg.irq = dev->irq;
msg.vcpu = vcpu;
msg.type = VIODEV_MSG_KICK;
msg.state = INTR_STATE_ASSERT;
ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
&msg, sizeof(msg));
if (ret == -1)
log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
}
void
virtio_deassert_irq(struct virtio_dev *dev, int vcpu)
{
struct viodev_msg msg;
int ret;
memset(&msg, 0, sizeof(msg));
msg.irq = dev->irq;
msg.vcpu = vcpu;
msg.type = VIODEV_MSG_KICK;
msg.state = INTR_STATE_DEASSERT;
ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
&msg, sizeof(msg));
if (ret == -1)
log_warnx("%s: failed to deassert irq %d", __func__, dev->irq);
}
/*
* Close all underlying file descriptors for a given virtio device.
*/
static int
virtio_dev_closefds(struct virtio_dev *dev)
{
size_t i;
switch (dev->dev_type) {
case VMD_DEVTYPE_DISK:
for (i = 0; i < dev->vioblk.ndisk_fd; i++) {
close_fd(dev->vioblk.disk_fd[i]);
dev->vioblk.disk_fd[i] = -1;
}
break;
case VMD_DEVTYPE_NET:
close_fd(dev->vionet.data_fd);
dev->vionet.data_fd = -1;
break;
default:
log_warnx("%s: invalid device type", __func__);
return (-1);
}
close_fd(dev->async_fd);
dev->async_fd = -1;
close_fd(dev->sync_fd);
dev->sync_fd = -1;
return (0);
}