src/usr.sbin/vmd/vionet.c

1400 lines
36 KiB
C

/* $OpenBSD: vionet.c,v 1.13 2024/02/20 21:40:37 dv Exp $ */
/*
* Copyright (c) 2023 Dave Voutila <dv@openbsd.org>
* Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/socket.h>
#include <sys/types.h>
#include <dev/pci/virtio_pcireg.h>
#include <dev/pv/virtioreg.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>
#include <errno.h>
#include <event.h>
#include <fcntl.h>
#include <pthread.h>
#include <pthread_np.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "atomicio.h"
#include "virtio.h"
#include "vmd.h"
#define VIRTIO_NET_F_MAC (1 << 5)
#define RXQ 0
#define TXQ 1
extern char *__progname;
extern struct vmd_vm *current_vm;
struct packet {
uint8_t *buf;
size_t len;
};
static void *rx_run_loop(void *);
static void *tx_run_loop(void *);
static int vionet_rx(struct vionet_dev *, int);
static ssize_t vionet_rx_copy(struct vionet_dev *, int, const struct iovec *,
int, size_t);
static ssize_t vionet_rx_zerocopy(struct vionet_dev *, int,
const struct iovec *, int);
static void vionet_rx_event(int, short, void *);
static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *,
int8_t *);
static void handle_io_write(struct viodev_msg *, struct virtio_dev *);
static int vionet_tx(struct virtio_dev *);
static void vionet_notifyq(struct virtio_dev *);
static void dev_dispatch_vm(int, short, void *);
static void handle_sync_io(int, short, void *);
static void read_pipe_main(int, short, void *);
static void read_pipe_rx(int, short, void *);
static void read_pipe_tx(int, short, void *);
static void vionet_assert_pic_irq(struct virtio_dev *);
static void vionet_deassert_pic_irq(struct virtio_dev *);
/* Device Globals */
struct event ev_tap;
struct event ev_inject;
struct event_base *ev_base_main;
struct event_base *ev_base_rx;
struct event_base *ev_base_tx;
pthread_t rx_thread;
pthread_t tx_thread;
struct vm_dev_pipe pipe_main;
struct vm_dev_pipe pipe_rx;
struct vm_dev_pipe pipe_tx;
int pipe_inject[2];
#define READ 0
#define WRITE 1
struct iovec iov_rx[VIONET_QUEUE_SIZE];
struct iovec iov_tx[VIONET_QUEUE_SIZE];
pthread_rwlock_t lock = NULL; /* Guards device config state. */
/* Transient reset state used by the main thread to coordinate device reset. */
int resetting = 0;
__dead void
vionet_main(int fd, int fd_vmm)
{
struct virtio_dev dev;
struct vionet_dev *vionet = NULL;
struct viodev_msg msg;
struct vmd_vm vm;
struct vm_create_params *vcp;
ssize_t sz;
int ret;
/*
* stdio - needed for read/write to disk fds and channels to the vm.
* vmm + proc - needed to create shared vm mappings.
*/
if (pledge("stdio vmm proc", NULL) == -1)
fatal("pledge");
/* Initialize iovec arrays. */
memset(iov_rx, 0, sizeof(iov_rx));
memset(iov_tx, 0, sizeof(iov_tx));
/* Receive our vionet_dev, mostly preconfigured. */
sz = atomicio(read, fd, &dev, sizeof(dev));
if (sz != sizeof(dev)) {
ret = errno;
log_warn("failed to receive vionet");
goto fail;
}
if (dev.dev_type != VMD_DEVTYPE_NET) {
ret = EINVAL;
log_warn("received invalid device type");
goto fail;
}
dev.sync_fd = fd;
vionet = &dev.vionet;
log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d"
", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd,
dev.async_fd, fd_vmm);
/* Receive our vm information from the vm process. */
memset(&vm, 0, sizeof(vm));
sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm));
if (sz != sizeof(vm)) {
ret = EIO;
log_warnx("failed to receive vm details");
goto fail;
}
vcp = &vm.vm_params.vmc_params;
current_vm = &vm;
setproctitle("%s/vionet%d", vcp->vcp_name, vionet->idx);
log_procinit("vm/%s/vionet%d", vcp->vcp_name, vionet->idx);
/* Now that we have our vm information, we can remap memory. */
ret = remap_guest_mem(&vm, fd_vmm);
if (ret) {
fatal("%s: failed to remap", __func__);
goto fail;
}
/*
* We no longer need /dev/vmm access.
*/
close_fd(fd_vmm);
if (pledge("stdio", NULL) == -1)
fatal("pledge2");
/* If we're restoring hardware, re-initialize virtqueue hva's. */
if (vm.vm_state & VM_STATE_RECEIVED) {
struct virtio_vq_info *vq_info;
void *hva = NULL;
vq_info = &dev.vionet.vq[TXQ];
if (vq_info->q_gpa != 0) {
log_debug("%s: restoring TX virtqueue for gpa 0x%llx",
__func__, vq_info->q_gpa);
hva = hvaddr_mem(vq_info->q_gpa,
vring_size(VIONET_QUEUE_SIZE));
if (hva == NULL)
fatalx("%s: hva == NULL", __func__);
vq_info->q_hva = hva;
}
vq_info = &dev.vionet.vq[RXQ];
if (vq_info->q_gpa != 0) {
log_debug("%s: restoring RX virtqueue for gpa 0x%llx",
__func__, vq_info->q_gpa);
hva = hvaddr_mem(vq_info->q_gpa,
vring_size(VIONET_QUEUE_SIZE));
if (hva == NULL)
fatalx("%s: hva == NULL", __func__);
vq_info->q_hva = hva;
}
}
/* Initialize our packet injection pipe. */
if (pipe2(pipe_inject, O_NONBLOCK) == -1) {
log_warn("%s: injection pipe", __func__);
goto fail;
}
/* Initialize inter-thread communication channels. */
vm_pipe_init2(&pipe_main, read_pipe_main, &dev);
vm_pipe_init2(&pipe_rx, read_pipe_rx, &dev);
vm_pipe_init2(&pipe_tx, read_pipe_tx, &dev);
/* Initialize RX and TX threads . */
ret = pthread_create(&rx_thread, NULL, rx_run_loop, &dev);
if (ret) {
errno = ret;
log_warn("%s: failed to initialize rx thread", __func__);
goto fail;
}
pthread_set_name_np(rx_thread, "rx");
ret = pthread_create(&tx_thread, NULL, tx_run_loop, &dev);
if (ret) {
errno = ret;
log_warn("%s: failed to initialize tx thread", __func__);
goto fail;
}
pthread_set_name_np(tx_thread, "tx");
/* Initialize our rwlock for guarding shared device state. */
ret = pthread_rwlock_init(&lock, NULL);
if (ret) {
errno = ret;
log_warn("%s: failed to initialize rwlock", __func__);
goto fail;
}
/* Initialize libevent so we can start wiring event handlers. */
ev_base_main = event_base_new();
/* Add our handler for receiving messages from the RX/TX threads. */
event_base_set(ev_base_main, &pipe_main.read_ev);
event_add(&pipe_main.read_ev, NULL);
/* Wire up an async imsg channel. */
log_debug("%s: wiring in async vm event handler (fd=%d)", __func__,
dev.async_fd);
if (vm_device_pipe(&dev, dev_dispatch_vm, ev_base_main)) {
ret = EIO;
log_warnx("vm_device_pipe");
goto fail;
}
/* Configure our sync channel event handler. */
log_debug("%s: wiring in sync channel handler (fd=%d)", __func__,
dev.sync_fd);
imsg_init(&dev.sync_iev.ibuf, dev.sync_fd);
dev.sync_iev.handler = handle_sync_io;
dev.sync_iev.data = &dev;
dev.sync_iev.events = EV_READ;
imsg_event_add2(&dev.sync_iev, ev_base_main);
/* Send a ready message over the sync channel. */
log_debug("%s: telling vm %s device is ready", __func__, vcp->vcp_name);
memset(&msg, 0, sizeof(msg));
msg.type = VIODEV_MSG_READY;
imsg_compose_event2(&dev.sync_iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
sizeof(msg), ev_base_main);
/* Send a ready message over the async channel. */
log_debug("%s: sending async ready message", __func__);
ret = imsg_compose_event2(&dev.async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
&msg, sizeof(msg), ev_base_main);
if (ret == -1) {
log_warnx("%s: failed to send async ready message!", __func__);
goto fail;
}
/* Engage the event loop! */
ret = event_base_dispatch(ev_base_main);
event_base_free(ev_base_main);
/* Try stopping the rx & tx threads cleanly by messaging them. */
vm_pipe_send(&pipe_rx, VIRTIO_THREAD_STOP);
vm_pipe_send(&pipe_tx, VIRTIO_THREAD_STOP);
/* Wait for threads to stop. */
pthread_join(rx_thread, NULL);
pthread_join(tx_thread, NULL);
pthread_rwlock_destroy(&lock);
/* Cleanup */
if (ret == 0) {
close_fd(dev.sync_fd);
close_fd(dev.async_fd);
close_fd(vionet->data_fd);
close_fd(pipe_main.read);
close_fd(pipe_main.write);
close_fd(pipe_rx.write);
close_fd(pipe_tx.write);
close_fd(pipe_inject[READ]);
close_fd(pipe_inject[WRITE]);
_exit(ret);
/* NOTREACHED */
}
fail:
/* Try firing off a message to the vm saying we're dying. */
memset(&msg, 0, sizeof(msg));
msg.type = VIODEV_MSG_ERROR;
msg.data = ret;
imsg_compose(&dev.sync_iev.ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
sizeof(msg));
imsg_flush(&dev.sync_iev.ibuf);
close_fd(dev.sync_fd);
close_fd(dev.async_fd);
close_fd(pipe_inject[READ]);
close_fd(pipe_inject[WRITE]);
if (vionet != NULL)
close_fd(vionet->data_fd);
if (lock != NULL)
pthread_rwlock_destroy(&lock);
_exit(ret);
}
/*
* Update the gpa and hva of the virtqueue.
*/
static void
vionet_update_qa(struct vionet_dev *dev)
{
struct virtio_vq_info *vq_info;
void *hva = NULL;
/* Invalid queue? */
if (dev->cfg.queue_select > 1)
return;
vq_info = &dev->vq[dev->cfg.queue_select];
vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE;
dev->cfg.queue_pfn = vq_info->q_gpa >> 12;
if (vq_info->q_gpa == 0)
vq_info->q_hva = NULL;
hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIONET_QUEUE_SIZE));
if (hva == NULL)
fatalx("%s: hva == NULL", __func__);
vq_info->q_hva = hva;
}
/*
* Update the queue size.
*/
static void
vionet_update_qs(struct vionet_dev *dev)
{
struct virtio_vq_info *vq_info;
/* Invalid queue? */
if (dev->cfg.queue_select > 1) {
log_warnx("%s: !!! invalid queue selector %d", __func__,
dev->cfg.queue_select);
dev->cfg.queue_size = 0;
return;
}
vq_info = &dev->vq[dev->cfg.queue_select];
/* Update queue pfn/size based on queue select */
dev->cfg.queue_pfn = vq_info->q_gpa >> 12;
dev->cfg.queue_size = vq_info->qs;
}
/*
* vionet_rx
*
* Pull packet from the provided fd and fill the receive-side virtqueue. We
* selectively use zero-copy approaches when possible.
*
* Returns 1 if guest notification is needed. Otherwise, returns -1 on failure
* or 0 if no notification is needed.
*/
static int
vionet_rx(struct vionet_dev *dev, int fd)
{
uint16_t idx, hdr_idx;
char *vr = NULL;
size_t chain_len = 0, iov_cnt;
struct vring_desc *desc, *table;
struct vring_avail *avail;
struct vring_used *used;
struct virtio_vq_info *vq_info;
struct iovec *iov;
int notify = 0;
ssize_t sz;
uint8_t status = 0;
status = dev->cfg.device_status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK;
if (status != VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) {
log_warnx("%s: driver not ready", __func__);
return (0);
}
vq_info = &dev->vq[RXQ];
idx = vq_info->last_avail;
vr = vq_info->q_hva;
if (vr == NULL)
fatalx("%s: vr == NULL", __func__);
/* Compute offsets in ring of descriptors, avail ring, and used ring */
table = (struct vring_desc *)(vr);
avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
used->flags |= VRING_USED_F_NO_NOTIFY;
while (idx != avail->idx) {
hdr_idx = avail->ring[idx & VIONET_QUEUE_MASK];
desc = &table[hdr_idx & VIONET_QUEUE_MASK];
if (!DESC_WRITABLE(desc)) {
log_warnx("%s: invalid descriptor state", __func__);
goto reset;
}
iov = &iov_rx[0];
iov_cnt = 1;
/*
* First descriptor should be at least as large as the
* virtio_net_hdr. It's not technically required, but in
* legacy devices it should be safe to assume.
*/
iov->iov_len = desc->len;
if (iov->iov_len < sizeof(struct virtio_net_hdr)) {
log_warnx("%s: invalid descriptor length", __func__);
goto reset;
}
/*
* Insert the virtio_net_hdr and adjust len/base. We do the
* pointer math here before it's a void*.
*/
iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len);
if (iov->iov_base == NULL)
goto reset;
memset(iov->iov_base, 0, sizeof(struct virtio_net_hdr));
/* Tweak the iovec to account for the virtio_net_hdr. */
iov->iov_len -= sizeof(struct virtio_net_hdr);
iov->iov_base = hvaddr_mem(desc->addr +
sizeof(struct virtio_net_hdr), iov->iov_len);
if (iov->iov_base == NULL)
goto reset;
chain_len = iov->iov_len;
/*
* Walk the remaining chain and collect remaining addresses
* and lengths.
*/
while (desc->flags & VRING_DESC_F_NEXT) {
desc = &table[desc->next & VIONET_QUEUE_MASK];
if (!DESC_WRITABLE(desc)) {
log_warnx("%s: invalid descriptor state",
__func__);
goto reset;
}
/* Collect our IO information. Translate gpa's. */
iov = &iov_rx[iov_cnt];
iov->iov_len = desc->len;
iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len);
if (iov->iov_base == NULL)
goto reset;
chain_len += iov->iov_len;
/* Guard against infinitely looping chains. */
if (++iov_cnt >= nitems(iov_rx)) {
log_warnx("%s: infinite chain detected",
__func__);
goto reset;
}
}
/* Make sure the driver gave us the bare minimum buffers. */
if (chain_len < VIONET_MIN_TXLEN) {
log_warnx("%s: insufficient buffers provided",
__func__);
goto reset;
}
/*
* If we're enforcing hardware address or handling an injected
* packet, we need to use a copy-based approach.
*/
if (dev->lockedmac || fd != dev->data_fd)
sz = vionet_rx_copy(dev, fd, iov_rx, iov_cnt,
chain_len);
else
sz = vionet_rx_zerocopy(dev, fd, iov_rx, iov_cnt);
if (sz == -1)
goto reset;
if (sz == 0) /* No packets, so bail out for now. */
break;
/*
* Account for the prefixed header since it wasn't included
* in the copy or zerocopy operations.
*/
sz += sizeof(struct virtio_net_hdr);
/* Mark our buffers as used. */
used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_idx;
used->ring[used->idx & VIONET_QUEUE_MASK].len = sz;
__sync_synchronize();
used->idx++;
idx++;
}
if (idx != vq_info->last_avail &&
!(avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
notify = 1;
}
vq_info->last_avail = idx;
return (notify);
reset:
return (-1);
}
/*
* vionet_rx_copy
*
* Read a packet off the provided file descriptor, validating packet
* characteristics, and copy into the provided buffers in the iovec array.
*
* It's assumed that the provided iovec array contains validated host virtual
* address translations and not guest physical addreses.
*
* Returns number of bytes copied on success, 0 if packet is dropped, and
* -1 on an error.
*/
ssize_t
vionet_rx_copy(struct vionet_dev *dev, int fd, const struct iovec *iov,
int iov_cnt, size_t chain_len)
{
static uint8_t buf[VIONET_HARD_MTU];
struct packet *pkt = NULL;
struct ether_header *eh = NULL;
uint8_t *payload = buf;
size_t i, chunk, nbytes, copied = 0;
ssize_t sz;
/* If reading from the tap(4), try to right-size the read. */
if (fd == dev->data_fd)
nbytes = MIN(chain_len, VIONET_HARD_MTU);
else if (fd == pipe_inject[READ])
nbytes = sizeof(struct packet);
else {
log_warnx("%s: invalid fd: %d", __func__, fd);
return (-1);
}
/*
* Try to pull a packet. The fd should be non-blocking and we don't
* care if we under-read (i.e. sz != nbytes) as we may not have a
* packet large enough to fill the buffer.
*/
sz = read(fd, buf, nbytes);
if (sz == -1) {
if (errno != EAGAIN) {
log_warn("%s: error reading packet", __func__);
return (-1);
}
return (0);
} else if (fd == dev->data_fd && sz < VIONET_MIN_TXLEN) {
/* If reading the tap(4), we should get valid ethernet. */
log_warnx("%s: invalid packet size", __func__);
return (0);
} else if (fd == pipe_inject[READ] && sz != sizeof(struct packet)) {
log_warnx("%s: invalid injected packet object (sz=%ld)",
__func__, sz);
return (0);
}
/* Decompose an injected packet, if that's what we're working with. */
if (fd == pipe_inject[READ]) {
pkt = (struct packet *)buf;
if (pkt->buf == NULL) {
log_warnx("%s: invalid injected packet, no buffer",
__func__);
return (0);
}
if (sz < VIONET_MIN_TXLEN || sz > VIONET_MAX_TXLEN) {
log_warnx("%s: invalid injected packet size", __func__);
goto drop;
}
payload = pkt->buf;
sz = (ssize_t)pkt->len;
}
/* Validate the ethernet header, if required. */
if (dev->lockedmac) {
eh = (struct ether_header *)(payload);
if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
memcmp(eh->ether_dhost, dev->mac,
sizeof(eh->ether_dhost)) != 0)
goto drop;
}
/* Truncate one last time to the chain length, if shorter. */
sz = MIN(chain_len, (size_t)sz);
/*
* Copy the packet into the provided buffers. We can use memcpy(3)
* here as the gpa was validated and translated to an hva previously.
*/
for (i = 0; (int)i < iov_cnt && (size_t)sz > copied; i++) {
chunk = MIN(iov[i].iov_len, (size_t)(sz - copied));
memcpy(iov[i].iov_base, payload + copied, chunk);
copied += chunk;
}
drop:
/* Free any injected packet buffer. */
if (pkt != NULL)
free(pkt->buf);
return (copied);
}
/*
* vionet_rx_zerocopy
*
* Perform a vectorized read from the given fd into the guest physical memory
* pointed to by iovecs.
*
* Returns number of bytes read on success, -1 on error, or 0 if EAGAIN was
* returned by readv.
*
*/
static ssize_t
vionet_rx_zerocopy(struct vionet_dev *dev, int fd, const struct iovec *iov,
int iov_cnt)
{
ssize_t sz;
if (dev->lockedmac) {
log_warnx("%s: zerocopy not available for locked lladdr",
__func__);
return (-1);
}
sz = readv(fd, iov, iov_cnt);
if (sz == -1 && errno == EAGAIN)
return (0);
return (sz);
}
/*
* vionet_rx_event
*
* Called when new data can be received on the tap fd of a vionet device.
*/
static void
vionet_rx_event(int fd, short event, void *arg)
{
struct virtio_dev *dev = (struct virtio_dev *)arg;
struct vionet_dev *vionet = &dev->vionet;
int ret = 0;
if (!(event & EV_READ))
fatalx("%s: invalid event type", __func__);
pthread_rwlock_rdlock(&lock);
ret = vionet_rx(vionet, fd);
pthread_rwlock_unlock(&lock);
if (ret == 0) {
/* Nothing to do. */
return;
}
pthread_rwlock_wrlock(&lock);
if (ret == 1) {
/* Notify the driver. */
vionet->cfg.isr_status |= 1;
} else {
/* Need a reset. Something went wrong. */
log_warnx("%s: requesting device reset", __func__);
vionet->cfg.device_status |= DEVICE_NEEDS_RESET;
vionet->cfg.isr_status |= VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
}
pthread_rwlock_unlock(&lock);
vm_pipe_send(&pipe_main, VIRTIO_RAISE_IRQ);
}
static void
vionet_notifyq(struct virtio_dev *dev)
{
struct vionet_dev *vionet = &dev->vionet;
switch (vionet->cfg.queue_notify) {
case RXQ:
vm_pipe_send(&pipe_rx, VIRTIO_NOTIFY);
break;
case TXQ:
vm_pipe_send(&pipe_tx, VIRTIO_NOTIFY);
break;
default:
/*
* Catch the unimplemented queue ID 2 (control queue) as
* well as any bogus queue IDs.
*/
log_debug("%s: notify for unimplemented queue ID %d",
__func__, vionet->cfg.queue_notify);
break;
}
}
static int
vionet_tx(struct virtio_dev *dev)
{
uint16_t idx, hdr_idx;
size_t chain_len, iov_cnt;
ssize_t dhcpsz = 0, sz;
int notify = 0;
char *vr = NULL, *dhcppkt = NULL;
struct vionet_dev *vionet = &dev->vionet;
struct vring_desc *desc, *table;
struct vring_avail *avail;
struct vring_used *used;
struct virtio_vq_info *vq_info;
struct ether_header *eh;
struct iovec *iov;
struct packet pkt;
uint8_t status = 0;
status = vionet->cfg.device_status
& VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK;
if (status != VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) {
log_warnx("%s: driver not ready", __func__);
return (0);
}
vq_info = &vionet->vq[TXQ];
idx = vq_info->last_avail;
vr = vq_info->q_hva;
if (vr == NULL)
fatalx("%s: vr == NULL", __func__);
/* Compute offsets in ring of descriptors, avail ring, and used ring */
table = (struct vring_desc *)(vr);
avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
while (idx != avail->idx) {
hdr_idx = avail->ring[idx & VIONET_QUEUE_MASK];
desc = &table[hdr_idx & VIONET_QUEUE_MASK];
if (DESC_WRITABLE(desc)) {
log_warnx("%s: invalid descriptor state", __func__);
goto reset;
}
iov = &iov_tx[0];
iov_cnt = 0;
chain_len = 0;
/*
* As a legacy device, we most likely will receive a lead
* descriptor sized to the virtio_net_hdr. However, the framing
* is not guaranteed, so check for packet data.
*/
iov->iov_len = desc->len;
if (iov->iov_len < sizeof(struct virtio_net_hdr)) {
log_warnx("%s: invalid descriptor length", __func__);
goto reset;
} else if (iov->iov_len > sizeof(struct virtio_net_hdr)) {
/* Chop off the virtio header, leaving packet data. */
iov->iov_len -= sizeof(struct virtio_net_hdr);
chain_len += iov->iov_len;
iov->iov_base = hvaddr_mem(desc->addr +
sizeof(struct virtio_net_hdr), iov->iov_len);
if (iov->iov_base == NULL)
goto reset;
iov_cnt++;
}
/*
* Walk the chain and collect remaining addresses and lengths.
*/
while (desc->flags & VRING_DESC_F_NEXT) {
desc = &table[desc->next & VIONET_QUEUE_MASK];
if (DESC_WRITABLE(desc)) {
log_warnx("%s: invalid descriptor state",
__func__);
goto reset;
}
/* Collect our IO information, translating gpa's. */
iov = &iov_tx[iov_cnt];
iov->iov_len = desc->len;
iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len);
if (iov->iov_base == NULL)
goto reset;
chain_len += iov->iov_len;
/* Guard against infinitely looping chains. */
if (++iov_cnt >= nitems(iov_tx)) {
log_warnx("%s: infinite chain detected",
__func__);
goto reset;
}
}
/* Check if we've got a minimum viable amount of data. */
if (chain_len < VIONET_MIN_TXLEN) {
sz = chain_len;
goto drop;
}
/*
* Packet inspection for ethernet header (if using a "local"
* interface) for possibility of a DHCP packet or (if using
* locked lladdr) for validating ethernet header.
*
* To help preserve zero-copy semantics, we require the first
* descriptor with packet data contains a large enough buffer
* for this inspection.
*/
iov = &iov_tx[0];
if (vionet->lockedmac) {
if (iov->iov_len < ETHER_HDR_LEN) {
log_warnx("%s: insufficient header data",
__func__);
goto drop;
}
eh = (struct ether_header *)iov->iov_base;
if (memcmp(eh->ether_shost, vionet->mac,
sizeof(eh->ether_shost)) != 0) {
log_warnx("%s: bad source address %s",
__func__, ether_ntoa((struct ether_addr *)
eh->ether_shost));
sz = chain_len;
goto drop;
}
}
if (vionet->local) {
dhcpsz = dhcp_request(dev, iov->iov_base, iov->iov_len,
&dhcppkt);
if (dhcpsz > 0)
log_debug("%s: detected dhcp request of %zu bytes",
__func__, dhcpsz);
}
/* Write our packet to the tap(4). */
sz = writev(vionet->data_fd, iov_tx, iov_cnt);
if (sz == -1 && errno != ENOBUFS) {
log_warn("%s", __func__);
goto reset;
}
sz += sizeof(struct virtio_net_hdr);
drop:
used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_idx;
used->ring[used->idx & VIONET_QUEUE_MASK].len = sz;
__sync_synchronize();
used->idx++;
idx++;
/* Facilitate DHCP reply injection, if needed. */
if (dhcpsz > 0) {
pkt.buf = dhcppkt;
pkt.len = dhcpsz;
sz = write(pipe_inject[WRITE], &pkt, sizeof(pkt));
if (sz == -1 && errno != EAGAIN) {
log_warn("%s: packet injection", __func__);
free(pkt.buf);
} else if (sz == -1 && errno == EAGAIN) {
log_debug("%s: dropping dhcp reply", __func__);
free(pkt.buf);
} else if (sz != sizeof(pkt)) {
log_warnx("%s: failed packet injection",
__func__);
free(pkt.buf);
}
log_debug("%s: injected dhcp reply with %ld bytes",
__func__, sz);
}
}
if (idx != vq_info->last_avail &&
!(avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
notify = 1;
vq_info->last_avail = idx;
return (notify);
reset:
return (-1);
}
static void
dev_dispatch_vm(int fd, short event, void *arg)
{
struct virtio_dev *dev = arg;
struct vionet_dev *vionet = &dev->vionet;
struct imsgev *iev = &dev->async_iev;
struct imsgbuf *ibuf = &iev->ibuf;
struct imsg imsg;
ssize_t n = 0;
int verbose;
uint8_t status = 0;
if (dev == NULL)
fatalx("%s: missing vionet pointer", __func__);
if (event & EV_READ) {
if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
fatal("%s: imsg_read", __func__);
if (n == 0) {
/* this pipe is dead, so remove the event handler */
log_debug("%s: pipe dead (EV_READ)", __func__);
event_del(&iev->ev);
event_base_loopexit(ev_base_main, NULL);
return;
}
}
if (event & EV_WRITE) {
if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
fatal("%s: msgbuf_write", __func__);
if (n == 0) {
/* this pipe is dead, so remove the event handler */
log_debug("%s: pipe dead (EV_WRITE)", __func__);
event_del(&iev->ev);
event_base_loopexit(ev_base_main, NULL);
return;
}
}
for (;;) {
if ((n = imsg_get(ibuf, &imsg)) == -1)
fatal("%s: imsg_get", __func__);
if (n == 0)
break;
switch (imsg.hdr.type) {
case IMSG_DEVOP_HOSTMAC:
IMSG_SIZE_CHECK(&imsg, vionet->hostmac);
memcpy(vionet->hostmac, imsg.data,
sizeof(vionet->hostmac));
log_debug("%s: set hostmac", __func__);
break;
case IMSG_VMDOP_PAUSE_VM:
log_debug("%s: pausing", __func__);
vm_pipe_send(&pipe_rx, VIRTIO_THREAD_PAUSE);
break;
case IMSG_VMDOP_UNPAUSE_VM:
log_debug("%s: unpausing", __func__);
pthread_rwlock_rdlock(&lock);
status = vionet->cfg.device_status &
VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK;
pthread_rwlock_unlock(&lock);
if (status)
vm_pipe_send(&pipe_rx, VIRTIO_THREAD_START);
break;
case IMSG_CTL_VERBOSE:
IMSG_SIZE_CHECK(&imsg, &verbose);
memcpy(&verbose, imsg.data, sizeof(verbose));
log_setverbose(verbose);
break;
}
imsg_free(&imsg);
}
imsg_event_add2(iev, ev_base_main);
}
/*
* Synchronous IO handler.
*
*/
static void
handle_sync_io(int fd, short event, void *arg)
{
struct virtio_dev *dev = (struct virtio_dev *)arg;
struct imsgev *iev = &dev->sync_iev;
struct imsgbuf *ibuf = &iev->ibuf;
struct viodev_msg msg;
struct imsg imsg;
ssize_t n;
int8_t intr = INTR_STATE_NOOP;
if (event & EV_READ) {
if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
fatal("%s: imsg_read", __func__);
if (n == 0) {
/* this pipe is dead, so remove the event handler */
log_debug("%s: pipe dead (EV_READ)", __func__);
event_del(&iev->ev);
event_base_loopexit(ev_base_main, NULL);
return;
}
}
if (event & EV_WRITE) {
if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
fatal("%s: msgbuf_write", __func__);
if (n == 0) {
/* this pipe is dead, so remove the event handler */
log_debug("%s: pipe dead (EV_WRITE)", __func__);
event_del(&iev->ev);
event_base_loopexit(ev_base_main, NULL);
return;
}
}
for (;;) {
if ((n = imsg_get(ibuf, &imsg)) == -1)
fatalx("%s: imsg_get (n=%ld)", __func__, n);
if (n == 0)
break;
/* Unpack our message. They ALL should be dev messeges! */
IMSG_SIZE_CHECK(&imsg, &msg);
memcpy(&msg, imsg.data, sizeof(msg));
imsg_free(&imsg);
switch (msg.type) {
case VIODEV_MSG_DUMP:
/* Dump device */
n = atomicio(vwrite, dev->sync_fd, dev, sizeof(*dev));
if (n != sizeof(*dev)) {
log_warnx("%s: failed to dump vionet device",
__func__);
break;
}
case VIODEV_MSG_IO_READ:
/* Read IO: make sure to send a reply */
msg.data = handle_io_read(&msg, dev, &intr);
msg.data_valid = 1;
msg.state = intr;
imsg_compose_event2(iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
sizeof(msg), ev_base_main);
break;
case VIODEV_MSG_IO_WRITE:
/* Write IO: no reply needed */
handle_io_write(&msg, dev);
break;
case VIODEV_MSG_SHUTDOWN:
event_del(&dev->sync_iev.ev);
event_base_loopbreak(ev_base_main);
return;
default:
fatalx("%s: invalid msg type %d", __func__, msg.type);
}
}
imsg_event_add2(iev, ev_base_main);
}
static void
handle_io_write(struct viodev_msg *msg, struct virtio_dev *dev)
{
struct vionet_dev *vionet = &dev->vionet;
uint32_t data = msg->data;
int pause_devices = 0;
pthread_rwlock_wrlock(&lock);
switch (msg->reg) {
case VIRTIO_CONFIG_DEVICE_FEATURES:
case VIRTIO_CONFIG_QUEUE_SIZE:
case VIRTIO_CONFIG_ISR_STATUS:
log_warnx("%s: illegal write %x to %s", __progname, data,
virtio_reg_name(msg->reg));
break;
case VIRTIO_CONFIG_GUEST_FEATURES:
vionet->cfg.guest_feature = data;
break;
case VIRTIO_CONFIG_QUEUE_PFN:
vionet->cfg.queue_pfn = data;
vionet_update_qa(vionet);
break;
case VIRTIO_CONFIG_QUEUE_SELECT:
vionet->cfg.queue_select = data;
vionet_update_qs(vionet);
break;
case VIRTIO_CONFIG_QUEUE_NOTIFY:
vionet->cfg.queue_notify = data;
vionet_notifyq(dev);
break;
case VIRTIO_CONFIG_DEVICE_STATUS:
if (data == 0) {
resetting = 2; /* Wait on two acks: rx & tx */
pause_devices = 1;
} else {
// XXX is this correct?
vionet->cfg.device_status = data;
}
break;
}
pthread_rwlock_unlock(&lock);
if (pause_devices) {
vionet_deassert_pic_irq(dev);
vm_pipe_send(&pipe_rx, VIRTIO_THREAD_PAUSE);
vm_pipe_send(&pipe_tx, VIRTIO_THREAD_PAUSE);
}
}
static uint32_t
handle_io_read(struct viodev_msg *msg, struct virtio_dev *dev, int8_t *intr)
{
struct vionet_dev *vionet = &dev->vionet;
uint32_t data;
pthread_rwlock_rdlock(&lock);
switch (msg->reg) {
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI:
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1:
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2:
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3:
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4:
case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5:
data = vionet->mac[msg->reg -
VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI];
break;
case VIRTIO_CONFIG_DEVICE_FEATURES:
data = vionet->cfg.device_feature;
break;
case VIRTIO_CONFIG_GUEST_FEATURES:
data = vionet->cfg.guest_feature;
break;
case VIRTIO_CONFIG_QUEUE_PFN:
data = vionet->cfg.queue_pfn;
break;
case VIRTIO_CONFIG_QUEUE_SIZE:
data = vionet->cfg.queue_size;
break;
case VIRTIO_CONFIG_QUEUE_SELECT:
data = vionet->cfg.queue_select;
break;
case VIRTIO_CONFIG_QUEUE_NOTIFY:
data = vionet->cfg.queue_notify;
break;
case VIRTIO_CONFIG_DEVICE_STATUS:
data = vionet->cfg.device_status;
break;
case VIRTIO_CONFIG_ISR_STATUS:
pthread_rwlock_unlock(&lock);
pthread_rwlock_wrlock(&lock);
data = vionet->cfg.isr_status;
vionet->cfg.isr_status = 0;
if (intr != NULL)
*intr = INTR_STATE_DEASSERT;
break;
default:
data = 0xFFFFFFFF;
}
pthread_rwlock_unlock(&lock);
return (data);
}
/*
* Handle the rx side processing, communicating to the main thread via pipe.
*/
static void *
rx_run_loop(void *arg)
{
struct virtio_dev *dev = (struct virtio_dev *)arg;
struct vionet_dev *vionet = &dev->vionet;
int ret;
ev_base_rx = event_base_new();
/* Wire up event handling for the tap fd. */
event_set(&ev_tap, vionet->data_fd, EV_READ | EV_PERSIST,
vionet_rx_event, dev);
event_base_set(ev_base_rx, &ev_tap);
/* Wire up event handling for the packet injection pipe. */
event_set(&ev_inject, pipe_inject[READ], EV_READ | EV_PERSIST,
vionet_rx_event, dev);
event_base_set(ev_base_rx, &ev_inject);
/* Wire up event handling for our inter-thread communication channel. */
event_base_set(ev_base_rx, &pipe_rx.read_ev);
event_add(&pipe_rx.read_ev, NULL);
/* Begin our event loop with our channel event active. */
ret = event_base_dispatch(ev_base_rx);
event_base_free(ev_base_rx);
log_debug("%s: exiting (%d)", __func__, ret);
close_fd(pipe_rx.read);
close_fd(pipe_inject[READ]);
return (NULL);
}
/*
* Handle the tx side processing, communicating to the main thread via pipe.
*/
static void *
tx_run_loop(void *arg)
{
int ret;
ev_base_tx = event_base_new();
/* Wire up event handling for our inter-thread communication channel. */
event_base_set(ev_base_tx, &pipe_tx.read_ev);
event_add(&pipe_tx.read_ev, NULL);
/* Begin our event loop with our channel event active. */
ret = event_base_dispatch(ev_base_tx);
event_base_free(ev_base_tx);
log_debug("%s: exiting (%d)", __func__, ret);
close_fd(pipe_tx.read);
return (NULL);
}
/*
* Read events sent by the main thread to the rx thread.
*/
static void
read_pipe_rx(int fd, short event, void *arg)
{
enum pipe_msg_type msg;
if (!(event & EV_READ))
fatalx("%s: invalid event type", __func__);
msg = vm_pipe_recv(&pipe_rx);
switch (msg) {
case VIRTIO_NOTIFY:
case VIRTIO_THREAD_START:
event_add(&ev_tap, NULL);
event_add(&ev_inject, NULL);
break;
case VIRTIO_THREAD_PAUSE:
event_del(&ev_tap);
event_del(&ev_inject);
vm_pipe_send(&pipe_main, VIRTIO_THREAD_ACK);
break;
case VIRTIO_THREAD_STOP:
event_del(&ev_tap);
event_del(&ev_inject);
event_base_loopexit(ev_base_rx, NULL);
break;
default:
fatalx("%s: invalid channel message: %d", __func__, msg);
}
}
/*
* Read events sent by the main thread to the tx thread.
*/
static void
read_pipe_tx(int fd, short event, void *arg)
{
struct virtio_dev *dev = (struct virtio_dev*)arg;
struct vionet_dev *vionet = &dev->vionet;
enum pipe_msg_type msg;
int ret = 0;
if (!(event & EV_READ))
fatalx("%s: invalid event type", __func__);
msg = vm_pipe_recv(&pipe_tx);
switch (msg) {
case VIRTIO_NOTIFY:
pthread_rwlock_rdlock(&lock);
ret = vionet_tx(dev);
pthread_rwlock_unlock(&lock);
break;
case VIRTIO_THREAD_START:
/* Ignore Start messages. */
break;
case VIRTIO_THREAD_PAUSE:
/*
* Nothing to do when pausing on the tx side, but ACK so main
* thread knows we're not transmitting.
*/
vm_pipe_send(&pipe_main, VIRTIO_THREAD_ACK);
break;
case VIRTIO_THREAD_STOP:
event_base_loopexit(ev_base_tx, NULL);
break;
default:
fatalx("%s: invalid channel message: %d", __func__, msg);
}
if (ret == 0) {
/* No notification needed. Return early. */
return;
}
pthread_rwlock_wrlock(&lock);
if (ret == 1) {
/* Notify the driver. */
vionet->cfg.isr_status |= 1;
} else {
/* Need a reset. Something went wrong. */
log_warnx("%s: requesting device reset", __func__);
vionet->cfg.device_status |= DEVICE_NEEDS_RESET;
vionet->cfg.isr_status |= VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
}
pthread_rwlock_unlock(&lock);
vm_pipe_send(&pipe_main, VIRTIO_RAISE_IRQ);
}
/*
* Read events sent by the rx/tx threads to the main thread.
*/
static void
read_pipe_main(int fd, short event, void *arg)
{
struct virtio_dev *dev = (struct virtio_dev*)arg;
struct vionet_dev *vionet = &dev->vionet;
enum pipe_msg_type msg;
if (!(event & EV_READ))
fatalx("%s: invalid event type", __func__);
msg = vm_pipe_recv(&pipe_main);
switch (msg) {
case VIRTIO_RAISE_IRQ:
vionet_assert_pic_irq(dev);
break;
case VIRTIO_THREAD_ACK:
resetting--;
if (resetting == 0) {
log_debug("%s: resetting virtio network device %d",
__func__, vionet->idx);
pthread_rwlock_wrlock(&lock);
vionet->cfg.device_status = 0;
vionet->cfg.guest_feature = 0;
vionet->cfg.queue_pfn = 0;
vionet_update_qa(vionet);
vionet->cfg.queue_size = 0;
vionet_update_qs(vionet);
vionet->cfg.queue_select = 0;
vionet->cfg.queue_notify = 0;
vionet->cfg.isr_status = 0;
vionet->vq[RXQ].last_avail = 0;
vionet->vq[RXQ].notified_avail = 0;
vionet->vq[TXQ].last_avail = 0;
vionet->vq[TXQ].notified_avail = 0;
pthread_rwlock_unlock(&lock);
}
break;
default:
fatalx("%s: invalid channel msg: %d", __func__, msg);
}
}
/*
* Message the vm process asking to raise the irq. Must be called from the main
* thread.
*/
static void
vionet_assert_pic_irq(struct virtio_dev *dev)
{
struct viodev_msg msg;
int ret;
memset(&msg, 0, sizeof(msg));
msg.irq = dev->irq;
msg.vcpu = 0; // XXX
msg.type = VIODEV_MSG_KICK;
msg.state = INTR_STATE_ASSERT;
ret = imsg_compose_event2(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
&msg, sizeof(msg), ev_base_main);
if (ret == -1)
log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
}
/*
* Message the vm process asking to lower the irq. Must be called from the main
* thread.
*/
static void
vionet_deassert_pic_irq(struct virtio_dev *dev)
{
struct viodev_msg msg;
int ret;
memset(&msg, 0, sizeof(msg));
msg.irq = dev->irq;
msg.vcpu = 0; // XXX
msg.type = VIODEV_MSG_KICK;
msg.state = INTR_STATE_DEASSERT;
ret = imsg_compose_event2(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
&msg, sizeof(msg), ev_base_main);
if (ret == -1)
log_warnx("%s: failed to assert irq %d", __func__, dev->irq);
}