[PATCH 0/5] High-speed tun receive and xmit - Kernel

This is a discussion on [PATCH 0/5] High-speed tun receive and xmit - Kernel ; kvm (and lguest!) want to get more speed out of the tun device. We already have an ABI for guest host comms, called virtio_ring; extending tun to understand this (with its async nature and batching) make for an efficient network. ...

+ Reply to Thread
Page 1 of 2 1 2 LastLast
Results 1 to 20 of 27

Thread: [PATCH 0/5] High-speed tun receive and xmit

  1. [PATCH 0/5] High-speed tun receive and xmit

    kvm (and lguest!) want to get more speed out of the tun device. We already
    have an ABI for guest<->host comms, called virtio_ring; extending tun to
    understand this (with its async nature and batching) make for an efficient
    network.

    But moreover: the same things that make virtio a good guest<->host transport
    make it appealing as a userspace<->kernel transport. So rather than do
    something just for tun, we create a /dev/vring. We could do a system call,
    but /dev/vring is sufficient. It is a nice, simple format for normal tun
    usage too.

    We do need a small (backwards-compatible) extension to the ring format if
    we're going to do this however, since we didn't previously expose the
    consumer's view of the producer (and vice verse) in the ring because the
    other end doesn't need to see it. However, it means that a middleman (as the
    kvm or lguest launcher process now becomes) cannot save and restore all the
    ring state.

    Cheers,
    Rusty.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  2. [PATCH 1/5] virtio: put last_used and last_avail index into ring itself.

    Generally, the other end of the virtio ring doesn't need to see where
    you're up to in consuming the ring. However, in order for an external
    entity to understand it, it must be exposed. For example, if you want
    to save and restore a virtio_ring, but you're not the consumer because
    the kernel is using it directly.

    Fortunately, we have room to expand: the ring is always a whole number
    of pages and there's hundreds of bytes of padding after the avail ring
    and the used ring, whatever the number of descriptors (which must be a
    power of 2).

    We add a feature bit so the guest can tell the host that it's writing
    out the current value there, if it wants to use that.

    Signed-off-by: Rusty Russell
    ---
    drivers/virtio/virtio_ring.c | 20 ++++++++++++--------
    include/linux/virtio_config.h | 3 +++
    include/linux/virtio_ring.h | 12 +++++++++++-
    3 files changed, 26 insertions(+), 9 deletions(-)

    diff -r 1349b6a821c7 drivers/virtio/virtio_ring.c
    --- a/drivers/virtio/virtio_ring.c Thu Apr 17 05:58:09 2008 +1000
    +++ b/drivers/virtio/virtio_ring.c Thu Apr 17 06:15:26 2008 +1000
    @@ -18,6 +18,7 @@
    */
    #include
    #include
    +#include
    #include

    #ifdef DEBUG
    @@ -51,9 +52,6 @@ struct vring_virtqueue
    unsigned int free_head;
    /* Number we've added since last sync. */
    unsigned int num_added;
    -
    - /* Last used index we've seen. */
    - u16 last_used_idx;

    /* How to notify other side. FIXME: commonalize hcalls! */
    void (*notify)(struct virtqueue *vq);
    @@ -173,12 +171,13 @@ static void detach_buf(struct vring_virt

    static inline bool more_used(const struct vring_virtqueue *vq)
    {
    - return vq->last_used_idx != vq->vring.used->idx;
    + return vring_last_used(&vq->vring) != vq->vring.used->idx;
    }

    static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len)
    {
    struct vring_virtqueue *vq = to_vvq(_vq);
    + struct vring_used_elem *u;
    void *ret;
    unsigned int i;

    @@ -195,8 +194,11 @@ static void *vring_get_buf(struct virtqu
    return NULL;
    }

    - i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id;
    - *len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len;
    + u = &vq->vring.used->ring[vring_last_used(&vq->vring) % vq->vring.num];
    + i = u->id;
    + *len = u->len;
    + /* Make sure we don't reload i after doing checks. */
    + rmb();

    if (unlikely(i >= vq->vring.num)) {
    BAD_RING(vq, "id %u out of range\n", i);
    @@ -210,7 +212,7 @@ static void *vring_get_buf(struct virtqu
    /* detach_buf clears data, so grab it now. */
    ret = vq->data[i];
    detach_buf(vq, i);
    - vq->last_used_idx++;
    + vring_last_used(&vq->vring)++;
    END_USE(vq);
    return ret;
    }
    @@ -302,7 +304,6 @@ struct virtqueue *vring_new_virtqueue(un
    vq->vq.vq_ops = &vring_vq_ops;
    vq->notify = notify;
    vq->broken = false;
    - vq->last_used_idx = 0;
    vq->num_added = 0;
    #ifdef DEBUG
    vq->in_use = false;
    @@ -318,6 +319,9 @@ struct virtqueue *vring_new_virtqueue(un
    for (i = 0; i < num-1; i++)
    vq->vring.desc[i].next = i+1;

    + /* We now publish our last_used index in the rings. */
    + vdev->config->feature(vdev, VIRTIO_RING_F_PUBLISH_INDICES);
    +
    return &vq->vq;
    }
    EXPORT_SYMBOL_GPL(vring_new_virtqueue);
    diff -r 1349b6a821c7 include/linux/virtio_config.h
    --- a/include/linux/virtio_config.h Thu Apr 17 05:58:09 2008 +1000
    +++ b/include/linux/virtio_config.h Thu Apr 17 06:15:26 2008 +1000
    @@ -14,6 +14,9 @@
    #define VIRTIO_CONFIG_S_DRIVER_OK 4
    /* We've given up on this device. */
    #define VIRTIO_CONFIG_S_FAILED 0x80
    +
    +/* Transport virtio feature bits (currently bits 24 to 32 are reserved
    + * for this), the rest are per-device feature bits. */

    #ifdef __KERNEL__
    struct virtio_device;
    diff -r 1349b6a821c7 include/linux/virtio_ring.h
    --- a/include/linux/virtio_ring.h Thu Apr 17 05:58:09 2008 +1000
    +++ b/include/linux/virtio_ring.h Thu Apr 17 06:15:26 2008 +1000
    @@ -23,6 +23,9 @@
    * when you consume a buffer. It's unreliable, so it's simply an
    * optimization. */
    #define VRING_AVAIL_F_NO_INTERRUPT 1
    +
    +/* We publish our last-seen used index at the end of the avail ring. */
    +#define VIRTIO_RING_F_PUBLISH_INDICES 24

    /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */
    struct vring_desc
    @@ -82,6 +85,7 @@ struct vring {
    * __u16 avail_flags;
    * __u16 avail_idx;
    * __u16 available[num];
    + * __u16 last_used_idx;
    *
    * // Padding to the next page boundary.
    * char pad[];
    @@ -90,6 +94,7 @@ struct vring {
    * __u16 used_flags;
    * __u16 used_idx;
    * struct vring_used_elem used[num];
    + * __u16 last_avail_idx;
    * };
    */
    static inline void vring_init(struct vring *vr, unsigned int num, void *p,
    @@ -106,8 +111,13 @@ static inline unsigned vring_size(unsign
    {
    return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (2 + num)
    + pagesize - 1) & ~(pagesize - 1))
    - + sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num;
    + + sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num + 2;
    }
    +
    +/* We publish the last-seen used index at the end of the available ring, and
    + * vice-versa. These are at the end for backwards compatibility. */
    +#define vring_last_used(vr) ((vr)->avail->ring[(vr)->num])
    +#define vring_last_avail(vr) (*(__u16 *)&(vr)->used->ring[(vr)->num])

    #ifdef __KERNEL__
    #include
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  3. [PATCH 5/5] tun: vringfd xmit support.

    This patch modifies tun to allow a vringfd to specify the send
    buffer. The user does a write to push out packets from the buffer.

    Again we use the 'struct virtio_net_hdr' to allow userspace to send
    GSO packets. In this case, it can hint how much to copy, and the
    other pages will be made into skb fragments.

    Signed-off-by: Rusty Russell
    ---
    drivers/net/tun.c | 410 +++++++++++++++++++++++++++++++++++++++++--------
    include/linux/if_tun.h | 1
    2 files changed, 351 insertions(+), 60 deletions(-)

    diff -r f797ec115d1b drivers/net/tun.c
    --- a/drivers/net/tun.c Fri Apr 18 05:58:40 2008 +1000
    +++ b/drivers/net/tun.c Fri Apr 18 06:07:21 2008 +1000
    @@ -65,6 +65,8 @@
    #include
    #include
    #include
    +#include
    +#include
    #include

    #include
    @@ -102,8 +104,8 @@ struct tun_struct {
    u32 chr_filter[2];
    u32 net_filter[2];

    - struct vring_info *inring;
    - struct file *infile;
    + struct vring_info *inring, *outring;
    + struct file *infile, *outfile;

    #ifdef TUN_DEBUG
    int debug;
    @@ -258,6 +261,169 @@ static void tun_net_init(struct net_devi
    dev->tx_queue_len = TUN_READQ_SIZE; /* We prefer our own queue length */
    break;
    }
    +}
    +
    +/* We don't consolidate consecutive iovecs, so huge iovecs can break here.
    + * Users will learn not to do that. */
    +static int get_user_skb_frags(const struct iovec *iv, size_t len,
    + struct skb_frag_struct *f)
    +{
    + unsigned int i, j, num_pg = 0;
    + int err;
    + struct page *pages[MAX_SKB_FRAGS];
    +
    + down_read(&current->mm->mmap_sem);
    + while (len) {
    + int n, npages;
    + unsigned long base, len;
    + base = (unsigned long)iv->iov_base;
    + len = (unsigned long)iv->iov_len;
    +
    + if (len == 0) {
    + iv++;
    + continue;
    + }
    +
    + /* How many pages will this take? */
    + npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;
    + if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) {
    + err = -ENOSPC;
    + goto fail;
    + }
    + n = get_user_pages(current, current->mm, base, npages,
    + 0, 0, pages, NULL);
    + if (unlikely(n < 0)) {
    + err = n;
    + goto fail;
    + }
    +
    + /* Transfer pages to the frag array */
    + for (j = 0; j < n; j++) {
    + f[num_pg].page = pages[j];
    + if (j == 0) {
    + f[num_pg].page_offset = offset_in_page(base);
    + f[num_pg].size = min(len, PAGE_SIZE -
    + f[num_pg].page_offset);
    + } else {
    + f[num_pg].page_offset = 0;
    + f[num_pg].size = min(len, PAGE_SIZE);
    + }
    + len -= f[num_pg].size;
    + base += f[num_pg].size;
    + num_pg++;
    + }
    +
    + if (unlikely(n != npages)) {
    + err = -EFAULT;
    + goto fail;
    + }
    + }
    + up_read(&current->mm->mmap_sem);
    + return num_pg;
    +
    +fail:
    + for (i = 0; i < num_pg; i++)
    + put_page(f[i].page);
    + up_read(&current->mm->mmap_sem);
    + return err;
    +}
    +
    +/* We actually store this at the head of the skb. */
    +struct skb_tun_hdr {
    + struct list_head list;
    + struct tun_struct *tun;
    + unsigned int id;
    + unsigned int len;
    +};
    +
    +/* Get packet from user space buffer. copylen is a hint as to how
    + * much to copy (rest is pinned). */
    +static struct sk_buff *get_user_skb(struct tun_struct *tun, struct iovec *iv,
    + size_t copylen, size_t len)
    +{
    + struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
    + struct sk_buff *skb;
    + size_t align = 0, extra = 0;
    + int err;
    +
    + if (!(tun->flags & TUN_NO_PI)) {
    + if (len < sizeof(pi)) {
    + err = -EINVAL;
    + goto fail;
    + }
    + len -= sizeof(pi);
    +
    + if (memcpy_fromiovec((void *)&pi, iv, sizeof(pi))) {
    + err = -EFAULT;
    + goto fail;
    + }
    + if (copylen > len)
    + copylen = len;
    + }
    +
    + if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
    + align = NET_IP_ALIGN;
    + if (unlikely(copylen < ETH_HLEN)) {
    + if (len < ETH_HLEN) {
    + err = -EINVAL;
    + goto fail;
    + }
    + copylen = ETH_HLEN;
    + }
    + }
    +
    + /* Allocate extra header if we need */
    + if (copylen != len)
    + extra = sizeof(struct skb_tun_hdr);
    +
    + skb = alloc_skb(extra + copylen + align, GFP_KERNEL);
    + if (!skb) {
    + err = -ENOMEM;
    + goto fail;
    + }
    +
    + if (extra + align)
    + skb_reserve(skb, extra + align);
    +
    + if (memcpy_fromiovec(skb_put(skb, copylen), iv, copylen)) {
    + err = -EFAULT;
    + goto free_skb;
    + }
    +
    + switch (tun->flags & TUN_TYPE_MASK) {
    + case TUN_TUN_DEV:
    + skb_reset_mac_header(skb);
    + skb->protocol = pi.proto;
    + skb->dev = tun->dev;
    + break;
    + case TUN_TAP_DEV:
    + skb->protocol = eth_type_trans(skb, tun->dev);
    + break;
    + };
    +
    + if (tun->flags & TUN_NOCHECKSUM)
    + skb->ip_summed = CHECKSUM_UNNECESSARY;
    +
    + /* Anything left gets put into frags. */
    + if (extra) {
    + struct skb_shared_info *sinfo = skb_shinfo(skb);
    + int err = get_user_skb_frags(iv, len - copylen, sinfo->frags);
    + if (err < 0)
    + goto free_skb;
    + sinfo->nr_frags = err;
    + }
    + tun->dev->last_rx = jiffies;
    +
    + tun->dev->stats.rx_packets++;
    + tun->dev->stats.rx_bytes += len;
    +
    + return skb;
    +
    +free_skb:
    + kfree_skb(skb);
    +fail:
    + tun->dev->stats.rx_dropped++;
    + return ERR_PTR(err);
    }

    #if defined(CONFIG_VRING) || defined(CONFIG_VRING_MODULE)
    @@ -355,6 +521,132 @@ static struct vring_ops recvops = {
    .pull = pull_recv_skbs,
    };

    +static DEFINE_SPINLOCK(finished_lock);
    +static LIST_HEAD(shinfo_finished_list);
    +static struct task_struct *shinfo_finisher;
    +
    +static void used_buffer(struct skb_tun_hdr *tunh)
    +{
    + /* Woot, something happened. */
    + vring_wake(tunh->tun->outring);
    +
    + /* Release device. Keeping this reference blocks file close. */
    + dev_put(tunh->tun->dev);
    +
    + /* tunh == skb->head. */
    + kfree(tunh);
    +}
    +
    +static int do_shinfo_finisher(void *unused)
    +{
    + LIST_HEAD(list);
    + struct skb_tun_hdr *i;
    +
    + while (!kthread_should_stop()) {
    + set_current_state(TASK_INTERRUPTIBLE);
    +
    + spin_lock_irq(&finished_lock);
    + list_splice_init(&list, &shinfo_finished_list);
    + spin_unlock_irq(&finished_lock);
    +
    + if (list_empty(&list)) {
    + schedule();
    + continue;
    + }
    +
    + list_for_each_entry(i, &list, list) {
    + vring_used_buffer(i->tun->outring, i->id, i->len);
    + used_buffer(i);
    + }
    + }
    + return 0;
    +}
    +
    +/* We are done with this skb data: put it in the used pile. */
    +static void shinfo_finished(struct skb_shared_info *sinfo)
    +{
    + struct skb_tun_hdr *tunh = (void *)skb_shinfo_to_head(sinfo);
    + unsigned long flags;
    +
    + spin_lock_irqsave(&finished_lock, flags);
    + list_add(&tunh->list, &shinfo_finished_list);
    + spin_unlock_irqrestore(&finished_lock, flags);
    +
    + wake_up_process(shinfo_finisher);
    +}
    +
    +static int xmit_packets(void *_tun)
    +{
    + struct tun_struct *tun = _tun;
    + struct iovec iov[1+MAX_SKB_FRAGS];
    + unsigned int iovnum = ARRAY_SIZE(iov);
    + int id, err, wake = 0;
    + unsigned long len;
    +
    + while ((id = vring_get_buffer(tun->outring, NULL, NULL, NULL,
    + iov, &iovnum, &len)) > 0) {
    + struct virtio_net_hdr h;
    + struct sk_buff *skb;
    + struct skb_shared_info *shinfo;
    +
    + if (unlikely(len < sizeof(h)))
    + return -EINVAL;
    +
    + err = memcpy_fromiovec((void *)&h, iov, sizeof(h));
    + if (unlikely(err))
    + return -EFAULT;
    +
    + len -= sizeof(h);
    + if (h.hdr_len > len)
    + return -EINVAL;
    +
    + /* Without GSO, we copy entire packet. */
    + if (h.gso_type == VIRTIO_NET_HDR_GSO_NONE)
    + h.hdr_len = len;
    +
    + skb = get_user_skb(tun, iov, h.hdr_len, len);
    + if (IS_ERR(skb))
    + return PTR_ERR(skb);
    +
    + if ((h.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
    + !skb_partial_csum_set(skb, h.csum_start, h.csum_offset)) {
    + kfree_skb(skb);
    + return -EINVAL;
    + }
    +
    + /* If it has fragments, set up destructor for later. */
    + shinfo = skb_shinfo(skb);
    + if (skb_shinfo(skb)->nr_frags) {
    + struct skb_tun_hdr *tunh = (void *)skb->head;
    + shinfo->destructor = shinfo_finished;
    + tunh->id = id;
    + tunh->len = sizeof(h) + skb->len;
    + } else {
    + vring_used_buffer(tun->outring, id, sizeof(h)+skb->len);
    + wake = 1;
    + }
    + netif_rx_ni(skb);
    + }
    +
    + if (wake)
    + vring_wake(tun->outring);
    +
    + /* 0 or error. */
    + return id;
    +}
    +
    +static struct vring_ops xmitops = {
    + .push = xmit_packets,
    +};
    +
    +static int init_vring(void)
    +{
    + shinfo_finisher = kthread_run(do_shinfo_finisher, NULL, "tun");
    + if (IS_ERR(shinfo_finisher))
    + return PTR_ERR(shinfo_finisher);
    + return 0;
    +}
    +
    static int set_recv_vring(struct tun_struct *tun, int fd)
    {
    int err;
    @@ -391,9 +685,47 @@ static void unset_vrings(struct tun_stru
    vring_unset_ops(tun->inring);
    fput(tun->infile);
    }
    + if (tun->outring) {
    + vring_unset_ops(tun->outring);
    + fput(tun->outfile);
    + }
    +}
    +
    +static int set_xmit_vring(struct tun_struct *tun, int fd)
    +{
    + int err;
    +
    + if (tun->outring)
    + return -EBUSY;
    +
    + tun->outfile = fget(fd);
    + if (!tun->outfile)
    + return -EBADF;
    +
    + tun->outring = vring_get(tun->outfile);
    + if (!tun->outring) {
    + err = -EBADF;
    + goto put;
    + }
    +
    + err = vring_set_ops(tun->outring, &xmitops, tun);
    + if (err) {
    + tun->outring = NULL;
    + goto put;
    + }
    + return 0;
    +
    +put:
    + fput(tun->outfile);
    + tun->outfile = NULL;
    + return err;
    }
    #else /* ... !CONFIG_VRING */
    static int set_recv_vring(struct tun_struct *tun, int fd)
    +{
    + return -ENOTTY;
    +}
    +static int set_xmit_vring(struct tun_struct *tun, int fd)
    {
    return -ENOTTY;
    }
    @@ -424,74 +756,26 @@ static unsigned int tun_chr_poll(struct
    return mask;
    }

    -/* Get packet from user space buffer */
    -static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count)
    -{
    - struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
    - struct sk_buff *skb;
    - size_t len = count, align = 0;
    -
    - if (!(tun->flags & TUN_NO_PI)) {
    - if ((len -= sizeof(pi)) > count)
    - return -EINVAL;
    -
    - if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi)))
    - return -EFAULT;
    - }
    -
    - if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
    - align = NET_IP_ALIGN;
    - if (unlikely(len < ETH_HLEN))
    - return -EINVAL;
    - }
    -
    - if (!(skb = alloc_skb(len + align, GFP_KERNEL))) {
    - tun->dev->stats.rx_dropped++;
    - return -ENOMEM;
    - }
    -
    - if (align)
    - skb_reserve(skb, align);
    - if (memcpy_fromiovec(skb_put(skb, len), iv, len)) {
    - tun->dev->stats.rx_dropped++;
    - kfree_skb(skb);
    - return -EFAULT;
    - }
    -
    - switch (tun->flags & TUN_TYPE_MASK) {
    - case TUN_TUN_DEV:
    - skb_reset_mac_header(skb);
    - skb->protocol = pi.proto;
    - skb->dev = tun->dev;
    - break;
    - case TUN_TAP_DEV:
    - skb->protocol = eth_type_trans(skb, tun->dev);
    - break;
    - };
    -
    - if (tun->flags & TUN_NOCHECKSUM)
    - skb->ip_summed = CHECKSUM_UNNECESSARY;
    -
    - netif_rx_ni(skb);
    - tun->dev->last_rx = jiffies;
    -
    - tun->dev->stats.rx_packets++;
    - tun->dev->stats.rx_bytes += len;
    -
    - return count;
    -}
    -
    static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
    unsigned long count, loff_t pos)
    {
    struct tun_struct *tun = iocb->ki_filp->private_data;
    + size_t len;
    + struct sk_buff *skb;

    if (!tun)
    return -EBADFD;

    DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count);

    - return tun_get_user(tun, (struct iovec *) iv, iov_length(iv, count));
    + len = iov_length(iv, count);
    +
    + skb = get_user_skb(tun, (struct iovec *)iv, len, len);
    + if (IS_ERR(skb))
    + return PTR_ERR(skb);
    +
    + netif_rx_ni(skb);
    + return len;
    }

    /* Put packet to the user space buffer */
    @@ -831,6 +1115,9 @@ static int tun_chr_ioctl(struct inode *i
    case TUNSETRECVVRING:
    return set_recv_vring(tun, arg);

    + case TUNSETXMITVRING:
    + return set_xmit_vring(tun, arg);
    +
    case SIOCGIFFLAGS:
    ifr.ifr_flags = tun->if_flags;
    if (copy_to_user( argp, &ifr, sizeof ifr))
    @@ -1078,6 +1365,12 @@ static int __init tun_init(void)
    ret = misc_register(&tun_miscdev);
    if (ret)
    printk(KERN_ERR "tun: Can't register misc device %d\n", TUN_MINOR);
    + else {
    + ret = init_vring();
    + if (ret)
    + misc_deregister(&tun_miscdev);
    + }
    +
    return ret;
    }

    diff -r f797ec115d1b include/linux/if_tun.h
    --- a/include/linux/if_tun.h Fri Apr 18 05:58:40 2008 +1000
    +++ b/include/linux/if_tun.h Fri Apr 18 06:07:21 2008 +1000
    @@ -43,6 +43,7 @@
    #define TUNSETLINK _IOW('T', 205, int)
    #define TUNSETGROUP _IOW('T', 206, int)
    #define TUNSETRECVVRING _IOW('T', 207, int)
    +#define TUNSETXMITVRING _IOW('T', 208, int)

    /* TUNSETIFF ifr flags */
    #define IFF_TUN 0x0001
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  4. [PATCH 3/5] /dev/vring limit and base ioctls

    It turns out the lguest (and possibly kvm) want the addresses in the
    ring buffer to only cover a certain part of memory, and be offset.

    It makes sense that this be an ioctl.

    Signed-off-by: Rusty Russell
    ---
    Documentation/ioctl-number.txt | 3 +--
    drivers/char/vring.c | 37 +++++++++++++++++++++++++++++++++++--
    include/linux/vring.h | 9 +++++++++
    3 files changed, 45 insertions(+), 4 deletions(-)

    diff -r caeeb48d478a Documentation/ioctl-number.txt
    --- a/Documentation/ioctl-number.txt Fri Apr 18 13:35:17 2008 +1000
    +++ b/Documentation/ioctl-number.txt Fri Apr 18 13:36:21 2008 +1000
    @@ -181,8 +181,7 @@ 0xA3 90-9F linux/dtlk.h
    0xA3 90-9F linux/dtlk.h
    0xAB 00-1F linux/nbd.h
    0xAC 00-1F linux/raw.h
    -0xAD 00 Netfilter device in development:
    -
    +0xAD 00-01 linux/vring.h
    0xB0 all RATIO devices in development:

    0xB1 00-1F PPPoX
    diff -r caeeb48d478a drivers/char/vring.c
    --- a/drivers/char/vring.c Fri Apr 18 13:35:17 2008 +1000
    +++ b/drivers/char/vring.c Fri Apr 18 13:36:21 2008 +1000
    @@ -32,6 +32,8 @@ struct vring_info {
    struct vring ring;
    u16 mask;
    u16 last_used;
    +
    + unsigned long base, limit;

    const struct vring_ops *ops;
    void *ops_data;
    @@ -163,6 +165,26 @@ static int vring_open(struct inode *in,

    init_waitqueue_head(&vr->poll_wait);
    mutex_init(&vr->lock);
    + vr->limit = -1UL;
    + vr->base = 0;
    + return 0;
    +}
    +
    +static int vring_ioctl(struct inode *in, struct file *filp,
    + unsigned int cmd, unsigned long arg)
    +{
    + struct vring_info *vr = filp->private_data;
    +
    + switch (cmd) {
    + case VRINGSETBASE:
    + vr->base = arg;
    + break;
    + case VRINGSETLIMIT:
    + vr->limit = arg;
    + break;
    + default:
    + return -ENOTTY;
    + }
    return 0;
    }

    @@ -173,6 +195,7 @@ static const struct file_operations vrin
    .read = vring_read,
    .write = vring_write,
    .poll = vring_poll,
    + .ioctl = vring_ioctl,
    };

    /**
    @@ -220,6 +243,8 @@ int vring_get_buffer(struct vring_info *

    i = head;
    do {
    + void __user *base;
    +
    if (unlikely(i >= vr->ring.num)) {
    pr_debug("vring: bad index: %u\n", i);
    return -EINVAL;
    @@ -227,6 +252,14 @@ int vring_get_buffer(struct vring_info *

    if (copy_from_user(&d, &vr->ring.desc[i], sizeof(d)) != 0)
    return -EFAULT;
    +
    + if (d.addr + d.len > vr->limit || (d.addr + d.len < d.addr)) {
    + pr_debug("vring: bad addr/len: %u@%p\n",
    + d.len, (void *)(unsigned long)d.addr);
    + return -EINVAL;
    + }
    +
    + base = (void __user *)(unsigned long)d.addr + vr->base;

    if (d.flags & VRING_DESC_F_WRITE) {
    /* Check for length and iovec overflows */
    @@ -239,7 +272,7 @@ int vring_get_buffer(struct vring_info *
    return -E2BIG;
    in_iov[in].iov_len = d.len;
    *in_len += d.len;
    - in_iov[in].iov_base = (void __user *)(long)d.addr;
    + in_iov[in].iov_base = base;
    in++;
    } else {
    if (!num_out) {
    @@ -251,7 +284,7 @@ int vring_get_buffer(struct vring_info *
    return -E2BIG;
    out_iov[out].iov_len = d.len;
    *out_len += d.len;
    - out_iov[out].iov_base = (void __user *)(long)d.addr;
    + out_iov[out].iov_base = base;
    out++;
    }

    diff -r caeeb48d478a include/linux/vring.h
    --- a/include/linux/vring.h Fri Apr 18 13:35:17 2008 +1000
    +++ b/include/linux/vring.h Fri Apr 18 13:36:21 2008 +1000
    @@ -18,6 +18,13 @@
    */
    #ifndef _LINUX_VRING_H
    #define _LINUX_VRING_H
    +#include
    +
    +/* Ioctl defines. */
    +#define VRINGSETBASE _IO(0xAD, 0)
    +#define VRINGSETLIMIT _IO(0xAD, 1)
    +
    +#ifdef __KERNEL__

    /**
    * vring_ops - operations for a vring fd.
    @@ -55,4 +62,6 @@ void vring_used_buffer(struct vring_info
    void vring_used_buffer(struct vring_info *vr, int id, u32 len);

    void vring_wake(struct vring_info *vr);
    +#endif /* __KERNEL__ */
    +
    #endif /* _LINUX_VRING_H */
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  5. [PATCH 4/5] tun: vringfd receive support.

    This patch modifies tun to allow a vringfd to specify the receive
    buffer. Because we can't copy to userspace in bh context, we queue
    like normal then use the "pull" hook to actually do the copy.

    We use struct virtio_net_hdr prepended to packets in the ring to allow
    userspace to receive GSO packets in future (at the moment, the tun
    driver doesn't tell the stack it can handle them, so these cases are
    never taken). This will need to be something that userspace tells us
    it can handle.

    Signed-off-by: Rusty Russell
    ---
    drivers/net/Kconfig | 2
    drivers/net/tun.c | 159 +++++++++++++++++++++++++++++++++++++++++++++++++
    include/linux/if_tun.h | 1
    3 files changed, 162 insertions(+)

    diff -r 9bafcef88e1b drivers/net/Kconfig
    --- a/drivers/net/Kconfig Fri Apr 18 05:54:45 2008 +1000
    +++ b/drivers/net/Kconfig Fri Apr 18 05:58:40 2008 +1000
    @@ -120,6 +120,8 @@ config TUN
    config TUN
    tristate "Universal TUN/TAP device driver support"
    select CRC32
    +# If no VRING at all, that's fine, but if it's a module, we must be, too.
    + depends on !VRING || VRING
    ---help---
    TUN/TAP provides packet reception and transmission for user space
    programs. It can be viewed as a simple Point-to-Point or Ethernet
    diff -r 9bafcef88e1b drivers/net/tun.c
    --- a/drivers/net/tun.c Fri Apr 18 05:54:45 2008 +1000
    +++ b/drivers/net/tun.c Fri Apr 18 05:58:40 2008 +1000
    @@ -62,6 +62,9 @@
    #include
    #include
    #include
    +#include
    +#include
    +#include
    #include

    #include
    @@ -98,6 +101,9 @@ struct tun_struct {
    u8 dev_addr[ETH_ALEN];
    u32 chr_filter[2];
    u32 net_filter[2];
    +
    + struct vring_info *inring;
    + struct file *infile;

    #ifdef TUN_DEBUG
    int debug;
    @@ -158,6 +164,10 @@ static int tun_net_xmit(struct sk_buff *
    /* Notify and wake up reader process */
    if (tun->flags & TUN_FASYNC)
    kill_fasync(&tun->fasync, SIGIO, POLL_IN);
    +
    + if (tun->inring)
    + vring_wake(tun->inring);
    +
    wake_up_interruptible(&tun->read_wait);
    return 0;

    @@ -249,6 +259,149 @@ static void tun_net_init(struct net_devi
    break;
    }
    }
    +
    +#if defined(CONFIG_VRING) || defined(CONFIG_VRING_MODULE)
    +/* Returns whether there are queued buffers */
    +static bool pending_recv_skbs(void *_tun)
    +{
    + struct tun_struct *tun = _tun;
    +
    + return !skb_queue_empty(&tun->readq);
    +}
    +
    +/* Returns 0, or negative errno. */
    +static int pull_recv_skbs(void *_tun)
    +{
    + struct tun_struct *tun = _tun;
    + int err = 0, num_copied = 0;
    + struct sk_buff *skb;
    +
    + while ((skb = skb_dequeue(&tun->readq)) != NULL) {
    + struct iovec iov[1+MAX_SKB_FRAGS];
    + struct virtio_net_hdr gso = { 0 }; /* no info leak */
    + unsigned int iovnum = ARRAY_SIZE(iov);
    + unsigned long len;
    + int id;
    +
    + id = vring_get_buffer(tun->inring, iov, &iovnum, &len,
    + NULL, NULL, NULL);
    + if (id <= 0) {
    + err = id;
    + break;
    + }
    +
    + /* FIXME: we could stash this descriptor and go looking for a
    + * better-sized one. That would allow them to mix different
    + * buffer sizes for efficiency. */
    + if (unlikely(len < sizeof(gso) + skb->len)) {
    + tun->dev->stats.tx_aborted_errors++;
    + err = -ENOBUFS; /* PS. You suck! */
    + break;
    + }
    +
    + if (skb_is_gso(skb)) {
    + struct skb_shared_info *sinfo = skb_shinfo(skb);
    +
    + /* This is a hint as to how much should be linear. */
    + gso.hdr_len = skb_headlen(skb);
    + gso.gso_size = sinfo->gso_size;
    + if (sinfo->gso_type & SKB_GSO_TCPV4)
    + gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
    + else if (sinfo->gso_type & SKB_GSO_TCPV6)
    + gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
    + else if (sinfo->gso_type & SKB_GSO_UDP)
    + gso.gso_type = VIRTIO_NET_HDR_GSO_UDP;
    + else
    + BUG();
    + if (sinfo->gso_type & SKB_GSO_TCP_ECN)
    + gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
    + } else
    + gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;
    +
    + if (skb->ip_summed == CHECKSUM_PARTIAL) {
    + gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
    + gso.csum_start = skb->csum_start - skb_headroom(skb);
    + gso.csum_offset = skb->csum_offset;
    + } /* else everything is zero */
    +
    + err = memcpy_toiovec(iov, (void *)&gso, sizeof(gso));
    + if (unlikely(err)) {
    + tun->dev->stats.tx_fifo_errors++;
    + break;
    + }
    +
    + err = skb_copy_datagram_iovec(skb, 0, iov, skb->len);
    + if (unlikely(err)) {
    + tun->dev->stats.tx_fifo_errors++;
    + break;
    + }
    +
    + vring_used_buffer(tun->inring, id, sizeof(gso) + skb->len);
    + num_copied++;
    + }
    +
    + /* We took an skb, but ring isn't ready for it. Put it back */
    + if (skb)
    + skb_queue_head(&tun->readq, skb);
    +
    + if (num_copied)
    + netif_wake_queue(tun->dev);
    +
    + return err;
    +}
    +
    +static struct vring_ops recvops = {
    + .needs_pull = pending_recv_skbs,
    + .pull = pull_recv_skbs,
    +};
    +
    +static int set_recv_vring(struct tun_struct *tun, int fd)
    +{
    + int err;
    +
    + if (tun->inring)
    + return -EBUSY;
    +
    + tun->infile = fget(fd);
    + if (!tun->infile)
    + return -EBADF;
    +
    + tun->inring = vring_get(tun->infile);
    + if (!tun->inring) {
    + err = -EBADF;
    + goto put;
    + }
    +
    + err = vring_set_ops(tun->inring, &recvops, tun);
    + if (err) {
    + tun->inring = NULL;
    + goto put;
    + }
    + return 0;
    +
    +put:
    + fput(tun->infile);
    + tun->infile = NULL;
    + return err;
    +}
    +
    +static void unset_vrings(struct tun_struct *tun)
    +{
    + if (tun->inring) {
    + vring_unset_ops(tun->inring);
    + fput(tun->infile);
    + }
    +}
    +#else /* ... !CONFIG_VRING */
    +static int set_recv_vring(struct tun_struct *tun, int fd)
    +{
    + return -ENOTTY;
    +}
    +
    +static void unset_vrings(struct tun_struct *tun)
    +{
    +}
    +#endif

    /* Character device part */

    @@ -465,6 +618,7 @@ static void tun_setup(struct net_device

    tun->owner = -1;
    tun->group = -1;
    + tun->inring = NULL;

    dev->open = tun_net_open;
    dev->hard_start_xmit = tun_net_xmit;
    @@ -674,6 +828,9 @@ static int tun_chr_ioctl(struct inode *i
    break;
    #endif

    + case TUNSETRECVVRING:
    + return set_recv_vring(tun, arg);
    +
    case SIOCGIFFLAGS:
    ifr.ifr_flags = tun->if_flags;
    if (copy_to_user( argp, &ifr, sizeof ifr))
    @@ -784,6 +941,8 @@ static int tun_chr_close(struct inode *i
    DBG(KERN_INFO "%s: tun_chr_close\n", tun->dev->name);

    tun_chr_fasync(-1, file, 0);
    +
    + unset_vrings(tun);

    rtnl_lock();

    diff -r 9bafcef88e1b include/linux/if_tun.h
    --- a/include/linux/if_tun.h Fri Apr 18 05:54:45 2008 +1000
    +++ b/include/linux/if_tun.h Fri Apr 18 05:58:40 2008 +1000
    @@ -42,6 +42,7 @@
    #define TUNSETOWNER _IOW('T', 204, int)
    #define TUNSETLINK _IOW('T', 205, int)
    #define TUNSETGROUP _IOW('T', 206, int)
    +#define TUNSETRECVVRING _IOW('T', 207, int)

    /* TUNSETIFF ifr flags */
    #define IFF_TUN 0x0001
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  6. [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

    virtio introduced a ring structure ABI for guest-host communications
    (currently used by lguest and kvm). Using this same ABI, we can
    create a nice fd version.

    This is useful for efficiently passing packets to and from the tun,
    for example.

    Signed-off-by: Rusty Russell
    ---
    drivers/char/Kconfig | 9 +
    drivers/char/Makefile | 2
    drivers/char/vring.c | 400 ++++++++++++++++++++++++++++++++++++++++++++++++++
    include/linux/vring.h | 58 +++++++
    4 files changed, 469 insertions(+)

    diff -r b2d9869d338f drivers/char/Kconfig
    --- a/drivers/char/Kconfig Fri Apr 18 10:33:58 2008 +1000
    +++ b/drivers/char/Kconfig Fri Apr 18 13:35:16 2008 +1000
    @@ -1049,5 +1049,14 @@ config DEVPORT

    source "drivers/s390/char/Kconfig"

    +config VRING
    + tristate "/dev/vring support (EXPERIMENTAL)"
    + depends on EXPERIMENTAL
    + help
    + vring is a ringbuffer implementation for efficient I/O. It is
    + currently used by virtualization hosts (lguest, kvm) for efficient
    + networking using the tun driver.
    +
    + If unsure, say N, but there's a part of you that wants to say M.
    endmenu

    diff -r b2d9869d338f drivers/char/Makefile
    --- a/drivers/char/Makefile Fri Apr 18 10:33:58 2008 +1000
    +++ b/drivers/char/Makefile Fri Apr 18 13:35:16 2008 +1000
    @@ -112,6 +112,8 @@ obj-$(CONFIG_JS_RTC) += js-rtc.o
    obj-$(CONFIG_JS_RTC) += js-rtc.o
    js-rtc-y = rtc.o

    +obj-$(CONFIG_VRING) += vring.o
    +
    # Files generated that shall be removed upon make clean
    clean-files := consolemap_deftbl.c defkeymap.c

    diff -r b2d9869d338f drivers/char/vring.c
    --- /dev/null Thu Jan 01 00:00:00 1970 +0000
    +++ b/drivers/char/vring.c Fri Apr 18 13:35:16 2008 +1000
    @@ -0,0 +1,400 @@
    +/* Ring-buffer device implementation.
    + *
    + * Copyright 2008 Rusty Russell IBM Corporation
    + *
    + * This program is free software; you can redistribute it and/or modify
    + * it under the terms of the GNU General Public License as published by
    + * the Free Software Foundation; either version 2 of the License, or
    + * (at your option) any later version.
    + *
    + * This program is distributed in the hope that it will be useful,
    + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    + * GNU General Public License for more details.
    + *
    + * You should have received a copy of the GNU General Public License
    + * along with this program; if not, write to the Free Software
    + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
    + */
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +#include
    +
    +struct vring_info {
    + struct mutex lock;
    +
    + struct vring ring;
    + u16 mask;
    + u16 last_used;
    +
    + const struct vring_ops *ops;
    + void *ops_data;
    +
    + /* Waitqueue for poll() */
    + wait_queue_head_t poll_wait;
    +};
    +
    +static unsigned int vring_poll(struct file *filp,
    + struct poll_table_struct *poll)
    +{
    + struct vring_info *vr = filp->private_data;
    + unsigned int mask;
    + u16 used = 0;
    +
    + /* Poll can't error, so let's not go silly here. */
    + get_user(used, &vr->ring.used->idx);
    +
    + /* More buffers have been used? It's 'readable'. */
    + if (used != vr->last_used)
    + mask = POLLIN | POLLRDNORM;
    + else {
    + mask = 0;
    + /* If we need to pull, it's also readable. */
    + mutex_lock(&vr->lock);
    + if (vr->ops && vr->ops->needs_pull) {
    + if (vr->ops->needs_pull(vr->ops_data))
    + mask = POLLIN | POLLRDNORM;
    + }
    + mutex_unlock(&vr->lock);
    + }
    +
    + poll_wait(filp, &vr->poll_wait, poll);
    +
    + return mask;
    +}
    +
    +/* Read may not be necessary for all use cases, in fact. */
    +static ssize_t vring_read(struct file *filp, char __user *buf,
    + size_t size, loff_t *off)
    +{
    + struct vring_info *vr = filp->private_data;
    + int err;
    +
    + /* Some uses of vrings require updating in user context. This
    + * is best done close to the caller, ie. here. */
    + mutex_lock(&vr->lock);
    + if (vr->ops && vr->ops->pull)
    + err = vr->ops->pull(vr->ops_data);
    + else
    + err = 0;
    + mutex_unlock(&vr->lock);
    +
    + /* Update our last_used value to clear the poll. */
    + if (!err)
    + err = get_user(vr->last_used, &vr->ring.used->idx);
    +
    + return err;
    +}
    +
    +/* Write kicks the other end to say we have buffers. */
    +static ssize_t vring_write(struct file *filp, const char __user *buf,
    + size_t size, loff_t *off)
    +{
    + struct vring_info *vr = filp->private_data;
    + int err;
    +
    + mutex_lock(&vr->lock);
    + if (vr->ops && vr->ops->push)
    + err = vr->ops->push(vr->ops_data);
    + else
    + err = 0;
    + mutex_unlock(&vr->lock);
    +
    + return err;
    +}
    +
    +/* We assume anyone attached holds a reference, so this won't mess them up */
    +static int vring_release(struct inode *inode, struct file *filp)
    +{
    + struct vring_info *vr = filp->private_data;
    +
    + kfree(vr);
    + return 0;
    +}
    +
    +static int vring_mmap(struct file *filp, struct vm_area_struct *vma)
    +{
    + unsigned long size, num_descs;
    + struct vring_info *vr = filp->private_data;
    + int err;
    +
    + /* We overload mmap's offset to hold the ring number. */
    + num_descs = vma->vm_pgoff;
    +
    + /* Must be a power of two, and limit indices to a u16. */
    + if (!num_descs || (num_descs & (num_descs-1)) || num_descs > 65536)
    + return -EINVAL;
    +
    + /* mmap size must be what we expect for such a ring. */
    + size = vma->vm_end - vma->vm_start;
    + if (size != ALIGN(vring_size(num_descs, PAGE_SIZE), PAGE_SIZE))
    + return -EINVAL;
    +
    + /* We only let them map this in one place. */
    + mutex_lock(&vr->lock);
    + if (vr->ring.num != 0) {
    + err = -EBUSY;
    + goto unlock;
    + }
    +
    + vring_init(&vr->ring, num_descs, (void *)vma->vm_start, PAGE_SIZE);
    +
    + vr->mask = num_descs - 1;
    + err = 0;
    +
    +unlock:
    + mutex_unlock(&vr->lock);
    + return err;
    +}
    +
    +static int vring_open(struct inode *in, struct file *filp)
    +{
    + struct vring_info *vr;
    +
    + filp->private_data = vr = kzalloc(sizeof(*vr), GFP_KERNEL);
    + if (!vr)
    + return -ENOMEM;
    +
    + init_waitqueue_head(&vr->poll_wait);
    + mutex_init(&vr->lock);
    + return 0;
    +}
    +
    +static const struct file_operations vring_fops = {
    + .open = vring_open,
    + .release = vring_release,
    + .mmap = vring_mmap,
    + .read = vring_read,
    + .write = vring_write,
    + .poll = vring_poll,
    +};
    +
    +/**
    + * vring_get_buffer - get a buffer from the vring
    + * @vr: the vring
    + * @in_iov: the iovec array for input buffers
    + * @num_in: the size of the in_iov array, updated by this function.
    + * @in_len: the total length of in_iov after this function.
    + * @out_iov: the iovec array for output buffers
    + * @num_out: the size of the ut_iov array, updated by this function.
    + * @out_len: the total length of out_iov after this function.
    + *
    + * A vring buffer is an array of input and output parts. This gets the next
    + * available buffer, and returns a non-zero id which is handed back to
    + * vring_used_buffer() once you're finished with the buffer. A zero return
    + * means no available buffers, negative for error.
    + */
    +int vring_get_buffer(struct vring_info *vr,
    + struct iovec *in_iov,
    + unsigned int *num_in, unsigned long *in_len,
    + struct iovec *out_iov,
    + unsigned int *num_out, unsigned long *out_len)
    +{
    + unsigned int i, in = 0, out = 0;
    + unsigned long dummy;
    + u16 avail, last_avail, head;
    + struct vring_desc d;
    +
    + if (unlikely(get_user(avail, &vr->ring.avail->idx)))
    + return -EFAULT;
    + if (unlikely(get_user(last_avail, &vring_last_avail(&vr->ring))))
    + return -EFAULT;
    +
    + if (last_avail == avail)
    + return 0;
    +
    + if (!in_len)
    + in_len = &dummy;
    + if (!out_len)
    + out_len = &dummy;
    +
    + *in_len = *out_len = 0;
    +
    + if (unlikely(get_user(head, &vr->ring.avail->ring[last_avail
    + & vr->mask])))
    + return -EFAULT;
    +
    + i = head;
    + do {
    + if (unlikely(i >= vr->ring.num)) {
    + pr_debug("vring: bad index: %u\n", i);
    + return -EINVAL;
    + }
    +
    + if (copy_from_user(&d, &vr->ring.desc[i], sizeof(d)) != 0)
    + return -EFAULT;
    +
    + if (d.flags & VRING_DESC_F_WRITE) {
    + /* Check for length and iovec overflows */
    + if (!num_in) {
    + pr_debug("vring: writable desc %u in ring %p\n",
    + i, vr->ring.desc);
    + return -EINVAL;
    + }
    + if (in == *num_in || *in_len + d.len < *in_len)
    + return -E2BIG;
    + in_iov[in].iov_len = d.len;
    + *in_len += d.len;
    + in_iov[in].iov_base = (void __user *)(long)d.addr;
    + in++;
    + } else {
    + if (!num_out) {
    + pr_debug("vring: readable desc %u in ring %p\n",
    + i, vr->ring.desc);
    + return -EINVAL;
    + }
    + if (out == *num_out || *out_len + d.len < *out_len)
    + return -E2BIG;
    + out_iov[out].iov_len = d.len;
    + *out_len += d.len;
    + out_iov[out].iov_base = (void __user *)(long)d.addr;
    + out++;
    + }
    +
    + i = d.next;
    + } while (d.flags & VRING_DESC_F_NEXT);
    +
    + if (num_in)
    + *num_in = in;
    + if (num_out)
    + *num_out = out;
    +
    + last_avail++;
    + put_user(last_avail, &vring_last_avail(&vr->ring));
    +
    + /* 0 is a valid head, so add one. */
    + return head + 1;
    +}
    +EXPORT_SYMBOL_GPL(vring_get_buffer);
    +
    +/**
    + * vring_used_buffer - return a used buffer to the vring
    + * @vr: the vring
    + * @id: the id returned from vring_get_buffer
    + * @len: the total bytes *written* to the buffer
    + */
    +void vring_used_buffer(struct vring_info *vr, int id, u32 len)
    +{
    + struct vring_used_elem used;
    + u16 used_idx;
    +
    + BUG_ON(id <= 0 || id > vr->ring.num);
    +
    + used.id = id - 1;
    + used.len = len;
    + if (get_user(used_idx, &vr->ring.used->idx) != 0)
    + return;
    +
    + if (copy_to_user(&vr->ring.used->ring[used_idx & vr->mask], &used,
    + sizeof(used)))
    + return;
    +
    + wmb();
    + used_idx++;
    + put_user(used_idx, &vr->ring.used->idx);
    +}
    +EXPORT_SYMBOL_GPL(vring_used_buffer);
    +
    +void vring_wake(struct vring_info *vr)
    +{
    + wake_up(&vr->poll_wait);
    +}
    +EXPORT_SYMBOL_GPL(vring_wake);
    +
    +/**
    + * vring_get - check out a vring file descriptor
    + * @filp: the file structure to attach to (eg. from fget()).
    + *
    + * Userspace opens /dev/vring and mmaps it, then hands that fd to the
    + * kernel subsystem it wants to communicate with. That subsystem uses
    + * this routine and vring_set_ops() to attach to it.
    + *
    + * This simply checks that it really is a vring fd (otherwise it
    + * returns NULL), the other routine checks that it's not already
    + * attached.
    + */
    +struct vring_info *vring_get(struct file *filp)
    +{
    + /* Must be one of ours. */
    + if (filp->f_op != &vring_fops)
    + return NULL;
    +
    + return filp->private_data;
    +}
    +EXPORT_SYMBOL_GPL(vring_get);
    +
    +/**
    + * vring_set_ops - attach operations to a vring file descriptor.
    + * @vr: the vring_info returned from vring_get.
    + * @ops: the operations to attach.
    + * @ops_data: the argument to the ops callbacks.
    + *
    + * This is called after vring_get(): the reason for the two-part
    + * process is that the ops can be called before vring_set_ops returns
    + * (we don't do locking), so you really need to set things up before
    + * this call.
    + *
    + * This simply checks that the ring is not already attached to something,
    + * then sets the ops.
    + */
    +int vring_set_ops(struct vring_info *vr,
    + const struct vring_ops *ops, void *ops_data)
    +{
    + int err;
    +
    + mutex_lock(&vr->lock);
    + if (vr->ops) {
    + err = -EBUSY;
    + goto unlock;
    + }
    +
    + /* We don't lock, so make sure we get this in the right order. */
    + vr->ops_data = ops_data;
    + wmb();
    + vr->ops = ops;
    +
    + err = 0;
    +unlock:
    + mutex_unlock(&vr->lock);
    + local_irq_enable();
    + return err;
    +}
    +EXPORT_SYMBOL_GPL(vring_set_ops);
    +
    +/**
    + * vring_unset_ops - remove operations to a vring file descriptor.
    + * @vr: the vring_info previously successfully vring_set_ops'd
    + */
    +void vring_unset_ops(struct vring_info *vr)
    +{
    + BUG_ON(!vr->ops);
    + mutex_lock(&vr->lock);
    + vr->ops = NULL;
    + mutex_unlock(&vr->lock);
    +}
    +EXPORT_SYMBOL_GPL(vring_unset_ops);
    +
    +static struct miscdevice vring_dev = {
    + .minor = MISC_DYNAMIC_MINOR,
    + .name = KBUILD_MODNAME,
    + .fops = &vring_fops,
    +};
    +
    +static int __init init(void)
    +{
    + return misc_register(&vring_dev);
    +}
    +
    +static void __exit fini(void)
    +{
    + misc_deregister(&vring_dev);
    +}
    +
    +module_init(init);
    +module_exit(fini);
    diff -r b2d9869d338f include/linux/vring.h
    --- /dev/null Thu Jan 01 00:00:00 1970 +0000
    +++ b/include/linux/vring.h Fri Apr 18 13:35:16 2008 +1000
    @@ -0,0 +1,58 @@
    +/* Ring-buffer file descriptor implementation.
    + *
    + * Copyright 2008 Rusty Russell IBM Corporation
    + *
    + * This program is free software; you can redistribute it and/or modify
    + * it under the terms of the GNU General Public License as published by
    + * the Free Software Foundation; either version 2 of the License, or
    + * (at your option) any later version.
    + *
    + * This program is distributed in the hope that it will be useful,
    + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    + * GNU General Public License for more details.
    + *
    + * You should have received a copy of the GNU General Public License
    + * along with this program; if not, write to the Free Software
    + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
    + */
    +#ifndef _LINUX_VRING_H
    +#define _LINUX_VRING_H
    +
    +/**
    + * vring_ops - operations for a vring fd.
    + * @needs_pull: more data is pending, need to call pull.
    + * @pull: callback when read() is called to report used buffers.
    + * @push: callback when write() is called to notify of added buffers.
    + *
    + * Any of these callbacks can be NULL, if you don't need them.
    + */
    +struct vring_ops {
    + bool (*needs_pull)(void *ops_data);
    +
    + /* Returns 0 or negative errno. */
    + int (*pull)(void *ops_data);
    +
    + /* Returns 0 or negative errno. */
    + int (*push)(void *ops_data);
    +};
    +
    +struct file;
    +
    +struct vring_info *vring_get(struct file *filp);
    +int vring_set_ops(struct vring_info *,
    + const struct vring_ops *ops, void *ops_data);
    +void vring_unset_ops(struct vring_info *vr);
    +struct iovec;
    +
    +/* Returns an error, or 0 (no buffers), or an id for vring_used_buffer() */
    +int vring_get_buffer(struct vring_info *vr,
    + struct iovec *in_iov,
    + unsigned int *num_in, unsigned long *in_len,
    + struct iovec *out_iov,
    + unsigned int *num_out, unsigned long *out_len);
    +
    +void vring_used_buffer(struct vring_info *vr, int id, u32 len);
    +
    +void vring_wake(struct vring_info *vr);
    +#endif /* _LINUX_VRING_H */
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  7. Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

    On Fri, 18 Apr 2008 14:39:48 +1000 Rusty Russell wrote:

    > virtio introduced a ring structure ABI for guest-host communications
    > (currently used by lguest and kvm). Using this same ABI, we can
    > create a nice fd version.
    >
    > This is useful for efficiently passing packets to and from the tun,
    > for example.
    >
    > ...
    >
    > +static int vring_mmap(struct file *filp, struct vm_area_struct *vma)
    > +{
    > + unsigned long size, num_descs;
    > + struct vring_info *vr = filp->private_data;
    > + int err;
    > +
    > + /* We overload mmap's offset to hold the ring number. */
    > + num_descs = vma->vm_pgoff;
    > +
    > + /* Must be a power of two, and limit indices to a u16. */
    > + if (!num_descs || (num_descs & (num_descs-1)) || num_descs > 65536)


    We have an is_power_of_2().

    > + return -EINVAL;
    > +
    > + /* mmap size must be what we expect for such a ring. */
    > + size = vma->vm_end - vma->vm_start;
    > + if (size != ALIGN(vring_size(num_descs, PAGE_SIZE), PAGE_SIZE))
    > + return -EINVAL;
    > +
    > + /* We only let them map this in one place. */
    > + mutex_lock(&vr->lock);
    > + if (vr->ring.num != 0) {
    > + err = -EBUSY;
    > + goto unlock;
    > + }
    > +
    > + vring_init(&vr->ring, num_descs, (void *)vma->vm_start, PAGE_SIZE);
    > +
    > + vr->mask = num_descs - 1;
    > + err = 0;
    > +
    > +unlock:
    > + mutex_unlock(&vr->lock);
    > + return err;
    > +}
    >
    > ...
    >
    > +/**
    > + * vring_get - check out a vring file descriptor
    > + * @filp: the file structure to attach to (eg. from fget()).
    > + *
    > + * Userspace opens /dev/vring and mmaps it, then hands that fd to the
    > + * kernel subsystem it wants to communicate with. That subsystem uses
    > + * this routine and vring_set_ops() to attach to it.
    > + *
    > + * This simply checks that it really is a vring fd (otherwise it
    > + * returns NULL), the other routine checks that it's not already
    > + * attached.
    > + */


    hm, I don't understand the big picture here yet.

    Isn't this kinda-sorta like what a relayfs file does? The oprofile
    buffers? etc? Nothing in common at all, no hope?

    > +struct vring_info *vring_get(struct file *filp)
    > +{
    > + /* Must be one of ours. */
    > + if (filp->f_op != &vring_fops)
    > + return NULL;
    > +
    > + return filp->private_data;
    > +}
    > +EXPORT_SYMBOL_GPL(vring_get);
    > +
    > +/**
    > + * vring_set_ops - attach operations to a vring file descriptor.
    > + * @vr: the vring_info returned from vring_get.
    > + * @ops: the operations to attach.
    > + * @ops_data: the argument to the ops callbacks.
    > + *
    > + * This is called after vring_get(): the reason for the two-part
    > + * process is that the ops can be called before vring_set_ops returns
    > + * (we don't do locking), so you really need to set things up before
    > + * this call.
    > + *
    > + * This simply checks that the ring is not already attached to something,
    > + * then sets the ops.
    > + */
    > +int vring_set_ops(struct vring_info *vr,
    > + const struct vring_ops *ops, void *ops_data)
    > +{
    > + int err;
    > +
    > + mutex_lock(&vr->lock);
    > + if (vr->ops) {
    > + err = -EBUSY;
    > + goto unlock;
    > + }
    > +
    > + /* We don't lock, so make sure we get this in the right order. */
    > + vr->ops_data = ops_data;
    > + wmb();
    > + vr->ops = ops;
    > +
    > + err = 0;
    > +unlock:
    > + mutex_unlock(&vr->lock);
    > + local_irq_enable();


    what's this doing here?

    > + return err;
    > +}
    > +EXPORT_SYMBOL_GPL(vring_set_ops);
    > +
    > +/**
    > + * vring_unset_ops - remove operations to a vring file descriptor.
    > + * @vr: the vring_info previously successfully vring_set_ops'd
    > + */
    > +void vring_unset_ops(struct vring_info *vr)
    > +{
    > + BUG_ON(!vr->ops);
    > + mutex_lock(&vr->lock);
    > + vr->ops = NULL;
    > + mutex_unlock(&vr->lock);
    > +}
    > +EXPORT_SYMBOL_GPL(vring_unset_ops);


    Isn't this just vring_set_ops(vr, NULL, NULL)?

    > +static struct miscdevice vring_dev = {
    > + .minor = MISC_DYNAMIC_MINOR,
    > + .name = KBUILD_MODNAME,
    > + .fops = &vring_fops,
    > +};
    > +
    > +static int __init init(void)
    > +{
    > + return misc_register(&vring_dev);
    > +}
    > +
    > +static void __exit fini(void)
    > +{
    > + misc_deregister(&vring_dev);
    > +}
    > +
    > +module_init(init);
    > +module_exit(fini);
    > diff -r b2d9869d338f include/linux/vring.h
    > --- /dev/null Thu Jan 01 00:00:00 1970 +0000
    > +++ b/include/linux/vring.h Fri Apr 18 13:35:16 2008 +1000
    > @@ -0,0 +1,58 @@
    > +/* Ring-buffer file descriptor implementation.
    > + *
    > + * Copyright 2008 Rusty Russell IBM Corporation
    > + *
    > + * This program is free software; you can redistribute it and/or modify
    > + * it under the terms of the GNU General Public License as published by
    > + * the Free Software Foundation; either version 2 of the License, or
    > + * (at your option) any later version.
    > + *
    > + * This program is distributed in the hope that it will be useful,
    > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    > + * GNU General Public License for more details.
    > + *
    > + * You should have received a copy of the GNU General Public License
    > + * along with this program; if not, write to the Free Software
    > + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
    > + */


    ponders #include

    > +#ifndef _LINUX_VRING_H
    > +#define _LINUX_VRING_H
    > +
    > +/**
    > + * vring_ops - operations for a vring fd.
    > + * @needs_pull: more data is pending, need to call pull.
    > + * @pull: callback when read() is called to report used buffers.
    > + * @push: callback when write() is called to notify of added buffers.
    > + *
    > + * Any of these callbacks can be NULL, if you don't need them.
    > + */
    > +struct vring_ops {
    > + bool (*needs_pull)(void *ops_data);
    > +
    > + /* Returns 0 or negative errno. */
    > + int (*pull)(void *ops_data);
    > +
    > + /* Returns 0 or negative errno. */
    > + int (*push)(void *ops_data);
    > +};
    > +
    > +struct file;
    > +
    > +struct vring_info *vring_get(struct file *filp);
    > +int vring_set_ops(struct vring_info *,
    > + const struct vring_ops *ops, void *ops_data);


    the first arg to vring_set_ops() lost its name.

    > +void vring_unset_ops(struct vring_info *vr);
    > +struct iovec;
    > +
    > +/* Returns an error, or 0 (no buffers), or an id for vring_used_buffer() */
    > +int vring_get_buffer(struct vring_info *vr,
    > + struct iovec *in_iov,
    > + unsigned int *num_in, unsigned long *in_len,
    > + struct iovec *out_iov,
    > + unsigned int *num_out, unsigned long *out_len);
    > +
    > +void vring_used_buffer(struct vring_info *vr, int id, u32 len);
    > +
    > +void vring_wake(struct vring_info *vr);
    > +#endif /* _LINUX_VRING_H */



    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  8. Re: [PATCH 5/5] tun: vringfd xmit support.

    On Fri, 18 Apr 2008 14:43:24 +1000 Rusty Russell wrote:

    > This patch modifies tun to allow a vringfd to specify the send
    > buffer. The user does a write to push out packets from the buffer.
    >
    > Again we use the 'struct virtio_net_hdr' to allow userspace to send
    > GSO packets. In this case, it can hint how much to copy, and the
    > other pages will be made into skb fragments.
    >
    >
    > ...
    >
    > +/* We don't consolidate consecutive iovecs, so huge iovecs can break here.
    > + * Users will learn not to do that. */
    > +static int get_user_skb_frags(const struct iovec *iv, size_t len,
    > + struct skb_frag_struct *f)
    > +{
    > + unsigned int i, j, num_pg = 0;
    > + int err;
    > + struct page *pages[MAX_SKB_FRAGS];
    > +
    > + down_read(&current->mm->mmap_sem);
    > + while (len) {
    > + int n, npages;
    > + unsigned long base, len;
    > + base = (unsigned long)iv->iov_base;
    > + len = (unsigned long)iv->iov_len;
    > +
    > + if (len == 0) {
    > + iv++;
    > + continue;
    > + }
    > +
    > + /* How many pages will this take? */
    > + npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;


    Brain hurts. I hope you got that right.

    > + if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) {
    > + err = -ENOSPC;
    > + goto fail;
    > + }
    > + n = get_user_pages(current, current->mm, base, npages,
    > + 0, 0, pages, NULL);


    What is the maximum numbet of pages which an unpriviliged user can
    concurrently pin with this code?

    > + if (unlikely(n < 0)) {
    > + err = n;
    > + goto fail;
    > + }
    > +
    > + /* Transfer pages to the frag array */
    > + for (j = 0; j < n; j++) {
    > + f[num_pg].page = pages[j];
    > + if (j == 0) {
    > + f[num_pg].page_offset = offset_in_page(base);
    > + f[num_pg].size = min(len, PAGE_SIZE -
    > + f[num_pg].page_offset);
    > + } else {
    > + f[num_pg].page_offset = 0;
    > + f[num_pg].size = min(len, PAGE_SIZE);
    > + }
    > + len -= f[num_pg].size;
    > + base += f[num_pg].size;
    > + num_pg++;
    > + }


    This loop is a fancy way of doing

    num_pg = n;

    > + if (unlikely(n != npages)) {
    > + err = -EFAULT;
    > + goto fail;
    > + }


    why not do this immediately after running get_user_pages()?

    > + }
    > + up_read(&current->mm->mmap_sem);
    > + return num_pg;
    > +
    > +fail:
    > + for (i = 0; i < num_pg; i++)
    > + put_page(f[i].page);


    release_pages() could be a tad more efficient, but it's only error-path.

    > + up_read(&current->mm->mmap_sem);
    > + return err;
    > +}
    > +
    >


    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  9. Re: [PATCH 5/5] tun: vringfd xmit support.

    On Fri, Apr 18, 2008 at 10:13 AM, Rusty Russell wrote:
    > This patch modifies tun to allow a vringfd to specify the send
    > buffer. The user does a write to push out packets from the buffer.
    >
    > Again we use the 'struct virtio_net_hdr' to allow userspace to send
    > GSO packets. In this case, it can hint how much to copy, and the
    > other pages will be made into skb fragments.
    >
    > Signed-off-by: Rusty Russell
    > ---
    > drivers/net/tun.c | 410 +++++++++++++++++++++++++++++++++++++++++--------
    > include/linux/if_tun.h | 1
    > 2 files changed, 351 insertions(+), 60 deletions(-)
    >
    > diff -r f797ec115d1b drivers/net/tun.c
    > --- a/drivers/net/tun.c Fri Apr 18 05:58:40 2008 +1000
    > +++ b/drivers/net/tun.c Fri Apr 18 06:07:21 2008 +1000
    > @@ -65,6 +65,8 @@
    > #include
    > #include
    > #include
    > +#include
    > +#include
    > #include
    >
    > #include
    > @@ -102,8 +104,8 @@ struct tun_struct {
    > u32 chr_filter[2];
    > u32 net_filter[2];
    >
    > - struct vring_info *inring;
    > - struct file *infile;
    > + struct vring_info *inring, *outring;
    > + struct file *infile, *outfile;
    >
    > #ifdef TUN_DEBUG
    > int debug;
    > @@ -258,6 +261,169 @@ static void tun_net_init(struct net_devi
    > dev->tx_queue_len = TUN_READQ_SIZE; /* We prefer our own queue length */
    > break;
    > }
    > +}
    > +
    > +/* We don't consolidate consecutive iovecs, so huge iovecs can break here.
    > + * Users will learn not to do that. */
    > +static int get_user_skb_frags(const struct iovec *iv, size_t len,
    > + struct skb_frag_struct *f)
    > +{
    > + unsigned int i, j, num_pg = 0;
    > + int err;
    > + struct page *pages[MAX_SKB_FRAGS];
    > +
    > + down_read(&current->mm->mmap_sem);
    > + while (len) {
    > + int n, npages;
    > + unsigned long base, len;
    > + base = (unsigned long)iv->iov_base;
    > + len = (unsigned long)iv->iov_len;
    > +
    > + if (len == 0) {
    > + iv++;
    > + continue;
    > + }
    > +
    > + /* How many pages will this take? */
    > + npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;


    Hi Rusty,
    A trivial suggestion, how about
    npages = 1+(len -1)/PAGE_SIZE ?

    Thanks,
    --Pradeep
    > + if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) {
    > + err = -ENOSPC;
    > + goto fail;
    > + }
    > + n = get_user_pages(current, current->mm, base, npages,
    > + 0, 0, pages, NULL);
    > + if (unlikely(n < 0)) {
    > + err = n;
    > + goto fail;
    > + }
    > +
    > + /* Transfer pages to the frag array */
    > + for (j = 0; j < n; j++) {
    > + f[num_pg].page = pages[j];
    > + if (j == 0) {
    > + f[num_pg].page_offset = offset_in_page(base);
    > + f[num_pg].size = min(len, PAGE_SIZE -
    > + f[num_pg].page_offset);
    > + } else {
    > + f[num_pg].page_offset = 0;
    > + f[num_pg].size = min(len, PAGE_SIZE);
    > + }
    > + len -= f[num_pg].size;
    > + base += f[num_pg].size;
    > + num_pg++;
    > + }
    > +
    > + if (unlikely(n != npages)) {
    > + err = -EFAULT;
    > + goto fail;
    > + }
    > + }
    > + up_read(&current->mm->mmap_sem);
    > + return num_pg;
    > +
    > +fail:
    > + for (i = 0; i < num_pg; i++)
    > + put_page(f[i].page);
    > + up_read(&current->mm->mmap_sem);
    > + return err;
    > +}
    > +
    > +/* We actually store this at the head of the skb. */
    > +struct skb_tun_hdr {
    > + struct list_head list;
    > + struct tun_struct *tun;
    > + unsigned int id;
    > + unsigned int len;
    > +};
    > +
    > +/* Get packet from user space buffer. copylen is a hint as to how
    > + * much to copy (rest is pinned). */
    > +static struct sk_buff *get_user_skb(struct tun_struct *tun, struct iovec *iv,
    > + size_t copylen, size_t len)
    > +{
    > + struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
    > + struct sk_buff *skb;
    > + size_t align = 0, extra = 0;
    > + int err;
    > +
    > + if (!(tun->flags & TUN_NO_PI)) {
    > + if (len < sizeof(pi)) {
    > + err = -EINVAL;
    > + goto fail;
    > + }
    > + len -= sizeof(pi);
    > +
    > + if (memcpy_fromiovec((void *)&pi, iv, sizeof(pi))) {
    > + err = -EFAULT;
    > + goto fail;
    > + }
    > + if (copylen > len)
    > + copylen = len;
    > + }
    > +
    > + if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
    > + align = NET_IP_ALIGN;
    > + if (unlikely(copylen < ETH_HLEN)) {
    > + if (len < ETH_HLEN) {
    > + err = -EINVAL;
    > + goto fail;
    > + }
    > + copylen = ETH_HLEN;
    > + }
    > + }
    > +
    > + /* Allocate extra header if we need */
    > + if (copylen != len)
    > + extra = sizeof(struct skb_tun_hdr);
    > +
    > + skb = alloc_skb(extra + copylen + align, GFP_KERNEL);
    > + if (!skb) {
    > + err = -ENOMEM;
    > + goto fail;
    > + }
    > +
    > + if (extra + align)
    > + skb_reserve(skb, extra + align);
    > +
    > + if (memcpy_fromiovec(skb_put(skb, copylen), iv, copylen)) {
    > + err = -EFAULT;
    > + goto free_skb;
    > + }
    > +
    > + switch (tun->flags & TUN_TYPE_MASK) {
    > + case TUN_TUN_DEV:
    > + skb_reset_mac_header(skb);
    > + skb->protocol = pi.proto;
    > + skb->dev = tun->dev;
    > + break;
    > + case TUN_TAP_DEV:
    > + skb->protocol = eth_type_trans(skb, tun->dev);
    > + break;
    > + };
    > +
    > + if (tun->flags & TUN_NOCHECKSUM)
    > + skb->ip_summed = CHECKSUM_UNNECESSARY;
    > +
    > + /* Anything left gets put into frags. */
    > + if (extra) {
    > + struct skb_shared_info *sinfo = skb_shinfo(skb);
    > + int err = get_user_skb_frags(iv, len - copylen, sinfo->frags);
    > + if (err < 0)
    > + goto free_skb;
    > + sinfo->nr_frags = err;
    > + }
    > + tun->dev->last_rx = jiffies;
    > +
    > + tun->dev->stats.rx_packets++;
    > + tun->dev->stats.rx_bytes += len;
    > +
    > + return skb;
    > +
    > +free_skb:
    > + kfree_skb(skb);
    > +fail:
    > + tun->dev->stats.rx_dropped++;
    > + return ERR_PTR(err);
    > }
    >
    > #if defined(CONFIG_VRING) || defined(CONFIG_VRING_MODULE)
    > @@ -355,6 +521,132 @@ static struct vring_ops recvops = {
    > .pull = pull_recv_skbs,
    > };
    >
    > +static DEFINE_SPINLOCK(finished_lock);
    > +static LIST_HEAD(shinfo_finished_list);
    > +static struct task_struct *shinfo_finisher;
    > +
    > +static void used_buffer(struct skb_tun_hdr *tunh)
    > +{
    > + /* Woot, something happened. */
    > + vring_wake(tunh->tun->outring);
    > +
    > + /* Release device. Keeping this reference blocks file close. */
    > + dev_put(tunh->tun->dev);
    > +
    > + /* tunh == skb->head. */
    > + kfree(tunh);
    > +}
    > +
    > +static int do_shinfo_finisher(void *unused)
    > +{
    > + LIST_HEAD(list);
    > + struct skb_tun_hdr *i;
    > +
    > + while (!kthread_should_stop()) {
    > + set_current_state(TASK_INTERRUPTIBLE);
    > +
    > + spin_lock_irq(&finished_lock);
    > + list_splice_init(&list, &shinfo_finished_list);
    > + spin_unlock_irq(&finished_lock);
    > +
    > + if (list_empty(&list)) {
    > + schedule();
    > + continue;
    > + }
    > +
    > + list_for_each_entry(i, &list, list) {
    > + vring_used_buffer(i->tun->outring, i->id, i->len);
    > + used_buffer(i);
    > + }
    > + }
    > + return 0;
    > +}
    > +
    > +/* We are done with this skb data: put it in the used pile. */
    > +static void shinfo_finished(struct skb_shared_info *sinfo)
    > +{
    > + struct skb_tun_hdr *tunh = (void *)skb_shinfo_to_head(sinfo);
    > + unsigned long flags;
    > +
    > + spin_lock_irqsave(&finished_lock, flags);
    > + list_add(&tunh->list, &shinfo_finished_list);
    > + spin_unlock_irqrestore(&finished_lock, flags);
    > +
    > + wake_up_process(shinfo_finisher);
    > +}
    > +
    > +static int xmit_packets(void *_tun)
    > +{
    > + struct tun_struct *tun = _tun;
    > + struct iovec iov[1+MAX_SKB_FRAGS];
    > + unsigned int iovnum = ARRAY_SIZE(iov);
    > + int id, err, wake = 0;
    > + unsigned long len;
    > +
    > + while ((id = vring_get_buffer(tun->outring, NULL, NULL, NULL,
    > + iov, &iovnum, &len)) > 0) {
    > + struct virtio_net_hdr h;
    > + struct sk_buff *skb;
    > + struct skb_shared_info *shinfo;
    > +
    > + if (unlikely(len < sizeof(h)))
    > + return -EINVAL;
    > +
    > + err = memcpy_fromiovec((void *)&h, iov, sizeof(h));
    > + if (unlikely(err))
    > + return -EFAULT;
    > +
    > + len -= sizeof(h);
    > + if (h.hdr_len > len)
    > + return -EINVAL;
    > +
    > + /* Without GSO, we copy entire packet. */
    > + if (h.gso_type == VIRTIO_NET_HDR_GSO_NONE)
    > + h.hdr_len = len;
    > +
    > + skb = get_user_skb(tun, iov, h.hdr_len, len);
    > + if (IS_ERR(skb))
    > + return PTR_ERR(skb);
    > +
    > + if ((h.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
    > + !skb_partial_csum_set(skb, h.csum_start, h.csum_offset)) {
    > + kfree_skb(skb);
    > + return -EINVAL;
    > + }
    > +
    > + /* If it has fragments, set up destructor for later. */
    > + shinfo = skb_shinfo(skb);
    > + if (skb_shinfo(skb)->nr_frags) {
    > + struct skb_tun_hdr *tunh = (void *)skb->head;
    > + shinfo->destructor = shinfo_finished;
    > + tunh->id = id;
    > + tunh->len = sizeof(h) + skb->len;
    > + } else {
    > + vring_used_buffer(tun->outring, id, sizeof(h)+skb->len);
    > + wake = 1;
    > + }
    > + netif_rx_ni(skb);
    > + }
    > +
    > + if (wake)
    > + vring_wake(tun->outring);
    > +
    > + /* 0 or error. */
    > + return id;
    > +}
    > +
    > +static struct vring_ops xmitops = {
    > + .push = xmit_packets,
    > +};
    > +
    > +static int init_vring(void)
    > +{
    > + shinfo_finisher = kthread_run(do_shinfo_finisher, NULL, "tun");
    > + if (IS_ERR(shinfo_finisher))
    > + return PTR_ERR(shinfo_finisher);
    > + return 0;
    > +}
    > +
    > static int set_recv_vring(struct tun_struct *tun, int fd)
    > {
    > int err;
    > @@ -391,9 +685,47 @@ static void unset_vrings(struct tun_stru
    > vring_unset_ops(tun->inring);
    > fput(tun->infile);
    > }
    > + if (tun->outring) {
    > + vring_unset_ops(tun->outring);
    > + fput(tun->outfile);
    > + }
    > +}
    > +
    > +static int set_xmit_vring(struct tun_struct *tun, int fd)
    > +{
    > + int err;
    > +
    > + if (tun->outring)
    > + return -EBUSY;
    > +
    > + tun->outfile = fget(fd);
    > + if (!tun->outfile)
    > + return -EBADF;
    > +
    > + tun->outring = vring_get(tun->outfile);
    > + if (!tun->outring) {
    > + err = -EBADF;
    > + goto put;
    > + }
    > +
    > + err = vring_set_ops(tun->outring, &xmitops, tun);
    > + if (err) {
    > + tun->outring = NULL;
    > + goto put;
    > + }
    > + return 0;
    > +
    > +put:
    > + fput(tun->outfile);
    > + tun->outfile = NULL;
    > + return err;
    > }
    > #else /* ... !CONFIG_VRING */
    > static int set_recv_vring(struct tun_struct *tun, int fd)
    > +{
    > + return -ENOTTY;
    > +}
    > +static int set_xmit_vring(struct tun_struct *tun, int fd)
    > {
    > return -ENOTTY;
    > }
    > @@ -424,74 +756,26 @@ static unsigned int tun_chr_poll(struct
    > return mask;
    > }
    >
    > -/* Get packet from user space buffer */
    > -static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count)
    > -{
    > - struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
    > - struct sk_buff *skb;
    > - size_t len = count, align = 0;
    > -
    > - if (!(tun->flags & TUN_NO_PI)) {
    > - if ((len -= sizeof(pi)) > count)
    > - return -EINVAL;
    > -
    > - if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi)))
    > - return -EFAULT;
    > - }
    > -
    > - if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
    > - align = NET_IP_ALIGN;
    > - if (unlikely(len < ETH_HLEN))
    > - return -EINVAL;
    > - }
    > -
    > - if (!(skb = alloc_skb(len + align, GFP_KERNEL))) {
    > - tun->dev->stats.rx_dropped++;
    > - return -ENOMEM;
    > - }
    > -
    > - if (align)
    > - skb_reserve(skb, align);
    > - if (memcpy_fromiovec(skb_put(skb, len), iv, len)) {
    > - tun->dev->stats.rx_dropped++;
    > - kfree_skb(skb);
    > - return -EFAULT;
    > - }
    > -
    > - switch (tun->flags & TUN_TYPE_MASK) {
    > - case TUN_TUN_DEV:
    > - skb_reset_mac_header(skb);
    > - skb->protocol = pi.proto;
    > - skb->dev = tun->dev;
    > - break;
    > - case TUN_TAP_DEV:
    > - skb->protocol = eth_type_trans(skb, tun->dev);
    > - break;
    > - };
    > -
    > - if (tun->flags & TUN_NOCHECKSUM)
    > - skb->ip_summed = CHECKSUM_UNNECESSARY;
    > -
    > - netif_rx_ni(skb);
    > - tun->dev->last_rx = jiffies;
    > -
    > - tun->dev->stats.rx_packets++;
    > - tun->dev->stats.rx_bytes += len;
    > -
    > - return count;
    > -}
    > -
    > static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
    > unsigned long count, loff_t pos)
    > {
    > struct tun_struct *tun = iocb->ki_filp->private_data;
    > + size_t len;
    > + struct sk_buff *skb;
    >
    > if (!tun)
    > return -EBADFD;
    >
    > DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count);
    >
    > - return tun_get_user(tun, (struct iovec *) iv, iov_length(iv, count));
    > + len = iov_length(iv, count);
    > +
    > + skb = get_user_skb(tun, (struct iovec *)iv, len, len);
    > + if (IS_ERR(skb))
    > + return PTR_ERR(skb);
    > +
    > + netif_rx_ni(skb);
    > + return len;
    > }
    >
    > /* Put packet to the user space buffer */
    > @@ -831,6 +1115,9 @@ static int tun_chr_ioctl(struct inode *i
    > case TUNSETRECVVRING:
    > return set_recv_vring(tun, arg);
    >
    > + case TUNSETXMITVRING:
    > + return set_xmit_vring(tun, arg);
    > +
    > case SIOCGIFFLAGS:
    > ifr.ifr_flags = tun->if_flags;
    > if (copy_to_user( argp, &ifr, sizeof ifr))
    > @@ -1078,6 +1365,12 @@ static int __init tun_init(void)
    > ret = misc_register(&tun_miscdev);
    > if (ret)
    > printk(KERN_ERR "tun: Can't register misc device %d\n", TUN_MINOR);
    > + else {
    > + ret = init_vring();
    > + if (ret)
    > + misc_deregister(&tun_miscdev);
    > + }
    > +
    > return ret;
    > }
    >
    > diff -r f797ec115d1b include/linux/if_tun.h
    > --- a/include/linux/if_tun.h Fri Apr 18 05:58:40 2008 +1000
    > +++ b/include/linux/if_tun.h Fri Apr 18 06:07:21 2008 +1000
    > @@ -43,6 +43,7 @@
    > #define TUNSETLINK _IOW('T', 205, int)
    > #define TUNSETGROUP _IOW('T', 206, int)
    > #define TUNSETRECVVRING _IOW('T', 207, int)
    > +#define TUNSETXMITVRING _IOW('T', 208, int)
    >
    > /* TUNSETIFF ifr flags */
    > #define IFF_TUN 0x0001
    > _______________________________________________
    > Virtualization mailing list
    > Virtualization@lists.linux-foundation.org
    > https://lists.linux-foundation.org/m...virtualization
    >




    --
    Pradeep Singh Rautela
    http://eagain.wordpress.com
    http://emptydomain.googlepages.com
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  10. Re: [PATCH 5/5] tun: vringfd xmit support.

    On Fri, Apr 18, 2008 at 4:46 AM, pradeep singh rautela
    wrote:
    >
    > On Fri, Apr 18, 2008 at 10:13 AM, Rusty Russell wrote:
    > > + /* How many pages will this take? */
    > > + npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;

    >
    > Hi Rusty,
    > A trivial suggestion, how about
    > npages = 1+(len -1)/PAGE_SIZE ?


    That's not the same. In particular, his version accounts for the
    fractional page at the beginning, while yours doesn't.

    While it's tempting to use algebra to simplify things, it's not safe
    to do so when the expression involves division over the integers. The
    only care-free integer math in a computer is subtraction and ++.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  11. Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

    On Friday 18 April 2008 21:18:46 Andrew Morton wrote:
    > > + /* Must be a power of two, and limit indices to a u16. */
    > > + if (!num_descs || (num_descs & (num_descs-1)) || num_descs > 65536)

    >
    > We have an is_power_of_2().


    Thanks, fixed.

    > > + * vring_get - check out a vring file descriptor
    > > + * @filp: the file structure to attach to (eg. from fget()).
    > > + *
    > > + * Userspace opens /dev/vring and mmaps it, then hands that fd to the
    > > + * kernel subsystem it wants to communicate with. That subsystem uses
    > > + * this routine and vring_set_ops() to attach to it.
    > > + *
    > > + * This simply checks that it really is a vring fd (otherwise it
    > > + * returns NULL), the other routine checks that it's not already
    > > + * attached.
    > > + */

    >
    > hm, I don't understand the big picture here yet.
    >
    > Isn't this kinda-sorta like what a relayfs file does? The oprofile
    > buffers? etc? Nothing in common at all, no hope?


    An excellent question, but I thought the modern kernel etiquette was to only
    comment on whitespace and formatting, and call it "review"?

    Yes, kinda-sorta in that it's a ring buffer. No, in that it's bidir and
    consumption can be out-of-order (kind of important for I/O buffers).

    But the reason I'm not proposing it as a syscall is that I'm not convinced
    it's the One True Solution which everyone should be using. Time will tell:
    it's clearly not tied to tun and it's been generically useful for virtual
    I/O, but history has not been kind to new userspace interfaces.

    > > + mutex_unlock(&vr->lock);
    > > + local_irq_enable();

    >
    > what's this doing here?


    Snot from previous version. Removed.

    > > +void vring_unset_ops(struct vring_info *vr)
    > > +{
    > > + BUG_ON(!vr->ops);
    > > + mutex_lock(&vr->lock);
    > > + vr->ops = NULL;
    > > + mutex_unlock(&vr->lock);
    > > +}
    > > +EXPORT_SYMBOL_GPL(vring_unset_ops);

    >
    > Isn't this just vring_set_ops(vr, NULL, NULL)?


    Yes, except I like the clarity and the BUG_ON.

    > ponders #include


    "#include " for me, just to add more inclement weather to
    that teacup...

    Thanks,
    Rusty.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  12. Re: [PATCH 5/5] tun: vringfd xmit support.

    On Friday 18 April 2008 21:31:20 Andrew Morton wrote:
    > On Fri, 18 Apr 2008 14:43:24 +1000 Rusty Russell wrote:
    > > + /* How many pages will this take? */
    > > + npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;

    >
    > Brain hurts. I hope you got that right.


    I tested it when I wrote it, but just wrote a tester again:

    base len npages
    0 1 1
    0xfff 1 1
    0x1000 1 1
    0 4096 1
    0x1 4096 2
    0xfff 4096 2
    0x1000 4096 1
    0xfffff000 4096 1
    0xfffff000 4097 4293918722

    > > + if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) {
    > > + err = -ENOSPC;
    > > + goto fail;
    > > + }
    > > + n = get_user_pages(current, current->mm, base, npages,
    > > + 0, 0, pages, NULL);

    >
    > What is the maximum numbet of pages which an unpriviliged user can
    > concurrently pin with this code?


    Since only root can open the tun device, it's currently OK. The old code
    kmalloced and copied: is there some mm-fu reason why pinning userspace memory
    is worse?

    But I actually think it's OK even for non-root, since these become skbs, which
    means they either go into an outgoing device queue or a socket queue which is
    accounted for exactly for this reason.

    > > + if (unlikely(n < 0)) {
    > > + err = n;
    > > + goto fail;
    > > + }
    > > +
    > > + /* Transfer pages to the frag array */
    > > + for (j = 0; j < n; j++) {
    > > + f[num_pg].page = pages[j];
    > > + if (j == 0) {
    > > + f[num_pg].page_offset = offset_in_page(base);
    > > + f[num_pg].size = min(len, PAGE_SIZE -
    > > + f[num_pg].page_offset);
    > > + } else {
    > > + f[num_pg].page_offset = 0;
    > > + f[num_pg].size = min(len, PAGE_SIZE);
    > > + }
    > > + len -= f[num_pg].size;
    > > + base += f[num_pg].size;
    > > + num_pg++;
    > > + }

    >
    > This loop is a fancy way of doing
    >
    > num_pg = n;


    Damn, you had me reworking this until I realized why. It's not: we're
    inside a loop, doing one iovec array element at a time.

    > > + if (unlikely(n != npages)) {
    > > + err = -EFAULT;
    > > + goto fail;
    > > + }

    >
    > why not do this immediately after running get_user_pages()?


    To simplify the failure path. Hmm, I would use release_pages here...

    > > +fail:
    > > + for (i = 0; i < num_pg; i++)
    > > + put_page(f[i].page);

    >
    > release_pages() could be a tad more efficient, but it's only error-path.


    .... but I didn't know that existed. Had to include pagemap.h, and it's not
    exported. It seems to be a useful interface; see patch.

    Cheers,
    Rusty.

    Subject: Export release_pages; nice undo for get_user_pages.

    Andrew Morton suggests tun/tap use release_pages, but it's not
    exported. It's not clear to me why this is in swap.c, but it exists
    even without CONFIG_SWAP, so that's OK.

    Signed-off-by: Rusty Russell

    diff -r abd2ad431e5c mm/swap.c
    --- a/mm/swap.c Sat Apr 19 00:34:54 2008 +1000
    +++ b/mm/swap.c Sat Apr 19 01:11:40 2008 +1000
    @@ -346,6 +346,7 @@ void release_pages(struct page **pages,

    pagevec_free(&pages_to_free);
    }
    +EXPORT_SYMBOL(release_pages);

    /*
    * The pages which we're about to release may be in the deferred lru-addition
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  13. Re: [PATCH 5/5] tun: vringfd xmit support.

    On Fri, Apr 18, 2008 at 8:15 AM, Rusty Russell wrote:
    > On Friday 18 April 2008 21:31:20 Andrew Morton wrote:
    > > On Fri, 18 Apr 2008 14:43:24 +1000 Rusty Russell wrote:

    >
    > > > + /* How many pages will this take? */
    > > > + npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;

    > >
    > > Brain hurts. I hope you got that right.

    >
    > I tested it when I wrote it, but just wrote a tester again:
    >
    > base len npages
    > 0 1 1
    > 0xfff 1 1
    > 0x1000 1 1
    > 0 4096 1
    > 0x1 4096 2
    > 0xfff 4096 2
    > 0x1000 4096 1
    > 0xfffff000 4096 1
    > 0xfffff000 4097 4293918722


    Picky, picky :-).

    If base were always page aligned, we'd want

    npages = (len + PAGE_SIZE - 1) / PAGE_SIZE;

    which is simply rounding len up to the next largest page size. So when
    base is not page aligned, increase len by the left-over space at the
    beginning, and then do the same calculation as above. (ie, pretend
    base is page aligned, and instead count the excess at the beginning as
    part of len.)

    npages = ( (base & PAGE_MASK) + len + PAGE_SIZE - 1) / PAGE_SIZE;

    As long as len < PAGE_MASK - PAGE_SIZE, we're safe from overflows.

    (The above gives a different answer when len=0, but you guard for that.)
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  14. Re: [PATCH 5/5] tun: vringfd xmit support.

    On Fri, Apr 18, 2008 at 7:55 PM, Ray Lee wrote:
    > On Fri, Apr 18, 2008 at 4:46 AM, pradeep singh rautela
    > wrote:
    > >
    > > On Fri, Apr 18, 2008 at 10:13 AM, Rusty Russell wrote:

    >
    > > > + /* How many pages will this take? */
    > > > + npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;

    > >
    > > Hi Rusty,
    > > A trivial suggestion, how about
    > > npages = 1+(len -1)/PAGE_SIZE ?

    >
    > That's not the same. In particular, his version accounts for the
    > fractional page at the beginning, while yours doesn't.


    Oh thanks for correcting me Ray.
    >
    > While it's tempting to use algebra to simplify things, it's not safe
    > to do so when the expression involves division over the integers. The
    > only care-free integer math in a computer is subtraction and ++.


    I stand corrected.

    Sorry for noise.

    Thanks,
    --
    Pradeep Singh Rautela
    http://eagain.wordpress.com
    http://emptydomain.googlepages.com
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  15. Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

    On Sat, 19 Apr 2008 00:32:39 +1000 Rusty Russell wrote:

    > > Isn't this kinda-sorta like what a relayfs file does? The oprofile
    > > buffers? etc? Nothing in common at all, no hope?

    >
    > An excellent question, but I thought the modern kernel etiquette was to only
    > comment on whitespace and formatting, and call it "review"?
    >
    > Yes, kinda-sorta in that it's a ring buffer. No, in that it's bidir and
    > consumption can be out-of-order (kind of important for I/O buffers).
    >
    > But the reason I'm not proposing it as a syscall is that I'm not convinced
    > it's the One True Solution which everyone should be using. Time will tell:
    > it's clearly not tied to tun and it's been generically useful for virtual
    > I/O, but history has not been kind to new userspace interfaces.


    This is may be our third high-bandwidth user/kernel interface to transport
    bulk data ("hbukittbd") which was implemented because its predecessors
    weren't quite right. In a year or two's time someone else will need a
    hbukittbd and will find that the existing three aren't quite right and will
    give us another one. One day we need to stop doing this

    It could be that this person will look at Rusty's hbukittbd and find that
    it _could_ be tweaked to do what he wants, but it's already shipping and
    it's part of the kernel API and hence can't be made to do what he wants.

    So I think it would be good to plonk the proposed interface on the table
    and have a poke at it. Is it compat-safe? Is it extensible in a
    backward-compatible fashion? Are there future-safe changes we should make
    to it? Can Michael Kerrisk understand, review and document it? etc.

    You know what I'm saying What is the proposed interface?
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  16. Re: [PATCH 5/5] tun: vringfd xmit support.

    On Sat, 19 Apr 2008 01:15:15 +1000 Rusty Russell wrote:

    > >
    > > What is the maximum numbet of pages which an unpriviliged user can
    > > concurrently pin with this code?

    >
    > Since only root can open the tun device, it's currently OK. The old code
    > kmalloced and copied: is there some mm-fu reason why pinning userspace memory
    > is worse?


    We generally try to avoid it - it allows users to dos the box. Although I
    suspect that direct-io presently permits users to transiently pin an amount
    of memory which is proportional to the number of disks upon which they can
    open files.

    > Subject: Export release_pages; nice undo for get_user_pages.
    >
    > Andrew Morton suggests tun/tap use release_pages, but it's not
    > exported. It's not clear to me why this is in swap.c, but it exists
    > even without CONFIG_SWAP, so that's OK.
    >
    > Signed-off-by: Rusty Russell
    >
    > diff -r abd2ad431e5c mm/swap.c
    > --- a/mm/swap.c Sat Apr 19 00:34:54 2008 +1000
    > +++ b/mm/swap.c Sat Apr 19 01:11:40 2008 +1000
    > @@ -346,6 +346,7 @@ void release_pages(struct page **pages,
    >
    > pagevec_free(&pages_to_free);
    > }
    > +EXPORT_SYMBOL(release_pages);
    >
    > /*
    > * The pages which we're about to release may be in the deferred lru-addition


    acked-by: me.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  17. Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

    On 4/18/08, Andrew Morton wrote:
    > On Sat, 19 Apr 2008 00:32:39 +1000 Rusty Russell wrote:
    >
    > > > Isn't this kinda-sorta like what a relayfs file does? The oprofile
    > > > buffers? etc? Nothing in common at all, no hope?

    > >
    > > An excellent question, but I thought the modern kernel etiquette was to only
    > > comment on whitespace and formatting, and call it "review"?
    > >
    > > Yes, kinda-sorta in that it's a ring buffer. No, in that it's bidir and
    > > consumption can be out-of-order (kind of important for I/O buffers).
    > >
    > > But the reason I'm not proposing it as a syscall is that I'm not convinced
    > > it's the One True Solution which everyone should be using. Time will tell:
    > > it's clearly not tied to tun and it's been generically useful for virtual
    > > I/O, but history has not been kind to new userspace interfaces.

    >
    >
    > This is may be our third high-bandwidth user/kernel interface to transport
    > bulk data ("hbukittbd") which was implemented because its predecessors
    > weren't quite right. In a year or two's time someone else will need a
    > hbukittbd and will find that the existing three aren't quite right and will
    > give us another one. One day we need to stop doing this
    >
    > It could be that this person will look at Rusty's hbukittbd and find that
    > it _could_ be tweaked to do what he wants, but it's already shipping and
    > it's part of the kernel API and hence can't be made to do what he wants.
    >
    > So I think it would be good to plonk the proposed interface on the table
    > and have a poke at it. Is it compat-safe? Is it extensible in a
    > backward-compatible fashion? Are there future-safe changes we should make
    > to it? Can Michael Kerrisk understand, review and document it? etc.


    Well, it helps if he's CCed....

    I'm happy to work *with someone* on the documentation (pointless to do
    it on my own -- how do I know what Rusty's *intended* behavior for the
    interface is), and review, and testing.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  18. Re: [PATCH 5/5] tun: vringfd xmit support.

    On Sat, 19 Apr 2008 01:15:15 +1000 Rusty Russell wrote:

    > It's not clear to me why this is in swap.c, but it exists
    > even without CONFIG_SWAP, so that's OK.


    We should have done mv mm/swap.c mm/mmlib.c years ago.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  19. Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

    Hi.

    On Fri, Apr 18, 2008 at 02:39:48PM +1000, Rusty Russell (rusty@rustcorp.com.au) wrote:

    > +int vring_get_buffer(struct vring_info *vr,
    > + struct iovec *in_iov,
    > + unsigned int *num_in, unsigned long *in_len,
    > + struct iovec *out_iov,
    > + unsigned int *num_out, unsigned long *out_len)
    > +{
    > + unsigned int i, in = 0, out = 0;
    > + unsigned long dummy;
    > + u16 avail, last_avail, head;
    > + struct vring_desc d;


    Should this whole function and vring_used_buffer() be protected with
    vr->lock mutex?

    > + if (unlikely(get_user(avail, &vr->ring.avail->idx)))
    > + return -EFAULT;
    > + if (unlikely(get_user(last_avail, &vring_last_avail(&vr->ring))))
    > + return -EFAULT;
    > +
    > + if (last_avail == avail)
    > + return 0;
    > +
    > + if (!in_len)
    > + in_len = &dummy;
    > + if (!out_len)
    > + out_len = &dummy;
    > +
    > + *in_len = *out_len = 0;
    > +
    > + if (unlikely(get_user(head, &vr->ring.avail->ring[last_avail
    > + & vr->mask])))
    > + return -EFAULT;
    > +
    > + i = head;
    > + do {
    > + if (unlikely(i >= vr->ring.num)) {
    > + pr_debug("vring: bad index: %u\n", i);
    > + return -EINVAL;
    > + }
    > +
    > + if (copy_from_user(&d, &vr->ring.desc[i], sizeof(d)) != 0)
    > + return -EFAULT;
    > +
    > + if (d.flags & VRING_DESC_F_WRITE) {
    > + /* Check for length and iovec overflows */
    > + if (!num_in) {
    > + pr_debug("vring: writable desc %u in ring %p\n",
    > + i, vr->ring.desc);
    > + return -EINVAL;
    > + }
    > + if (in == *num_in || *in_len + d.len < *in_len)
    > + return -E2BIG;
    > + in_iov[in].iov_len = d.len;
    > + *in_len += d.len;
    > + in_iov[in].iov_base = (void __user *)(long)d.addr;
    > + in++;
    > + } else {
    > + if (!num_out) {
    > + pr_debug("vring: readable desc %u in ring %p\n",
    > + i, vr->ring.desc);
    > + return -EINVAL;
    > + }
    > + if (out == *num_out || *out_len + d.len < *out_len)
    > + return -E2BIG;
    > + out_iov[out].iov_len = d.len;
    > + *out_len += d.len;
    > + out_iov[out].iov_base = (void __user *)(long)d.addr;
    > + out++;
    > + }
    > +
    > + i = d.next;
    > + } while (d.flags & VRING_DESC_F_NEXT);
    > +
    > + if (num_in)
    > + *num_in = in;
    > + if (num_out)
    > + *num_out = out;
    > +
    > + last_avail++;
    > + put_user(last_avail, &vring_last_avail(&vr->ring));
    > +
    > + /* 0 is a valid head, so add one. */
    > + return head + 1;
    > +}
    > +EXPORT_SYMBOL_GPL(vring_get_buffer);
    > +
    > +/**
    > + * vring_used_buffer - return a used buffer to the vring
    > + * @vr: the vring
    > + * @id: the id returned from vring_get_buffer
    > + * @len: the total bytes *written* to the buffer
    > + */
    > +void vring_used_buffer(struct vring_info *vr, int id, u32 len)
    > +{
    > + struct vring_used_elem used;
    > + u16 used_idx;
    > +
    > + BUG_ON(id <= 0 || id > vr->ring.num);
    > +
    > + used.id = id - 1;
    > + used.len = len;
    > + if (get_user(used_idx, &vr->ring.used->idx) != 0)
    > + return;
    > +
    > + if (copy_to_user(&vr->ring.used->ring[used_idx & vr->mask], &used,
    > + sizeof(used)))
    > + return;
    > +
    > + wmb();
    > + used_idx++;
    > + put_user(used_idx, &vr->ring.used->idx);
    > +}
    > +EXPORT_SYMBOL_GPL(vring_used_buffer);


    --
    Evgeniy Polyakov
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  20. Re: [PATCH 5/5] tun: vringfd xmit support.

    On Saturday 19 April 2008 05:06:34 Andrew Morton wrote:
    > On Sat, 19 Apr 2008 01:15:15 +1000 Rusty Russell

    wrote:
    > > > What is the maximum numbet of pages which an unpriviliged user can
    > > > concurrently pin with this code?

    > >
    > > Since only root can open the tun device, it's currently OK. The old code
    > > kmalloced and copied: is there some mm-fu reason why pinning userspace
    > > memory is worse?

    >
    > We generally try to avoid it - it allows users to dos the box.


    My question is: is pinning a page worse than allocating a (kernel) page in
    some way?

    Cheers,
    Rusty.
    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

+ Reply to Thread
Page 1 of 2 1 2 LastLast