[PATCH 00/32] Swap over NFS - v19 - Kernel

This is a discussion on [PATCH 00/32] Swap over NFS - v19 - Kernel ; Patches are against: v2.6.27-rc5-mm1 This release features more comments and (hopefully) better Changelogs. Also the netns stuff got sorted and ipv6 will now build and not oops on boot ;-) The first 4 patches are cleanups and can go in ...

+ Reply to Thread
Page 1 of 3 1 2 3 LastLast
Results 1 to 20 of 46

Thread: [PATCH 00/32] Swap over NFS - v19

  1. [PATCH 00/32] Swap over NFS - v19

    Patches are against: v2.6.27-rc5-mm1

    This release features more comments and (hopefully) better Changelogs.
    Also the netns stuff got sorted and ipv6 will now build and not oops
    on boot ;-)

    The first 4 patches are cleanups and can go in if the respective maintainers
    agree.

    The code is lightly tested but seems to work on my default config.

    Let's get this ball rolling...

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  2. [PATCH 18/32] net: sk_allocation() - concentrate socket related allocations

    Introduce sk_allocation(), this function allows to inject sock specific
    flags to each sock related allocation.

    Signed-off-by: Peter Zijlstra
    ---
    include/net/sock.h | 5 +++++
    net/ipv4/tcp.c | 3 ++-
    net/ipv4/tcp_output.c | 12 +++++++-----
    net/ipv6/tcp_ipv6.c | 17 ++++++++++++-----
    4 files changed, 26 insertions(+), 11 deletions(-)

    Index: linux-2.6/net/ipv4/tcp_output.c
    ================================================== =================
    --- linux-2.6.orig/net/ipv4/tcp_output.c
    +++ linux-2.6/net/ipv4/tcp_output.c
    @@ -2148,7 +2148,8 @@ void tcp_send_fin(struct sock *sk)
    } else {
    /* Socket is locked, keep trying until memory is available. */
    for (; {
    - skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
    + skb = alloc_skb_fclone(MAX_TCP_HEADER,
    + sk_allocation(sk, GFP_KERNEL));
    if (skb)
    break;
    yield();
    @@ -2174,7 +2175,7 @@ void tcp_send_active_reset(struct sock *
    struct sk_buff *skb;

    /* NOTE: No TCP options attached and we never retransmit this. */
    - skb = alloc_skb(MAX_TCP_HEADER, priority);
    + skb = alloc_skb(MAX_TCP_HEADER, sk_allocation(sk, priority));
    if (!skb) {
    NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
    return;
    @@ -2242,7 +2243,8 @@ struct sk_buff *tcp_make_synack(struct s
    struct tcp_md5sig_key *md5;
    __u8 *md5_hash_location;

    - skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
    + skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1,
    + sk_allocation(sk, GFP_ATOMIC));
    if (skb == NULL)
    return NULL;

    @@ -2482,7 +2484,7 @@ void tcp_send_ack(struct sock *sk)
    * tcp_transmit_skb() will set the ownership to this
    * sock.
    */
    - buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
    + buff = alloc_skb(MAX_TCP_HEADER, sk_allocation(sk, GFP_ATOMIC));
    if (buff == NULL) {
    inet_csk_schedule_ack(sk);
    inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
    @@ -2517,7 +2519,7 @@ static int tcp_xmit_probe_skb(struct soc
    struct sk_buff *skb;

    /* We don't queue it, tcp_transmit_skb() sets ownership. */
    - skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
    + skb = alloc_skb(MAX_TCP_HEADER, sk_allocation(sk, GFP_ATOMIC));
    if (skb == NULL)
    return -1;

    Index: linux-2.6/include/net/sock.h
    ================================================== =================
    --- linux-2.6.orig/include/net/sock.h
    +++ linux-2.6/include/net/sock.h
    @@ -435,6 +435,11 @@ static inline int sock_flag(struct sock
    return test_bit(flag, &sk->sk_flags);
    }

    +static inline gfp_t sk_allocation(struct sock *sk, gfp_t gfp_mask)
    +{
    + return gfp_mask;
    +}
    +
    static inline void sk_acceptq_removed(struct sock *sk)
    {
    sk->sk_ack_backlog--;
    Index: linux-2.6/net/ipv6/tcp_ipv6.c
    ================================================== =================
    --- linux-2.6.orig/net/ipv6/tcp_ipv6.c
    +++ linux-2.6/net/ipv6/tcp_ipv6.c
    @@ -582,7 +582,8 @@ static int tcp_v6_md5_do_add(struct sock
    } else {
    /* reallocate new list if current one is full. */
    if (!tp->md5sig_info) {
    - tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info), GFP_ATOMIC);
    + tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
    + sk_allocation(sk, GFP_ATOMIC));
    if (!tp->md5sig_info) {
    kfree(newkey);
    return -ENOMEM;
    @@ -595,7 +596,8 @@ static int tcp_v6_md5_do_add(struct sock
    }
    if (tp->md5sig_info->alloced6 == tp->md5sig_info->entries6) {
    keys = kmalloc((sizeof (tp->md5sig_info->keys6[0]) *
    - (tp->md5sig_info->entries6 + 1)), GFP_ATOMIC);
    + (tp->md5sig_info->entries6 + 1)),
    + sk_allocation(sk, GFP_ATOMIC));

    if (!keys) {
    tcp_free_md5sig_pool();
    @@ -719,7 +721,8 @@ static int tcp_v6_parse_md5_keys (struct
    struct tcp_sock *tp = tcp_sk(sk);
    struct tcp_md5sig_info *p;

    - p = kzalloc(sizeof(struct tcp_md5sig_info), GFP_KERNEL);
    + p = kzalloc(sizeof(struct tcp_md5sig_info),
    + sk_allocation(sk, GFP_KERNEL));
    if (!p)
    return -ENOMEM;

    @@ -952,6 +955,7 @@ static void tcp_v6_send_reset(struct soc
    #ifdef CONFIG_TCP_MD5SIG
    struct tcp_md5sig_key *key;
    #endif
    + gfp_t gfp_mask = GFP_ATOMIC;

    if (th->rst)
    return;
    @@ -969,13 +973,16 @@ static void tcp_v6_send_reset(struct soc
    tot_len += TCPOLEN_MD5SIG_ALIGNED;
    #endif

    + if (sk)
    + gfp_mask = sk_allocation(skb->sk, gfp_mask);
    +
    /*
    * We need to grab some memory, and put together an RST,
    * and then put it into the queue to be sent.
    */

    buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
    - GFP_ATOMIC);
    + sk_allocation(sk, GFP_ATOMIC));
    if (buff == NULL)
    return;

    @@ -1063,7 +1070,7 @@ static void tcp_v6_send_ack(struct sk_bu
    #endif

    buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
    - GFP_ATOMIC);
    + sk_allocation(ctl_sk, GFP_ATOMIC));
    if (buff == NULL)
    return;

    Index: linux-2.6/net/ipv4/tcp.c
    ================================================== =================
    --- linux-2.6.orig/net/ipv4/tcp.c
    +++ linux-2.6/net/ipv4/tcp.c
    @@ -635,7 +635,8 @@ struct sk_buff *sk_stream_alloc_skb(stru
    /* The TCP header must be at least 32-bit aligned. */
    size = ALIGN(size, 4);

    - skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
    + skb = alloc_skb_fclone(size + sk->sk_prot->max_header,
    + sk_allocation(sk, gfp));
    if (skb) {
    if (sk_wmem_schedule(sk, skb->truesize)) {
    /*

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  3. [PATCH 02/32] mm: serialize access to min_free_kbytes

    There is a small race between the procfs caller and the memory hotplug caller
    of setup_per_zone_pages_min(). Not a big deal, but the next patch will add yet
    another caller. Time to close the gap.

    Signed-off-by: Peter Zijlstra
    Reviewed-by: Pekka Enberg
    ---
    mm/page_alloc.c | 16 +++++++++++++---
    1 file changed, 13 insertions(+), 3 deletions(-)

    Index: linux-2.6/mm/page_alloc.c
    ================================================== =================
    --- linux-2.6.orig/mm/page_alloc.c
    +++ linux-2.6/mm/page_alloc.c
    @@ -118,6 +118,7 @@ static char * const zone_names[MAX_NR_ZO
    "Movable",
    };

    +static DEFINE_SPINLOCK(min_free_lock);
    int min_free_kbytes = 1024;

    unsigned long __meminitdata nr_kernel_pages;
    @@ -4333,12 +4334,12 @@ static void setup_per_zone_lowmem_reserv
    }

    /**
    - * setup_per_zone_pages_min - called when min_free_kbytes changes.
    + * __setup_per_zone_pages_min - called when min_free_kbytes changes.
    *
    * Ensures that the pages_{min,low,high} values for each zone are set correctly
    * with respect to min_free_kbytes.
    */
    -void setup_per_zone_pages_min(void)
    +static void __setup_per_zone_pages_min(void)
    {
    unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
    unsigned long lowmem_pages = 0;
    @@ -4433,6 +4434,15 @@ void setup_per_zone_inactive_ratio(void)
    }
    }

    +void setup_per_zone_pages_min(void)
    +{
    + unsigned long flags;
    +
    + spin_lock_irqsave(&min_free_lock, flags);
    + __setup_per_zone_pages_min();
    + spin_unlock_irqrestore(&min_free_lock, flags);
    +}
    +
    /*
    * Initialise min_free_kbytes.
    *
    @@ -4468,7 +4478,7 @@ static int __init init_per_zone_pages_mi
    min_free_kbytes = 128;
    if (min_free_kbytes > 65536)
    min_free_kbytes = 65536;
    - setup_per_zone_pages_min();
    + __setup_per_zone_pages_min();
    setup_per_zone_lowmem_reserve();
    setup_per_zone_inactive_ratio();
    return 0;

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  4. [PATCH 27/32] mm: methods for teaching filesystems about PG_swapcache pages

    In order to teach filesystems to handle swap cache pages, three new page
    functions are introduced:

    pgoff_t page_file_index(struct page *);
    loff_t page_file_offset(struct page *);
    struct address_space *page_file_mapping(struct page *);

    page_file_index() - gives the offset of this page in the file in
    PAGE_CACHE_SIZE blocks. Like page->index is for mapped pages, this function
    also gives the correct index for PG_swapcache pages.

    page_file_offset() - uses page_file_index(), so that it will give the expected
    result, even for PG_swapcache pages.

    page_file_mapping() - gives the mapping backing the actual page; that is for
    swap cache pages it will give swap_file->f_mapping.

    Signed-off-by: Peter Zijlstra
    ---
    include/linux/mm.h | 25 +++++++++++++++++++++++++
    include/linux/pagemap.h | 5 +++++
    mm/swapfile.c | 19 +++++++++++++++++++
    3 files changed, 49 insertions(+)

    Index: linux-2.6/include/linux/mm.h
    ================================================== =================
    --- linux-2.6.orig/include/linux/mm.h
    +++ linux-2.6/include/linux/mm.h
    @@ -598,6 +598,17 @@ static inline struct address_space *page
    return mapping;
    }

    +extern struct address_space *__page_file_mapping(struct page *);
    +
    +static inline
    +struct address_space *page_file_mapping(struct page *page)
    +{
    + if (unlikely(PageSwapCache(page)))
    + return __page_file_mapping(page);
    +
    + return page->mapping;
    +}
    +
    static inline int PageAnon(struct page *page)
    {
    return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
    @@ -614,6 +625,20 @@ static inline pgoff_t page_index(struct
    return page->index;
    }

    +extern pgoff_t __page_file_index(struct page *page);
    +
    +/*
    + * Return the file index of the page. Regular pagecache pages use ->index
    + * whereas swapcache pages use swp_offset(->private)
    + */
    +static inline pgoff_t page_file_index(struct page *page)
    +{
    + if (unlikely(PageSwapCache(page)))
    + return __page_file_index(page);
    +
    + return page->index;
    +}
    +
    /*
    * The atomic page->_mapcount, like _count, starts from -1:
    * so that transitions both from it and to it can be tracked,
    Index: linux-2.6/include/linux/pagemap.h
    ================================================== =================
    --- linux-2.6.orig/include/linux/pagemap.h
    +++ linux-2.6/include/linux/pagemap.h
    @@ -148,6 +148,11 @@ static inline loff_t page_offset(struct
    return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
    }

    +static inline loff_t page_file_offset(struct page *page)
    +{
    + return ((loff_t)page_file_index(page)) << PAGE_CACHE_SHIFT;
    +}
    +
    static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
    unsigned long address)
    {
    Index: linux-2.6/mm/swapfile.c
    ================================================== =================
    --- linux-2.6.orig/mm/swapfile.c
    +++ linux-2.6/mm/swapfile.c
    @@ -1828,6 +1828,25 @@ struct swap_info_struct *page_swap_info(
    }

    /*
    + * out-of-line __page_file_ methods to avoid include hell.
    + */
    +
    +struct address_space *__page_file_mapping(struct page *page)
    +{
    + VM_BUG_ON(!PageSwapCache(page));
    + return page_swap_info(page)->swap_file->f_mapping;
    +}
    +EXPORT_SYMBOL_GPL(__page_file_mapping);
    +
    +pgoff_t __page_file_index(struct page *page)
    +{
    + swp_entry_t swap = { .val = page_private(page) };
    + VM_BUG_ON(!PageSwapCache(page));
    + return swp_offset(swap);
    +}
    +EXPORT_SYMBOL_GPL(__page_file_index);
    +
    +/*
    * swap_lock prevents swap_map being freed. Don't grab an extra
    * reference on the swaphandle, it doesn't matter if it becomes unused.
    */

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  5. [PATCH 28/32] nfs: remove mempools

    With the introduction of the shared dirty page accounting in .19, NFS should
    not be able to surpise the VM with all dirty pages. Thus it should always be
    able to free some memory. Hence no more need for mempools.

    Signed-off-by: Peter Zijlstra
    ---
    fs/nfs/read.c | 15 +++------------
    fs/nfs/write.c | 27 +++++----------------------
    2 files changed, 8 insertions(+), 34 deletions(-)

    Index: linux-2.6/fs/nfs/read.c
    ================================================== =================
    --- linux-2.6.orig/fs/nfs/read.c
    +++ linux-2.6/fs/nfs/read.c
    @@ -33,13 +33,10 @@ static const struct rpc_call_ops nfs_rea
    static const struct rpc_call_ops nfs_read_full_ops;

    static struct kmem_cache *nfs_rdata_cachep;
    -static mempool_t *nfs_rdata_mempool;
    -
    -#define MIN_POOL_READ (32)

    struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
    {
    - struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS);
    + struct nfs_read_data *p = kmem_cache_alloc(nfs_rdata_cachep, GFP_NOFS);

    if (p) {
    memset(p, 0, sizeof(*p));
    @@ -50,7 +47,7 @@ struct nfs_read_data *nfs_readdata_alloc
    else {
    p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
    if (!p->pagevec) {
    - mempool_free(p, nfs_rdata_mempool);
    + kmem_cache_free(nfs_rdata_cachep, p);
    p = NULL;
    }
    }
    @@ -62,7 +59,7 @@ static void nfs_readdata_free(struct nfs
    {
    if (p && (p->pagevec != &p->page_array[0]))
    kfree(p->pagevec);
    - mempool_free(p, nfs_rdata_mempool);
    + kmem_cache_free(nfs_rdata_cachep, p);
    }

    void nfs_readdata_release(void *data)
    @@ -614,16 +611,10 @@ int __init nfs_init_readpagecache(void)
    if (nfs_rdata_cachep == NULL)
    return -ENOMEM;

    - nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ,
    - nfs_rdata_cachep);
    - if (nfs_rdata_mempool == NULL)
    - return -ENOMEM;
    -
    return 0;
    }

    void nfs_destroy_readpagecache(void)
    {
    - mempool_destroy(nfs_rdata_mempool);
    kmem_cache_destroy(nfs_rdata_cachep);
    }
    Index: linux-2.6/fs/nfs/write.c
    ================================================== =================
    --- linux-2.6.orig/fs/nfs/write.c
    +++ linux-2.6/fs/nfs/write.c
    @@ -28,9 +28,6 @@

    #define NFSDBG_FACILITY NFSDBG_PAGECACHE

    -#define MIN_POOL_WRITE (32)
    -#define MIN_POOL_COMMIT (4)
    -
    /*
    * Local function declarations
    */
    @@ -45,12 +42,10 @@ static const struct rpc_call_ops nfs_wri
    static const struct rpc_call_ops nfs_commit_ops;

    static struct kmem_cache *nfs_wdata_cachep;
    -static mempool_t *nfs_wdata_mempool;
    -static mempool_t *nfs_commit_mempool;

    struct nfs_write_data *nfs_commitdata_alloc(void)
    {
    - struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
    + struct nfs_write_data *p = kmem_cache_alloc(nfs_wdata_cachep, GFP_NOFS);

    if (p) {
    memset(p, 0, sizeof(*p));
    @@ -63,12 +58,12 @@ void nfs_commit_free(struct nfs_write_da
    {
    if (p && (p->pagevec != &p->page_array[0]))
    kfree(p->pagevec);
    - mempool_free(p, nfs_commit_mempool);
    + kmem_cache_free(nfs_wdata_cachep, p);
    }

    struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
    {
    - struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
    + struct nfs_write_data *p = kmem_cache_alloc(nfs_wdata_cachep, GFP_NOFS);

    if (p) {
    memset(p, 0, sizeof(*p));
    @@ -79,7 +74,7 @@ struct nfs_write_data *nfs_writedata_all
    else {
    p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
    if (!p->pagevec) {
    - mempool_free(p, nfs_wdata_mempool);
    + kmem_cache_free(nfs_wdata_cachep, p);
    p = NULL;
    }
    }
    @@ -91,7 +86,7 @@ static void nfs_writedata_free(struct nf
    {
    if (p && (p->pagevec != &p->page_array[0]))
    kfree(p->pagevec);
    - mempool_free(p, nfs_wdata_mempool);
    + kmem_cache_free(nfs_wdata_cachep, p);
    }

    void nfs_writedata_release(void *data)
    @@ -1552,16 +1547,6 @@ int __init nfs_init_writepagecache(void)
    if (nfs_wdata_cachep == NULL)
    return -ENOMEM;

    - nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE,
    - nfs_wdata_cachep);
    - if (nfs_wdata_mempool == NULL)
    - return -ENOMEM;
    -
    - nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
    - nfs_wdata_cachep);
    - if (nfs_commit_mempool == NULL)
    - return -ENOMEM;
    -
    /*
    * NFS congestion size, scale with available memory.
    *
    @@ -1587,8 +1572,6 @@ int __init nfs_init_writepagecache(void)

    void nfs_destroy_writepagecache(void)
    {
    - mempool_destroy(nfs_commit_mempool);
    - mempool_destroy(nfs_wdata_mempool);
    kmem_cache_destroy(nfs_wdata_cachep);
    }


    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  6. [PATCH 03/32] net: ipv6: clean up ip6_route_net_init() error handling

    ip6_route_net_init() error handling looked less than solid, fix 'er up.

    Signed-off-by: Peter Zijlstra
    ---
    net/ipv6/route.c | 19 ++++++++++---------
    1 file changed, 10 insertions(+), 9 deletions(-)

    Index: linux-2.6/net/ipv6/route.c
    ================================================== =================
    --- linux-2.6.orig/net/ipv6/route.c
    +++ linux-2.6/net/ipv6/route.c
    @@ -2611,10 +2611,8 @@ static int ip6_route_net_init(struct net
    net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
    sizeof(*net->ipv6.ip6_prohibit_entry),
    GFP_KERNEL);
    - if (!net->ipv6.ip6_prohibit_entry) {
    - kfree(net->ipv6.ip6_null_entry);
    - goto out;
    - }
    + if (!net->ipv6.ip6_prohibit_entry)
    + goto out_ip6_null_entry;
    net->ipv6.ip6_prohibit_entry->u.dst.path =
    (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
    net->ipv6.ip6_prohibit_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
    @@ -2622,11 +2620,8 @@ static int ip6_route_net_init(struct net
    net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
    sizeof(*net->ipv6.ip6_blk_hole_entry),
    GFP_KERNEL);
    - if (!net->ipv6.ip6_blk_hole_entry) {
    - kfree(net->ipv6.ip6_null_entry);
    - kfree(net->ipv6.ip6_prohibit_entry);
    - goto out;
    - }
    + if (!net->ipv6.ip6_blk_hole_entry)
    + goto out_ip6_prohibit_entry;
    net->ipv6.ip6_blk_hole_entry->u.dst.path =
    (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
    net->ipv6.ip6_blk_hole_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
    @@ -2642,6 +2637,12 @@ static int ip6_route_net_init(struct net
    out:
    return ret;

    +#ifdef CONFIG_IPV6_MULTIPLE_TABLES
    +out_ip6_prohibit_entry:
    + kfree(net->ipv6.ip6_prohibit_entry);
    +out_ip6_null_entry:
    + kfree(net->ipv6.ip6_null_entry);
    +#endif
    out_ip6_dst_ops:
    release_net(net->ipv6.ip6_dst_ops->dst_net);
    kfree(net->ipv6.ip6_dst_ops);

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  7. [PATCH 30/32] nfs: disable data cache revalidation for swapfiles

    Do as Trond suggested:
    http://lkml.org/lkml/2006/8/25/348

    Disable NFS data cache revalidation on swap files since it doesn't really
    make sense to have other clients change the file while you are using it.

    Thereby we can stop setting PG_private on swap pages, since there ought to
    be no further races with invalidate_inode_pages2() to deal with.

    And since we cannot set PG_private we cannot use page->private (which is
    already used by PG_swapcache pages anyway) to store the nfs_page. Thus
    augment the new nfs_page_find_request logic.

    Signed-off-by: Peter Zijlstra
    ---
    fs/nfs/inode.c | 6 ++++
    fs/nfs/write.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++-----------
    2 files changed, 64 insertions(+), 13 deletions(-)

    Index: linux-2.6/fs/nfs/inode.c
    ================================================== =================
    --- linux-2.6.orig/fs/nfs/inode.c
    +++ linux-2.6/fs/nfs/inode.c
    @@ -824,6 +824,12 @@ int nfs_revalidate_mapping_nolock(struct
    struct nfs_inode *nfsi = NFS_I(inode);
    int ret = 0;

    + /*
    + * swapfiles are not supposed to be shared.
    + */
    + if (IS_SWAPFILE(inode))
    + goto out;
    +
    if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
    || nfs_attribute_timeout(inode) || NFS_STALE(inode)) {
    ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
    Index: linux-2.6/fs/nfs/write.c
    ================================================== =================
    --- linux-2.6.orig/fs/nfs/write.c
    +++ linux-2.6/fs/nfs/write.c
    @@ -101,25 +101,62 @@ static void nfs_context_set_write_error(
    set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
    }

    -static struct nfs_page *nfs_page_find_request_locked(struct page *page)
    +static struct nfs_page *
    +__nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page, int get)
    {
    struct nfs_page *req = NULL;

    - if (PagePrivate(page)) {
    + if (PagePrivate(page))
    req = (struct nfs_page *)page_private(page);
    - if (req != NULL)
    - kref_get(&req->wb_kref);
    - }
    + else if (unlikely(PageSwapCache(page)))
    + req = radix_tree_lookup(&nfsi->nfs_page_tree, page_file_index(page));
    +
    + if (get && req)
    + kref_get(&req->wb_kref);
    +
    return req;
    }

    +static inline struct nfs_page *
    +nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page)
    +{
    + return __nfs_page_find_request_locked(nfsi, page, 1);
    +}
    +
    +static int __nfs_page_has_request(struct page *page)
    +{
    + struct inode *inode = page_file_mapping(page)->host;
    + struct nfs_page *req = NULL;
    +
    + spin_lock(&inode->i_lock);
    + req = __nfs_page_find_request_locked(NFS_I(inode), page, 0);
    + spin_unlock(&inode->i_lock);
    +
    + /*
    + * hole here plugged by the caller holding onto PG_locked
    + */
    +
    + return req != NULL;
    +}
    +
    +static inline int nfs_page_has_request(struct page *page)
    +{
    + if (PagePrivate(page))
    + return 1;
    +
    + if (unlikely(PageSwapCache(page)))
    + return __nfs_page_has_request(page);
    +
    + return 0;
    +}
    +
    static struct nfs_page *nfs_page_find_request(struct page *page)
    {
    struct inode *inode = page_file_mapping(page)->host;
    struct nfs_page *req = NULL;

    spin_lock(&inode->i_lock);
    - req = nfs_page_find_request_locked(page);
    + req = nfs_page_find_request_locked(NFS_I(inode), page);
    spin_unlock(&inode->i_lock);
    return req;
    }
    @@ -220,7 +257,7 @@ static int nfs_page_async_flush(struct n

    spin_lock(&inode->i_lock);
    for(; {
    - req = nfs_page_find_request_locked(page);
    + req = nfs_page_find_request_locked(NFS_I(inode), page);
    if (req == NULL) {
    spin_unlock(&inode->i_lock);
    return 0;
    @@ -343,8 +380,14 @@ static int nfs_inode_add_request(struct
    if (nfs_have_delegation(inode, FMODE_WRITE))
    nfsi->change_attr++;
    }
    - SetPagePrivate(req->wb_page);
    - set_page_private(req->wb_page, (unsigned long)req);
    + /*
    + * Swap-space should not get truncated. Hence no need to plug the race
    + * with invalidate/truncate.
    + */
    + if (likely(!PageSwapCache(req->wb_page))) {
    + SetPagePrivate(req->wb_page);
    + set_page_private(req->wb_page, (unsigned long)req);
    + }
    nfsi->npages++;
    kref_get(&req->wb_kref);
    radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
    @@ -366,8 +409,10 @@ static void nfs_inode_remove_request(str
    BUG_ON (!NFS_WBACK_BUSY(req));

    spin_lock(&inode->i_lock);
    - set_page_private(req->wb_page, 0);
    - ClearPagePrivate(req->wb_page);
    + if (likely(!PageSwapCache(req->wb_page))) {
    + set_page_private(req->wb_page, 0);
    + ClearPagePrivate(req->wb_page);
    + }
    radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
    nfsi->npages--;
    if (!nfsi->npages) {
    @@ -571,7 +616,7 @@ static struct nfs_page *nfs_try_to_updat
    spin_lock(&inode->i_lock);

    for (; {
    - req = nfs_page_find_request_locked(page);
    + req = nfs_page_find_request_locked(NFS_I(inode), page);
    if (req == NULL)
    goto out_unlock;

    @@ -1482,7 +1527,7 @@ int nfs_wb_page_cancel(struct inode *ino
    if (ret < 0)
    goto out;
    }
    - if (!PagePrivate(page))
    + if (!nfs_page_has_request(page))
    return 0;
    ret = nfs_sync_mapping_wait(page_file_mapping(page), &wbc, FLUSH_INVALIDATE);
    out:

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  8. [PATCH 24/32] netfilter: NF_QUEUE vs emergency skbs

    Avoid memory getting stuck waiting for userspace, drop all emergency packets.
    This of course requires the regular storage route to not include an NF_QUEUE
    target ;-)

    Signed-off-by: Peter Zijlstra
    ---
    net/netfilter/core.c | 3 +++
    1 file changed, 3 insertions(+)

    Index: linux-2.6/net/netfilter/core.c
    ================================================== =================
    --- linux-2.6.orig/net/netfilter/core.c
    +++ linux-2.6/net/netfilter/core.c
    @@ -176,9 +176,12 @@ next_hook:
    ret = 1;
    goto unlock;
    } else if (verdict == NF_DROP) {
    +drop:
    kfree_skb(skb);
    ret = -EPERM;
    } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
    + if (skb_emergency(*pskb))
    + goto drop;
    if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
    verdict >> NF_VERDICT_BITS))
    goto next_hook;

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  9. [PATCH 17/32] net: packet split receive api

    Add some packet-split receive hooks.

    For one this allows to do NUMA node affine page allocs. Later on these hooks
    will be extended to do emergency reserve allocations for fragments.

    Signed-off-by: Peter Zijlstra
    ---
    drivers/net/bnx2.c | 8 +++-----
    drivers/net/e1000/e1000_main.c | 8 ++------
    drivers/net/e1000e/netdev.c | 7 ++-----
    drivers/net/igb/igb_main.c | 9 ++-------
    drivers/net/ixgbe/ixgbe_main.c | 10 +++-------
    drivers/net/sky2.c | 16 ++++++----------
    include/linux/skbuff.h | 23 +++++++++++++++++++++++
    net/core/skbuff.c | 20 ++++++++++++++++++++
    8 files changed, 61 insertions(+), 40 deletions(-)

    Index: linux-2.6/drivers/net/e1000/e1000_main.c
    ================================================== =================
    --- linux-2.6.orig/drivers/net/e1000/e1000_main.c
    +++ linux-2.6/drivers/net/e1000/e1000_main.c
    @@ -4347,12 +4347,8 @@ static bool e1000_clean_rx_irq_ps(struct
    pci_unmap_page(pdev, ps_page_dma->ps_page_dma[j],
    PAGE_SIZE, PCI_DMA_FROMDEVICE);
    ps_page_dma->ps_page_dma[j] = 0;
    - skb_fill_page_desc(skb, j, ps_page->ps_page[j], 0,
    - length);
    + skb_add_rx_frag(skb, j, ps_page->ps_page[j], 0, length);
    ps_page->ps_page[j] = NULL;
    - skb->len += length;
    - skb->data_len += length;
    - skb->truesize += length;
    }

    /* strip the ethernet crc, problem is we're using pages now so
    @@ -4551,7 +4547,7 @@ static void e1000_alloc_rx_buffers_ps(st
    if (j < adapter->rx_ps_pages) {
    if (likely(!ps_page->ps_page[j])) {
    ps_page->ps_page[j] =
    - alloc_page(GFP_ATOMIC);
    + netdev_alloc_page(netdev);
    if (unlikely(!ps_page->ps_page[j])) {
    adapter->alloc_rx_buff_failed++;
    goto no_buffers;
    Index: linux-2.6/include/linux/skbuff.h
    ================================================== =================
    --- linux-2.6.orig/include/linux/skbuff.h
    +++ linux-2.6/include/linux/skbuff.h
    @@ -829,6 +829,9 @@ static inline void skb_fill_page_desc(st
    skb_shinfo(skb)->nr_frags = i + 1;
    }

    +extern void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page,
    + int off, int size);
    +
    #define SKB_PAGE_ASSERT(skb) BUG_ON(skb_shinfo(skb)->nr_frags)
    #define SKB_FRAG_ASSERT(skb) BUG_ON(skb_shinfo(skb)->frag_list)
    #define SKB_LINEAR_ASSERT(skb) BUG_ON(skb_is_nonlinear(skb))
    @@ -1243,6 +1246,26 @@ static inline struct sk_buff *netdev_all
    return __netdev_alloc_skb(dev, length, GFP_ATOMIC);
    }

    +extern struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask);
    +
    +/**
    + * netdev_alloc_page - allocate a page for ps-rx on a specific device
    + * @dev: network device to receive on
    + *
    + * Allocate a new page node local to the specified device.
    + *
    + * %NULL is returned if there is no free memory.
    + */
    +static inline struct page *netdev_alloc_page(struct net_device *dev)
    +{
    + return __netdev_alloc_page(dev, GFP_ATOMIC);
    +}
    +
    +static inline void netdev_free_page(struct net_device *dev, struct page *page)
    +{
    + __free_page(page);
    +}
    +
    /**
    * skb_clone_writable - is the header of a clone writable
    * @skb: buffer to check
    Index: linux-2.6/net/core/skbuff.c
    ================================================== =================
    --- linux-2.6.orig/net/core/skbuff.c
    +++ linux-2.6/net/core/skbuff.c
    @@ -263,6 +263,26 @@ struct sk_buff *__netdev_alloc_skb(struc
    return skb;
    }

    +struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
    +{
    + int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
    + struct page *page;
    +
    + page = alloc_pages_node(node, gfp_mask, 0);
    + return page;
    +}
    +EXPORT_SYMBOL(__netdev_alloc_page);
    +
    +void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
    + int size)
    +{
    + skb_fill_page_desc(skb, i, page, off, size);
    + skb->len += size;
    + skb->data_len += size;
    + skb->truesize += size;
    +}
    +EXPORT_SYMBOL(skb_add_rx_frag);
    +
    /**
    * dev_alloc_skb - allocate an skbuff for receiving
    * @length: length to allocate
    Index: linux-2.6/drivers/net/sky2.c
    ================================================== =================
    --- linux-2.6.orig/drivers/net/sky2.c
    +++ linux-2.6/drivers/net/sky2.c
    @@ -1272,7 +1272,7 @@ static struct sk_buff *sky2_rx_alloc(str
    }

    for (i = 0; i < sky2->rx_nfrags; i++) {
    - struct page *page = alloc_page(GFP_ATOMIC);
    + struct page *page = netdev_alloc_page(sky2->netdev);

    if (!page)
    goto free_partial;
    @@ -2141,8 +2141,8 @@ static struct sk_buff *receive_copy(stru
    }

    /* Adjust length of skb with fragments to match received data */
    -static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
    - unsigned int length)
    +static void skb_put_frags(struct sky2_port *sky2, struct sk_buff *skb,
    + unsigned int hdr_space, unsigned int length)
    {
    int i, num_frags;
    unsigned int size;
    @@ -2159,15 +2159,11 @@ static void skb_put_frags(struct sk_buff

    if (length == 0) {
    /* don't need this page */
    - __free_page(frag->page);
    + netdev_free_page(sky2->netdev, frag->page);
    --skb_shinfo(skb)->nr_frags;
    } else {
    size = min(length, (unsigned) PAGE_SIZE);
    -
    - frag->size = size;
    - skb->data_len += size;
    - skb->truesize += size;
    - skb->len += size;
    + skb_add_rx_frag(skb, i, frag->page, 0, size);
    length -= size;
    }
    }
    @@ -2194,7 +2190,7 @@ static struct sk_buff *receive_new(struc
    sky2_rx_map_skb(sky2->hw->pdev, re, hdr_space);

    if (skb_shinfo(skb)->nr_frags)
    - skb_put_frags(skb, hdr_space, length);
    + skb_put_frags(sky2, skb, hdr_space, length);
    else
    skb_put(skb, length);
    return skb;
    Index: linux-2.6/drivers/net/bnx2.c
    ================================================== =================
    --- linux-2.6.orig/drivers/net/bnx2.c
    +++ linux-2.6/drivers/net/bnx2.c
    @@ -2472,7 +2472,7 @@ bnx2_alloc_rx_page(struct bnx2 *bp, stru
    struct sw_pg *rx_pg = &rxr->rx_pg_ring[index];
    struct rx_bd *rxbd =
    &rxr->rx_pg_desc_ring[RX_RING(index)][RX_IDX(index)];
    - struct page *page = alloc_page(GFP_ATOMIC);
    + struct page *page = netdev_alloc_page(bp->dev);

    if (!page)
    return -ENOMEM;
    @@ -2497,7 +2497,7 @@ bnx2_free_rx_page(struct bnx2 *bp, struc
    pci_unmap_page(bp->pdev, pci_unmap_addr(rx_pg, mapping), PAGE_SIZE,
    PCI_DMA_FROMDEVICE);

    - __free_page(page);
    + netdev_free_page(bp->dev, page);
    rx_pg->page = NULL;
    }

    @@ -2828,9 +2828,7 @@ bnx2_rx_skb(struct bnx2 *bp, struct bnx2
    }

    frag_size -= frag_len;
    - skb->data_len += frag_len;
    - skb->truesize += frag_len;
    - skb->len += frag_len;
    + skb_add_rx_frag(skb, i, rx_pg->page, 0, frag_len);

    pg_prod = NEXT_RX_BD(pg_prod);
    pg_cons = RX_PG_RING_IDX(NEXT_RX_BD(pg_cons));
    Index: linux-2.6/drivers/net/e1000e/netdev.c
    ================================================== =================
    --- linux-2.6.orig/drivers/net/e1000e/netdev.c
    +++ linux-2.6/drivers/net/e1000e/netdev.c
    @@ -258,7 +258,7 @@ static void e1000_alloc_rx_buffers_ps(st
    continue;
    }
    if (!ps_page->page) {
    - ps_page->page = alloc_page(GFP_ATOMIC);
    + ps_page->page = netdev_alloc_page(netdev);
    if (!ps_page->page) {
    adapter->alloc_rx_buff_failed++;
    goto no_buffers;
    @@ -818,11 +818,8 @@ static bool e1000_clean_rx_irq_ps(struct
    pci_unmap_page(pdev, ps_page->dma, PAGE_SIZE,
    PCI_DMA_FROMDEVICE);
    ps_page->dma = 0;
    - skb_fill_page_desc(skb, j, ps_page->page, 0, length);
    + skb_add_rx_frag(skb, j, ps_page->page, 0, length);
    ps_page->page = NULL;
    - skb->len += length;
    - skb->data_len += length;
    - skb->truesize += length;
    }

    copydone:
    Index: linux-2.6/drivers/net/igb/igb_main.c
    ================================================== =================
    --- linux-2.6.orig/drivers/net/igb/igb_main.c
    +++ linux-2.6/drivers/net/igb/igb_main.c
    @@ -3874,7 +3874,7 @@ static bool igb_clean_rx_irq_adv(struct
    PAGE_SIZE / 2, PCI_DMA_FROMDEVICE);
    buffer_info->page_dma = 0;

    - skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags++,
    + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags++,
    buffer_info->page,
    buffer_info->page_offset,
    length);
    @@ -3884,11 +3884,6 @@ static bool igb_clean_rx_irq_adv(struct
    buffer_info->page = NULL;
    else
    get_page(buffer_info->page);
    -
    - skb->len += length;
    - skb->data_len += length;
    -
    - skb->truesize += length;
    }
    send_up:
    i++;
    @@ -3982,7 +3977,7 @@ static void igb_alloc_rx_buffers_adv(str

    if (adapter->rx_ps_hdr_size && !buffer_info->page_dma) {
    if (!buffer_info->page) {
    - buffer_info->page = alloc_page(GFP_ATOMIC);
    + buffer_info->page = netdev_alloc_page(netdev);
    if (!buffer_info->page) {
    adapter->alloc_rx_buff_failed++;
    goto no_buffers;
    Index: linux-2.6/drivers/net/ixgbe/ixgbe_main.c
    ================================================== =================
    --- linux-2.6.orig/drivers/net/ixgbe/ixgbe_main.c
    +++ linux-2.6/drivers/net/ixgbe/ixgbe_main.c
    @@ -495,7 +495,7 @@ static void ixgbe_alloc_rx_buffers(struc

    if (!bi->page &&
    (adapter->flags & IXGBE_FLAG_RX_PS_ENABLED)) {
    - bi->page = alloc_page(GFP_ATOMIC);
    + bi->page = netdev_alloc_page(netdev);
    if (!bi->page) {
    adapter->alloc_rx_page_failed++;
    goto no_buffers;
    @@ -622,13 +622,9 @@ static bool ixgbe_clean_rx_irq(struct ix
    pci_unmap_page(pdev, rx_buffer_info->page_dma,
    PAGE_SIZE, PCI_DMA_FROMDEVICE);
    rx_buffer_info->page_dma = 0;
    - skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
    - rx_buffer_info->page, 0, upper_len);
    + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
    + rx_buffer_info->page, 0, upper_len);
    rx_buffer_info->page = NULL;
    -
    - skb->len += upper_len;
    - skb->data_len += upper_len;
    - skb->truesize += upper_len;
    }

    i++;

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  10. [PATCH 25/32] netvm: skb processing

    In order to make sure emergency packets receive all memory needed to proceed
    ensure processing of emergency SKBs happens under PF_MEMALLOC.

    Use the (new) sk_backlog_rcv() wrapper to ensure this for backlog processing.

    Skip taps, since those are user-space again.

    Signed-off-by: Peter Zijlstra
    ---
    include/net/sock.h | 5 ++++
    net/core/dev.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++-----
    net/core/sock.c | 16 ++++++++++++++
    3 files changed, 73 insertions(+), 5 deletions(-)

    Index: linux-2.6/net/core/dev.c
    ================================================== =================
    --- linux-2.6.orig/net/core/dev.c
    +++ linux-2.6/net/core/dev.c
    @@ -2162,6 +2162,30 @@ void netif_nit_deliver(struct sk_buff *s
    rcu_read_unlock();
    }

    +/*
    + * Filter the protocols for which the reserves are adequate.
    + *
    + * Before adding a protocol make sure that it is either covered by the existing
    + * reserves, or add reserves covering the memory need of the new protocol's
    + * packet processing.
    + */
    +static int skb_emergency_protocol(struct sk_buff *skb)
    +{
    + if (skb_emergency(skb))
    + switch (skb->protocol) {
    + case __constant_htons(ETH_P_ARP):
    + case __constant_htons(ETH_P_IP):
    + case __constant_htons(ETH_P_IPV6):
    + case __constant_htons(ETH_P_8021Q):
    + break;
    +
    + default:
    + return 0;
    + }
    +
    + return 1;
    +}
    +
    /**
    * netif_receive_skb - process receive buffer from network
    * @skb: buffer to process
    @@ -2184,10 +2208,23 @@ int netif_receive_skb(struct sk_buff *sk
    struct net_device *null_or_orig;
    int ret = NET_RX_DROP;
    __be16 type;
    + unsigned long pflags = current->flags;
    +
    + /* Emergency skb are special, they should
    + * - be delivered to SOCK_MEMALLOC sockets only
    + * - stay away from userspace
    + * - have bounded memory usage
    + *
    + * Use PF_MEMALLOC as a poor mans memory pool - the grouping kind.
    + * This saves us from propagating the allocation context down to all
    + * allocation sites.
    + */
    + if (skb_emergency(skb))
    + current->flags |= PF_MEMALLOC;

    /* if we've gotten here through NAPI, check netpoll */
    if (netpoll_receive_skb(skb))
    - return NET_RX_DROP;
    + goto out;

    if (!skb->tstamp.tv64)
    net_timestamp(skb);
    @@ -2225,6 +2262,9 @@ int netif_receive_skb(struct sk_buff *sk
    }
    #endif

    + if (skb_emergency(skb))
    + goto skip_taps;
    +
    list_for_each_entry_rcu(ptype, &ptype_all, list) {
    if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
    ptype->dev == orig_dev) {
    @@ -2234,19 +2274,23 @@ int netif_receive_skb(struct sk_buff *sk
    }
    }

    +skip_taps:
    #ifdef CONFIG_NET_CLS_ACT
    skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
    if (!skb)
    - goto out;
    + goto unlock;
    ncls:
    #endif

    + if (!skb_emergency_protocol(skb))
    + goto drop;
    +
    skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
    if (!skb)
    - goto out;
    + goto unlock;
    skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
    if (!skb)
    - goto out;
    + goto unlock;

    type = skb->protocol;
    list_for_each_entry_rcu(ptype,
    @@ -2263,6 +2307,7 @@ ncls:
    if (pt_prev) {
    ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
    } else {
    +drop:
    kfree_skb(skb);
    /* Jamal, now you will not able to escape explaining
    * me how you were going to use this. :-)
    @@ -2270,8 +2315,10 @@ ncls:
    ret = NET_RX_DROP;
    }

    -out:
    +unlock:
    rcu_read_unlock();
    +out:
    + tsk_restore_flags(current, pflags, PF_MEMALLOC);
    return ret;
    }

    Index: linux-2.6/include/net/sock.h
    ================================================== =================
    --- linux-2.6.orig/include/net/sock.h
    +++ linux-2.6/include/net/sock.h
    @@ -528,8 +528,13 @@ static inline void sk_add_backlog(struct
    skb->next = NULL;
    }

    +extern int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
    +
    static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
    {
    + if (skb_emergency(skb))
    + return __sk_backlog_rcv(sk, skb);
    +
    return sk->sk_backlog_rcv(sk, skb);
    }

    Index: linux-2.6/net/core/sock.c
    ================================================== =================
    --- linux-2.6.orig/net/core/sock.c
    +++ linux-2.6/net/core/sock.c
    @@ -309,6 +309,22 @@ int sk_clear_memalloc(struct sock *sk)
    return set;
    }
    EXPORT_SYMBOL_GPL(sk_clear_memalloc);
    +
    +int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
    +{
    + int ret;
    + unsigned long pflags = current->flags;
    +
    + /* these should have been dropped before queueing */
    + BUG_ON(!sk_has_memalloc(sk));
    +
    + current->flags |= PF_MEMALLOC;
    + ret = sk->sk_backlog_rcv(sk, skb);
    + tsk_restore_flags(current, pflags, PF_MEMALLOC);
    +
    + return ret;
    +}
    +EXPORT_SYMBOL(__sk_backlog_rcv);
    #endif

    static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  11. [PATCH 06/32] mm: expose gfp_to_alloc_flags()

    Expose the gfp to alloc_flags mapping, so we can use it in other parts
    of the vm.

    Signed-off-by: Peter Zijlstra
    ---
    mm/internal.h | 10 ++++++++++
    mm/page_alloc.c | 10 +---------
    2 files changed, 11 insertions(+), 9 deletions(-)

    Index: linux-2.6/mm/internal.h
    ================================================== =================
    --- linux-2.6.orig/mm/internal.h
    +++ linux-2.6/mm/internal.h
    @@ -187,6 +187,16 @@ static inline void free_page_mlock(struc
    #define __paginginit __init
    #endif

    +#define ALLOC_HARDER 0x01 /* try to alloc harder */
    +#define ALLOC_HIGH 0x02 /* __GFP_HIGH set */
    +#define ALLOC_WMARK_MIN 0x04 /* use pages_min watermark */
    +#define ALLOC_WMARK_LOW 0x08 /* use pages_low watermark */
    +#define ALLOC_WMARK_HIGH 0x10 /* use pages_high watermark */
    +#define ALLOC_NO_WATERMARKS 0x20 /* don't check watermarks at all */
    +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
    +
    +int gfp_to_alloc_flags(gfp_t gfp_mask);
    +
    /* Memory initialisation debug and verification */
    enum mminit_level {
    MMINIT_WARNING,
    Index: linux-2.6/mm/page_alloc.c
    ================================================== =================
    --- linux-2.6.orig/mm/page_alloc.c
    +++ linux-2.6/mm/page_alloc.c
    @@ -1122,14 +1122,6 @@ failed:
    return NULL;
    }

    -#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
    -#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
    -#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
    -#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
    -#define ALLOC_HARDER 0x10 /* try to alloc harder */
    -#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
    -#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
    -
    #ifdef CONFIG_FAIL_PAGE_ALLOC

    static struct fail_page_alloc_attr {
    @@ -1512,7 +1504,7 @@ static void set_page_owner(struct page *
    /*
    * get the deepest reaching allocation flags for the given gfp_mask
    */
    -static int gfp_to_alloc_flags(gfp_t gfp_mask)
    +int gfp_to_alloc_flags(gfp_t gfp_mask)
    {
    struct task_struct *p = current;
    int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  12. [PATCH 07/32] mm: tag reseve pages

    Tag pages allocated from the reserves with a non-zero page->reserve.
    This allows us to distinguish and account reserve pages.

    Since low-memory situations are transient, and unrelated the the actual
    page (any page can be on the freelist when we run low), don't mark the
    page in any permanent way - just pass along the information to the
    allocatee.

    Signed-off-by: Peter Zijlstra
    ---
    include/linux/mm_types.h | 1 +
    mm/page_alloc.c | 4 +++-
    2 files changed, 4 insertions(+), 1 deletion(-)

    Index: linux-2.6/include/linux/mm_types.h
    ================================================== =================
    --- linux-2.6.orig/include/linux/mm_types.h
    +++ linux-2.6/include/linux/mm_types.h
    @@ -70,6 +70,7 @@ struct page {
    union {
    pgoff_t index; /* Our offset within mapping. */
    void *freelist; /* SLUB: freelist req. slab lock */
    + int reserve; /* page_alloc: page is a reserve page */
    };
    struct list_head lru; /* Pageout list, eg. active_list
    * protected by zone->lru_lock !
    Index: linux-2.6/mm/page_alloc.c
    ================================================== =================
    --- linux-2.6.orig/mm/page_alloc.c
    +++ linux-2.6/mm/page_alloc.c
    @@ -1433,8 +1433,10 @@ zonelist_scan:
    }

    page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
    - if (page)
    + if (page) {
    + page->reserve = !!(alloc_flags & ALLOC_NO_WATERMARKS);
    break;
    + }
    this_zone_full:
    if (NUMA_BUILD)
    zlc_mark_zone_full(zonelist, z);

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  13. [PATCH 22/32] netvm: filter emergency skbs.

    Toss all emergency packets not for a SOCK_MEMALLOC socket. This ensures our
    precious memory reserve doesn't get stuck waiting for user-space.

    The correctness of this approach relies on the fact that networks must be
    assumed lossy.

    Signed-off-by: Peter Zijlstra
    ---
    net/core/filter.c | 3 +++
    1 file changed, 3 insertions(+)

    Index: linux-2.6/net/core/filter.c
    ================================================== =================
    --- linux-2.6.orig/net/core/filter.c
    +++ linux-2.6/net/core/filter.c
    @@ -81,6 +81,9 @@ int sk_filter(struct sock *sk, struct sk
    int err;
    struct sk_filter *filter;

    + if (skb_emergency(skb) && !sk_has_memalloc(sk))
    + return -ENOMEM;
    +
    err = security_sock_rcv_skb(sk, skb);
    if (err)
    return err;

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  14. [PATCH 08/32] mm: slb: add knowledge of reserve pages

    Restrict objects from reserve slabs (ALLOC_NO_WATERMARKS) to allocation
    contexts that are entitled to it. This is done to ensure reserve pages don't
    leak out and get consumed.

    The basic pattern used for all # allocators is the following, for each active
    slab page we store if it came from an emergency allocation. When we find it
    did, make sure the current allocation context would have been able to allocate
    page from the emergency reserves as well. In that case allow the allocation. If
    not, force a new slab allocation. When that works the memory pressure has
    lifted enough to allow this context to get an object, otherwise fail the
    allocation.

    Signed-off-by: Peter Zijlstra
    ---
    include/linux/slub_def.h | 1
    mm/slab.c | 60 +++++++++++++++++++++++++++++++++++++++--------
    mm/slob.c | 14 ++++++++++
    mm/slub.c | 42 +++++++++++++++++++++++++++-----
    4 files changed, 101 insertions(+), 16 deletions(-)

    Index: linux-2.6/mm/slub.c
    ================================================== =================
    --- linux-2.6.orig/mm/slub.c
    +++ linux-2.6/mm/slub.c
    @@ -25,6 +25,7 @@
    #include
    #include
    #include
    +#include "internal.h"

    /*
    * Lock order:
    @@ -1118,7 +1119,8 @@ static void setup_object(struct kmem_cac
    s->ctor(object);
    }

    -static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
    +static
    +struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node, int *reserve)
    {
    struct page *page;
    void *start;
    @@ -1132,6 +1134,8 @@ static struct page *new_slab(struct kmem
    if (!page)
    goto out;

    + *reserve = page->reserve;
    +
    inc_slabs_node(s, page_to_nid(page), page->objects);
    page->slab = s;
    page->flags |= 1 << PG_slab;
    @@ -1524,10 +1528,20 @@ static void *__slab_alloc(struct kmem_ca
    {
    void **object;
    struct page *new;
    + int reserve;

    /* We handle __GFP_ZERO in the caller */
    gfpflags &= ~__GFP_ZERO;

    + if (unlikely(c->reserve)) {
    + /*
    + * If the current slab is a reserve slab and the current
    + * allocation context does not allow access to the reserves we
    + * must force an allocation to test the current levels.
    + */
    + if (!(gfp_to_alloc_flags(gfpflags) & ALLOC_NO_WATERMARKS))
    + goto grow_slab;
    + }
    if (!c->page)
    goto new_slab;

    @@ -1541,8 +1555,8 @@ load_freelist:
    object = c->page->freelist;
    if (unlikely(!object))
    goto another_slab;
    - if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
    - goto debug;
    + if (unlikely(PageSlubDebug(c->page) || c->reserve))
    + goto slow_path;

    c->freelist = object[c->offset];
    c->page->inuse = c->page->objects;
    @@ -1564,16 +1578,18 @@ new_slab:
    goto load_freelist;
    }

    +grow_slab:
    if (gfpflags & __GFP_WAIT)
    local_irq_enable();

    - new = new_slab(s, gfpflags, node);
    + new = new_slab(s, gfpflags, node, &reserve);

    if (gfpflags & __GFP_WAIT)
    local_irq_disable();

    if (new) {
    c = get_cpu_slab(s, smp_processor_id());
    + c->reserve = reserve;
    stat(c, ALLOC_SLAB);
    if (c->page)
    flush_slab(s, c);
    @@ -1583,10 +1599,21 @@ new_slab:
    goto load_freelist;
    }
    return NULL;
    -debug:
    - if (!alloc_debug_processing(s, c->page, object, addr))
    +
    +slow_path:
    + if (PageSlubDebug(c->page) &&
    + !alloc_debug_processing(s, c->page, object, addr))
    goto another_slab;

    + /*
    + * Avoid the slub fast path in slab_alloc() by not setting
    + * c->freelist and the fast path in slab_free() by making
    + * node_match() fail by setting c->node to -1.
    + *
    + * We use this for for debug and reserve checks which need
    + * to be done for each allocation.
    + */
    +
    c->page->inuse++;
    c->page->freelist = object[c->offset];
    c->node = -1;
    @@ -2130,10 +2157,11 @@ static struct kmem_cache_node *early_kme
    struct page *page;
    struct kmem_cache_node *n;
    unsigned long flags;
    + int reserve;

    BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));

    - page = new_slab(kmalloc_caches, gfpflags, node);
    + page = new_slab(kmalloc_caches, gfpflags, node, &reserve);

    BUG_ON(!page);
    if (page_to_nid(page) != node) {
    Index: linux-2.6/include/linux/slub_def.h
    ================================================== =================
    --- linux-2.6.orig/include/linux/slub_def.h
    +++ linux-2.6/include/linux/slub_def.h
    @@ -45,6 +45,7 @@ struct kmem_cache_cpu {
    int node; /* The node of the page (or -1 for debug) */
    unsigned int offset; /* Freepointer offset (in word units) */
    unsigned int objsize; /* Size of an object (from kmem_cache) */
    + int reserve; /* Did the current page come from the reserve */
    #ifdef CONFIG_SLUB_STATS
    unsigned stat[NR_SLUB_STAT_ITEMS];
    #endif
    Index: linux-2.6/mm/slab.c
    ================================================== =================
    --- linux-2.6.orig/mm/slab.c
    +++ linux-2.6/mm/slab.c
    @@ -118,6 +118,8 @@
    #include
    #include

    +#include "internal.h"
    +
    /*
    * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
    * 0 for faster, smaller code (especially in the critical paths).
    @@ -267,7 +269,8 @@ struct array_cache {
    unsigned int avail;
    unsigned int limit;
    unsigned int batchcount;
    - unsigned int touched;
    + unsigned int touched:1,
    + reserve:1;
    spinlock_t lock;
    void *entry[]; /*
    * Must have this definition in here for the proper
    @@ -690,6 +693,27 @@ static inline struct array_cache *cpu_ca
    return cachep->array[smp_processor_id()];
    }

    +/*
    + * If the last page came from the reserves, and the current allocation context
    + * does not have access to them, force an allocation to test the watermarks.
    + */
    +static inline int slab_force_alloc(struct kmem_cache *cachep, gfp_t flags)
    +{
    + if (unlikely(cpu_cache_get(cachep)->reserve) &&
    + !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS))
    + return 1;
    +
    + return 0;
    +}
    +
    +static inline void slab_set_reserve(struct kmem_cache *cachep, int reserve)
    +{
    + struct array_cache *ac = cpu_cache_get(cachep);
    +
    + if (unlikely(ac->reserve != reserve))
    + ac->reserve = reserve;
    +}
    +
    static inline struct kmem_cache *__find_general_cachep(size_t size,
    gfp_t gfpflags)
    {
    @@ -889,6 +913,7 @@ static struct array_cache *alloc_arrayca
    nc->limit = entries;
    nc->batchcount = batchcount;
    nc->touched = 0;
    + nc->reserve = 0;
    spin_lock_init(&nc->lock);
    }
    return nc;
    @@ -1591,7 +1616,8 @@ __initcall(cpucache_init);
    * did not request dmaable memory, we might get it, but that
    * would be relatively rare and ignorable.
    */
    -static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
    +static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid,
    + int *reserve)
    {
    struct page *page;
    int nr_pages;
    @@ -1613,6 +1639,7 @@ static void *kmem_getpages(struct kmem_c
    if (!page)
    return NULL;

    + *reserve = page->reserve;
    nr_pages = (1 << cachep->gfporder);
    if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
    add_zone_page_state(page_zone(page),
    @@ -2040,6 +2067,7 @@ static int __init_refok setup_cpu_cache(
    cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
    cpu_cache_get(cachep)->batchcount = 1;
    cpu_cache_get(cachep)->touched = 0;
    + cpu_cache_get(cachep)->reserve = 0;
    cachep->batchcount = 1;
    cachep->limit = BOOT_CPUCACHE_ENTRIES;
    return 0;
    @@ -2694,6 +2722,7 @@ static int cache_grow(struct kmem_cache
    size_t offset;
    gfp_t local_flags;
    struct kmem_list3 *l3;
    + int reserve;

    /*
    * Be lazy and only check for valid flags here, keeping it out of the
    @@ -2732,7 +2761,7 @@ static int cache_grow(struct kmem_cache
    * 'nodeid'.
    */
    if (!objp)
    - objp = kmem_getpages(cachep, local_flags, nodeid);
    + objp = kmem_getpages(cachep, local_flags, nodeid, &reserve);
    if (!objp)
    goto failed;

    @@ -2749,6 +2778,7 @@ static int cache_grow(struct kmem_cache
    if (local_flags & __GFP_WAIT)
    local_irq_disable();
    check_irq_off();
    + slab_set_reserve(cachep, reserve);
    spin_lock(&l3->list_lock);

    /* Make slab active. */
    @@ -2894,7 +2924,8 @@ bad:
    #define check_slabp(x,y) do { } while(0)
    #endif

    -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
    +static void *cache_alloc_refill(struct kmem_cache *cachep,
    + gfp_t flags, int must_refill)
    {
    int batchcount;
    struct kmem_list3 *l3;
    @@ -2904,6 +2935,8 @@ static void *cache_alloc_refill(struct k
    retry:
    check_irq_off();
    node = numa_node_id();
    + if (unlikely(must_refill))
    + goto force_grow;
    ac = cpu_cache_get(cachep);
    batchcount = ac->batchcount;
    if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
    @@ -2971,11 +3004,14 @@ alloc_done:

    if (unlikely(!ac->avail)) {
    int x;
    +force_grow:
    x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);

    /* cache_grow can reenable interrupts, then ac could change. */
    ac = cpu_cache_get(cachep);
    - if (!x && ac->avail == 0) /* no objects in sight? abort */
    +
    + /* no objects in sight? abort */
    + if (!x && (ac->avail == 0 || must_refill))
    return NULL;

    if (!ac->avail) /* objects refilled by interrupt? */
    @@ -3130,17 +3166,18 @@ static inline void *____cache_alloc(stru
    {
    void *objp;
    struct array_cache *ac;
    + int must_refill = slab_force_alloc(cachep, flags);

    check_irq_off();

    ac = cpu_cache_get(cachep);
    - if (likely(ac->avail)) {
    + if (likely(ac->avail && !must_refill)) {
    STATS_INC_ALLOCHIT(cachep);
    ac->touched = 1;
    objp = ac->entry[--ac->avail];
    } else {
    STATS_INC_ALLOCMISS(cachep);
    - objp = cache_alloc_refill(cachep, flags);
    + objp = cache_alloc_refill(cachep, flags, must_refill);
    }
    return objp;
    }
    @@ -3184,7 +3221,7 @@ static void *fallback_alloc(struct kmem_
    struct zone *zone;
    enum zone_type high_zoneidx = gfp_zone(flags);
    void *obj = NULL;
    - int nid;
    + int nid, reserve;

    if (flags & __GFP_THISNODE)
    return NULL;
    @@ -3220,10 +3257,11 @@ retry:
    if (local_flags & __GFP_WAIT)
    local_irq_enable();
    kmem_flagcheck(cache, flags);
    - obj = kmem_getpages(cache, local_flags, -1);
    + obj = kmem_getpages(cache, local_flags, -1, &reserve);
    if (local_flags & __GFP_WAIT)
    local_irq_disable();
    if (obj) {
    + slab_set_reserve(cache, reserve);
    /*
    * Insert into the appropriate per node queues
    */
    @@ -3262,6 +3300,9 @@ static void *____cache_alloc_node(struct
    l3 = cachep->nodelists[nodeid];
    BUG_ON(!l3);

    + if (unlikely(slab_force_alloc(cachep, flags)))
    + goto force_grow;
    +
    retry:
    check_irq_off();
    spin_lock(&l3->list_lock);
    @@ -3299,6 +3340,7 @@ retry:

    must_grow:
    spin_unlock(&l3->list_lock);
    +force_grow:
    x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
    if (x)
    goto retry;
    Index: linux-2.6/mm/slob.c
    ================================================== =================
    --- linux-2.6.orig/mm/slob.c
    +++ linux-2.6/mm/slob.c
    @@ -67,6 +67,7 @@
    #include
    #include
    #include
    +#include "internal.h"

    /*
    * slob_block has a field 'units', which indicates size of block if +ve,
    @@ -184,6 +185,11 @@ struct slob_rcu {
    static DEFINE_SPINLOCK(slob_lock);

    /*
    + * tracks the reserve state for the allocator.
    + */
    +static int slob_reserve;
    +
    +/*
    * Encode the given size and next info into a free slob block s.
    */
    static void set_slob(slob_t *s, slobidx_t size, slob_t *next)
    @@ -245,6 +251,8 @@ static void *slob_new_page(gfp_t gfp, in
    if (!page)
    return NULL;

    + slob_reserve = page->reserve;
    +
    return page_address(page);
    }

    @@ -310,6 +318,11 @@ static void *slob_alloc(size_t size, gfp
    slob_t *b = NULL;
    unsigned long flags;

    + if (unlikely(slub_reserve)) {
    + if (!(gfp_to_alloc_flags(gfp) & ALLOC_NO_WATERMARKS))
    + goto grow;
    + }
    +
    if (size < SLOB_BREAK1)
    slob_list = &free_slob_small;
    else if (size < SLOB_BREAK2)
    @@ -348,6 +361,7 @@ static void *slob_alloc(size_t size, gfp
    }
    spin_unlock_irqrestore(&slob_lock, flags);

    +grow:
    /* Not enough space: must allocate a new page */
    if (!b) {
    b = slob_new_page(gfp & ~__GFP_ZERO, 0, node);

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  15. [PATCH 15/32] selinux: tag avc cache alloc as non-critical

    Failing to allocate a cache entry will only harm performance not correctness.
    Do not consume valuable reserve pages for something like that.

    Signed-off-by: Peter Zijlstra
    Acked-by: James Morris
    ---
    security/selinux/avc.c | 2 +-
    1 file changed, 1 insertion(+), 1 deletion(-)

    Index: linux-2.6/security/selinux/avc.c
    ================================================== =================
    --- linux-2.6.orig/security/selinux/avc.c
    +++ linux-2.6/security/selinux/avc.c
    @@ -334,7 +334,7 @@ static struct avc_node *avc_alloc_node(v
    {
    struct avc_node *node;

    - node = kmem_cache_zalloc(avc_node_cachep, GFP_ATOMIC);
    + node = kmem_cache_zalloc(avc_node_cachep, GFP_ATOMIC|__GFP_NOMEMALLOC);
    if (!node)
    goto out;


    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  16. [PATCH 01/32] mm: gfp_to_alloc_flags()

    Clean up the code by factoring out the gfp to alloc_flags mapping.

    [neilb@suse.de says]
    As the test:

    - if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
    - && !in_interrupt()) {
    - if (!(gfp_mask & __GFP_NOMEMALLOC)) {

    has been replaced with a slightly weaker one:

    + if (alloc_flags & ALLOC_NO_WATERMARKS) {

    we need to ensure we don't recurse when PF_MEMALLOC is set

    Signed-off-by: Peter Zijlstra
    ---
    mm/internal.h | 10 +++++
    mm/page_alloc.c | 95 +++++++++++++++++++++++++++++++-------------------------
    2 files changed, 64 insertions(+), 41 deletions(-)

    Index: linux-2.6/mm/page_alloc.c
    ================================================== =================
    --- linux-2.6.orig/mm/page_alloc.c
    +++ linux-2.6/mm/page_alloc.c
    @@ -1510,6 +1502,44 @@ static void set_page_owner(struct page *
    #endif /* CONFIG_PAGE_OWNER */

    /*
    + * get the deepest reaching allocation flags for the given gfp_mask
    + */
    +static int gfp_to_alloc_flags(gfp_t gfp_mask)
    +{
    + struct task_struct *p = current;
    + int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
    + const gfp_t wait = gfp_mask & __GFP_WAIT;
    +
    + /*
    + * The caller may dip into page reserves a bit more if the caller
    + * cannot run direct reclaim, or if the caller has realtime scheduling
    + * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
    + * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
    + */
    + if (gfp_mask & __GFP_HIGH)
    + alloc_flags |= ALLOC_HIGH;
    +
    + if (!wait) {
    + alloc_flags |= ALLOC_HARDER;
    + /*
    + * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
    + * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
    + */
    + alloc_flags &= ~ALLOC_CPUSET;
    + } else if (unlikely(rt_task(p)) && !in_interrupt())
    + alloc_flags |= ALLOC_HARDER;
    +
    + if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
    + if (!in_interrupt() &&
    + ((p->flags & PF_MEMALLOC) ||
    + unlikely(test_thread_flag(TIF_MEMDIE))))
    + alloc_flags |= ALLOC_NO_WATERMARKS;
    + }
    +
    + return alloc_flags;
    +}
    +
    +/*
    * This is the 'heart' of the zoned buddy allocator.
    */
    struct page *
    @@ -1567,49 +1597,28 @@ restart:
    * OK, we're below the kswapd watermark and have kicked background
    * reclaim. Now things get more complex, so set up alloc_flags according
    * to how we want to proceed.
    - *
    - * The caller may dip into page reserves a bit more if the caller
    - * cannot run direct reclaim, or if the caller has realtime scheduling
    - * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
    - * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
    */
    - alloc_flags = ALLOC_WMARK_MIN;
    - if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
    - alloc_flags |= ALLOC_HARDER;
    - if (gfp_mask & __GFP_HIGH)
    - alloc_flags |= ALLOC_HIGH;
    - if (wait)
    - alloc_flags |= ALLOC_CPUSET;
    + alloc_flags = gfp_to_alloc_flags(gfp_mask);

    - /*
    - * Go through the zonelist again. Let __GFP_HIGH and allocations
    - * coming from realtime tasks go deeper into reserves.
    - *
    - * This is the last chance, in general, before the goto nopage.
    - * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
    - * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
    - */
    + /* This is the last chance, in general, before the goto nopage. */
    page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
    - high_zoneidx, alloc_flags);
    + high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS);
    if (page)
    goto got_pg;

    /* This allocation should allow future memory freeing. */
    -
    rebalance:
    - if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
    - && !in_interrupt()) {
    - if (!(gfp_mask & __GFP_NOMEMALLOC)) {
    + if (alloc_flags & ALLOC_NO_WATERMARKS) {
    nofail_alloc:
    - /* go through the zonelist yet again, ignoring mins */
    - page = get_page_from_freelist(gfp_mask, nodemask, order,
    + /* go through the zonelist yet again, ignoring mins */
    + page = get_page_from_freelist(gfp_mask, nodemask, order,
    zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
    - if (page)
    - goto got_pg;
    - if (gfp_mask & __GFP_NOFAIL) {
    - congestion_wait(WRITE, HZ/50);
    - goto nofail_alloc;
    - }
    + if (page)
    + goto got_pg;
    +
    + if (wait && (gfp_mask & __GFP_NOFAIL)) {
    + congestion_wait(WRITE, HZ/50);
    + goto nofail_alloc;
    }
    goto nopage;
    }
    @@ -1618,6 +1627,10 @@ nofail_alloc:
    if (!wait)
    goto nopage;

    + /* Avoid recursion of direct reclaim */
    + if (p->flags & PF_MEMALLOC)
    + goto nopage;
    +
    cond_resched();

    /* We now go into synchronous reclaim */

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  17. [PATCH 11/32] mm: emergency pool

    Provide means to reserve a specific amount of pages.

    The emergency pool is separated from the min watermark because ALLOC_HARDER
    and ALLOC_HIGH modify the watermark in a relative way and thus do not ensure
    a strict minimum.

    Signed-off-by: Peter Zijlstra
    ---
    include/linux/mmzone.h | 6 ++-
    mm/page_alloc.c | 84 +++++++++++++++++++++++++++++++++++++++++++------
    mm/vmstat.c | 6 +--
    3 files changed, 82 insertions(+), 14 deletions(-)

    Index: linux-2.6/include/linux/mmzone.h
    ================================================== =================
    --- linux-2.6.orig/include/linux/mmzone.h
    +++ linux-2.6/include/linux/mmzone.h
    @@ -265,7 +265,10 @@ enum zone_type {

    struct zone {
    /* Fields commonly accessed by the page allocator */
    - unsigned long pages_min, pages_low, pages_high;
    + unsigned long pages_high; /* we stop kswapd */
    + unsigned long pages_low; /* we wake up kswapd */
    + unsigned long pages_min; /* we enter direct reclaim */
    + unsigned long pages_emerg; /* emergency pool */
    /*
    * We don't know if the memory that we're going to allocate will be freeable
    * or/and it will be released eventually, so to avoid totally wasting several
    @@ -751,6 +754,7 @@ int sysctl_min_unmapped_ratio_sysctl_han
    struct file *, void __user *, size_t *, loff_t *);
    int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
    struct file *, void __user *, size_t *, loff_t *);
    +int adjust_memalloc_reserve(int pages);

    extern int numa_zonelist_order_handler(struct ctl_table *, int,
    struct file *, void __user *, size_t *, loff_t *);
    Index: linux-2.6/mm/page_alloc.c
    ================================================== =================
    --- linux-2.6.orig/mm/page_alloc.c
    +++ linux-2.6/mm/page_alloc.c
    @@ -120,6 +120,8 @@ static char * const zone_names[MAX_NR_ZO

    static DEFINE_SPINLOCK(min_free_lock);
    int min_free_kbytes = 1024;
    +static DEFINE_MUTEX(var_free_mutex);
    +int var_free_kbytes;

    unsigned long __meminitdata nr_kernel_pages;
    unsigned long __meminitdata nr_all_pages;
    @@ -1235,7 +1237,7 @@ int zone_watermark_ok(struct zone *z, in
    if (alloc_flags & ALLOC_HARDER)
    min -= min / 4;

    - if (free_pages <= min + z->lowmem_reserve[classzone_idx])
    + if (free_pages <= min+z->lowmem_reserve[classzone_idx]+z->pages_emerg)
    return 0;
    for (o = 0; o < order; o++) {
    /* At the next order, this order's pages become unavailable */
    @@ -1558,7 +1560,7 @@ __alloc_pages_internal(gfp_t gfp_mask, u
    struct reclaim_state reclaim_state;
    struct task_struct *p = current;
    int do_retry;
    - int alloc_flags;
    + int alloc_flags = 0;
    unsigned long did_some_progress;
    unsigned long pages_reclaimed = 0;

    @@ -1724,8 +1726,8 @@ nofail_alloc:
    nopage:
    if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
    printk(KERN_WARNING "%s: page allocation failure."
    - " order:%d, mode:0x%x\n",
    - p->comm, order, gfp_mask);
    + " order:%d, mode:0x%x, alloc_flags:0x%x, pflags:0x%x\n",
    + p->comm, order, gfp_mask, alloc_flags, p->flags);
    dump_stack();
    show_mem();
    }
    @@ -2008,9 +2010,9 @@ void show_free_areas(void)
    "\n",
    zone->name,
    K(zone_page_state(zone, NR_FREE_PAGES)),
    - K(zone->pages_min),
    - K(zone->pages_low),
    - K(zone->pages_high),
    + K(zone->pages_emerg + zone->pages_min),
    + K(zone->pages_emerg + zone->pages_low),
    + K(zone->pages_emerg + zone->pages_high),
    K(zone_page_state(zone, NR_ACTIVE_ANON)),
    K(zone_page_state(zone, NR_INACTIVE_ANON)),
    K(zone_page_state(zone, NR_ACTIVE_FILE)),
    @@ -4284,7 +4286,7 @@ static void calculate_totalreserve_pages
    }

    /* we treat pages_high as reserved pages. */
    - max += zone->pages_high;
    + max += zone->pages_high + zone->pages_emerg;

    if (max > zone->present_pages)
    max = zone->present_pages;
    @@ -4341,7 +4343,8 @@ static void setup_per_zone_lowmem_reserv
    */
    static void __setup_per_zone_pages_min(void)
    {
    - unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
    + unsigned pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
    + unsigned pages_emerg = var_free_kbytes >> (PAGE_SHIFT - 10);
    unsigned long lowmem_pages = 0;
    struct zone *zone;
    unsigned long flags;
    @@ -4353,11 +4356,13 @@ static void __setup_per_zone_pages_min(v
    }

    for_each_zone(zone) {
    - u64 tmp;
    + u64 tmp, tmp_emerg;

    spin_lock_irqsave(&zone->lru_lock, flags);
    tmp = (u64)pages_min * zone->present_pages;
    do_div(tmp, lowmem_pages);
    + tmp_emerg = (u64)pages_emerg * zone->present_pages;
    + do_div(tmp_emerg, lowmem_pages);
    if (is_highmem(zone)) {
    /*
    * __GFP_HIGH and PF_MEMALLOC allocations usually don't
    @@ -4376,12 +4381,14 @@ static void __setup_per_zone_pages_min(v
    if (min_pages > 128)
    min_pages = 128;
    zone->pages_min = min_pages;
    + zone->pages_emerg = 0;
    } else {
    /*
    * If it's a lowmem zone, reserve a number of pages
    * proportionate to the zone's size.
    */
    zone->pages_min = tmp;
    + zone->pages_emerg = tmp_emerg;
    }

    zone->pages_low = zone->pages_min + (tmp >> 2);
    @@ -4443,6 +4450,63 @@ void setup_per_zone_pages_min(void)
    spin_unlock_irqrestore(&min_free_lock, flags);
    }

    +static void __adjust_memalloc_reserve(int pages)
    +{
    + var_free_kbytes += pages << (PAGE_SHIFT - 10);
    + BUG_ON(var_free_kbytes < 0);
    + setup_per_zone_pages_min();
    +}
    +
    +static int test_reserve_limits(void)
    +{
    + struct zone *zone;
    + int node;
    +
    + for_each_zone(zone)
    + wakeup_kswapd(zone, 0);
    +
    + for_each_online_node(node) {
    + struct page *page = alloc_pages_node(node, GFP_KERNEL, 0);
    + if (!page)
    + return -ENOMEM;
    +
    + __free_page(page);
    + }
    +
    + return 0;
    +}
    +
    +/**
    + * adjust_memalloc_reserve - adjust the memalloc reserve
    + * @pages: number of pages to add
    + *
    + * It adds a number of pages to the memalloc reserve; if
    + * the number was positive it kicks reclaim into action to
    + * satisfy the higher watermarks.
    + *
    + * returns -ENOMEM when it failed to satisfy the watermarks.
    + */
    +int adjust_memalloc_reserve(int pages)
    +{
    + int err = 0;
    +
    + mutex_lock(&var_free_mutex);
    + __adjust_memalloc_reserve(pages);
    + if (pages > 0) {
    + err = test_reserve_limits();
    + if (err) {
    + __adjust_memalloc_reserve(-pages);
    + goto unlock;
    + }
    + }
    + printk(KERN_DEBUG "Emergency reserve: %d\n", var_free_kbytes);
    +
    +unlock:
    + mutex_unlock(&var_free_mutex);
    + return err;
    +}
    +EXPORT_SYMBOL_GPL(adjust_memalloc_reserve);
    +
    /*
    * Initialise min_free_kbytes.
    *
    Index: linux-2.6/mm/vmstat.c
    ================================================== =================
    --- linux-2.6.orig/mm/vmstat.c
    +++ linux-2.6/mm/vmstat.c
    @@ -785,9 +785,9 @@ static void zoneinfo_show_print(struct s
    "\n spanned %lu"
    "\n present %lu",
    zone_page_state(zone, NR_FREE_PAGES),
    - zone->pages_min,
    - zone->pages_low,
    - zone->pages_high,
    + zone->pages_emerg + zone->pages_min,
    + zone->pages_emerg + zone->pages_low,
    + zone->pages_emerg + zone->pages_high,
    zone->pages_scanned,
    zone->lru[LRU_ACTIVE_ANON].nr_scan,
    zone->lru[LRU_INACTIVE_ANON].nr_scan,

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  18. [PATCH 19/32] netvm: network reserve infrastructure

    Provide the basic infrastructure to reserve and charge/account network memory.

    We provide the following reserve tree:

    1) total network reserve
    2) network TX reserve
    3) protocol TX pages
    4) network RX reserve
    5) SKB data reserve

    [1] is used to make all the network reserves a single subtree, for easy
    manipulation.

    [2] and [4] are merely for eastetic reasons.

    The TX pages reserve [3] is assumed bounded by it being the upper bound of
    memory that can be used for sending pages (not quite true, but good enough)

    The SKB reserve [5] is an aggregate reserve, which is used to charge SKB data
    against in the fallback path.

    The consumers for these reserves are sockets marked with:
    SOCK_MEMALLOC

    Such sockets are to be used to service the VM (iow. to swap over). They
    must be handled kernel side, exposing such a socket to user-space is a BUG.

    Signed-off-by: Peter Zijlstra
    ---
    include/net/sock.h | 43 ++++++++++++++++++++-
    net/Kconfig | 3 +
    net/core/sock.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++ +++
    3 files changed, 152 insertions(+), 1 deletion(-)

    Index: linux-2.6/include/net/sock.h
    ================================================== =================
    --- linux-2.6.orig/include/net/sock.h
    +++ linux-2.6/include/net/sock.h
    @@ -50,6 +50,7 @@
    #include /* struct sk_buff */
    #include
    #include
    +#include

    #include

    @@ -413,6 +414,7 @@ enum sock_flags {
    SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
    SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
    SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
    + SOCK_MEMALLOC, /* the VM depends on us - make sure we're serviced */
    };

    static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
    @@ -435,9 +437,48 @@ static inline int sock_flag(struct sock
    return test_bit(flag, &sk->sk_flags);
    }

    +static inline int sk_has_memalloc(struct sock *sk)
    +{
    + return sock_flag(sk, SOCK_MEMALLOC);
    +}
    +
    +extern struct mem_reserve net_rx_reserve;
    +extern struct mem_reserve net_skb_reserve;
    +
    +#ifdef CONFIG_NETVM
    +/*
    + * Guestimate the per request queue TX upper bound.
    + *
    + * Max packet size is 64k, and we need to reserve that much since the data
    + * might need to bounce it. Double it to be on the safe side.
    + */
    +#define TX_RESERVE_PAGES DIV_ROUND_UP(2*65536, PAGE_SIZE)
    +
    +extern int memalloc_socks;
    +
    +static inline int sk_memalloc_socks(void)
    +{
    + return memalloc_socks;
    +}
    +
    +extern int sk_adjust_memalloc(int socks, long tx_reserve_pages);
    +extern int sk_set_memalloc(struct sock *sk);
    +extern int sk_clear_memalloc(struct sock *sk);
    +#else
    +static inline int sk_memalloc_socks(void)
    +{
    + return 0;
    +}
    +
    +static inline int sk_clear_memalloc(struct sock *sk)
    +{
    + return 0;
    +}
    +#endif
    +
    static inline gfp_t sk_allocation(struct sock *sk, gfp_t gfp_mask)
    {
    - return gfp_mask;
    + return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC);
    }

    static inline void sk_acceptq_removed(struct sock *sk)
    Index: linux-2.6/net/core/sock.c
    ================================================== =================
    --- linux-2.6.orig/net/core/sock.c
    +++ linux-2.6/net/core/sock.c
    @@ -110,6 +110,7 @@
    #include
    #include
    #include
    +#include

    #include
    #include
    @@ -211,6 +212,105 @@ __u32 sysctl_rmem_default __read_mostly
    /* Maximal space eaten by iovec or ancilliary data plus some space */
    int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);

    +static struct mem_reserve net_reserve;
    +struct mem_reserve net_rx_reserve;
    +EXPORT_SYMBOL_GPL(net_rx_reserve); /* modular ipv6 only */
    +struct mem_reserve net_skb_reserve;
    +EXPORT_SYMBOL_GPL(net_skb_reserve); /* modular ipv6 only */
    +static struct mem_reserve net_tx_reserve;
    +static struct mem_reserve net_tx_pages;
    +
    +#ifdef CONFIG_NETVM
    +static DEFINE_MUTEX(memalloc_socks_lock);
    +int memalloc_socks;
    +
    +/**
    + * sk_adjust_memalloc - adjust the global memalloc reserve for critical RX
    + * @socks: number of new %SOCK_MEMALLOC sockets
    + * @tx_resserve_pages: number of pages to (un)reserve for TX
    + *
    + * This function adjusts the memalloc reserve based on system demand.
    + * The RX reserve is a limit, and only added once, not for each socket.
    + *
    + * NOTE:
    + * @tx_reserve_pages is an upper-bound of memory used for TX hence
    + * we need not account the pages like we do for RX pages.
    + */
    +int sk_adjust_memalloc(int socks, long tx_reserve_pages)
    +{
    + int err;
    +
    + mutex_lock(&memalloc_socks_lock);
    + err = mem_reserve_pages_add(&net_tx_pages, tx_reserve_pages);
    + if (err)
    + goto unlock;
    +
    + /*
    + * either socks is positive and we need to check for 0 -> !0
    + * transition and connect the reserve tree when we observe it.
    + */
    + if (!memalloc_socks && socks > 0) {
    + err = mem_reserve_connect(&net_reserve, &mem_reserve_root);
    + if (err) {
    + /*
    + * if we failed to connect the tree, undo the tx
    + * reserve so that failure has no side effects.
    + */
    + mem_reserve_pages_add(&net_tx_pages, -tx_reserve_pages);
    + goto unlock;
    + }
    + }
    + memalloc_socks += socks;
    + /*
    + * or socks is negative and we must observe the !0 -> 0 transition
    + * and disconnect the reserve tree.
    + */
    + if (!memalloc_socks && socks)
    + mem_reserve_disconnect(&net_reserve);
    +
    +unlock:
    + mutex_unlock(&memalloc_socks_lock);
    +
    + return err;
    +}
    +EXPORT_SYMBOL_GPL(sk_adjust_memalloc);
    +
    +/**
    + * sk_set_memalloc - sets %SOCK_MEMALLOC
    + * @sk: socket to set it on
    + *
    + * Set %SOCK_MEMALLOC on a socket and increase the memalloc reserve
    + * accordingly.
    + */
    +int sk_set_memalloc(struct sock *sk)
    +{
    + int set = sock_flag(sk, SOCK_MEMALLOC);
    +
    + if (!set) {
    + int err = sk_adjust_memalloc(1, 0);
    + if (err)
    + return err;
    +
    + sock_set_flag(sk, SOCK_MEMALLOC);
    + sk->sk_allocation |= __GFP_MEMALLOC;
    + }
    + return !set;
    +}
    +EXPORT_SYMBOL_GPL(sk_set_memalloc);
    +
    +int sk_clear_memalloc(struct sock *sk)
    +{
    + int set = sock_flag(sk, SOCK_MEMALLOC);
    + if (set) {
    + sk_adjust_memalloc(-1, 0);
    + sock_reset_flag(sk, SOCK_MEMALLOC);
    + sk->sk_allocation &= ~__GFP_MEMALLOC;
    + }
    + return set;
    +}
    +EXPORT_SYMBOL_GPL(sk_clear_memalloc);
    +#endif
    +
    static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
    {
    struct timeval tv;
    @@ -957,6 +1057,7 @@ void sk_free(struct sock *sk)
    {
    struct sk_filter *filter;

    + sk_clear_memalloc(sk);
    if (sk->sk_destruct)
    sk->sk_destruct(sk);

    @@ -1106,6 +1207,12 @@ void __init sk_init(void)
    sysctl_wmem_max = 131071;
    sysctl_rmem_max = 131071;
    }
    +
    + mem_reserve_init(&net_reserve, "total network reserve", NULL);
    + mem_reserve_init(&net_rx_reserve, "network RX reserve", &net_reserve);
    + mem_reserve_init(&net_skb_reserve, "SKB data reserve", &net_rx_reserve);
    + mem_reserve_init(&net_tx_reserve, "network TX reserve", &net_reserve);
    + mem_reserve_init(&net_tx_pages, "protocol TX pages", &net_tx_reserve);
    }

    /*
    Index: linux-2.6/net/Kconfig
    ================================================== =================
    --- linux-2.6.orig/net/Kconfig
    +++ linux-2.6/net/Kconfig
    @@ -248,4 +248,7 @@ endmenu
    source "net/rfkill/Kconfig"
    source "net/9p/Kconfig"

    +config NETVM
    + def_bool n
    +
    endif # if NET

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  19. [PATCH 20/32] netvm: INET reserves.

    Add reserves for INET.

    The two big users seem to be the route cache and ip-fragment cache.

    Reserve the route cache under generic RX reserve, its usage is bounded by
    the high reclaim watermark, and thus does not need further accounting.

    Reserve the ip-fragement caches under SKB data reserve, these add to the
    SKB RX limit. By ensuring we can at least receive as much data as fits in
    the reassmbly line we avoid fragment attack deadlocks.

    Adds to the reserve tree:

    total network reserve
    network TX reserve
    protocol TX pages
    network RX reserve
    + IPv6 route cache
    + IPv4 route cache
    SKB data reserve
    + IPv6 fragment cache
    + IPv4 fragment cache

    Signed-off-by: Peter Zijlstra
    ---
    include/net/inet_frag.h | 7 +++
    include/net/netns/ipv6.h | 4 ++
    net/ipv4/inet_fragment.c | 3 +
    net/ipv4/ip_fragment.c | 86 +++++++++++++++++++++++++++++++++++++++++++++--
    net/ipv4/route.c | 70 +++++++++++++++++++++++++++++++++++++-
    net/ipv6/reassembly.c | 85 +++++++++++++++++++++++++++++++++++++++++++++-
    net/ipv6/route.c | 77 ++++++++++++++++++++++++++++++++++++++++--
    7 files changed, 325 insertions(+), 7 deletions(-)

    Index: linux-2.6/net/ipv4/ip_fragment.c
    ================================================== =================
    --- linux-2.6.orig/net/ipv4/ip_fragment.c
    +++ linux-2.6/net/ipv4/ip_fragment.c
    @@ -42,6 +42,8 @@
    #include
    #include
    #include
    +#include
    +#include

    /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
    * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
    @@ -599,6 +601,63 @@ int ip_defrag(struct sk_buff *skb, u32 u
    }

    #ifdef CONFIG_SYSCTL
    +static int proc_dointvec_fragment(struct ctl_table *table, int write,
    + struct file *filp, void __user *buffer, size_t *lenp,
    + loff_t *ppos)
    +{
    + struct net *net = container_of(table->data, struct net,
    + ipv4.frags.high_thresh);
    + ctl_table tmp = *table;
    + int new_bytes, ret;
    +
    + mutex_lock(&net->ipv4.frags.lock);
    + if (write) {
    + tmp.data = &new_bytes;
    + table = &tmp;
    + }
    +
    + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
    +
    + if (!ret && write) {
    + ret = mem_reserve_kmalloc_set(&net->ipv4.frags.reserve,
    + new_bytes);
    + if (!ret)
    + net->ipv4.frags.high_thresh = new_bytes;
    + }
    + mutex_unlock(&net->ipv4.frags.lock);
    +
    + return ret;
    +}
    +
    +static int sysctl_intvec_fragment(struct ctl_table *table,
    + void __user *oldval, size_t __user *oldlenp,
    + void __user *newval, size_t newlen)
    +{
    + struct net *net = container_of(table->data, struct net,
    + ipv4.frags.high_thresh);
    + int write = (newval && newlen);
    + ctl_table tmp = *table;
    + int new_bytes, ret;
    +
    + mutex_lock(&net->ipv4.frags.lock);
    + if (write) {
    + tmp.data = &new_bytes;
    + table = &tmp;
    + }
    +
    + ret = sysctl_intvec(table, oldval, oldlenp, newval, newlen);
    +
    + if (!ret && write) {
    + ret = mem_reserve_kmalloc_set(&net->ipv4.frags.reserve,
    + new_bytes);
    + if (!ret)
    + net->ipv4.frags.high_thresh = new_bytes;
    + }
    + mutex_unlock(&net->ipv4.frags.lock);
    +
    + return ret;
    +}
    +
    static int zero;

    static struct ctl_table ip4_frags_ns_ctl_table[] = {
    @@ -608,7 +667,8 @@ static struct ctl_table ip4_frags_ns_ctl
    .data = &init_net.ipv4.frags.high_thresh,
    .maxlen = sizeof(int),
    .mode = 0644,
    - .proc_handler = &proc_dointvec
    + .proc_handler = &proc_dointvec_fragment,
    + .strategy = &sysctl_intvec_fragment,
    },
    {
    .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH,
    @@ -711,6 +771,8 @@ static inline void ip4_frags_ctl_registe

    static int ipv4_frags_init_net(struct net *net)
    {
    + int ret;
    +
    /*
    * Fragment cache limits. We will commit 256K at one time. Should we
    * cross that limit we will prune down to 192K. This should cope with
    @@ -728,11 +790,31 @@ static int ipv4_frags_init_net(struct ne

    inet_frags_init_net(&net->ipv4.frags);

    - return ip4_frags_ns_ctl_register(net);
    + ret = ip4_frags_ns_ctl_register(net);
    + if (ret)
    + goto out_reg;
    +
    + mem_reserve_init(&net->ipv4.frags.reserve, "IPv4 fragment cache",
    + &net_skb_reserve);
    + ret = mem_reserve_kmalloc_set(&net->ipv4.frags.reserve,
    + net->ipv4.frags.high_thresh);
    + if (ret)
    + goto out_reserve;
    +
    + return 0;
    +
    +out_reserve:
    + mem_reserve_disconnect(&net->ipv4.frags.reserve);
    + ip4_frags_ns_ctl_unregister(net);
    +out_reg:
    + inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
    +
    + return ret;
    }

    static void ipv4_frags_exit_net(struct net *net)
    {
    + mem_reserve_disconnect(&net->ipv4.frags.reserve);
    ip4_frags_ns_ctl_unregister(net);
    inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
    }
    Index: linux-2.6/net/ipv6/reassembly.c
    ================================================== =================
    --- linux-2.6.orig/net/ipv6/reassembly.c
    +++ linux-2.6/net/ipv6/reassembly.c
    @@ -41,6 +41,7 @@
    #include
    #include
    #include
    +#include

    #include
    #include
    @@ -632,6 +633,63 @@ static struct inet6_protocol frag_protoc
    };

    #ifdef CONFIG_SYSCTL
    +static int proc_dointvec_fragment(struct ctl_table *table, int write,
    + struct file *filp, void __user *buffer, size_t *lenp,
    + loff_t *ppos)
    +{
    + struct net *net = container_of(table->data, struct net,
    + ipv6.frags.high_thresh);
    + ctl_table tmp = *table;
    + int new_bytes, ret;
    +
    + mutex_lock(&net->ipv6.frags.lock);
    + if (write) {
    + tmp.data = &new_bytes;
    + table = &tmp;
    + }
    +
    + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
    +
    + if (!ret && write) {
    + ret = mem_reserve_kmalloc_set(&net->ipv6.frags.reserve,
    + new_bytes);
    + if (!ret)
    + net->ipv6.frags.high_thresh = new_bytes;
    + }
    + mutex_unlock(&net->ipv6.frags.lock);
    +
    + return ret;
    +}
    +
    +static int sysctl_intvec_fragment(struct ctl_table *table,
    + void __user *oldval, size_t __user *oldlenp,
    + void __user *newval, size_t newlen)
    +{
    + struct net *net = container_of(table->data, struct net,
    + ipv6.frags.high_thresh);
    + int write = (newval && newlen);
    + ctl_table tmp = *table;
    + int new_bytes, ret;
    +
    + mutex_lock(&net->ipv6.frags.lock);
    + if (write) {
    + tmp.data = &new_bytes;
    + table = &tmp;
    + }
    +
    + ret = sysctl_intvec(table, oldval, oldlenp, newval, newlen);
    +
    + if (!ret && write) {
    + ret = mem_reserve_kmalloc_set(&net->ipv6.frags.reserve,
    + new_bytes);
    + if (!ret)
    + net->ipv6.frags.high_thresh = new_bytes;
    + }
    + mutex_unlock(&net->ipv6.frags.lock);
    +
    + return ret;
    +}
    +
    static struct ctl_table ip6_frags_ns_ctl_table[] = {
    {
    .ctl_name = NET_IPV6_IP6FRAG_HIGH_THRESH,
    @@ -639,7 +697,8 @@ static struct ctl_table ip6_frags_ns_ctl
    .data = &init_net.ipv6.frags.high_thresh,
    .maxlen = sizeof(int),
    .mode = 0644,
    - .proc_handler = &proc_dointvec
    + .proc_handler = &proc_dointvec_fragment,
    + .strategy = &sysctl_intvec_fragment,
    },
    {
    .ctl_name = NET_IPV6_IP6FRAG_LOW_THRESH,
    @@ -748,17 +807,39 @@ static inline void ip6_frags_sysctl_unre

    static int ipv6_frags_init_net(struct net *net)
    {
    + int ret;
    +
    net->ipv6.frags.high_thresh = 256 * 1024;
    net->ipv6.frags.low_thresh = 192 * 1024;
    net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;

    inet_frags_init_net(&net->ipv6.frags);

    - return ip6_frags_ns_sysctl_register(net);
    + ret = ip6_frags_ns_sysctl_register(net);
    + if (ret)
    + goto out_reg;
    +
    + mem_reserve_init(&net->ipv6.frags.reserve, "IPv6 fragment cache",
    + &net_skb_reserve);
    + ret = mem_reserve_kmalloc_set(&net->ipv6.frags.reserve,
    + net->ipv6.frags.high_thresh);
    + if (ret)
    + goto out_reserve;
    +
    + return 0;
    +
    +out_reserve:
    + mem_reserve_disconnect(&net->ipv6.frags.reserve);
    + ip6_frags_ns_sysctl_unregister(net);
    +out_reg:
    + inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
    +
    + return ret;
    }

    static void ipv6_frags_exit_net(struct net *net)
    {
    + mem_reserve_disconnect(&net->ipv6.frags.reserve);
    ip6_frags_ns_sysctl_unregister(net);
    inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
    }
    Index: linux-2.6/net/ipv4/route.c
    ================================================== =================
    --- linux-2.6.orig/net/ipv4/route.c
    +++ linux-2.6/net/ipv4/route.c
    @@ -107,6 +107,7 @@
    #ifdef CONFIG_SYSCTL
    #include
    #endif
    +#include

    #define RT_FL_TOS(oldflp) \
    ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
    @@ -269,6 +270,8 @@ static inline int rt_genid(struct net *n
    return atomic_read(&net->ipv4.rt_genid);
    }

    +static struct mem_reserve ipv4_route_reserve;
    +
    #ifdef CONFIG_PROC_FS
    struct rt_cache_iter_state {
    struct seq_net_private p;
    @@ -398,6 +401,61 @@ static int rt_cache_seq_show(struct seq_
    return 0;
    }

    +static struct mutex ipv4_route_lock;
    +
    +static int proc_dointvec_route(struct ctl_table *table, int write,
    + struct file *filp, void __user *buffer, size_t *lenp,
    + loff_t *ppos)
    +{
    + ctl_table tmp = *table;
    + int new_size, ret;
    +
    + mutex_lock(&ipv4_route_lock);
    + if (write) {
    + tmp.data = &new_size;
    + table = &tmp;
    + }
    +
    + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
    +
    + if (!ret && write) {
    + ret = mem_reserve_kmem_cache_set(&ipv4_route_reserve,
    + ipv4_dst_ops.kmem_cachep, new_size);
    + if (!ret)
    + ip_rt_max_size = new_size;
    + }
    + mutex_unlock(&ipv4_route_lock);
    +
    + return ret;
    +}
    +
    +static int sysctl_intvec_route(struct ctl_table *table,
    + void __user *oldval, size_t __user *oldlenp,
    + void __user *newval, size_t newlen)
    +{
    + int write = (newval && newlen);
    + ctl_table tmp = *table;
    + int new_size, ret;
    +
    + mutex_lock(&ipv4_route_lock);
    + if (write) {
    + tmp.data = &new_size;
    + table = &tmp;
    + }
    +
    + ret = sysctl_intvec(table, oldval, oldlenp, newval, newlen);
    +
    + if (!ret && write) {
    + ret = mem_reserve_kmem_cache_set(&ipv4_route_reserve,
    + ipv4_dst_ops.kmem_cachep, new_size);
    + if (!ret)
    + ip_rt_max_size = new_size;
    + }
    + mutex_unlock(&ipv4_route_lock);
    +
    + return ret;
    +}
    +
    static const struct seq_operations rt_cache_seq_ops = {
    .start = rt_cache_seq_start,
    .next = rt_cache_seq_next,
    @@ -2992,7 +3050,8 @@ static ctl_table ipv4_route_table[] = {
    .data = &ip_rt_max_size,
    .maxlen = sizeof(int),
    .mode = 0644,
    - .proc_handler = &proc_dointvec,
    + .proc_handler = &proc_dointvec_route,
    + .strategy = &sysctl_intvec_route,
    },
    {
    /* Deprecated. Use gc_min_interval_ms */
    @@ -3271,6 +3330,15 @@ int __init ip_rt_init(void)
    ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
    ip_rt_max_size = (rt_hash_mask + 1) * 16;

    +#ifdef CONFIG_PROCFS
    + mutex_init(&ipv4_route_lock);
    +#endif
    +
    + mem_reserve_init(&ipv4_route_reserve, "IPv4 route cache",
    + &net_rx_reserve);
    + mem_reserve_kmem_cache_set(&ipv4_route_reserve,
    + ipv4_dst_ops.kmem_cachep, ip_rt_max_size);
    +
    devinet_init();
    ip_fib_init();

    Index: linux-2.6/net/ipv6/route.c
    ================================================== =================
    --- linux-2.6.orig/net/ipv6/route.c
    +++ linux-2.6/net/ipv6/route.c
    @@ -37,6 +37,7 @@
    #include
    #include
    #include
    +#include
    #include
    #include
    #include
    @@ -2473,6 +2474,63 @@ int ipv6_sysctl_rtcache_flush(ctl_table
    return -EINVAL;
    }

    +static int proc_dointvec_route(struct ctl_table *table, int write,
    + struct file *filp, void __user *buffer, size_t *lenp,
    + loff_t *ppos)
    +{
    + struct net *net = container_of(table->data, struct net,
    + ipv6.sysctl.ip6_rt_max_size);
    + ctl_table tmp = *table;
    + int new_size, ret;
    +
    + mutex_lock(&net->ipv6.sysctl.ip6_rt_lock);
    + if (write) {
    + tmp.data = &new_size;
    + table = &tmp;
    + }
    +
    + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
    +
    + if (!ret && write) {
    + ret = mem_reserve_kmem_cache_set(&net->ipv6.ip6_rt_reserve,
    + net->ipv6.ip6_dst_ops->kmem_cachep, new_size);
    + if (!ret)
    + net->ipv6.sysctl.ip6_rt_max_size = new_size;
    + }
    + mutex_unlock(&net->ipv6.sysctl.ip6_rt_lock);
    +
    + return ret;
    +}
    +
    +static int sysctl_intvec_route(struct ctl_table *table,
    + void __user *oldval, size_t __user *oldlenp,
    + void __user *newval, size_t newlen)
    +{
    + struct net *net = container_of(table->data, struct net,
    + ipv6.sysctl.ip6_rt_max_size);
    + int write = (newval && newlen);
    + ctl_table tmp = *table;
    + int new_size, ret;
    +
    + mutex_lock(&net->ipv6.sysctl.ip6_rt_lock);
    + if (write) {
    + tmp.data = &new_size;
    + table = &tmp;
    + }
    +
    + ret = sysctl_intvec(table, oldval, oldlenp, newval, newlen);
    +
    + if (!ret && write) {
    + ret = mem_reserve_kmem_cache_set(&net->ipv6.ip6_rt_reserve,
    + net->ipv6.ip6_dst_ops->kmem_cachep, new_size);
    + if (!ret)
    + net->ipv6.sysctl.ip6_rt_max_size = new_size;
    + }
    + mutex_unlock(&net->ipv6.sysctl.ip6_rt_lock);
    +
    + return ret;
    +}
    +
    ctl_table ipv6_route_table_template[] = {
    {
    .procname = "flush",
    @@ -2495,7 +2553,8 @@ ctl_table ipv6_route_table_template[] =
    .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
    .maxlen = sizeof(int),
    .mode = 0644,
    - .proc_handler = &proc_dointvec,
    + .proc_handler = &proc_dointvec_route,
    + .strategy = &sysctl_intvec_route,
    },
    {
    .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
    @@ -2583,6 +2642,8 @@ struct ctl_table *ipv6_route_sysctl_init
    table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
    }

    + mutex_init(&net->ipv6.sysctl.ip6_rt_lock);
    +
    return table;
    }
    #endif
    @@ -2636,6 +2697,14 @@ static int ip6_route_net_init(struct net
    net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
    net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;

    + mem_reserve_init(&net->ipv6.ip6_rt_reserve, "IPv6 route cache",
    + &net_rx_reserve);
    + ret = mem_reserve_kmem_cache_set(&net->ipv6.ip6_rt_reserve,
    + net->ipv6.ip6_dst_ops->kmem_cachep,
    + net->ipv6.sysctl.ip6_rt_max_size);
    + if (ret)
    + goto out_reserve_fail;
    +
    #ifdef CONFIG_PROC_FS
    proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
    proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
    @@ -2646,12 +2715,15 @@ static int ip6_route_net_init(struct net
    out:
    return ret;

    +out_reserve_fail:
    + mem_reserve_disconnect(&net->ipv6.ip6_rt_reserve);
    #ifdef CONFIG_IPV6_MULTIPLE_TABLES
    + kfree(net->ipv6.ip6_blk_hole_entry);
    out_ip6_prohibit_entry:
    kfree(net->ipv6.ip6_prohibit_entry);
    out_ip6_null_entry:
    - kfree(net->ipv6.ip6_null_entry);
    #endif
    + kfree(net->ipv6.ip6_null_entry);
    out_ip6_dst_ops:
    release_net(net->ipv6.ip6_dst_ops->dst_net);
    kfree(net->ipv6.ip6_dst_ops);
    @@ -2664,6 +2736,7 @@ static void ip6_route_net_exit(struct ne
    proc_net_remove(net, "ipv6_route");
    proc_net_remove(net, "rt6_stats");
    #endif
    + mem_reserve_disconnect(&net->ipv6.ip6_rt_reserve);
    kfree(net->ipv6.ip6_null_entry);
    #ifdef CONFIG_IPV6_MULTIPLE_TABLES
    kfree(net->ipv6.ip6_prohibit_entry);
    Index: linux-2.6/include/net/inet_frag.h
    ================================================== =================
    --- linux-2.6.orig/include/net/inet_frag.h
    +++ linux-2.6/include/net/inet_frag.h
    @@ -1,6 +1,9 @@
    #ifndef __NET_FRAG_H__
    #define __NET_FRAG_H__

    +#include
    +#include
    +
    struct netns_frags {
    int nqueues;
    atomic_t mem;
    @@ -10,6 +13,10 @@ struct netns_frags {
    int timeout;
    int high_thresh;
    int low_thresh;
    +
    + /* reserves */
    + struct mutex lock;
    + struct mem_reserve reserve;
    };

    struct inet_frag_queue {
    Index: linux-2.6/net/ipv4/inet_fragment.c
    ================================================== =================
    --- linux-2.6.orig/net/ipv4/inet_fragment.c
    +++ linux-2.6/net/ipv4/inet_fragment.c
    @@ -19,6 +19,7 @@
    #include
    #include
    #include
    +#include

    #include

    @@ -74,6 +75,8 @@ void inet_frags_init_net(struct netns_fr
    nf->nqueues = 0;
    atomic_set(&nf->mem, 0);
    INIT_LIST_HEAD(&nf->lru_list);
    + mutex_init(&nf->lock);
    + mem_reserve_init(&nf->reserve, "IP fragement cache", NULL);
    }
    EXPORT_SYMBOL(inet_frags_init_net);

    Index: linux-2.6/include/net/netns/ipv6.h
    ================================================== =================
    --- linux-2.6.orig/include/net/netns/ipv6.h
    +++ linux-2.6/include/net/netns/ipv6.h
    @@ -24,6 +24,8 @@ struct netns_sysctl_ipv6 {
    int ip6_rt_mtu_expires;
    int ip6_rt_min_advmss;
    int icmpv6_time;
    +
    + struct mutex ip6_rt_lock;
    };

    struct netns_ipv6 {
    @@ -55,5 +57,7 @@ struct netns_ipv6 {
    struct sock *ndisc_sk;
    struct sock *tcp_sk;
    struct sock *igmp_sk;
    +
    + struct mem_reserve ip6_rt_reserve;
    };
    #endif

    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

  20. [PATCH 16/32] net: wrap sk->sk_backlog_rcv()

    Wrap calling sk->sk_backlog_rcv() in a function. This will allow extending the
    generic sk_backlog_rcv behaviour.

    Signed-off-by: Peter Zijlstra
    ---
    include/net/sock.h | 5 +++++
    include/net/tcp.h | 2 +-
    net/core/sock.c | 4 ++--
    net/ipv4/tcp.c | 2 +-
    net/ipv4/tcp_timer.c | 2 +-
    5 files changed, 10 insertions(+), 5 deletions(-)

    Index: linux-2.6/include/net/sock.h
    ================================================== =================
    --- linux-2.6.orig/include/net/sock.h
    +++ linux-2.6/include/net/sock.h
    @@ -482,6 +482,11 @@ static inline void sk_add_backlog(struct
    skb->next = NULL;
    }

    +static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
    +{
    + return sk->sk_backlog_rcv(sk, skb);
    +}
    +
    #define sk_wait_event(__sk, __timeo, __condition) \
    ({ int __rc; \
    release_sock(__sk); \
    Index: linux-2.6/net/core/sock.c
    ================================================== =================
    --- linux-2.6.orig/net/core/sock.c
    +++ linux-2.6/net/core/sock.c
    @@ -324,7 +324,7 @@ int sk_receive_skb(struct sock *sk, stru
    */
    mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);

    - rc = sk->sk_backlog_rcv(sk, skb);
    + rc = sk_backlog_rcv(sk, skb);

    mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
    } else
    @@ -1371,7 +1371,7 @@ static void __release_sock(struct sock *
    struct sk_buff *next = skb->next;

    skb->next = NULL;
    - sk->sk_backlog_rcv(sk, skb);
    + sk_backlog_rcv(sk, skb);

    /*
    * We are in process context here with softirqs
    Index: linux-2.6/net/ipv4/tcp.c
    ================================================== =================
    --- linux-2.6.orig/net/ipv4/tcp.c
    +++ linux-2.6/net/ipv4/tcp.c
    @@ -1157,7 +1157,7 @@ static void tcp_prequeue_process(struct
    * necessary */
    local_bh_disable();
    while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
    - sk->sk_backlog_rcv(sk, skb);
    + sk_backlog_rcv(sk, skb);
    local_bh_enable();

    /* Clear memory counter. */
    Index: linux-2.6/net/ipv4/tcp_timer.c
    ================================================== =================
    --- linux-2.6.orig/net/ipv4/tcp_timer.c
    +++ linux-2.6/net/ipv4/tcp_timer.c
    @@ -201,7 +201,7 @@ static void tcp_delack_timer(unsigned lo
    NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);

    while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
    - sk->sk_backlog_rcv(sk, skb);
    + sk_backlog_rcv(sk, skb);

    tp->ucopy.memory = 0;
    }
    Index: linux-2.6/include/net/tcp.h
    ================================================== =================
    --- linux-2.6.orig/include/net/tcp.h
    +++ linux-2.6/include/net/tcp.h
    @@ -894,7 +894,7 @@ static inline int tcp_prequeue(struct so
    BUG_ON(sock_owned_by_user(sk));

    while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
    - sk->sk_backlog_rcv(sk, skb1);
    + sk_backlog_rcv(sk, skb1);
    NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED);
    }


    --

    --
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/

+ Reply to Thread
Page 1 of 3 1 2 3 LastLast