As per recommendations made at LSF'08, a stackable file system which does
not change the data across layers, should try to use vm_operations instead
of address_space_operations. This means we now have to implement out own
->read and ->write methods because we cannot rely on VFS helpers which
require us to have a ->readpage method. Either way there are two caveats:

(1) It's not possible currently to set inode->i_mapping->a_ops to NULL,
because too many code paths expect i_mapping to be non-NULL.

(2) a small/dummy ->readpage is still needed because generic_file_mmap,
which we used in unionfs_mmap, still check for the existence of the
->readpage method. These code paths may have to be changed to remove the
need for readpage().

Signed-off-by: Erez Zadok
---
fs/unionfs/file.c | 109 +++++++++++++++--
fs/unionfs/mmap.c | 338 +++++++---------------------------------------------
fs/unionfs/union.h | 4 +-
3 files changed, 147 insertions(+), 304 deletions(-)

diff --git a/fs/unionfs/file.c b/fs/unionfs/file.c
index 1fcaf8e..9397ff3 100644
--- a/fs/unionfs/file.c
+++ b/fs/unionfs/file.c
@@ -18,17 +18,83 @@

#include "union.h"

+static ssize_t unionfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int err;
+ struct file *lower_file;
+ struct dentry *dentry = file->f_path.dentry;
+
+ unionfs_read_lock(dentry->d_sb, UNIONFS_SMUTEX_PARENT);
+ unionfs_lock_dentry(dentry, UNIONFS_DMUTEX_CHILD);
+ err = unionfs_file_revalidate_locked(file, false);
+ if (unlikely(err))
+ goto out;
+
+ lower_file = unionfs_lower_file(file);
+ err = vfs_read(lower_file, buf, count, ppos);
+ /* update our inode atime upon a successful lower read */
+ if (err >= 0) {
+ fsstack_copy_attr_atime(file->f_path.dentry->d_inode,
+ lower_file->f_path.dentry->d_inode);
+ unionfs_check_file(file);
+ }
+
+out:
+ unionfs_unlock_dentry(dentry);
+ unionfs_read_unlock(file->f_path.dentry->d_sb);
+ return err;
+}
+
+static ssize_t unionfs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ int err = 0;
+ struct file *lower_file;
+ struct dentry *dentry = file->f_path.dentry;
+
+ unionfs_read_lock(dentry->d_sb, UNIONFS_SMUTEX_PARENT);
+ unionfs_lock_dentry(dentry, UNIONFS_DMUTEX_CHILD);
+ err = unionfs_file_revalidate_locked(file, true);
+ if (unlikely(err))
+ goto out;
+
+ lower_file = unionfs_lower_file(file);
+ err = vfs_write(lower_file, buf, count, ppos);
+ /* update our inode times+sizes upon a successful lower write */
+ if (err >= 0) {
+ fsstack_copy_inode_size(file->f_path.dentry->d_inode,
+ lower_file->f_path.dentry->d_inode);
+ fsstack_copy_attr_times(file->f_path.dentry->d_inode,
+ lower_file->f_path.dentry->d_inode);
+ unionfs_check_file(file);
+ }
+
+out:
+ unionfs_unlock_dentry(dentry);
+ unionfs_read_unlock(file->f_path.dentry->d_sb);
+ return err;
+}
+
+
static int unionfs_file_readdir(struct file *file, void *dirent,
filldir_t filldir)
{
return -ENOTDIR;
}

+int unionfs_readpage_dummy(struct file *file, struct page *page)
+{
+ BUG();
+ return -EINVAL;
+}
+
static int unionfs_mmap(struct file *file, struct vm_area_struct *vma)
{
int err = 0;
bool willwrite;
struct file *lower_file;
+ struct vm_operations_struct *saved_vm_ops = NULL;

unionfs_read_lock(file->f_path.dentry->d_sb, UNIONFS_SMUTEX_PARENT);

@@ -54,12 +120,39 @@ static int unionfs_mmap(struct file *file, struct vm_area_struct *vma)
err = -EINVAL;
printk(KERN_ERR "unionfs: branch %d file system does not "
"support writeable mmap\n", fbstart(file));
- } else {
- err = generic_file_mmap(file, vma);
- if (err)
- printk(KERN_ERR
- "unionfs: generic_file_mmap failed %d\n", err);
+ goto out;
+ }
+
+ /*
+ * find and save lower vm_ops.
+ *
+ * XXX: the VFS should have a cleaner way of finding the lower vm_ops
+ */
+ if (!UNIONFS_F(file)->lower_vm_ops) {
+ err = lower_file->f_op->mmap(lower_file, vma);
+ if (err) {
+ printk(KERN_ERR "unionfs: lower mmap failed %d\n", err);
+ goto out;
+ }
+ saved_vm_ops = vma->vm_ops;
+ err = do_munmap(current->mm, vma->vm_start,
+ vma->vm_end - vma->vm_start);
+ if (err) {
+ printk(KERN_ERR "unionfs: do_munmap failed %d\n", err);
+ goto out;
+ }
+ }
+
+ file->f_mapping->a_ops = &unionfs_dummy_aops;
+ err = generic_file_mmap(file, vma);
+ file->f_mapping->a_ops = &unionfs_aops;
+ if (err) {
+ printk(KERN_ERR "unionfs: generic_file_mmap failed %d\n", err);
+ goto out;
}
+ vma->vm_ops = &unionfs_vm_ops;
+ if (!UNIONFS_F(file)->lower_vm_ops)
+ UNIONFS_F(file)->lower_vm_ops = saved_vm_ops;

out:
if (!err) {
@@ -222,10 +315,8 @@ out:

struct file_operations unionfs_main_fops = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = unionfs_read,
+ .write = unionfs_write,
.readdir = unionfs_file_readdir,
.unlocked_ioctl = unionfs_ioctl,
.mmap = unionfs_mmap,
diff --git a/fs/unionfs/mmap.c b/fs/unionfs/mmap.c
index d6ac61e..07db5b0 100644
--- a/fs/unionfs/mmap.c
+++ b/fs/unionfs/mmap.c
@@ -19,316 +19,66 @@

#include "union.h"

-static int unionfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- int err = -EIO;
- struct inode *inode;
- struct inode *lower_inode;
- struct page *lower_page;
- struct address_space *lower_mapping; /* lower inode mapping */
- gfp_t mask;
-
- BUG_ON(!PageUptodate(page));
- inode = page->mapping->host;
- /* if no lower inode, nothing to do */
- if (!inode || !UNIONFS_I(inode) || UNIONFS_I(inode)->lower_inodes) {
- err = 0;
- goto out;
- }
- lower_inode = unionfs_lower_inode(inode);
- lower_mapping = lower_inode->i_mapping;
-
- /*
- * find lower page (returns a locked page)
- *
- * We turn off __GFP_FS while we look for or create a new lower
- * page. This prevents a recursion into the file system code, which
- * under memory pressure conditions could lead to a deadlock. This
- * is similar to how the loop driver behaves (see loop_set_fd in
- * drivers/block/loop.c). If we can't find the lower page, we
- * redirty our page and return "success" so that the VM will call us
- * again in the (hopefully near) future.
- */
- mask = mapping_gfp_mask(lower_mapping) & ~(__GFP_FS);
- lower_page = find_or_create_page(lower_mapping, page->index, mask);
- if (!lower_page) {
- err = 0;
- set_page_dirty(page);
- goto out;
- }
-
- /* copy page data from our upper page to the lower page */
- copy_highpage(lower_page, page);
- flush_dcache_page(lower_page);
- SetPageUptodate(lower_page);
- set_page_dirty(lower_page);
-
- /*
- * Call lower writepage (expects locked page). However, if we are
- * called with wbc->for_reclaim, then the VFS/VM just wants to
- * reclaim our page. Therefore, we don't need to call the lower
- * ->writepage: just copy our data to the lower page (already done
- * above), then mark the lower page dirty and unlock it, and return
- * success.
- */
- if (wbc->for_reclaim) {
- unlock_page(lower_page);
- goto out_release;
- }
-
- BUG_ON(!lower_mapping->a_ops->writepage);
- wait_on_page_writeback(lower_page); /* prevent multiple writers */
- clear_page_dirty_for_io(lower_page); /* emulate VFS behavior */
- err = lower_mapping->a_ops->writepage(lower_page, wbc);
- if (err < 0)
- goto out_release;
-
- /*
- * Lower file systems such as ramfs and tmpfs, may return
- * AOP_WRITEPAGE_ACTIVATE so that the VM won't try to (pointlessly)
- * write the page again for a while. But those lower file systems
- * also set the page dirty bit back again. Since we successfully
- * copied our page data to the lower page, then the VM will come
- * back to the lower page (directly) and try to flush it. So we can
- * save the VM the hassle of coming back to our page and trying to
- * flush too. Therefore, we don't re-dirty our own page, and we
- * never return AOP_WRITEPAGE_ACTIVATE back to the VM (we consider
- * this a success).
- *
- * We also unlock the lower page if the lower ->writepage returned
- * AOP_WRITEPAGE_ACTIVATE. (This "anomalous" behaviour may be
- * addressed in future shmem/VM code.)
- */
- if (err == AOP_WRITEPAGE_ACTIVATE) {
- err = 0;
- unlock_page(lower_page);
- }
-
- /* all is well */
-
- /* lower mtimes have changed: update ours */
- unionfs_copy_attr_times(inode);
-
-out_release:
- /* b/c find_or_create_page increased refcnt */
- page_cache_release(lower_page);
-out:
- /*
- * We unlock our page unconditionally, because we never return
- * AOP_WRITEPAGE_ACTIVATE.
- */
- unlock_page(page);
- return err;
-}

-static int unionfs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+/*
+ * XXX: we need a dummy readpage handler because generic_file_mmap (which we
+ * use in unionfs_mmap) checks for the existence of
+ * mapping->a_ops->readpage, else it returns -ENOEXEC. The VFS will need to
+ * be fixed to allow a file system to define vm_ops->fault without any
+ * address_space_ops whatsoever.
+ *
+ * Otherwise, we don't want to use our readpage method at all.
+ */
+static int unionfs_readpage(struct file *file, struct page *page)
{
- int err = 0;
- struct inode *lower_inode;
- struct inode *inode;
-
- inode = mapping->host;
- if (ibstart(inode) < 0 && ibend(inode) < 0)
- goto out;
- lower_inode = unionfs_lower_inode(inode);
- if (!lower_inode)
- goto out;
-
- err = generic_writepages(mapping, wbc);
- if (!err)
- unionfs_copy_attr_times(inode);
-out:
- return err;
+ BUG();
+ return -EINVAL;
}

-/* Readpage expects a locked page, and must unlock it */
-static int unionfs_readpage(struct file *file, struct page *page)
+static int unionfs_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
int err;
- struct file *lower_file;
- struct inode *inode;
- mm_segment_t old_fs;
- char *page_data = NULL;
- mode_t orig_mode;
+ struct file *file, *lower_file;
+ struct vm_operations_struct *lower_vm_ops;

- unionfs_read_lock(file->f_path.dentry->d_sb, UNIONFS_SMUTEX_PARENT);
- err = unionfs_file_revalidate(file, false);
- if (unlikely(err))
- goto out;
- unionfs_check_file(file);
-
- if (!UNIONFS_F(file)) {
- err = -ENOENT;
- goto out;
- }
+ BUG_ON(!vma);
+ file = vma->vm_file;
+ lower_vm_ops = UNIONFS_F(file)->lower_vm_ops;
+ BUG_ON(!lower_vm_ops);

lower_file = unionfs_lower_file(file);
- /* FIXME: is this assertion right here? */
- BUG_ON(lower_file == NULL);
-
- inode = file->f_path.dentry->d_inode;
-
- page_data = (char *) kmap(page);
- /*
- * Use vfs_read because some lower file systems don't have a
- * readpage method, and some file systems (esp. distributed ones)
- * don't like their pages to be accessed directly. Using vfs_read
- * may be a little slower, but a lot safer, as the VFS does a lot of
- * the necessary magic for us.
- */
- lower_file->f_pos = page_offset(page);
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- /*
- * generic_file_splice_write may call us on a file not opened for
- * reading, so temporarily allow reading.
- */
- orig_mode = lower_file->f_mode;
- lower_file->f_mode |= FMODE_READ;
- err = vfs_read(lower_file, page_data, PAGE_CACHE_SIZE,
- &lower_file->f_pos);
- lower_file->f_mode = orig_mode;
- set_fs(old_fs);
- if (err >= 0 && err < PAGE_CACHE_SIZE)
- memset(page_data + err, 0, PAGE_CACHE_SIZE - err);
- kunmap(page);
-
- if (err < 0)
- goto out;
- err = 0;
-
- /* if vfs_read succeeded above, sync up our times */
- unionfs_copy_attr_times(inode);
-
- flush_dcache_page(page);
-
+ BUG_ON(!lower_file);
/*
- * we have to unlock our page, b/c we _might_ have gotten a locked
- * page. but we no longer have to wakeup on our page here, b/c
- * UnlockPage does it
+ * XXX: we set the vm_file to the lower_file, before calling the
+ * lower ->fault op, then we restore the vm_file back to the upper
+ * file. Need to change the ->fault prototype to take an explicit
+ * struct file, and fix all users accordingly.
*/
-out:
- if (err == 0)
- SetPageUptodate(page);
- else
- ClearPageUptodate(page);
-
- unlock_page(page);
- unionfs_check_file(file);
- unionfs_read_unlock(file->f_path.dentry->d_sb);
-
- return err;
-}
-
-static int unionfs_prepare_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
-{
- int err;
-
- unionfs_read_lock(file->f_path.dentry->d_sb, UNIONFS_SMUTEX_PARENT);
- err = unionfs_file_revalidate(file, true);
- if (!err) {
- unionfs_copy_attr_times(file->f_path.dentry->d_inode);
- unionfs_check_file(file);
- }
- unionfs_read_unlock(file->f_path.dentry->d_sb);
-
+ vma->vm_file = lower_file;
+ err = lower_vm_ops->fault(vma, vmf);
+ vma->vm_file = file;
return err;
}

-static int unionfs_commit_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
-{
- int err = -ENOMEM;
- struct inode *inode, *lower_inode;
- struct file *lower_file = NULL;
- unsigned bytes = to - from;
- char *page_data = NULL;
- mm_segment_t old_fs;
-
- BUG_ON(file == NULL);
-
- unionfs_read_lock(file->f_path.dentry->d_sb, UNIONFS_SMUTEX_PARENT);
- err = unionfs_file_revalidate(file, true);
- if (unlikely(err))
- goto out;
- unionfs_check_file(file);
-
- inode = page->mapping->host;
-
- if (UNIONFS_F(file) != NULL)
- lower_file = unionfs_lower_file(file);
-
- /* FIXME: is this assertion right here? */
- BUG_ON(lower_file == NULL);
-
- page_data = (char *)kmap(page);
- lower_file->f_pos = page_offset(page) + from;
-
- /*
- * We use vfs_write instead of copying page data and the
- * prepare_write/commit_write combo because file system's like
- * GFS/OCFS2 don't like things touching those directly,
- * calling the underlying write op, while a little bit slower, will
- * call all the FS specific code as well
- */
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- err = vfs_write(lower_file, page_data + from, bytes,
- &lower_file->f_pos);
- set_fs(old_fs);
-
- kunmap(page);
-
- if (err < 0)
- goto out;
-
- /* if vfs_write succeeded above, sync up our times/sizes */
- lower_inode = lower_file->f_path.dentry->d_inode;
- if (!lower_inode)
- lower_inode = unionfs_lower_inode(inode);
- BUG_ON(!lower_inode);
- fsstack_copy_inode_size(inode, lower_inode);
- unionfs_copy_attr_times(inode);
- mark_inode_dirty_sync(inode);
-
-out:
- if (err < 0)
- ClearPageUptodate(page);
-
- unionfs_check_file(file);
- unionfs_read_unlock(file->f_path.dentry->d_sb);
- return err; /* assume all is ok */
-}
-
/*
- * Although unionfs isn't a block-based file system, it may stack on one.
- * ->bmap is needed, for example, to swapon(2) files.
+ * XXX: the default address_space_ops for unionfs is empty. We cannot set
+ * our inode->i_mapping->a_ops to NULL because too many code paths expect
+ * the a_ops vector to be non-NULL.
*/
-sector_t unionfs_bmap(struct address_space *mapping, sector_t block)
-{
- int err = -EINVAL;
- struct inode *inode, *lower_inode;
- sector_t (*bmap)(struct address_space *, sector_t);
-
- inode = (struct inode *)mapping->host;
- lower_inode = unionfs_lower_inode(inode);
- if (!lower_inode)
- goto out;
- bmap = lower_inode->i_mapping->a_ops->bmap;
- if (bmap)
- err = bmap(lower_inode->i_mapping, block);
-out:
- return err;
-}
-
-
struct address_space_operations unionfs_aops = {
- .writepage = unionfs_writepage,
- .writepages = unionfs_writepages,
+ /* empty on purpose */
+};
+
+/*
+ * XXX: we need a second, dummy address_space_ops vector, to be used
+ * temporarily during unionfs_mmap, because the latter calls
+ * generic_file_mmap, which checks if ->readpage exists, else returns
+ * -ENOEXEC.
+ */
+struct address_space_operations unionfs_dummy_aops = {
.readpage = unionfs_readpage,
- .prepare_write = unionfs_prepare_write,
- .commit_write = unionfs_commit_write,
- .bmap = unionfs_bmap,
+};
+
+struct vm_operations_struct unionfs_vm_ops = {
+ .fault = unionfs_fault,
};
diff --git a/fs/unionfs/union.h b/fs/unionfs/union.h
index 7b55e33..f6fabf8 100644
--- a/fs/unionfs/union.h
+++ b/fs/unionfs/union.h
@@ -75,7 +75,8 @@ extern struct inode_operations unionfs_dir_iops;
extern struct inode_operations unionfs_symlink_iops;
extern struct super_operations unionfs_sops;
extern struct dentry_operations unionfs_dops;
-extern struct address_space_operations unionfs_aops;
+extern struct address_space_operations unionfs_aops, unionfs_dummy_aops;
+extern struct vm_operations_struct unionfs_vm_ops;

/* How long should an entry be allowed to persist */
#define RDCACHE_JIFFIES (5*HZ)
@@ -89,6 +90,7 @@ struct unionfs_file_info {
struct unionfs_dir_state *rdstate;
struct file **lower_files;
int *saved_branch_ids; /* IDs of branches when file was opened */
+ struct vm_operations_struct *lower_vm_ops;
};

/* unionfs inode data in memory */
--
1.5.2.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/