Web lists-archives.com

[RFC PATCH 2/2] mm, fs: daxfile, an interface for byte-addressable updates to pmem




To date, the full promise of byte-addressable access to persistent
memory has only been half realized via the filesystem-dax interface. The
current filesystem-dax mechanism allows an application to consume (read)
data from persistent storage at byte-size granularity, bypassing the
full page reads required by traditional storage devices.

Now, for writes, applications still need to contend with
page-granularity dirtying and flushing semantics as well as filesystem
coordination for metadata updates after any mmap write. The current
situation precludes use cases that leverage byte-granularity / in-place
updates to persistent media.

To get around this limitation there are some specialized applications
that are using the device-dax interface to bypass the overhead and
data-safety problems of the current filesystem-dax mmap-write path.
QEMU-KVM is forced to use device-dax to safely pass through persistent
memory to a guest [1]. Some specialized databases are using device-dax
for byte-granularity writes. Outside of those cases, device-dax is
difficult for general purpose persistent memory applications to consume.
There is demand for access to pmem without needing to contend with
special device configuration and other device-dax limitations.

The 'daxfile' interface satisfies this demand and realizes one of Dave
Chinner's ideas for allowing pmem applications to safely bypass
fsync/msync requirements. The idea is to make the file immutable with
respect to the offset-to-block mappings for every extent in the file
[2]. It turns out that filesystems already need to make this guarantee
today. This property is needed for files marked as swap files.

The new daxctl() syscall manages setting a file into 'static-dax' mode
whereby it arranges for the file to be treated as a swapfile as far as
the filesystem is concerned, but not registered with the core-mm as
swapfile space. A file in this mode is then safe to be mapped and
written without the requirement to fsync/msync the writes.  The cpu
cache management for flushing data to persistence can be handled
completely in userspace.

[1]: https://lists.gnu.org/archive/html/qemu-devel/2017-06/msg01207.html
[2]: https://lkml.org/lkml/2016/9/11/159

Cc: Jan Kara <jack@xxxxxxx>
Cc: Jeff Moyer <jmoyer@xxxxxxxxxx>
Cc: Christoph Hellwig <hch@xxxxxx>
Cc: Dave Chinner <david@xxxxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>
Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
---
 arch/x86/entry/syscalls/syscall_64.tbl |    1 
 include/linux/dax.h                    |    9 ++
 include/linux/fs.h                     |    3 +
 include/linux/syscalls.h               |    1 
 include/uapi/linux/dax.h               |    8 +
 mm/Kconfig                             |    5 +
 mm/Makefile                            |    1 
 mm/daxfile.c                           |  186 ++++++++++++++++++++++++++++++++
 mm/page_io.c                           |   31 +++++
 9 files changed, 245 insertions(+)
 create mode 100644 include/uapi/linux/dax.h
 create mode 100644 mm/daxfile.c

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183e2f85..795eb93d6beb 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
 330	common	pkey_alloc		sys_pkey_alloc
 331	common	pkey_free		sys_pkey_free
 332	common	statx			sys_statx
+333	64	daxctl			sys_daxctl
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 5ec1f6c47716..5f1d0e0ed30f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -4,8 +4,17 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/radix-tree.h>
+#include <uapi/linux/dax.h>
 #include <asm/pgtable.h>
 
+/*
+ * TODO: make sys_daxctl() be the generic interface for toggling S_DAX
+ * across filesystems. For now, mark DAXCTL_F_DAX as an invalid flag
+ */
+#define DAXCTL_VALID_FLAGS (DAXCTL_F_GET | DAXCTL_F_STATIC)
+
+int daxfile_activate(struct file *daxfile, unsigned align);
+
 struct iomap_ops;
 struct dax_device;
 struct dax_operations {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3e68cabb8457..3af649fb669f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1824,8 +1824,10 @@ struct super_operations {
 #define S_NOSEC		4096	/* no suid or xattr security attributes */
 #ifdef CONFIG_FS_DAX
 #define S_DAX		8192	/* Direct Access, avoiding the page cache */
+#define S_DAXFILE	16384	/* no truncate (swapfile) semantics + dax */
 #else
 #define S_DAX		0	/* Make all the DAX code disappear */
+#define S_DAXFILE	0
 #endif
 
 /*
@@ -1865,6 +1867,7 @@ struct super_operations {
 #define IS_AUTOMOUNT(inode)	((inode)->i_flags & S_AUTOMOUNT)
 #define IS_NOSEC(inode)		((inode)->i_flags & S_NOSEC)
 #define IS_DAX(inode)		((inode)->i_flags & S_DAX)
+#define IS_DAXFILE(inode)	((inode)->i_flags & S_DAXFILE)
 
 #define IS_WHITEOUT(inode)	(S_ISCHR(inode->i_mode) && \
 				 (inode)->i_rdev == WHITEOUT_DEV)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9b06f8..49e5cc4c192e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -701,6 +701,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			unsigned long arg4, unsigned long arg5);
 asmlinkage long sys_swapon(const char __user *specialfile, int swap_flags);
 asmlinkage long sys_swapoff(const char __user *specialfile);
+asmlinkage long sys_daxctl(const char __user *path, int flags, int align);
 asmlinkage long sys_sysctl(struct __sysctl_args __user *args);
 asmlinkage long sys_sysinfo(struct sysinfo __user *info);
 asmlinkage long sys_sysfs(int option,
diff --git a/include/uapi/linux/dax.h b/include/uapi/linux/dax.h
new file mode 100644
index 000000000000..78a41bb392c0
--- /dev/null
+++ b/include/uapi/linux/dax.h
@@ -0,0 +1,8 @@
+#ifndef _UAPI_LINUX_DAX_H
+#define _UAPI_LINUX_DAX_H
+
+#define DAXCTL_F_GET    (1 << 0)
+#define DAXCTL_F_DAX    (1 << 1)
+#define DAXCTL_F_STATIC (1 << 2)
+
+#endif /* _UAPI_LINUX_DAX_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index beb7a455915d..b874565c34eb 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -450,6 +450,11 @@ config	TRANSPARENT_HUGE_PAGECACHE
 	def_bool y
 	depends on TRANSPARENT_HUGEPAGE
 
+config DAXFILE
+	def_bool y
+	depends on FS_DAX
+	depends on SWAP
+
 #
 # UP and nommu archs use km based percpu allocator
 #
diff --git a/mm/Makefile b/mm/Makefile
index 026f6a828a50..38d9025a3e37 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -56,6 +56,7 @@ endif
 obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_DAXFILE)	+= daxfile.o
 obj-$(CONFIG_FRONTSWAP)	+= frontswap.o
 obj-$(CONFIG_ZSWAP)	+= zswap.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
diff --git a/mm/daxfile.c b/mm/daxfile.c
new file mode 100644
index 000000000000..fe230199c855
--- /dev/null
+++ b/mm/daxfile.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/dax.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+
+/*
+ * TODO: a list to lookup daxfiles assumes a low number of instances,
+ * revisit.
+ */
+static LIST_HEAD(daxfiles);
+static DEFINE_SPINLOCK(dax_lock);
+
+struct dax_info {
+	struct list_head list;
+	struct file *daxfile;
+};
+
+static int daxfile_disable(struct file *victim)
+{
+	int found = 0;
+	struct dax_info *d;
+	struct inode *inode;
+	struct file *daxfile;
+	struct address_space *mapping;
+
+	mapping = victim->f_mapping;
+	spin_lock(&dax_lock);
+	list_for_each_entry(d, &daxfiles, list)
+		if (d->daxfile->f_mapping == mapping) {
+			list_del(&d->list);
+			found = 1;
+			break;
+		}
+	spin_unlock(&dax_lock);
+
+	if (!found)
+		return -EINVAL;
+
+	daxfile = d->daxfile;
+
+	inode = mapping->host;
+	inode->i_flags &= ~(S_SWAPFILE | S_DAXFILE);
+	filp_close(daxfile, NULL);
+
+	return 0;
+}
+
+static int claim_daxfile_checks(struct inode *inode)
+{
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	if (!IS_DAX(inode))
+		return -EINVAL;
+
+	if (IS_SWAPFILE(inode) || IS_DAXFILE(inode))
+		return -EBUSY;
+
+	return 0;
+}
+
+int daxfile_enable(struct file *daxfile, int align)
+{
+	struct address_space *mapping;
+	struct inode *inode;
+	struct dax_info *d;
+	int rc;
+
+	if (align < 0)
+		return -EINVAL;
+
+	mapping = daxfile->f_mapping;
+	inode = mapping->host;
+
+	rc = claim_daxfile_checks(inode);
+	if (rc)
+		return rc;
+
+	rc = daxfile_activate(daxfile, align);
+	if (rc)
+		return rc;
+
+	d = kzalloc(sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&d->list);
+	d->daxfile = daxfile;
+
+	spin_lock(&dax_lock);
+	list_add(&d->list, &daxfiles);
+	spin_unlock(&dax_lock);
+
+	/*
+	 * We set S_SWAPFILE to gain "no truncate" / static block
+	 * allocation semantics, and S_DAXFILE so we can differentiate
+	 * traditional swapfiles and assume static block mappings in the
+	 * dax mmap path.
+	 */
+	inode->i_flags |= S_SWAPFILE | S_DAXFILE;
+	return 0;
+}
+
+SYSCALL_DEFINE3(daxctl, const char __user *, path, int, flags, int, align)
+{
+	int rc;
+	struct filename *name;
+	struct inode *inode = NULL;
+	struct file *daxfile = NULL;
+	struct address_space *mapping;
+
+	if (flags & ~DAXCTL_VALID_FLAGS)
+		return -EINVAL;
+
+	name = getname(path);
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	daxfile = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
+	if (IS_ERR(daxfile)) {
+		rc = PTR_ERR(daxfile);
+		daxfile = NULL;
+		goto out;
+	}
+
+	mapping = daxfile->f_mapping;
+	inode = mapping->host;
+	if (flags & DAXCTL_F_GET) {
+		/*
+		 * We only report the state of DAXCTL_F_STATIC since
+		 * there is no actions for applications to take based on
+		 * the setting of S_DAX. However, if this interface is
+		 * used for toggling S_DAX presumably userspace would
+		 * want to know the state of the flag.
+		 *
+		 * TODO: revisit whether we want to report DAXCTL_F_DAX
+		 * in the IS_DAX() case.
+		 */
+		if (IS_DAXFILE(inode))
+			rc = DAXCTL_F_STATIC;
+		else
+			rc = 0;
+
+		goto out;
+	}
+
+	/*
+	 * TODO: Should unprivileged users be allowed to control daxfile
+	 * behavior? Perhaps a mount flag... is -o dax that flag?
+	 */
+	if (!capable(CAP_LINUX_IMMUTABLE)) {
+		rc = -EPERM;
+		goto out;
+	}
+
+	inode_lock(inode);
+	if (!IS_DAXFILE(inode) && (flags & DAXCTL_F_STATIC)) {
+		rc = daxfile_enable(daxfile, align);
+		/* if successfully enabled hold daxfile open */
+		if (rc == 0)
+			daxfile = NULL;
+	} else if (IS_DAXFILE(inode) && !(flags & DAXCTL_F_STATIC))
+		rc = daxfile_disable(daxfile);
+	else
+		rc = 0;
+	inode_unlock(inode);
+
+out:
+	if (daxfile)
+		filp_close(daxfile, NULL);
+	if (name)
+		putname(name);
+	return rc;
+}
diff --git a/mm/page_io.c b/mm/page_io.c
index 5cec9a3d49f2..35160ad9c51f 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -244,6 +244,37 @@ static int bmap_walk(struct file *file, const unsigned page_size,
 	goto out;
 }
 
+static int daxfile_check(sector_t block, unsigned long page_no,
+		enum bmap_check type, void *none)
+{
+	if (type == BMAP_WALK_DONE)
+		return 0;
+
+	/*
+	 * Unlike the swapfile case, fail daxfile_activate() if any file
+	 * extent is not page aligned.
+	 */
+	if (type != BMAP_WALK_FULLPAGE)
+		return -EINVAL;
+	return 0;
+}
+
+int daxfile_activate(struct file *daxfile, unsigned align)
+{
+	int rc;
+
+	if (!align)
+		align = PAGE_SIZE;
+
+	if (align < PAGE_SIZE || !is_power_of_2(align))
+		return -EINVAL;
+
+	rc = bmap_walk(daxfile, align, ULONG_MAX, NULL, daxfile_check, NULL);
+	if (rc)
+		pr_debug("daxctl: daxfile has holes\n");
+	return rc;
+}
+
 static int swapfile_check(sector_t block, unsigned long page_no,
 		enum bmap_check type, void *_sis)
 {