Skip to content

Instantly share code, notes, and snippets.

@dvdhrm
Created February 28, 2014 18:20
Show Gist options
  • Save dvdhrm/9276657 to your computer and use it in GitHub Desktop.
Save dvdhrm/9276657 to your computer and use it in GitHub Desktop.
memfd_create() syscall
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 96bc506..c943b8a 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -359,3 +359,4 @@
350 i386 finit_module sys_finit_module
351 i386 sched_setattr sys_sched_setattr
352 i386 sched_getattr sys_sched_getattr
+353 i386 memfd_create sys_memfd_create
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index a12bddc..e9d56a8 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -322,6 +322,7 @@
313 common finit_module sys_finit_module
314 common sched_setattr sys_sched_setattr
315 common sched_getattr sys_sched_getattr
+316 common memfd_create sys_memfd_create
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ef68665..8751511 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -21,6 +21,7 @@
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
+#include <linux/shmem_fs.h>
#include <asm/poll.h>
#include <asm/siginfo.h>
@@ -326,6 +327,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_GETPIPE_SZ:
err = pipe_fcntl(filp, cmd, arg);
break;
+ case MEMFD_SET_SEALS:
+ case MEMFD_GET_SEALS:
+ err = shmem_fcntl(filp, cmd, arg);
+ break;
default:
break;
}
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 9d55438..a980939 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -21,6 +21,7 @@ struct shmem_inode_info {
struct list_head swaplist; /* chain of maybes on swap */
struct simple_xattrs xattrs; /* list of xattrs */
struct inode vfs_inode;
+ u32 seals; /* memfd seals */
};
struct shmem_sb_info {
@@ -56,6 +57,7 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
extern int shmem_unuse(swp_entry_t entry, struct page *page);
+extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
static inline struct page *shmem_read_mapping_page(
struct address_space *mapping, pgoff_t index)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a747a77..a344673 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -791,6 +791,7 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags,
asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
asmlinkage long sys_eventfd(unsigned int count);
asmlinkage long sys_eventfd2(unsigned int count, int flags);
+asmlinkage long sys_memfd_create(const char *uname_ptr, u64 size, u64 seals, u64 flags);
asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 074b886..761abbb 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -28,6 +28,19 @@
#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)
/*
+ * Set/Get seals of memfds
+ */
+#define MEMFD_SET_SEALS (F_LINUX_SPECIFIC_BASE + 9)
+#define MEMFD_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
+
+/*
+ * Types of memfd seals
+ */
+#define MEMFD_SEAL_SHRINK 0x0001 /* prevent file from shrinking */
+#define MEMFD_SEAL_WRITE 0x0002 /* prevent any writes */
+#define MEMFD_SEAL_EXCLFD 0x0004 /* require excl-FD for sealing */
+
+/*
* Types of directory notifications that may be requested.
*/
#define DN_ACCESS 0x00000001 /* File accessed */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7078052..64f5e29 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -193,6 +193,7 @@ cond_syscall(compat_sys_timerfd_settime);
cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
+cond_syscall(sys_memfd);
/* performance counters: */
cond_syscall(sys_perf_event_open);
diff --git a/mm/shmem.c b/mm/shmem.c
index 1f18c9d..0968de8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -66,6 +66,8 @@ static struct vfsmount *shm_mnt;
#include <linux/highmem.h>
#include <linux/seq_file.h>
#include <linux/magic.h>
+#include <linux/syscalls.h>
+#include <linux/fcntl.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -596,16 +598,21 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ loff_t oldsize = inode->i_size;
+ loff_t newsize = attr->ia_size;
int error;
error = inode_change_ok(inode, attr);
if (error)
return error;
- if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
- loff_t oldsize = inode->i_size;
- loff_t newsize = attr->ia_size;
+ /* protected by i_mutex */
+ if ((attr->ia_valid & ATTR_SIZE) && newsize < oldsize &&
+ (info->seals & MEMFD_SEAL_SHRINK))
+ return -EPERM;
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
if (newsize != oldsize) {
i_size_write(inode, newsize);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -1354,6 +1361,13 @@ out_nomem:
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
+ struct inode *inode = file_inode(file);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+
+ /* protected by mmap_sem and owns additional file-reference */
+ if ((info->seals & MEMFD_SEAL_WRITE) && vma->vm_flags & VM_WRITE)
+ return -EPERM;
+
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
return 0;
@@ -1433,7 +1447,13 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
+ struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+
+ /* i_mutex is held by caller */
+ if (info->seals & MEMFD_SEAL_WRITE)
+ return -EPERM;
+
return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
}
@@ -1802,11 +1822,84 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
return offset;
}
+long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ bool shared;
+ long r;
+
+ /*
+ * SHMEM SEALING
+ * Sealing allows multiple parties to share a shmem-file but restrict
+ * access to a specific subset of file operations as long as more than
+ * one party has access to the inode. This way, mutually untrusted
+ * parties can share common memory regions with a well-defined policy.
+ *
+ * Seals can be set on any shmem-file, but always affect the whole
+ * underlying inode. Once a seal is set, it may prevent some kinds of
+ * access to the file. Currently, the following seals are defined:
+ * SHRINK: Prevent the file from shrinking
+ * WRITE: Prevent write access to the file
+ * EXCLFD: Require exclusive FD for sealing
+ *
+ * As we don't require any trust relationship between two parties, we
+ * cannot allow asynchronous sealing. Instead, sealing is *only* allowed
+ * if you own an exclusive reference to the shmem-file. This access
+ * restriction is done per-file by default. If EXCLFD is set, it is done
+ * on a per-FD level, so you have to own the exclusive FD to the file.
+ * Note that mmap() also keeps a file-reference, so you cannot change
+ * seals with EXCLFD if the file is mapped somewhere.
+ */
+
+ if (file->f_op != &shmem_file_operations)
+ return -EBADF;
+
+ switch (cmd) {
+ case MEMFD_SET_SEALS:
+ /*
+ * - i_lock prevents racing open() calls and new inode-refs
+ * - mmap_sem prevents racing mmap() calls
+ * - i_mutex prevents racing write/ftruncate/fallocate/..
+ */
+ mutex_lock(&inode->i_mutex);
+ down_read(&current->mm->mmap_sem);
+ spin_lock(&inode->i_lock);
+
+ shared = d_count(dentry) > 1 ||
+ atomic_read(&inode->i_count) > 1;
+ if (info->seals & MEMFD_SEAL_EXCLFD)
+ shared = shared || file_count(file) > 1;
+
+ if (shared) {
+ r = -EPERM;
+ } else {
+ info->seals = (u32)arg;
+ r = 0;
+ }
+
+ spin_unlock(&inode->i_lock);
+ up_read(&current->mm->mmap_sem);
+ mutex_unlock(&inode->i_mutex);
+ break;
+ case MEMFD_GET_SEALS:
+ r = info->seals;
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+
+ return r;
+}
+
static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
struct inode *inode = file_inode(file);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_falloc shmem_falloc;
pgoff_t start, index, end;
int error;
@@ -1818,6 +1911,12 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t unmap_start = round_up(offset, PAGE_SIZE);
loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+ /* protected by i_mutex */
+ if (info->seals & MEMFD_SEAL_WRITE) {
+ error = -EPERM;
+ goto out;
+ }
+
if ((u64)unmap_end > (u64)unmap_start)
unmap_mapping_range(mapping, unmap_start,
1 + unmap_end - unmap_start, 0);
@@ -3048,3 +3147,72 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
#endif
}
EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
+
+#define MEMFD_ALL_SEALS (MEMFD_SEAL_SHRINK | \
+ MEMFD_SEAL_WRITE | \
+ MEMFD_SEAL_EXCLFD)
+
+/* maximum length of memfd names */
+#define MEMFD_MAX_NAMELEN 256
+
+SYSCALL_DEFINE4(memfd_create,
+ const char*, uname,
+ u64, size,
+ u64, seals,
+ u64, flags)
+{
+ struct shmem_inode_info *info;
+ struct inode *inode;
+ struct file *shm;
+ char *name;
+ int fd, r;
+ long len;
+
+ if (seals & ~(u64)MEMFD_ALL_SEALS)
+ return -EINVAL;
+ if (flags & ~(u64)O_CLOEXEC)
+ return -EINVAL;
+ if ((u64)(loff_t)size != size || (loff_t)size < 0)
+ return -EINVAL;
+
+ /* length includes terminating zero */
+ len = strnlen_user(uname, MEMFD_MAX_NAMELEN);
+ if (len <= 0)
+ return -EFAULT;
+ else if (len > MEMFD_MAX_NAMELEN)
+ return -EINVAL;
+
+ name = memdup_user(uname, len);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ /* name may have changed after strnlen_user() returned */
+ name[len - 1] = 0;
+
+ fd = get_unused_fd_flags(flags & O_CLOEXEC);
+ if (fd < 0) {
+ r = fd;
+ goto err_name;
+ }
+
+ shm = shmem_file_setup(name, size, 0);
+ if (IS_ERR(shm)) {
+ r = PTR_ERR(shm);
+ goto err_fd;
+ }
+
+ /* file is not exposed until fd_install(), so sealing is safe */
+ inode = file_inode(shm);
+ info = SHMEM_I(inode);
+ info->seals = seals;
+
+ fd_install(fd, shm);
+ kfree(name);
+ return fd;
+
+err_fd:
+ put_unused_fd(fd);
+err_name:
+ kfree(name);
+ return r;
+}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment