Created
February 28, 2014 18:20
-
-
Save dvdhrm/9276657 to your computer and use it in GitHub Desktop.
memfd_create() syscall
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl | |
index 96bc506..c943b8a 100644 | |
--- a/arch/x86/syscalls/syscall_32.tbl | |
+++ b/arch/x86/syscalls/syscall_32.tbl | |
@@ -359,3 +359,4 @@ | |
350 i386 finit_module sys_finit_module | |
351 i386 sched_setattr sys_sched_setattr | |
352 i386 sched_getattr sys_sched_getattr | |
+353 i386 memfd_create sys_memfd_create | |
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl | |
index a12bddc..e9d56a8 100644 | |
--- a/arch/x86/syscalls/syscall_64.tbl | |
+++ b/arch/x86/syscalls/syscall_64.tbl | |
@@ -322,6 +322,7 @@ | |
313 common finit_module sys_finit_module | |
314 common sched_setattr sys_sched_setattr | |
315 common sched_getattr sys_sched_getattr | |
+316 common memfd_create sys_memfd_create | |
# | |
# x32-specific system call numbers start at 512 to avoid cache impact | |
diff --git a/fs/fcntl.c b/fs/fcntl.c | |
index ef68665..8751511 100644 | |
--- a/fs/fcntl.c | |
+++ b/fs/fcntl.c | |
@@ -21,6 +21,7 @@ | |
#include <linux/rcupdate.h> | |
#include <linux/pid_namespace.h> | |
#include <linux/user_namespace.h> | |
+#include <linux/shmem_fs.h> | |
#include <asm/poll.h> | |
#include <asm/siginfo.h> | |
@@ -326,6 +327,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, | |
case F_GETPIPE_SZ: | |
err = pipe_fcntl(filp, cmd, arg); | |
break; | |
+ case MEMFD_SET_SEALS: | |
+ case MEMFD_GET_SEALS: | |
+ err = shmem_fcntl(filp, cmd, arg); | |
+ break; | |
default: | |
break; | |
} | |
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h | |
index 9d55438..a980939 100644 | |
--- a/include/linux/shmem_fs.h | |
+++ b/include/linux/shmem_fs.h | |
@@ -21,6 +21,7 @@ struct shmem_inode_info { | |
struct list_head swaplist; /* chain of maybes on swap */ | |
struct simple_xattrs xattrs; /* list of xattrs */ | |
struct inode vfs_inode; | |
+ u32 seals; /* memfd seals */ | |
}; | |
struct shmem_sb_info { | |
@@ -56,6 +57,7 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, | |
pgoff_t index, gfp_t gfp_mask); | |
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); | |
extern int shmem_unuse(swp_entry_t entry, struct page *page); | |
+extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg); | |
static inline struct page *shmem_read_mapping_page( | |
struct address_space *mapping, pgoff_t index) | |
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h | |
index a747a77..a344673 100644 | |
--- a/include/linux/syscalls.h | |
+++ b/include/linux/syscalls.h | |
@@ -791,6 +791,7 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags, | |
asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr); | |
asmlinkage long sys_eventfd(unsigned int count); | |
asmlinkage long sys_eventfd2(unsigned int count, int flags); | |
+asmlinkage long sys_memfd_create(const char *uname_ptr, u64 size, u64 seals, u64 flags); | |
asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); | |
asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); | |
asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, | |
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h | |
index 074b886..761abbb 100644 | |
--- a/include/uapi/linux/fcntl.h | |
+++ b/include/uapi/linux/fcntl.h | |
@@ -28,6 +28,19 @@ | |
#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) | |
/* | |
+ * Set/Get seals of memfds | |
+ */ | |
+#define MEMFD_SET_SEALS (F_LINUX_SPECIFIC_BASE + 9) | |
+#define MEMFD_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) | |
+ | |
+/* | |
+ * Types of memfd seals | |
+ */ | |
+#define MEMFD_SEAL_SHRINK 0x0001 /* prevent file from shrinking */ | |
+#define MEMFD_SEAL_WRITE 0x0002 /* prevent any writes */ | |
+#define MEMFD_SEAL_EXCLFD 0x0004 /* require excl-FD for sealing */ | |
+ | |
+/* | |
* Types of directory notifications that may be requested. | |
*/ | |
#define DN_ACCESS 0x00000001 /* File accessed */ | |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c | |
index 7078052..64f5e29 100644 | |
--- a/kernel/sys_ni.c | |
+++ b/kernel/sys_ni.c | |
@@ -193,6 +193,7 @@ cond_syscall(compat_sys_timerfd_settime); | |
cond_syscall(compat_sys_timerfd_gettime); | |
cond_syscall(sys_eventfd); | |
cond_syscall(sys_eventfd2); | |
+cond_syscall(sys_memfd); | |
/* performance counters: */ | |
cond_syscall(sys_perf_event_open); | |
diff --git a/mm/shmem.c b/mm/shmem.c | |
index 1f18c9d..0968de8 100644 | |
--- a/mm/shmem.c | |
+++ b/mm/shmem.c | |
@@ -66,6 +66,8 @@ static struct vfsmount *shm_mnt; | |
#include <linux/highmem.h> | |
#include <linux/seq_file.h> | |
#include <linux/magic.h> | |
+#include <linux/syscalls.h> | |
+#include <linux/fcntl.h> | |
#include <asm/uaccess.h> | |
#include <asm/pgtable.h> | |
@@ -596,16 +598,21 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); | |
static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |
{ | |
struct inode *inode = dentry->d_inode; | |
+ struct shmem_inode_info *info = SHMEM_I(inode); | |
+ loff_t oldsize = inode->i_size; | |
+ loff_t newsize = attr->ia_size; | |
int error; | |
error = inode_change_ok(inode, attr); | |
if (error) | |
return error; | |
- if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { | |
- loff_t oldsize = inode->i_size; | |
- loff_t newsize = attr->ia_size; | |
+ /* protected by i_mutex */ | |
+ if ((attr->ia_valid & ATTR_SIZE) && newsize < oldsize && | |
+ (info->seals & MEMFD_SEAL_SHRINK)) | |
+ return -EPERM; | |
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { | |
if (newsize != oldsize) { | |
i_size_write(inode, newsize); | |
inode->i_ctime = inode->i_mtime = CURRENT_TIME; | |
@@ -1354,6 +1361,13 @@ out_nomem: | |
static int shmem_mmap(struct file *file, struct vm_area_struct *vma) | |
{ | |
+ struct inode *inode = file_inode(file); | |
+ struct shmem_inode_info *info = SHMEM_I(inode); | |
+ | |
+ /* protected by mmap_sem and owns additional file-reference */ | |
+ if ((info->seals & MEMFD_SEAL_WRITE) && vma->vm_flags & VM_WRITE) | |
+ return -EPERM; | |
+ | |
file_accessed(file); | |
vma->vm_ops = &shmem_vm_ops; | |
return 0; | |
@@ -1433,7 +1447,13 @@ shmem_write_begin(struct file *file, struct address_space *mapping, | |
struct page **pagep, void **fsdata) | |
{ | |
struct inode *inode = mapping->host; | |
+ struct shmem_inode_info *info = SHMEM_I(inode); | |
pgoff_t index = pos >> PAGE_CACHE_SHIFT; | |
+ | |
+ /* i_mutex is held by caller */ | |
+ if (info->seals & MEMFD_SEAL_WRITE) | |
+ return -EPERM; | |
+ | |
return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); | |
} | |
@@ -1802,11 +1822,84 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) | |
return offset; | |
} | |
+long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) | |
+{ | |
+ struct dentry *dentry = file->f_path.dentry; | |
+ struct inode *inode = dentry->d_inode; | |
+ struct shmem_inode_info *info = SHMEM_I(inode); | |
+ bool shared; | |
+ long r; | |
+ | |
+ /* | |
+ * SHMEM SEALING | |
+ * Sealing allows multiple parties to share a shmem-file but restrict | |
+ * access to a specific subset of file operations as long as more than | |
+ * one party has access to the inode. This way, mutually untrusted | |
+ * parties can share common memory regions with a well-defined policy. | |
+ * | |
+ * Seals can be set on any shmem-file, but always affect the whole | |
+ * underlying inode. Once a seal is set, it may prevent some kinds of | |
+ * access to the file. Currently, the following seals are defined: | |
+ * SHRINK: Prevent the file from shrinking | |
+ * WRITE: Prevent write access to the file | |
+ * EXCLFD: Require exclusive FD for sealing | |
+ * | |
+ * As we don't require any trust relationship between two parties, we | |
+ * cannot allow asynchronous sealing. Instead, sealing is *only* allowed | |
+ * if you own an exclusive reference to the shmem-file. This access | |
+ * restriction is done per-file by default. If EXCLFD is set, it is done | |
+ * on a per-FD level, so you have to own the exclusive FD to the file. | |
+ * Note that mmap() also keeps a file-reference, so you cannot change | |
+ * seals with EXCLFD if the file is mapped somewhere. | |
+ */ | |
+ | |
+ if (file->f_op != &shmem_file_operations) | |
+ return -EBADF; | |
+ | |
+ switch (cmd) { | |
+ case MEMFD_SET_SEALS: | |
+ /* | |
+ * - i_lock prevents racing open() calls and new inode-refs | |
+ * - mmap_sem prevents racing mmap() calls | |
+ * - i_mutex prevents racing write/ftruncate/fallocate/.. | |
+ */ | |
+ mutex_lock(&inode->i_mutex); | |
+ down_read(¤t->mm->mmap_sem); | |
+ spin_lock(&inode->i_lock); | |
+ | |
+ shared = d_count(dentry) > 1 || | |
+ atomic_read(&inode->i_count) > 1; | |
+ if (info->seals & MEMFD_SEAL_EXCLFD) | |
+ shared = shared || file_count(file) > 1; | |
+ | |
+ if (shared) { | |
+ r = -EPERM; | |
+ } else { | |
+ info->seals = (u32)arg; | |
+ r = 0; | |
+ } | |
+ | |
+ spin_unlock(&inode->i_lock); | |
+ up_read(¤t->mm->mmap_sem); | |
+ mutex_unlock(&inode->i_mutex); | |
+ break; | |
+ case MEMFD_GET_SEALS: | |
+ r = info->seals; | |
+ break; | |
+ default: | |
+ r = -EINVAL; | |
+ break; | |
+ } | |
+ | |
+ return r; | |
+} | |
+ | |
static long shmem_fallocate(struct file *file, int mode, loff_t offset, | |
loff_t len) | |
{ | |
struct inode *inode = file_inode(file); | |
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | |
+ struct shmem_inode_info *info = SHMEM_I(inode); | |
struct shmem_falloc shmem_falloc; | |
pgoff_t start, index, end; | |
int error; | |
@@ -1818,6 +1911,12 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, | |
loff_t unmap_start = round_up(offset, PAGE_SIZE); | |
loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; | |
+ /* protected by i_mutex */ | |
+ if (info->seals & MEMFD_SEAL_WRITE) { | |
+ error = -EPERM; | |
+ goto out; | |
+ } | |
+ | |
if ((u64)unmap_end > (u64)unmap_start) | |
unmap_mapping_range(mapping, unmap_start, | |
1 + unmap_end - unmap_start, 0); | |
@@ -3048,3 +3147,72 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, | |
#endif | |
} | |
EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); | |
+ | |
+#define MEMFD_ALL_SEALS (MEMFD_SEAL_SHRINK | \ | |
+ MEMFD_SEAL_WRITE | \ | |
+ MEMFD_SEAL_EXCLFD) | |
+ | |
+/* maximum length of memfd names */ | |
+#define MEMFD_MAX_NAMELEN 256 | |
+ | |
+SYSCALL_DEFINE4(memfd_create, | |
+ const char*, uname, | |
+ u64, size, | |
+ u64, seals, | |
+ u64, flags) | |
+{ | |
+ struct shmem_inode_info *info; | |
+ struct inode *inode; | |
+ struct file *shm; | |
+ char *name; | |
+ int fd, r; | |
+ long len; | |
+ | |
+ if (seals & ~(u64)MEMFD_ALL_SEALS) | |
+ return -EINVAL; | |
+ if (flags & ~(u64)O_CLOEXEC) | |
+ return -EINVAL; | |
+ if ((u64)(loff_t)size != size || (loff_t)size < 0) | |
+ return -EINVAL; | |
+ | |
+ /* length includes terminating zero */ | |
+ len = strnlen_user(uname, MEMFD_MAX_NAMELEN); | |
+ if (len <= 0) | |
+ return -EFAULT; | |
+ else if (len > MEMFD_MAX_NAMELEN) | |
+ return -EINVAL; | |
+ | |
+ name = memdup_user(uname, len); | |
+ if (IS_ERR(name)) | |
+ return PTR_ERR(name); | |
+ | |
+ /* name may have changed after strnlen_user() returned */ | |
+ name[len - 1] = 0; | |
+ | |
+ fd = get_unused_fd_flags(flags & O_CLOEXEC); | |
+ if (fd < 0) { | |
+ r = fd; | |
+ goto err_name; | |
+ } | |
+ | |
+ shm = shmem_file_setup(name, size, 0); | |
+ if (IS_ERR(shm)) { | |
+ r = PTR_ERR(shm); | |
+ goto err_fd; | |
+ } | |
+ | |
+ /* file is not exposed until fd_install(), so sealing is safe */ | |
+ inode = file_inode(shm); | |
+ info = SHMEM_I(inode); | |
+ info->seals = seals; | |
+ | |
+ fd_install(fd, shm); | |
+ kfree(name); | |
+ return fd; | |
+ | |
+err_fd: | |
+ put_unused_fd(fd); | |
+err_name: | |
+ kfree(name); | |
+ return r; | |
+} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment