From e4493a4b483f819d3fabc77184212001586bc3d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edgar=20Arriaga=20Garc=C3=ADa?= Date: Thu, 30 Apr 2020 00:12:38 +0000 Subject: [PATCH] BACKPORT: mm/madvise: introduce process_madvise() syscall: an external memory hinting API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is usecase that System Management Software(SMS) want to give a memory hint like MADV_[COLD|PAGEEOUT] to other processes and in the case of Android, it is the ActivityManagerService. It's similar in spirit to madvise(MADV_WONTNEED), but the information required to make the reclaim decision is not known to the app. Instead, it is known to the centralized userspace daemon(ActivityManagerService), and that daemon must be able to initiate reclaim on its own without any app involvement. To solve the issue, this patch introduces a new syscall process_madvise(2). It uses pidfd of an external process to give the hint. int process_madvise(int pidfd, void *addr, size_t length, int advise, unsigned long flag); Since it could affect other process's address range, only privileged process(CAP_SYS_PTRACE) or something else(e.g., being the same UID) gives it the right to ptrace the process could use it successfully. The flag argument is reserved for future use if we need to extend the API. I think supporting all hints madvise has/will supported/support to process_madvise is rather risky. Because we are not sure all hints make sense from external process and implementation for the hint may rely on the caller being in the current context so it could be error-prone. Thus, I just limited hints as MADV_[COLD|PAGEOUT] in this patch. If someone want to add other hints, we could hear hear the usecase and review it for each hint. It's safer for maintenance rather than introducing a buggy syscall but hard to fix it later. [1] https://developer.android.com/topic/performance/memory" [2] process_getinfo for getting the cookie which is updated whenever vma of process address layout are changed - Daniel Colascione - https://lore.kernel.org/lkml/20190520035254.57579-1-minchan@kernel.org/T/#m7694416fd179b2066a2c62b5b139b14e3894e224 [3] anonymous fd which is used for the object(i.e., address range) validation - Michal Hocko - https://lore.kernel.org/lkml/20200120112722.GY18451@dhcp22.suse.cz/ Link: http://lkml.kernel.org/r/20200302193630.68771-3-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Cc: Alexander Duyck Cc: Brian Geffon Cc: Christian Brauner Cc: Daniel Colascione Cc: Jann Horn Cc: Jens Axboe Cc: Joel Fernandes Cc: Johannes Weiner Cc: John Dias Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleksandr Natalenko Cc: Sandeep Patil Cc: SeongJae Park Cc: SeongJae Park Cc: Shakeel Butt Cc: Sonny Rao Cc: Tim Murray Cc: Signed-off-by: Andrew Morton (cherry picked from commit ecb8ac8b1f146915aa6b96449b66dd48984caacc) Conflicts: arch/alpha/kernel/syscalls/syscall.tbl arch/ia64/kernel/syscalls/syscall.tbl arch/m68k/kernel/syscalls/syscall.tbl arch/microblaze/kernel/syscalls/syscall.tbl arch/mips/kernel/syscalls/syscall_n32.tbl arch/mips/kernel/syscalls/syscall_n64.tbl arch/parisc/kernel/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl arch/sh/kernel/syscalls/syscall.tbl arch/sparc/kernel/syscalls/syscall.tbl mm/madvise.c 1. __NR_compat_syscalls in arch/arm64/include/asm/unistd.h modified to match latest version to avoid clobbering old number. 2. Dropped syscall.tbl, syscall_n32, syscall_n64 files for architectures not present in current kernel. 3. __NR_process_madvise in arch/arm64/include/asm/unistd32.h modified to match latest mm tree. 4. Added include for uio.h lib which is needed for UIO_FASTIOV and iovec Bug: 153444106 Test: Built kernel Signed-off-by: Edgar Arriaga GarcĂ­a Change-Id: Icfff940abebcf290c3111239989ed40a407cf2a6 Signed-off-by: azrim --- arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 2 + include/uapi/asm-generic/unistd.h | 4 +- kernel/sys_ni.c | 1 + mm/madvise.c | 94 +++++++++++++++++++++++++- 9 files changed, 104 insertions(+), 4 deletions(-) diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 98a1e28bc13b..ba46276ff838 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -414,3 +414,4 @@ 397 common statx sys_statx 424 common pidfd_send_signal sys_pidfd_send_signal 434 common pidfd_open sys_pidfd_open +440 common process_madvise sys_process_madvise diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index b52bd9696386..4ed8a7dd7b17 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -43,7 +43,7 @@ #define __ARM_NR_compat_cacheflush (__ARM_NR_COMPAT_BASE+2) #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE+5) -#define __NR_compat_syscalls 435 +#define __NR_compat_syscalls 441 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index b81507012f45..108bc92f6d5a 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -821,6 +821,8 @@ __SYSCALL(__NR_statx, sys_statx) __SYSCALL(__NR_pidfd_send_signal, sys_pidfd_send_signal) #define __NR_pidfd_open 434 __SYSCALL(__NR_pidfd_open, sys_pidfd_open) +#define __NR_process_madvise 440 +__SYSCALL(__NR_process_madvise, sys_process_madvise) /* * Please add new compat syscalls above this comment and update diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 1b6fd1fdb255..f5fd16d16b5d 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -393,3 +393,4 @@ 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl 424 i386 pidfd_send_signal sys_pidfd_send_signal 434 i386 pidfd_open sys_pidfd_open +440 i386 process_madvise sys_process_madvise sys_process_madvise \ No newline at end of file diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 9a1523f53bac..6cd0bb788734 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -341,6 +341,7 @@ 332 common statx sys_statx 424 common pidfd_send_signal sys_pidfd_send_signal 434 common pidfd_open sys_pidfd_open +440 common process_madvise sys_process_madvise # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e87cc59c817d..7454828397d1 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -501,6 +501,8 @@ asmlinkage long sys_munlock(unsigned long start, size_t len); asmlinkage long sys_mlockall(int flags); asmlinkage long sys_munlockall(void); asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); +asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, + size_t vlen, int behavior, unsigned int flags); asmlinkage long sys_mincore(unsigned long start, size_t len, unsigned char __user * vec); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 119661ff7e94..d247846ef612 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -736,9 +736,11 @@ __SYSCALL(__NR_statx, sys_statx) __SYSCALL(__NR_pidfd_send_signal, sys_pidfd_send_signal) #define __NR_pidfd_open 434 __SYSCALL(__NR_pidfd_open, sys_pidfd_open) +#define __NR_process_madvise 440 +__SYSCALL(__NR_process_madvise, sys_process_madvise) #undef __NR_syscalls -#define __NR_syscalls 435 +#define __NR_syscalls 441 /* * All syscalls below here should go away really, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index b5189762d275..ffea02950d54 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -202,6 +202,7 @@ cond_syscall(sys_munlockall); cond_syscall(sys_mlock2); cond_syscall(sys_mincore); cond_syscall(sys_madvise); +cond_syscall(process_madvise); cond_syscall(sys_mremap); cond_syscall(sys_remap_file_pages); cond_syscall(compat_sys_move_pages); diff --git a/mm/madvise.c b/mm/madvise.c index 039469377c8a..d9b9f988d77f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -26,8 +27,7 @@ #include #include #include -#include - +#include #include #include "internal.h" @@ -1013,6 +1013,18 @@ madvise_behavior_valid(int behavior) } } +static bool +process_madvise_behavior_valid(int behavior) +{ + switch (behavior) { + case MADV_COLD: + case MADV_PAGEOUT: + return true; + default: + return false; + } +} + /* * The madvise(2) system call. * @@ -1060,6 +1072,11 @@ madvise_behavior_valid(int behavior) * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. + * MADV_COLD - the application is not expected to use this memory soon, + * deactivate pages in this range so that they can be reclaimed + * easily if memory pressure hanppens. + * MADV_PAGEOUT - the application is not expected to use this memory soon, + * page out the pages in this range immediately. * * return values: * zero - success @@ -1193,3 +1210,76 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { return do_madvise(current->mm, start, len_in, behavior); } + +SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, + size_t, vlen, int, behavior, unsigned int, flags) +{ + ssize_t ret; + struct iovec iovstack[UIO_FASTIOV], iovec; + struct iovec *iov = iovstack; + struct iov_iter iter; + struct pid *pid; + struct task_struct *task; + struct mm_struct *mm; + size_t total_len; + unsigned int f_flags; + + if (flags != 0) { + ret = -EINVAL; + goto out; + } + + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret < 0) + goto out; + + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto free_iov; + } + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + ret = -ESRCH; + goto put_pid; + } + + if (task->mm != current->mm && + !process_madvise_behavior_valid(behavior)) { + ret = -EINVAL; + goto release_task; + } + + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); + if (IS_ERR_OR_NULL(mm)) { + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + goto release_task; + } + + total_len = iov_iter_count(&iter); + + while (iov_iter_count(&iter)) { + iovec = iov_iter_iovec(&iter); + ret = do_madvise(mm, (unsigned long)iovec.iov_base, + iovec.iov_len, behavior); + if (ret < 0) + break; + iov_iter_advance(&iter, iovec.iov_len); + } + + if (ret == 0) + ret = total_len - iov_iter_count(&iter); + + mmput(mm); + return ret; + +release_task: + put_task_struct(task); +put_pid: + put_pid(pid); +free_iov: + kfree(iov); +out: + return ret; +} \ No newline at end of file