mirror of
https://github.com/rd-stuffs/msm-4.14.git
synced 2025-02-20 11:45:48 +08:00
BACKPORT: mm/madvise: introduce process_madvise() syscall: an external memory hinting API
There is usecase that System Management Software(SMS) want to give a memory hint like MADV_[COLD|PAGEEOUT] to other processes and in the case of Android, it is the ActivityManagerService. It's similar in spirit to madvise(MADV_WONTNEED), but the information required to make the reclaim decision is not known to the app. Instead, it is known to the centralized userspace daemon(ActivityManagerService), and that daemon must be able to initiate reclaim on its own without any app involvement. To solve the issue, this patch introduces a new syscall process_madvise(2). It uses pidfd of an external process to give the hint. int process_madvise(int pidfd, void *addr, size_t length, int advise, unsigned long flag); Since it could affect other process's address range, only privileged process(CAP_SYS_PTRACE) or something else(e.g., being the same UID) gives it the right to ptrace the process could use it successfully. The flag argument is reserved for future use if we need to extend the API. I think supporting all hints madvise has/will supported/support to process_madvise is rather risky. Because we are not sure all hints make sense from external process and implementation for the hint may rely on the caller being in the current context so it could be error-prone. Thus, I just limited hints as MADV_[COLD|PAGEOUT] in this patch. If someone want to add other hints, we could hear hear the usecase and review it for each hint. It's safer for maintenance rather than introducing a buggy syscall but hard to fix it later. [1] https://developer.android.com/topic/performance/memory" [2] process_getinfo for getting the cookie which is updated whenever vma of process address layout are changed - Daniel Colascione - https://lore.kernel.org/lkml/20190520035254.57579-1-minchan@kernel.org/T/#m7694416fd179b2066a2c62b5b139b14e3894e224 [3] anonymous fd which is used for the object(i.e., address range) validation - Michal Hocko - https://lore.kernel.org/lkml/20200120112722.GY18451@dhcp22.suse.cz/ Link: http://lkml.kernel.org/r/20200302193630.68771-3-minchan@kernel.org Signed-off-by: Minchan Kim <minchan@kernel.org> Reviewed-by: Suren Baghdasaryan <surenb@google.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com> Cc: Brian Geffon <bgeffon@google.com> Cc: Christian Brauner <christian@brauner.io> Cc: Daniel Colascione <dancol@google.com> Cc: Jann Horn <jannh@google.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Joel Fernandes <joel@joelfernandes.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: John Dias <joaodias@google.com> Cc: Kirill Tkhai <ktkhai@virtuozzo.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Oleksandr Natalenko <oleksandr@redhat.com> Cc: Sandeep Patil <sspatil@google.com> Cc: SeongJae Park <sj38.park@gmail.com> Cc: SeongJae Park <sjpark@amazon.de> Cc: Shakeel Butt <shakeelb@google.com> Cc: Sonny Rao <sonnyrao@google.com> Cc: Tim Murray <timmurray@google.com> Cc: <linux-man@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> (cherry picked from commit ecb8ac8b1f146915aa6b96449b66dd48984caacc) Conflicts: arch/alpha/kernel/syscalls/syscall.tbl arch/ia64/kernel/syscalls/syscall.tbl arch/m68k/kernel/syscalls/syscall.tbl arch/microblaze/kernel/syscalls/syscall.tbl arch/mips/kernel/syscalls/syscall_n32.tbl arch/mips/kernel/syscalls/syscall_n64.tbl arch/parisc/kernel/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl arch/sh/kernel/syscalls/syscall.tbl arch/sparc/kernel/syscalls/syscall.tbl mm/madvise.c 1. __NR_compat_syscalls in arch/arm64/include/asm/unistd.h modified to match latest version to avoid clobbering old number. 2. Dropped syscall.tbl, syscall_n32, syscall_n64 files for architectures not present in current kernel. 3. __NR_process_madvise in arch/arm64/include/asm/unistd32.h modified to match latest mm tree. 4. Added include for uio.h lib which is needed for UIO_FASTIOV and iovec Bug: 153444106 Test: Built kernel Signed-off-by: Edgar Arriaga García <edgararriaga@google.com> Change-Id: Icfff940abebcf290c3111239989ed40a407cf2a6 Signed-off-by: azrim <mirzaspc@gmail.com>
This commit is contained in:
parent
fbd0bf9615
commit
e4493a4b48
@ -414,3 +414,4 @@
|
||||
397 common statx sys_statx
|
||||
424 common pidfd_send_signal sys_pidfd_send_signal
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
440 common process_madvise sys_process_madvise
|
||||
|
@ -43,7 +43,7 @@
|
||||
#define __ARM_NR_compat_cacheflush (__ARM_NR_COMPAT_BASE+2)
|
||||
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE+5)
|
||||
|
||||
#define __NR_compat_syscalls 435
|
||||
#define __NR_compat_syscalls 441
|
||||
#endif
|
||||
|
||||
#define __ARCH_WANT_SYS_CLONE
|
||||
|
@ -821,6 +821,8 @@ __SYSCALL(__NR_statx, sys_statx)
|
||||
__SYSCALL(__NR_pidfd_send_signal, sys_pidfd_send_signal)
|
||||
#define __NR_pidfd_open 434
|
||||
__SYSCALL(__NR_pidfd_open, sys_pidfd_open)
|
||||
#define __NR_process_madvise 440
|
||||
__SYSCALL(__NR_process_madvise, sys_process_madvise)
|
||||
|
||||
/*
|
||||
* Please add new compat syscalls above this comment and update
|
||||
|
@ -393,3 +393,4 @@
|
||||
384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
|
||||
424 i386 pidfd_send_signal sys_pidfd_send_signal
|
||||
434 i386 pidfd_open sys_pidfd_open
|
||||
440 i386 process_madvise sys_process_madvise sys_process_madvise
|
@ -341,6 +341,7 @@
|
||||
332 common statx sys_statx
|
||||
424 common pidfd_send_signal sys_pidfd_send_signal
|
||||
434 common pidfd_open sys_pidfd_open
|
||||
440 common process_madvise sys_process_madvise
|
||||
|
||||
#
|
||||
# x32-specific system call numbers start at 512 to avoid cache impact
|
||||
|
@ -501,6 +501,8 @@ asmlinkage long sys_munlock(unsigned long start, size_t len);
|
||||
asmlinkage long sys_mlockall(int flags);
|
||||
asmlinkage long sys_munlockall(void);
|
||||
asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
|
||||
asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
|
||||
size_t vlen, int behavior, unsigned int flags);
|
||||
asmlinkage long sys_mincore(unsigned long start, size_t len,
|
||||
unsigned char __user * vec);
|
||||
|
||||
|
@ -736,9 +736,11 @@ __SYSCALL(__NR_statx, sys_statx)
|
||||
__SYSCALL(__NR_pidfd_send_signal, sys_pidfd_send_signal)
|
||||
#define __NR_pidfd_open 434
|
||||
__SYSCALL(__NR_pidfd_open, sys_pidfd_open)
|
||||
#define __NR_process_madvise 440
|
||||
__SYSCALL(__NR_process_madvise, sys_process_madvise)
|
||||
|
||||
#undef __NR_syscalls
|
||||
#define __NR_syscalls 435
|
||||
#define __NR_syscalls 441
|
||||
|
||||
/*
|
||||
* All syscalls below here should go away really,
|
||||
|
@ -202,6 +202,7 @@ cond_syscall(sys_munlockall);
|
||||
cond_syscall(sys_mlock2);
|
||||
cond_syscall(sys_mincore);
|
||||
cond_syscall(sys_madvise);
|
||||
cond_syscall(process_madvise);
|
||||
cond_syscall(sys_mremap);
|
||||
cond_syscall(sys_remap_file_pages);
|
||||
cond_syscall(compat_sys_move_pages);
|
||||
|
94
mm/madvise.c
94
mm/madvise.c
@ -17,6 +17,7 @@
|
||||
#include <linux/falloc.h>
|
||||
#include <linux/fadvise.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/ksm.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/file.h>
|
||||
@ -26,8 +27,7 @@
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/shmem_fs.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
#include <linux/uio.h>
|
||||
#include <asm/tlb.h>
|
||||
|
||||
#include "internal.h"
|
||||
@ -1013,6 +1013,18 @@ madvise_behavior_valid(int behavior)
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
process_madvise_behavior_valid(int behavior)
|
||||
{
|
||||
switch (behavior) {
|
||||
case MADV_COLD:
|
||||
case MADV_PAGEOUT:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The madvise(2) system call.
|
||||
*
|
||||
@ -1060,6 +1072,11 @@ madvise_behavior_valid(int behavior)
|
||||
* MADV_DONTDUMP - the application wants to prevent pages in the given range
|
||||
* from being included in its core dump.
|
||||
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
|
||||
* MADV_COLD - the application is not expected to use this memory soon,
|
||||
* deactivate pages in this range so that they can be reclaimed
|
||||
* easily if memory pressure hanppens.
|
||||
* MADV_PAGEOUT - the application is not expected to use this memory soon,
|
||||
* page out the pages in this range immediately.
|
||||
*
|
||||
* return values:
|
||||
* zero - success
|
||||
@ -1193,3 +1210,76 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
|
||||
{
|
||||
return do_madvise(current->mm, start, len_in, behavior);
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
|
||||
size_t, vlen, int, behavior, unsigned int, flags)
|
||||
{
|
||||
ssize_t ret;
|
||||
struct iovec iovstack[UIO_FASTIOV], iovec;
|
||||
struct iovec *iov = iovstack;
|
||||
struct iov_iter iter;
|
||||
struct pid *pid;
|
||||
struct task_struct *task;
|
||||
struct mm_struct *mm;
|
||||
size_t total_len;
|
||||
unsigned int f_flags;
|
||||
|
||||
if (flags != 0) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
pid = pidfd_get_pid(pidfd, &f_flags);
|
||||
if (IS_ERR(pid)) {
|
||||
ret = PTR_ERR(pid);
|
||||
goto free_iov;
|
||||
}
|
||||
|
||||
task = get_pid_task(pid, PIDTYPE_PID);
|
||||
if (!task) {
|
||||
ret = -ESRCH;
|
||||
goto put_pid;
|
||||
}
|
||||
|
||||
if (task->mm != current->mm &&
|
||||
!process_madvise_behavior_valid(behavior)) {
|
||||
ret = -EINVAL;
|
||||
goto release_task;
|
||||
}
|
||||
|
||||
mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
|
||||
if (IS_ERR_OR_NULL(mm)) {
|
||||
ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
|
||||
goto release_task;
|
||||
}
|
||||
|
||||
total_len = iov_iter_count(&iter);
|
||||
|
||||
while (iov_iter_count(&iter)) {
|
||||
iovec = iov_iter_iovec(&iter);
|
||||
ret = do_madvise(mm, (unsigned long)iovec.iov_base,
|
||||
iovec.iov_len, behavior);
|
||||
if (ret < 0)
|
||||
break;
|
||||
iov_iter_advance(&iter, iovec.iov_len);
|
||||
}
|
||||
|
||||
if (ret == 0)
|
||||
ret = total_len - iov_iter_count(&iter);
|
||||
|
||||
mmput(mm);
|
||||
return ret;
|
||||
|
||||
release_task:
|
||||
put_task_struct(task);
|
||||
put_pid:
|
||||
put_pid(pid);
|
||||
free_iov:
|
||||
kfree(iov);
|
||||
out:
|
||||
return ret;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user