Merge branch 'linus' into sched/urgent, to resolve conflicts
Conflicts:
arch/arm64/kernel/entry.S
arch/x86/Kconfig
include/linux/sched/mm.h
kernel/fork.c
Signed-off-by: Ingo Molnar <mingo@kernel.org>
diff --git a/kernel/fork.c b/kernel/fork.c
index 2295fc6..c7c1123 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -77,6 +77,7 @@
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
+#include <linux/sched/mm.h>
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
@@ -282,8 +283,9 @@
void thread_stack_cache_init(void)
{
- thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
- THREAD_SIZE, 0, NULL);
+ thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
+ THREAD_SIZE, THREAD_SIZE, 0, 0,
+ THREAD_SIZE, NULL);
BUG_ON(thread_stack_cache == NULL);
}
# endif
@@ -390,210 +392,6 @@
}
EXPORT_SYMBOL(free_task);
-static inline void free_signal_struct(struct signal_struct *sig)
-{
- taskstats_tgid_free(sig);
- sched_autogroup_exit(sig);
- /*
- * __mmdrop is not safe to call from softirq context on x86 due to
- * pgd_dtor so postpone it to the async context
- */
- if (sig->oom_mm)
- mmdrop_async(sig->oom_mm);
- kmem_cache_free(signal_cachep, sig);
-}
-
-static inline void put_signal_struct(struct signal_struct *sig)
-{
- if (atomic_dec_and_test(&sig->sigcnt))
- free_signal_struct(sig);
-}
-
-void __put_task_struct(struct task_struct *tsk)
-{
- WARN_ON(!tsk->exit_state);
- WARN_ON(atomic_read(&tsk->usage));
- WARN_ON(tsk == current);
-
- cgroup_free(tsk);
- task_numa_free(tsk);
- security_task_free(tsk);
- exit_creds(tsk);
- delayacct_tsk_free(tsk);
- put_signal_struct(tsk->signal);
-
- if (!profile_handoff_task(tsk))
- free_task(tsk);
-}
-EXPORT_SYMBOL_GPL(__put_task_struct);
-
-void __init __weak arch_task_cache_init(void) { }
-
-/*
- * set_max_threads
- */
-static void set_max_threads(unsigned int max_threads_suggested)
-{
- u64 threads;
-
- /*
- * The number of threads shall be limited such that the thread
- * structures may only consume a small part of the available memory.
- */
- if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
- threads = MAX_THREADS;
- else
- threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
- (u64) THREAD_SIZE * 8UL);
-
- if (threads > max_threads_suggested)
- threads = max_threads_suggested;
-
- max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
-}
-
-#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
-/* Initialized by the architecture: */
-int arch_task_struct_size __read_mostly;
-#endif
-
-void __init fork_init(void)
-{
- int i;
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
-#ifndef ARCH_MIN_TASKALIGN
-#define ARCH_MIN_TASKALIGN 0
-#endif
- int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
-
- /* create a slab on which task_structs can be allocated */
- task_struct_cachep = kmem_cache_create("task_struct",
- arch_task_struct_size, align,
- SLAB_PANIC|SLAB_ACCOUNT, NULL);
-#endif
-
- /* do the arch specific task caches init */
- arch_task_cache_init();
-
- set_max_threads(MAX_THREADS);
-
- init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
- init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
- init_task.signal->rlim[RLIMIT_SIGPENDING] =
- init_task.signal->rlim[RLIMIT_NPROC];
-
- for (i = 0; i < UCOUNT_COUNTS; i++) {
- init_user_ns.ucount_max[i] = max_threads/2;
- }
-
-#ifdef CONFIG_VMAP_STACK
- cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
- NULL, free_vm_stack_cache);
-#endif
-
- lockdep_init_task(&init_task);
-}
-
-int __weak arch_dup_task_struct(struct task_struct *dst,
- struct task_struct *src)
-{
- *dst = *src;
- return 0;
-}
-
-void set_task_stack_end_magic(struct task_struct *tsk)
-{
- unsigned long *stackend;
-
- stackend = end_of_stack(tsk);
- *stackend = STACK_END_MAGIC; /* for overflow detection */
-}
-
-static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
-{
- struct task_struct *tsk;
- unsigned long *stack;
- struct vm_struct *stack_vm_area;
- int err;
-
- if (node == NUMA_NO_NODE)
- node = tsk_fork_get_node(orig);
- tsk = alloc_task_struct_node(node);
- if (!tsk)
- return NULL;
-
- stack = alloc_thread_stack_node(tsk, node);
- if (!stack)
- goto free_tsk;
-
- stack_vm_area = task_stack_vm_area(tsk);
-
- err = arch_dup_task_struct(tsk, orig);
-
- /*
- * arch_dup_task_struct() clobbers the stack-related fields. Make
- * sure they're properly initialized before using any stack-related
- * functions again.
- */
- tsk->stack = stack;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = stack_vm_area;
-#endif
-#ifdef CONFIG_THREAD_INFO_IN_TASK
- atomic_set(&tsk->stack_refcount, 1);
-#endif
-
- if (err)
- goto free_stack;
-
-#ifdef CONFIG_SECCOMP
- /*
- * We must handle setting up seccomp filters once we're under
- * the sighand lock in case orig has changed between now and
- * then. Until then, filter must be NULL to avoid messing up
- * the usage counts on the error path calling free_task.
- */
- tsk->seccomp.filter = NULL;
-#endif
-
- setup_thread_stack(tsk, orig);
- clear_user_return_notifier(tsk);
- clear_tsk_need_resched(tsk);
- set_task_stack_end_magic(tsk);
-
-#ifdef CONFIG_CC_STACKPROTECTOR
- tsk->stack_canary = get_random_canary();
-#endif
-
- /*
- * One for us, one for whoever does the "release_task()" (usually
- * parent)
- */
- atomic_set(&tsk->usage, 2);
-#ifdef CONFIG_BLK_DEV_IO_TRACE
- tsk->btrace_seq = 0;
-#endif
- tsk->splice_pipe = NULL;
- tsk->task_frag.page = NULL;
- tsk->wake_q.next = NULL;
-
- account_kernel_stack(tsk, 1);
-
- kcov_task_init(tsk);
-
-#ifdef CONFIG_FAULT_INJECTION
- tsk->fail_nth = 0;
-#endif
-
- return tsk;
-
-free_stack:
- free_thread_stack(tsk);
-free_tsk:
- free_task_struct(tsk);
- return NULL;
-}
-
#ifdef CONFIG_MMU
static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct mm_struct *oldmm)
@@ -721,7 +519,8 @@
goto out;
}
/* a new mm has just been created */
- retval = arch_dup_mmap(oldmm, mm);
+ arch_dup_mmap(oldmm, mm);
+ retval = 0;
out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
@@ -764,11 +563,299 @@
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
+static void check_mm(struct mm_struct *mm)
+{
+ int i;
+
+ for (i = 0; i < NR_MM_COUNTERS; i++) {
+ long x = atomic_long_read(&mm->rss_stat.count[i]);
+
+ if (unlikely(x))
+ printk(KERN_ALERT "BUG: Bad rss-counter state "
+ "mm:%p idx:%d val:%ld\n", mm, i, x);
+ }
+
+ if (mm_pgtables_bytes(mm))
+ pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
+ mm_pgtables_bytes(mm));
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
+#endif
+}
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+/*
+ * Called when the last reference to the mm
+ * is dropped: either by a lazy thread or by
+ * mmput. Free the page directory and the mm.
+ */
+static void __mmdrop(struct mm_struct *mm)
+{
+ BUG_ON(mm == &init_mm);
+ mm_free_pgd(mm);
+ destroy_context(mm);
+ hmm_mm_destroy(mm);
+ mmu_notifier_mm_destroy(mm);
+ check_mm(mm);
+ put_user_ns(mm->user_ns);
+ free_mm(mm);
+}
+
+void mmdrop(struct mm_struct *mm)
+{
+ /*
+ * The implicit full barrier implied by atomic_dec_and_test() is
+ * required by the membarrier system call before returning to
+ * user-space, after storing to rq->curr.
+ */
+ if (unlikely(atomic_dec_and_test(&mm->mm_count)))
+ __mmdrop(mm);
+}
+EXPORT_SYMBOL_GPL(mmdrop);
+
+static void mmdrop_async_fn(struct work_struct *work)
+{
+ struct mm_struct *mm;
+
+ mm = container_of(work, struct mm_struct, async_put_work);
+ __mmdrop(mm);
+}
+
+static void mmdrop_async(struct mm_struct *mm)
+{
+ if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
+ INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
+ schedule_work(&mm->async_put_work);
+ }
+}
+
+static inline void free_signal_struct(struct signal_struct *sig)
+{
+ taskstats_tgid_free(sig);
+ sched_autogroup_exit(sig);
+ /*
+ * __mmdrop is not safe to call from softirq context on x86 due to
+ * pgd_dtor so postpone it to the async context
+ */
+ if (sig->oom_mm)
+ mmdrop_async(sig->oom_mm);
+ kmem_cache_free(signal_cachep, sig);
+}
+
+static inline void put_signal_struct(struct signal_struct *sig)
+{
+ if (atomic_dec_and_test(&sig->sigcnt))
+ free_signal_struct(sig);
+}
+
+void __put_task_struct(struct task_struct *tsk)
+{
+ WARN_ON(!tsk->exit_state);
+ WARN_ON(atomic_read(&tsk->usage));
+ WARN_ON(tsk == current);
+
+ cgroup_free(tsk);
+ task_numa_free(tsk);
+ security_task_free(tsk);
+ exit_creds(tsk);
+ delayacct_tsk_free(tsk);
+ put_signal_struct(tsk->signal);
+
+ if (!profile_handoff_task(tsk))
+ free_task(tsk);
+}
+EXPORT_SYMBOL_GPL(__put_task_struct);
+
+void __init __weak arch_task_cache_init(void) { }
+
+/*
+ * set_max_threads
+ */
+static void set_max_threads(unsigned int max_threads_suggested)
+{
+ u64 threads;
+
+ /*
+ * The number of threads shall be limited such that the thread
+ * structures may only consume a small part of the available memory.
+ */
+ if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
+ threads = MAX_THREADS;
+ else
+ threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
+ (u64) THREAD_SIZE * 8UL);
+
+ if (threads > max_threads_suggested)
+ threads = max_threads_suggested;
+
+ max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
+}
+
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+/* Initialized by the architecture: */
+int arch_task_struct_size __read_mostly;
+#endif
+
+static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
+{
+ /* Fetch thread_struct whitelist for the architecture. */
+ arch_thread_struct_whitelist(offset, size);
+
+ /*
+ * Handle zero-sized whitelist or empty thread_struct, otherwise
+ * adjust offset to position of thread_struct in task_struct.
+ */
+ if (unlikely(*size == 0))
+ *offset = 0;
+ else
+ *offset += offsetof(struct task_struct, thread);
+}
+
+void __init fork_init(void)
+{
+ int i;
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef ARCH_MIN_TASKALIGN
+#define ARCH_MIN_TASKALIGN 0
+#endif
+ int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
+ unsigned long useroffset, usersize;
+
+ /* create a slab on which task_structs can be allocated */
+ task_struct_whitelist(&useroffset, &usersize);
+ task_struct_cachep = kmem_cache_create_usercopy("task_struct",
+ arch_task_struct_size, align,
+ SLAB_PANIC|SLAB_ACCOUNT,
+ useroffset, usersize, NULL);
+#endif
+
+ /* do the arch specific task caches init */
+ arch_task_cache_init();
+
+ set_max_threads(MAX_THREADS);
+
+ init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
+ init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
+ init_task.signal->rlim[RLIMIT_SIGPENDING] =
+ init_task.signal->rlim[RLIMIT_NPROC];
+
+ for (i = 0; i < UCOUNT_COUNTS; i++) {
+ init_user_ns.ucount_max[i] = max_threads/2;
+ }
+
+#ifdef CONFIG_VMAP_STACK
+ cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
+ NULL, free_vm_stack_cache);
+#endif
+
+ lockdep_init_task(&init_task);
+}
+
+int __weak arch_dup_task_struct(struct task_struct *dst,
+ struct task_struct *src)
+{
+ *dst = *src;
+ return 0;
+}
+
+void set_task_stack_end_magic(struct task_struct *tsk)
+{
+ unsigned long *stackend;
+
+ stackend = end_of_stack(tsk);
+ *stackend = STACK_END_MAGIC; /* for overflow detection */
+}
+
+static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
+{
+ struct task_struct *tsk;
+ unsigned long *stack;
+ struct vm_struct *stack_vm_area;
+ int err;
+
+ if (node == NUMA_NO_NODE)
+ node = tsk_fork_get_node(orig);
+ tsk = alloc_task_struct_node(node);
+ if (!tsk)
+ return NULL;
+
+ stack = alloc_thread_stack_node(tsk, node);
+ if (!stack)
+ goto free_tsk;
+
+ stack_vm_area = task_stack_vm_area(tsk);
+
+ err = arch_dup_task_struct(tsk, orig);
+
+ /*
+ * arch_dup_task_struct() clobbers the stack-related fields. Make
+ * sure they're properly initialized before using any stack-related
+ * functions again.
+ */
+ tsk->stack = stack;
+#ifdef CONFIG_VMAP_STACK
+ tsk->stack_vm_area = stack_vm_area;
+#endif
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ atomic_set(&tsk->stack_refcount, 1);
+#endif
+
+ if (err)
+ goto free_stack;
+
+#ifdef CONFIG_SECCOMP
+ /*
+ * We must handle setting up seccomp filters once we're under
+ * the sighand lock in case orig has changed between now and
+ * then. Until then, filter must be NULL to avoid messing up
+ * the usage counts on the error path calling free_task.
+ */
+ tsk->seccomp.filter = NULL;
+#endif
+
+ setup_thread_stack(tsk, orig);
+ clear_user_return_notifier(tsk);
+ clear_tsk_need_resched(tsk);
+ set_task_stack_end_magic(tsk);
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+ tsk->stack_canary = get_random_canary();
+#endif
+
+ /*
+ * One for us, one for whoever does the "release_task()" (usually
+ * parent)
+ */
+ atomic_set(&tsk->usage, 2);
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+ tsk->btrace_seq = 0;
+#endif
+ tsk->splice_pipe = NULL;
+ tsk->task_frag.page = NULL;
+ tsk->wake_q.next = NULL;
+
+ account_kernel_stack(tsk, 1);
+
+ kcov_task_init(tsk);
+
+#ifdef CONFIG_FAULT_INJECTION
+ tsk->fail_nth = 0;
+#endif
+
+ return tsk;
+
+free_stack:
+ free_thread_stack(tsk);
+free_tsk:
+ free_task_struct(tsk);
+ return NULL;
+}
+
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
+
static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
static int __init coredump_filter_setup(char *s)
@@ -858,27 +945,6 @@
return NULL;
}
-static void check_mm(struct mm_struct *mm)
-{
- int i;
-
- for (i = 0; i < NR_MM_COUNTERS; i++) {
- long x = atomic_long_read(&mm->rss_stat.count[i]);
-
- if (unlikely(x))
- printk(KERN_ALERT "BUG: Bad rss-counter state "
- "mm:%p idx:%d val:%ld\n", mm, i, x);
- }
-
- if (mm_pgtables_bytes(mm))
- pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
- mm_pgtables_bytes(mm));
-
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
- VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
-#endif
-}
-
/*
* Allocate and initialize an mm_struct.
*/
@@ -894,24 +960,6 @@
return mm_init(mm, current, current_user_ns());
}
-/*
- * Called when the last reference to the mm
- * is dropped: either by a lazy thread or by
- * mmput. Free the page directory and the mm.
- */
-void __mmdrop(struct mm_struct *mm)
-{
- BUG_ON(mm == &init_mm);
- mm_free_pgd(mm);
- destroy_context(mm);
- hmm_mm_destroy(mm);
- mmu_notifier_mm_destroy(mm);
- check_mm(mm);
- put_user_ns(mm->user_ns);
- free_mm(mm);
-}
-EXPORT_SYMBOL_GPL(__mmdrop);
-
static inline void __mmput(struct mm_struct *mm)
{
VM_BUG_ON(atomic_read(&mm->mm_users));
@@ -2224,9 +2272,11 @@
* maximum number of CPU's we can ever have. The cpumask_allocation
* is at the end of the structure, exactly for that reason.
*/
- mm_cachep = kmem_cache_create("mm_struct",
+ mm_cachep = kmem_cache_create_usercopy("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
+ offsetof(struct mm_struct, saved_auxv),
+ sizeof_field(struct mm_struct, saved_auxv),
NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
mmap_init();