blob: d6b784a5520daf2938cd228daa2bf6bde74c421d [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
Hiroshi Shimamoto66125382008-01-30 13:31:03 +01006 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07007 * X86-64 port
8 * Andi Kleen.
Ashok Raj76e4f662005-06-25 14:55:00 -07009 *
10 * CPU hotplug support - ashok.raj@intel.com
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 */
12
13/*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
Ashok Raj76e4f662005-06-25 14:55:00 -070017#include <linux/cpu.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070018#include <linux/errno.h>
19#include <linux/sched.h>
Ingo Molnar29930022017-02-08 18:51:36 +010020#include <linux/sched/task.h>
Ingo Molnar68db0cf2017-02-08 18:51:37 +010021#include <linux/sched/task_stack.h>
Hiroshi Shimamoto66125382008-01-30 13:31:03 +010022#include <linux/fs.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070023#include <linux/kernel.h>
24#include <linux/mm.h>
25#include <linux/elfcore.h>
26#include <linux/smp.h>
27#include <linux/slab.h>
28#include <linux/user.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070029#include <linux/interrupt.h>
Hiroshi Shimamoto66125382008-01-30 13:31:03 +010030#include <linux/delay.h>
Paul Gortmaker186f4362016-07-13 20:18:56 -040031#include <linux/export.h>
Hiroshi Shimamoto66125382008-01-30 13:31:03 +010032#include <linux/ptrace.h>
Andi Kleen95833c82006-01-11 22:44:36 +010033#include <linux/notifier.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080034#include <linux/kprobes.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070035#include <linux/kdebug.h>
Erik Bosman529e25f2008-04-14 00:24:18 +020036#include <linux/prctl.h>
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -030037#include <linux/uaccess.h>
38#include <linux/io.h>
Frederic Weisbecker8b96f012008-12-06 03:40:00 +010039#include <linux/ftrace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070040
Linus Torvalds1da177e2005-04-16 15:20:36 -070041#include <asm/pgtable.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070042#include <asm/processor.h>
Ingo Molnar78f7f1e2015-04-24 02:54:44 +020043#include <asm/fpu/internal.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070044#include <asm/mmu_context.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070045#include <asm/prctl.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070046#include <asm/desc.h>
47#include <asm/proto.h>
48#include <asm/ia32.h>
Jaswinder Singhbbc1f692008-07-21 21:34:13 +053049#include <asm/syscalls.h>
K.Prasad66cb5912009-06-01 23:44:55 +053050#include <asm/debugreg.h>
David Howellsf05e7982012-03-28 18:11:12 +010051#include <asm/switch_to.h>
Andy Lutomirskib7a584592016-03-16 14:14:21 -070052#include <asm/xen/hypervisor.h>
Dmitry Safonov2eefd872016-09-05 16:33:05 +030053#include <asm/vdso.h>
Fenghua Yu4f341a52016-10-28 15:04:48 -070054#include <asm/intel_rdt.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070055
Ingo Molnarc38e5032015-03-17 14:42:59 +010056__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
Linus Torvalds1da177e2005-04-16 15:20:36 -070057
Hiroshi Shimamoto66125382008-01-30 13:31:03 +010058/* Prints also some state that isn't saved in the pt_regs */
Pekka Enberge2ce07c2008-04-03 16:40:48 +030059void __show_regs(struct pt_regs *regs, int all)
Linus Torvalds1da177e2005-04-16 15:20:36 -070060{
61 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
Alan Sternbb1995d2007-07-21 17:10:42 +020062 unsigned long d0, d1, d2, d3, d6, d7;
Hiroshi Shimamoto66125382008-01-30 13:31:03 +010063 unsigned int fsindex, gsindex;
64 unsigned int ds, cs, es;
Linus Torvalds1da177e2005-04-16 15:20:36 -070065
Josh Poimboeufbb5e5ce2016-10-25 09:51:12 -050066 printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff,
67 (void *)regs->ip);
Josh Poimboeuf6fa81a122016-10-20 11:34:45 -050068 printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
Josh Poimboeufbb5e5ce2016-10-25 09:51:12 -050069 regs->sp, regs->flags);
Josh Poimboeuf6fa81a122016-10-20 11:34:45 -050070 if (regs->orig_ax != -1)
71 pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
72 else
73 pr_cont("\n");
74
Pekka Enbergd015a092009-12-28 10:26:59 +020075 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
H. Peter Anvin65ea5b02008-01-30 13:30:56 +010076 regs->ax, regs->bx, regs->cx);
Pekka Enbergd015a092009-12-28 10:26:59 +020077 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
H. Peter Anvin65ea5b02008-01-30 13:30:56 +010078 regs->dx, regs->si, regs->di);
Pekka Enbergd015a092009-12-28 10:26:59 +020079 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
H. Peter Anvin65ea5b02008-01-30 13:30:56 +010080 regs->bp, regs->r8, regs->r9);
Pekka Enbergd015a092009-12-28 10:26:59 +020081 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -030082 regs->r10, regs->r11, regs->r12);
Pekka Enbergd015a092009-12-28 10:26:59 +020083 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -030084 regs->r13, regs->r14, regs->r15);
Linus Torvalds1da177e2005-04-16 15:20:36 -070085
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -030086 asm("movl %%ds,%0" : "=r" (ds));
87 asm("movl %%cs,%0" : "=r" (cs));
88 asm("movl %%es,%0" : "=r" (es));
Linus Torvalds1da177e2005-04-16 15:20:36 -070089 asm("movl %%fs,%0" : "=r" (fsindex));
90 asm("movl %%gs,%0" : "=r" (gsindex));
91
92 rdmsrl(MSR_FS_BASE, fs);
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -030093 rdmsrl(MSR_GS_BASE, gs);
94 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
Linus Torvalds1da177e2005-04-16 15:20:36 -070095
Pekka Enberge2ce07c2008-04-03 16:40:48 +030096 if (!all)
97 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -070098
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +020099 cr0 = read_cr0();
100 cr2 = read_cr2();
101 cr3 = read_cr3();
Andy Lutomirski1e02ce42014-10-24 15:58:08 -0700102 cr4 = __read_cr4();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103
Pekka Enbergd015a092009-12-28 10:26:59 +0200104 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -0300105 fs, fsindex, gs, gsindex, shadowgs);
Pekka Enbergd015a092009-12-28 10:26:59 +0200106 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
Gustavo F. Padovan8092c652008-07-29 02:48:52 -0300107 es, cr0);
Pekka Enbergd015a092009-12-28 10:26:59 +0200108 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
Gustavo F. Padovan8092c652008-07-29 02:48:52 -0300109 cr4);
Alan Sternbb1995d2007-07-21 17:10:42 +0200110
111 get_debugreg(d0, 0);
112 get_debugreg(d1, 1);
113 get_debugreg(d2, 2);
Alan Sternbb1995d2007-07-21 17:10:42 +0200114 get_debugreg(d3, 3);
115 get_debugreg(d6, 6);
116 get_debugreg(d7, 7);
Dave Jones43387742013-06-18 12:09:11 -0400117
118 /* Only print out debug registers if they are in their non-default state. */
Nicolas Ioossba6d0182016-09-10 20:30:45 +0200119 if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
120 (d6 == DR6_RESERVED) && (d7 == 0x400))) {
121 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
122 d0, d1, d2);
123 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
124 d3, d6, d7);
125 }
Dave Jones43387742013-06-18 12:09:11 -0400126
Dave Hansenc0b17b52016-02-12 13:02:25 -0800127 if (boot_cpu_has(X86_FEATURE_OSPKE))
128 printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700129}
130
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131void release_thread(struct task_struct *dead_task)
132{
133 if (dead_task->mm) {
Andy Lutomirskia5b9e5a2015-07-30 14:31:34 -0700134#ifdef CONFIG_MODIFY_LDT_SYSCALL
Andy Lutomirski37868fe2015-07-30 14:31:32 -0700135 if (dead_task->mm->context.ldt) {
Chen Gang349eab62012-11-06 14:45:46 +0800136 pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
Joe Perchesc767a542012-05-21 19:50:07 -0700137 dead_task->comm,
Jan Beulich0d430e32015-12-22 08:42:44 -0700138 dead_task->mm->context.ldt->entries,
Andy Lutomirski37868fe2015-07-30 14:31:32 -0700139 dead_task->mm->context.ldt->size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140 BUG();
141 }
Andy Lutomirskia5b9e5a2015-07-30 14:31:34 -0700142#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143 }
144}
145
Josh Triplettc1bd55f2015-06-30 15:00:00 -0700146int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
147 unsigned long arg, struct task_struct *p, unsigned long tls)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148{
149 int err;
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -0300150 struct pt_regs *childregs;
Brian Gerst01003012016-08-13 12:38:19 -0400151 struct fork_frame *fork_frame;
152 struct inactive_task_frame *frame;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153 struct task_struct *me = current;
154
Al Viro7076aad2012-09-10 16:44:54 -0400155 p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
156 childregs = task_pt_regs(p);
Brian Gerst01003012016-08-13 12:38:19 -0400157 fork_frame = container_of(childregs, struct fork_frame, regs);
158 frame = &fork_frame->frame;
159 frame->bp = 0;
160 frame->ret_addr = (unsigned long) ret_from_fork;
161 p->thread.sp = (unsigned long) fork_frame;
K.Prasad66cb5912009-06-01 23:44:55 +0530162 p->thread.io_bitmap_ptr = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163
Jeremy Fitzhardingeada85702008-06-25 00:19:00 -0400164 savesegment(gs, p->thread.gsindex);
Andy Lutomirski296f7812016-04-26 12:23:29 -0700165 p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
Jeremy Fitzhardingeada85702008-06-25 00:19:00 -0400166 savesegment(fs, p->thread.fsindex);
Andy Lutomirski296f7812016-04-26 12:23:29 -0700167 p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
Jeremy Fitzhardingeada85702008-06-25 00:19:00 -0400168 savesegment(es, p->thread.es);
169 savesegment(ds, p->thread.ds);
Al Viro7076aad2012-09-10 16:44:54 -0400170 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
171
Al Viro1d4b4b22012-10-22 22:34:11 -0400172 if (unlikely(p->flags & PF_KTHREAD)) {
Al Viro7076aad2012-09-10 16:44:54 -0400173 /* kernel thread */
174 memset(childregs, 0, sizeof(struct pt_regs));
Brian Gerst616d2482016-08-13 12:38:20 -0400175 frame->bx = sp; /* function */
176 frame->r12 = arg;
Al Viro7076aad2012-09-10 16:44:54 -0400177 return 0;
178 }
Brian Gerst616d2482016-08-13 12:38:20 -0400179 frame->bx = 0;
Al Viro1d4b4b22012-10-22 22:34:11 -0400180 *childregs = *current_pt_regs();
Al Viro7076aad2012-09-10 16:44:54 -0400181
182 childregs->ax = 0;
Al Viro1d4b4b22012-10-22 22:34:11 -0400183 if (sp)
184 childregs->sp = sp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185
K.Prasad66cb5912009-06-01 23:44:55 +0530186 err = -ENOMEM;
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200187 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
Thomas Meyercced4022011-11-17 23:43:40 +0100188 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
189 IO_BITMAP_BYTES, GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700190 if (!p->thread.io_bitmap_ptr) {
191 p->thread.io_bitmap_max = 0;
192 return -ENOMEM;
193 }
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200194 set_tsk_thread_flag(p, TIF_IO_BITMAP);
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100195 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196
197 /*
198 * Set a new TLS for the child thread?
199 */
200 if (clone_flags & CLONE_SETTLS) {
201#ifdef CONFIG_IA32_EMULATION
Dmitry Safonovabfb9492016-04-18 16:43:43 +0300202 if (in_ia32_syscall())
Roland McGrathefd1ca52008-01-30 13:30:46 +0100203 err = do_set_thread_area(p, -1,
Josh Triplettc1bd55f2015-06-30 15:00:00 -0700204 (struct user_desc __user *)tls, 0);
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -0300205 else
206#endif
Josh Triplettc1bd55f2015-06-30 15:00:00 -0700207 err = do_arch_prctl(p, ARCH_SET_FS, tls);
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -0300208 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209 goto out;
210 }
211 err = 0;
212out:
213 if (err && p->thread.io_bitmap_ptr) {
214 kfree(p->thread.io_bitmap_ptr);
215 p->thread.io_bitmap_max = 0;
216 }
K.Prasad66cb5912009-06-01 23:44:55 +0530217
Linus Torvalds1da177e2005-04-16 15:20:36 -0700218 return err;
219}
220
H. Peter Anvine634d8f2009-10-09 15:56:53 -0700221static void
222start_thread_common(struct pt_regs *regs, unsigned long new_ip,
223 unsigned long new_sp,
224 unsigned int _cs, unsigned int _ss, unsigned int _ds)
Ingo Molnar513ad842008-02-21 05:18:40 +0100225{
Jeremy Fitzhardingeada85702008-06-25 00:19:00 -0400226 loadsegment(fs, 0);
H. Peter Anvine634d8f2009-10-09 15:56:53 -0700227 loadsegment(es, _ds);
228 loadsegment(ds, _ds);
Ingo Molnar513ad842008-02-21 05:18:40 +0100229 load_gs_index(0);
230 regs->ip = new_ip;
231 regs->sp = new_sp;
H. Peter Anvine634d8f2009-10-09 15:56:53 -0700232 regs->cs = _cs;
233 regs->ss = _ss;
H. Peter Anvina6f05a62009-10-08 18:02:54 -0700234 regs->flags = X86_EFLAGS_IF;
Brian Gerst1daeaa32015-03-21 18:54:21 -0400235 force_iret();
Ingo Molnar513ad842008-02-21 05:18:40 +0100236}
H. Peter Anvine634d8f2009-10-09 15:56:53 -0700237
238void
239start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
240{
241 start_thread_common(regs, new_ip, new_sp,
242 __USER_CS, __USER_DS, 0);
243}
Ingo Molnar513ad842008-02-21 05:18:40 +0100244
Brian Gerst7da77072015-06-22 07:55:13 -0400245#ifdef CONFIG_COMPAT
246void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
H. Peter Anvina6f05a62009-10-08 18:02:54 -0700247{
H. Peter Anvine634d8f2009-10-09 15:56:53 -0700248 start_thread_common(regs, new_ip, new_sp,
H. Peter Anvind1a797f2012-02-19 10:06:34 -0800249 test_thread_flag(TIF_X32)
250 ? __USER_CS : __USER32_CS,
251 __USER_DS, __USER_DS);
H. Peter Anvina6f05a62009-10-08 18:02:54 -0700252}
253#endif
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200254
Linus Torvalds1da177e2005-04-16 15:20:36 -0700255/*
256 * switch_to(x,y) should switch tasks from x to y.
257 *
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100258 * This could still be optimized:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259 * - fold all the options into a flag word and test it with a single test.
260 * - could test fs/gs bitsliced
Andi Kleen099f3182006-02-03 21:51:38 +0100261 *
262 * Kprobes not supported here. Set the probe on schedule instead.
Frederic Weisbecker8b96f012008-12-06 03:40:00 +0100263 * Function graph tracer not supported too.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700264 */
Andi Kleen35ea79032013-08-05 15:02:39 -0700265__visible __notrace_funcgraph struct task_struct *
Andi Kleena88cde12005-11-05 17:25:54 +0100266__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267{
Jeremy Fitzhardinge87b935a2008-07-08 15:06:26 -0700268 struct thread_struct *prev = &prev_p->thread;
269 struct thread_struct *next = &next_p->thread;
Ingo Molnar384a23f2015-04-23 17:43:27 +0200270 struct fpu *prev_fpu = &prev->fpu;
271 struct fpu *next_fpu = &next->fpu;
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100272 int cpu = smp_processor_id();
Andy Lutomirski24933b82015-03-05 19:19:05 -0800273 struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700274 unsigned prev_fsindex, prev_gsindex;
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200275
Rik van Rielc474e502016-10-14 08:15:31 -0400276 switch_fpu_prepare(prev_fpu, cpu);
Linus Torvalds49030622012-02-16 19:11:15 -0800277
Jeremy Fitzhardinge478de5a2008-06-25 00:19:24 -0400278 /* We must save %fs and %gs before load_TLS() because
279 * %fs and %gs may be cleared by load_TLS().
280 *
281 * (e.g. xen_load_tls())
282 */
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700283 savesegment(fs, prev_fsindex);
284 savesegment(gs, prev_gsindex);
Jeremy Fitzhardinge478de5a2008-06-25 00:19:24 -0400285
Andy Lutomirskif647d7c2014-12-08 13:55:20 -0800286 /*
287 * Load TLS before restoring any segments so that segment loads
288 * reference the correct GDT entries.
289 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700290 load_TLS(next, cpu);
291
Jeremy Fitzhardinge3fe0a632008-06-25 00:19:23 -0400292 /*
Andy Lutomirskif647d7c2014-12-08 13:55:20 -0800293 * Leave lazy mode, flushing any hypercalls made here. This
294 * must be done after loading TLS entries in the GDT but before
295 * loading segments that might reference them, and and it must
Ingo Molnar3a0aee42015-04-22 13:16:47 +0200296 * be done before fpu__restore(), so the TS bit is up to
Andy Lutomirskif647d7c2014-12-08 13:55:20 -0800297 * date.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298 */
Jeremy Fitzhardinge224101e2009-02-18 11:18:57 -0800299 arch_end_context_switch(next_p);
Jeremy Fitzhardinge3fe0a632008-06-25 00:19:23 -0400300
Andy Lutomirskif647d7c2014-12-08 13:55:20 -0800301 /* Switch DS and ES.
302 *
303 * Reading them only returns the selectors, but writing them (if
304 * nonzero) loads the full descriptor from the GDT or LDT. The
305 * LDT for next is loaded in switch_mm, and the GDT is loaded
306 * above.
307 *
308 * We therefore need to write new values to the segment
309 * registers on every context switch unless both the new and old
310 * values are zero.
311 *
312 * Note that we don't need to do anything for CS and SS, as
313 * those are saved and restored as part of pt_regs.
314 */
315 savesegment(es, prev->es);
316 if (unlikely(next->es | prev->es))
317 loadsegment(es, next->es);
318
319 savesegment(ds, prev->ds);
320 if (unlikely(next->ds | prev->ds))
321 loadsegment(ds, next->ds);
322
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -0300323 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324 * Switch FS and GS.
Jeremy Fitzhardinge87b935a2008-07-08 15:06:26 -0700325 *
Chuck Ebbert558a65b2015-10-14 14:31:19 -0400326 * These are even more complicated than DS and ES: they have
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700327 * 64-bit bases are that controlled by arch_prctl. The bases
328 * don't necessarily match the selectors, as user code can do
329 * any number of things to cause them to be inconsistent.
Andy Lutomirskif647d7c2014-12-08 13:55:20 -0800330 *
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700331 * We don't promise to preserve the bases if the selectors are
332 * nonzero. We also don't promise to preserve the base if the
333 * selector is zero and the base doesn't match whatever was
334 * most recently passed to ARCH_SET_FS/GS. (If/when the
335 * FSGSBASE instructions are enabled, we'll need to offer
336 * stronger guarantees.)
Andy Lutomirskif647d7c2014-12-08 13:55:20 -0800337 *
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700338 * As an invariant,
Andy Lutomirski296f7812016-04-26 12:23:29 -0700339 * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700340 * impossible.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341 */
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700342 if (next->fsindex) {
343 /* Loading a nonzero value into FS sets the index and base. */
Jeremy Fitzhardinge87b935a2008-07-08 15:06:26 -0700344 loadsegment(fs, next->fsindex);
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700345 } else {
Andy Lutomirski296f7812016-04-26 12:23:29 -0700346 if (next->fsbase) {
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700347 /* Next index is zero but next base is nonzero. */
348 if (prev_fsindex)
349 loadsegment(fs, 0);
Andy Lutomirski296f7812016-04-26 12:23:29 -0700350 wrmsrl(MSR_FS_BASE, next->fsbase);
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700351 } else {
352 /* Next base and index are both zero. */
353 if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
354 /*
355 * We don't know the previous base and can't
356 * find out without RDMSR. Forcibly clear it.
357 */
358 loadsegment(fs, __USER_DS);
359 loadsegment(fs, 0);
360 } else {
361 /*
362 * If the previous index is zero and ARCH_SET_FS
363 * didn't change the base, then the base is
364 * also zero and we don't need to do anything.
365 */
Andy Lutomirski296f7812016-04-26 12:23:29 -0700366 if (prev->fsbase || prev_fsindex)
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700367 loadsegment(fs, 0);
368 }
369 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370 }
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700371 /*
372 * Save the old state and preserve the invariant.
373 * NB: if prev_fsindex == 0, then we can't reliably learn the base
374 * without RDMSR because Intel user code can zero it without telling
375 * us and AMD user code can program any 32-bit value without telling
376 * us.
377 */
378 if (prev_fsindex)
Andy Lutomirski296f7812016-04-26 12:23:29 -0700379 prev->fsbase = 0;
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700380 prev->fsindex = prev_fsindex;
Jeremy Fitzhardinge87b935a2008-07-08 15:06:26 -0700381
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700382 if (next->gsindex) {
383 /* Loading a nonzero value into GS sets the index and base. */
Jeremy Fitzhardinge87b935a2008-07-08 15:06:26 -0700384 load_gs_index(next->gsindex);
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700385 } else {
Andy Lutomirski296f7812016-04-26 12:23:29 -0700386 if (next->gsbase) {
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700387 /* Next index is zero but next base is nonzero. */
388 if (prev_gsindex)
389 load_gs_index(0);
Andy Lutomirski296f7812016-04-26 12:23:29 -0700390 wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700391 } else {
392 /* Next base and index are both zero. */
393 if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
394 /*
395 * We don't know the previous base and can't
396 * find out without RDMSR. Forcibly clear it.
397 *
398 * This contains a pointless SWAPGS pair.
399 * Fixing it would involve an explicit check
400 * for Xen or a new pvop.
401 */
402 load_gs_index(__USER_DS);
403 load_gs_index(0);
404 } else {
405 /*
406 * If the previous index is zero and ARCH_SET_GS
407 * didn't change the base, then the base is
408 * also zero and we don't need to do anything.
409 */
Andy Lutomirski296f7812016-04-26 12:23:29 -0700410 if (prev->gsbase || prev_gsindex)
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700411 load_gs_index(0);
412 }
413 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414 }
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700415 /*
416 * Save the old state and preserve the invariant.
417 * NB: if prev_gsindex == 0, then we can't reliably learn the base
418 * without RDMSR because Intel user code can zero it without telling
419 * us and AMD user code can program any 32-bit value without telling
420 * us.
421 */
422 if (prev_gsindex)
Andy Lutomirski296f7812016-04-26 12:23:29 -0700423 prev->gsbase = 0;
Andy Lutomirski3e2b68d2016-04-07 17:31:47 -0700424 prev->gsindex = prev_gsindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425
Rik van Rielc474e502016-10-14 08:15:31 -0400426 switch_fpu_finish(next_fpu, cpu);
Linus Torvalds34ddc812012-02-18 12:56:35 -0800427
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -0300428 /*
Jan Beulich45948d72006-03-25 16:29:25 +0100429 * Switch the PDA and FPU contexts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700430 */
Alex Shic6ae41e2012-05-11 15:35:27 +0800431 this_cpu_write(current_task, next_p);
Andi Kleen18bd0572006-04-20 02:36:45 +0200432
Andy Lutomirskib27559a2015-03-06 17:50:18 -0800433 /* Reload esp0 and ss1. This changes current_thread_info(). */
434 load_sp0(tss, next);
435
Linus Torvalds1da177e2005-04-16 15:20:36 -0700436 /*
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200437 * Now maybe reload the debug registers and handle I/O bitmaps
Linus Torvalds1da177e2005-04-16 15:20:36 -0700438 */
Markus Metzgereee3af42008-01-30 13:31:09 +0100439 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
440 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200441 __switch_to_xtra(prev_p, next_p, tss);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442
Andy Lutomirskib7a584592016-03-16 14:14:21 -0700443#ifdef CONFIG_XEN
444 /*
445 * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
446 * current_pt_regs()->flags may not match the current task's
447 * intended IOPL. We need to switch it manually.
448 */
449 if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
450 prev->iopl != next->iopl))
451 xen_set_iopl_mask(next->iopl);
452#endif
453
Andy Lutomirski61f01dd2015-04-26 16:47:59 -0700454 if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
455 /*
456 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
457 * does not update the cached descriptor. As a result, if we
458 * do SYSRET while SS is NULL, we'll end up in user mode with
459 * SS apparently equal to __USER_DS but actually unusable.
460 *
461 * The straightforward workaround would be to fix it up just
462 * before SYSRET, but that would slow down the system call
463 * fast paths. Instead, we ensure that SS is never NULL in
464 * system call context. We do this by replacing NULL SS
465 * selectors at every context switch. SYSCALL sets up a valid
466 * SS, so the only way to get NULL is to re-enter the kernel
467 * from CPL 3 through an interrupt. Since that can't happen
468 * in the same task as a running syscall, we are guaranteed to
469 * context switch between every interrupt vector entry and a
470 * subsequent SYSRET.
471 *
472 * We read SS first because SS reads are much faster than
473 * writes. Out of caution, we force SS to __KERNEL_DS even if
474 * it previously had a different non-NULL value.
475 */
476 unsigned short ss_sel;
477 savesegment(ss, ss_sel);
478 if (ss_sel != __KERNEL_DS)
479 loadsegment(ss, __KERNEL_DS);
480 }
481
Fenghua Yu4f341a52016-10-28 15:04:48 -0700482 /* Load the Intel cache allocation PQR MSR. */
483 intel_rdt_sched_in();
484
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485 return prev_p;
486}
487
Linus Torvalds1da177e2005-04-16 15:20:36 -0700488void set_personality_64bit(void)
489{
490 /* inherit personality from parent */
491
492 /* Make sure to be in 64bit mode */
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100493 clear_thread_flag(TIF_IA32);
H. Peter Anvin6bd33002012-02-06 13:03:09 -0800494 clear_thread_flag(TIF_ADDR32);
H. Peter Anvinbb212722012-02-14 13:56:49 -0800495 clear_thread_flag(TIF_X32);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700496
Stephen Wilson375906f2011-03-13 15:49:14 -0400497 /* Ensure the corresponding mm is not marked. */
498 if (current->mm)
499 current->mm->context.ia32_compat = 0;
500
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501 /* TBD: overwrites user setup. Should have two bits.
502 But 64bit processes have always behaved this way,
503 so it's not too bad. The main problem is just that
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100504 32bit childs are affected again. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700505 current->personality &= ~READ_IMPLIES_EXEC;
506}
507
H. Peter Anvind1a797f2012-02-19 10:06:34 -0800508void set_personality_ia32(bool x32)
H. Peter Anvin05d43ed2010-01-28 22:14:43 -0800509{
510 /* inherit personality from parent */
511
512 /* Make sure to be in 32bit mode */
H. Peter Anvin6bd33002012-02-06 13:03:09 -0800513 set_thread_flag(TIF_ADDR32);
H. Peter Anvin05d43ed2010-01-28 22:14:43 -0800514
Stephen Wilson375906f2011-03-13 15:49:14 -0400515 /* Mark the associated mm as containing 32-bit tasks. */
H. Peter Anvind1a797f2012-02-19 10:06:34 -0800516 if (x32) {
517 clear_thread_flag(TIF_IA32);
518 set_thread_flag(TIF_X32);
Oleg Nesterovb24dc8d2014-04-19 18:10:09 +0200519 if (current->mm)
520 current->mm->context.ia32_compat = TIF_X32;
H. Peter Anvind1a797f2012-02-19 10:06:34 -0800521 current->personality &= ~READ_IMPLIES_EXEC;
Andy Lutomirskif9701652016-03-22 14:25:27 -0700522 /* in_compat_syscall() uses the presence of the x32
Bobby Powersce5f7a92012-02-25 23:25:38 -0500523 syscall bit flag to determine compat status */
Andy Lutomirskib9d989c2016-09-13 14:29:21 -0700524 current->thread.status &= ~TS_COMPAT;
H. Peter Anvind1a797f2012-02-19 10:06:34 -0800525 } else {
526 set_thread_flag(TIF_IA32);
527 clear_thread_flag(TIF_X32);
Oleg Nesterovb24dc8d2014-04-19 18:10:09 +0200528 if (current->mm)
529 current->mm->context.ia32_compat = TIF_IA32;
H. Peter Anvind1a797f2012-02-19 10:06:34 -0800530 current->personality |= force_personality32;
531 /* Prepare the first "return" to user space */
Andy Lutomirskib9d989c2016-09-13 14:29:21 -0700532 current->thread.status |= TS_COMPAT;
H. Peter Anvind1a797f2012-02-19 10:06:34 -0800533 }
H. Peter Anvin05d43ed2010-01-28 22:14:43 -0800534}
Larry Fingerfebb72a2012-05-06 19:40:03 -0500535EXPORT_SYMBOL_GPL(set_personality_ia32);
H. Peter Anvin05d43ed2010-01-28 22:14:43 -0800536
Ingo Molnar91b7bd32016-09-15 08:42:51 +0200537#ifdef CONFIG_CHECKPOINT_RESTORE
Dmitry Safonov2eefd872016-09-05 16:33:05 +0300538static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
539{
540 int ret;
541
542 ret = map_vdso_once(image, addr);
543 if (ret)
544 return ret;
545
546 return (long)image->size;
547}
Ingo Molnar91b7bd32016-09-15 08:42:51 +0200548#endif
Dmitry Safonov2eefd872016-09-05 16:33:05 +0300549
Linus Torvalds1da177e2005-04-16 15:20:36 -0700550long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -0300551{
552 int ret = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700553 int doit = task == current;
554 int cpu;
555
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -0300556 switch (code) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700557 case ARCH_SET_GS:
Andy Lutomirskid696ca02016-05-10 09:18:46 -0700558 if (addr >= TASK_SIZE_MAX)
Gustavo F. Padovan7de08b42008-07-29 02:48:51 -0300559 return -EPERM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700560 cpu = get_cpu();
Andy Lutomirski731e33e2016-04-26 12:23:28 -0700561 task->thread.gsindex = 0;
Andy Lutomirski296f7812016-04-26 12:23:29 -0700562 task->thread.gsbase = addr;
Andy Lutomirski731e33e2016-04-26 12:23:28 -0700563 if (doit) {
564 load_gs_index(0);
565 ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700566 }
Mateusz Guzik4afd0562016-05-10 22:56:43 +0200567 put_cpu();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700568 break;
569 case ARCH_SET_FS:
570 /* Not strictly needed for fs, but do it for symmetry
571 with gs */
Andy Lutomirskid696ca02016-05-10 09:18:46 -0700572 if (addr >= TASK_SIZE_MAX)
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100573 return -EPERM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700574 cpu = get_cpu();
Andy Lutomirski731e33e2016-04-26 12:23:28 -0700575 task->thread.fsindex = 0;
Andy Lutomirski296f7812016-04-26 12:23:29 -0700576 task->thread.fsbase = addr;
Andy Lutomirski731e33e2016-04-26 12:23:28 -0700577 if (doit) {
578 /* set the selector to 0 to not confuse __switch_to */
579 loadsegment(fs, 0);
580 ret = wrmsrl_safe(MSR_FS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700581 }
582 put_cpu();
583 break;
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100584 case ARCH_GET_FS: {
585 unsigned long base;
Andy Lutomirskid47b50e2016-04-07 17:31:45 -0700586 if (doit)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700587 rdmsrl(MSR_FS_BASE, base);
Andi Kleena88cde12005-11-05 17:25:54 +0100588 else
Andy Lutomirski296f7812016-04-26 12:23:29 -0700589 base = task->thread.fsbase;
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100590 ret = put_user(base, (unsigned long __user *)addr);
591 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700592 }
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100593 case ARCH_GET_GS: {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700594 unsigned long base;
Andy Lutomirskid47b50e2016-04-07 17:31:45 -0700595 if (doit)
596 rdmsrl(MSR_KERNEL_GS_BASE, base);
Andy Lutomirskid47b50e2016-04-07 17:31:45 -0700597 else
Andy Lutomirski296f7812016-04-26 12:23:29 -0700598 base = task->thread.gsbase;
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100599 ret = put_user(base, (unsigned long __user *)addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700600 break;
601 }
602
Dmitry Safonov2eefd872016-09-05 16:33:05 +0300603#ifdef CONFIG_CHECKPOINT_RESTORE
Vinson Lee6e68b082016-09-17 00:51:53 +0000604# ifdef CONFIG_X86_X32_ABI
Dmitry Safonov2eefd872016-09-05 16:33:05 +0300605 case ARCH_MAP_VDSO_X32:
606 return prctl_map_vdso(&vdso_image_x32, addr);
Ingo Molnar91b7bd32016-09-15 08:42:51 +0200607# endif
608# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
Dmitry Safonov2eefd872016-09-05 16:33:05 +0300609 case ARCH_MAP_VDSO_32:
610 return prctl_map_vdso(&vdso_image_32, addr);
Ingo Molnar91b7bd32016-09-15 08:42:51 +0200611# endif
Dmitry Safonov2eefd872016-09-05 16:33:05 +0300612 case ARCH_MAP_VDSO_64:
613 return prctl_map_vdso(&vdso_image_64, addr);
614#endif
615
Linus Torvalds1da177e2005-04-16 15:20:36 -0700616 default:
617 ret = -EINVAL;
618 break;
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100619 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700620
Hiroshi Shimamoto66125382008-01-30 13:31:03 +0100621 return ret;
622}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700623
624long sys_arch_prctl(int code, unsigned long addr)
625{
626 return do_arch_prctl(current, code, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700627}
628
Stefani Seibold89240ba2009-11-03 10:22:40 +0100629unsigned long KSTK_ESP(struct task_struct *task)
630{
Denys Vlasenko263042e2015-03-09 19:39:23 +0100631 return task_pt_regs(task)->sp;
Stefani Seibold89240ba2009-11-03 10:22:40 +0100632}