blob: 77854b192fef669e8546cf59d43991dbbef3a253 [file] [log] [blame]
Thomas Gleixner40b0b3f2019-06-03 07:44:46 +02001// SPDX-License-Identifier: GPL-2.0-only
Eric W. Biederman5033cba2005-06-25 14:57:56 -07002/*
Dave Jones835c34a2007-10-12 21:10:53 -04003 * handle transition of Linux booting another kernel
Eric W. Biederman5033cba2005-06-25 14:57:56 -07004 * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
Eric W. Biederman5033cba2005-06-25 14:57:56 -07005 */
6
7#include <linux/mm.h>
8#include <linux/kexec.h>
9#include <linux/delay.h>
Ken'ichi Ohmichifd59d232007-10-16 23:27:27 -070010#include <linux/numa.h>
Ingo Molnarf43fdad2008-05-12 21:20:43 +020011#include <linux/ftrace.h>
Huang Ying3122c332008-08-15 00:40:26 -070012#include <linux/suspend.h>
Huang Ying92be3d62008-10-31 09:48:08 +080013#include <linux/gfp.h>
Huang Yingfef3a7a2009-03-10 10:56:57 +080014#include <linux/io.h>
Ingo Molnarf43fdad2008-05-12 21:20:43 +020015
Eric W. Biederman5033cba2005-06-25 14:57:56 -070016#include <asm/pgtable.h>
17#include <asm/pgalloc.h>
18#include <asm/tlbflush.h>
19#include <asm/mmu_context.h>
Eric W. Biederman5033cba2005-06-25 14:57:56 -070020#include <asm/apic.h>
Jiang Liu8643e282014-10-27 16:12:04 +080021#include <asm/io_apic.h>
Eric W. Biederman5033cba2005-06-25 14:57:56 -070022#include <asm/cpufeature.h>
Eric W. Biedermane7b47cc2005-07-29 13:01:18 -060023#include <asm/desc.h>
Laura Abbottd1163652017-05-08 15:58:11 -070024#include <asm/set_memory.h>
K.Prasad17f557e2009-06-01 23:46:03 +053025#include <asm/debugreg.h>
Eric W. Biederman5033cba2005-06-25 14:57:56 -070026
Eric W. Biederman5033cba2005-06-25 14:57:56 -070027static void set_gdt(void *newgdt, __u16 limit)
28{
Glauber de Oliveira Costa6b68f012008-01-30 13:31:12 +010029 struct desc_ptr curgdt;
Eric W. Biederman5033cba2005-06-25 14:57:56 -070030
31 /* ia32 supports unaligned loads & stores */
Eric W. Biedermane7b47cc2005-07-29 13:01:18 -060032 curgdt.size = limit;
33 curgdt.address = (unsigned long)newgdt;
Eric W. Biederman5033cba2005-06-25 14:57:56 -070034
Zachary Amsdenf2ab44612005-09-03 15:56:42 -070035 load_gdt(&curgdt);
WANG Cong378fc6e2008-06-24 16:21:18 +010036}
Eric W. Biederman5033cba2005-06-25 14:57:56 -070037
38static void load_segments(void)
39{
40#define __STR(X) #X
41#define STR(X) __STR(X)
42
43 __asm__ __volatile__ (
44 "\tljmp $"STR(__KERNEL_CS)",$1f\n"
45 "\t1:\n"
Michael Matz2ec5e3a2006-03-07 21:55:48 -080046 "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
47 "\tmovl %%eax,%%ds\n"
48 "\tmovl %%eax,%%es\n"
Michael Matz2ec5e3a2006-03-07 21:55:48 -080049 "\tmovl %%eax,%%ss\n"
Huang Yingfef3a7a2009-03-10 10:56:57 +080050 : : : "eax", "memory");
Eric W. Biederman5033cba2005-06-25 14:57:56 -070051#undef STR
52#undef __STR
53}
54
Huang Ying92be3d62008-10-31 09:48:08 +080055static void machine_kexec_free_page_tables(struct kimage *image)
56{
Joerg Roedelca38dc82018-07-25 17:48:03 +020057 free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER);
Tetsuo Handaa466ef72018-05-09 19:42:20 +090058 image->arch.pgd = NULL;
Huang Ying92be3d62008-10-31 09:48:08 +080059#ifdef CONFIG_X86_PAE
60 free_page((unsigned long)image->arch.pmd0);
Tetsuo Handaa466ef72018-05-09 19:42:20 +090061 image->arch.pmd0 = NULL;
Huang Ying92be3d62008-10-31 09:48:08 +080062 free_page((unsigned long)image->arch.pmd1);
Tetsuo Handaa466ef72018-05-09 19:42:20 +090063 image->arch.pmd1 = NULL;
Huang Ying92be3d62008-10-31 09:48:08 +080064#endif
65 free_page((unsigned long)image->arch.pte0);
Tetsuo Handaa466ef72018-05-09 19:42:20 +090066 image->arch.pte0 = NULL;
Huang Ying92be3d62008-10-31 09:48:08 +080067 free_page((unsigned long)image->arch.pte1);
Tetsuo Handaa466ef72018-05-09 19:42:20 +090068 image->arch.pte1 = NULL;
Huang Ying92be3d62008-10-31 09:48:08 +080069}
70
71static int machine_kexec_alloc_page_tables(struct kimage *image)
72{
Joerg Roedelca38dc82018-07-25 17:48:03 +020073 image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
74 PGD_ALLOCATION_ORDER);
Huang Ying92be3d62008-10-31 09:48:08 +080075#ifdef CONFIG_X86_PAE
76 image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
77 image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
78#endif
79 image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL);
80 image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL);
81 if (!image->arch.pgd ||
82#ifdef CONFIG_X86_PAE
83 !image->arch.pmd0 || !image->arch.pmd1 ||
84#endif
85 !image->arch.pte0 || !image->arch.pte1) {
Huang Ying92be3d62008-10-31 09:48:08 +080086 return -ENOMEM;
87 }
88 return 0;
89}
90
Huang Ying9868ee62008-10-31 09:48:15 +080091static void machine_kexec_page_table_set_one(
92 pgd_t *pgd, pmd_t *pmd, pte_t *pte,
93 unsigned long vaddr, unsigned long paddr)
94{
Kirill A. Shutemov7f689042017-03-17 21:55:10 +030095 p4d_t *p4d;
Huang Ying9868ee62008-10-31 09:48:15 +080096 pud_t *pud;
97
98 pgd += pgd_index(vaddr);
99#ifdef CONFIG_X86_PAE
100 if (!(pgd_val(*pgd) & _PAGE_PRESENT))
101 set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT));
102#endif
Kirill A. Shutemov7f689042017-03-17 21:55:10 +0300103 p4d = p4d_offset(pgd, vaddr);
104 pud = pud_offset(p4d, vaddr);
Huang Ying9868ee62008-10-31 09:48:15 +0800105 pmd = pmd_offset(pud, vaddr);
106 if (!(pmd_val(*pmd) & _PAGE_PRESENT))
107 set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
108 pte = pte_offset_kernel(pmd, vaddr);
109 set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
110}
111
112static void machine_kexec_prepare_page_tables(struct kimage *image)
113{
114 void *control_page;
Hannes Ederfc6fcdf2009-02-22 01:00:57 +0100115 pmd_t *pmd = NULL;
Huang Ying9868ee62008-10-31 09:48:15 +0800116
117 control_page = page_address(image->control_code_page);
118#ifdef CONFIG_X86_PAE
119 pmd = image->arch.pmd0;
120#endif
121 machine_kexec_page_table_set_one(
122 image->arch.pgd, pmd, image->arch.pte0,
123 (unsigned long)control_page, __pa(control_page));
124#ifdef CONFIG_X86_PAE
125 pmd = image->arch.pmd1;
126#endif
127 machine_kexec_page_table_set_one(
128 image->arch.pgd, pmd, image->arch.pte1,
129 __pa(control_page), __pa(control_page));
130}
131
Eric W. Biederman5033cba2005-06-25 14:57:56 -0700132/*
133 * A architecture hook called to validate the
134 * proposed image and prepare the control pages
Huang Ying163f6872008-08-15 00:40:22 -0700135 * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE
Eric W. Biederman5033cba2005-06-25 14:57:56 -0700136 * have been allocated, but the segments have yet
137 * been copied into the kernel.
138 *
139 * Do what every setup is needed on image and the
140 * reboot code buffer to allow us to avoid allocations
141 * later.
142 *
Huang Ying92be3d62008-10-31 09:48:08 +0800143 * - Make control page executable.
144 * - Allocate page tables
Huang Ying9868ee62008-10-31 09:48:15 +0800145 * - Setup page tables
Eric W. Biederman5033cba2005-06-25 14:57:56 -0700146 */
147int machine_kexec_prepare(struct kimage *image)
148{
Huang Ying9868ee62008-10-31 09:48:15 +0800149 int error;
150
H. Peter Anvin583140a2009-11-13 15:28:15 -0800151 set_pages_x(image->control_code_page, 1);
Huang Ying9868ee62008-10-31 09:48:15 +0800152 error = machine_kexec_alloc_page_tables(image);
153 if (error)
154 return error;
155 machine_kexec_prepare_page_tables(image);
156 return 0;
Eric W. Biederman5033cba2005-06-25 14:57:56 -0700157}
158
159/*
160 * Undo anything leftover by machine_kexec_prepare
161 * when an image is freed.
162 */
163void machine_kexec_cleanup(struct kimage *image)
164{
H. Peter Anvin583140a2009-11-13 15:28:15 -0800165 set_pages_nx(image->control_code_page, 1);
Huang Ying92be3d62008-10-31 09:48:08 +0800166 machine_kexec_free_page_tables(image);
Eric W. Biederman5033cba2005-06-25 14:57:56 -0700167}
168
169/*
170 * Do not allocate memory (or fail in any way) in machine_kexec().
171 * We are past the point of no return, committed to rebooting now.
172 */
Huang Ying3ab83522008-07-25 19:45:07 -0700173void machine_kexec(struct kimage *image)
Eric W. Biederman5033cba2005-06-25 14:57:56 -0700174{
Magnus Damm35665612006-09-26 10:52:38 +0200175 unsigned long page_list[PAGES_NR];
176 void *control_page;
Huang Ying3122c332008-08-15 00:40:26 -0700177 int save_ftrace_enabled;
Huang Ying3ab83522008-07-25 19:45:07 -0700178 asmlinkage unsigned long
179 (*relocate_kernel_ptr)(unsigned long indirection_page,
180 unsigned long control_page,
181 unsigned long start_address,
182 unsigned int has_pae,
183 unsigned int preserve_context);
Eric W. Biederman5033cba2005-06-25 14:57:56 -0700184
Huang Ying3122c332008-08-15 00:40:26 -0700185#ifdef CONFIG_KEXEC_JUMP
Huang Ying6407df52009-05-08 10:51:41 +0800186 if (image->preserve_context)
Huang Ying3122c332008-08-15 00:40:26 -0700187 save_processor_state();
188#endif
189
190 save_ftrace_enabled = __ftrace_enabled_save();
Ingo Molnarf43fdad2008-05-12 21:20:43 +0200191
Eric W. Biederman5033cba2005-06-25 14:57:56 -0700192 /* Interrupts aren't acceptable while we reboot */
193 local_irq_disable();
K.Prasad17f557e2009-06-01 23:46:03 +0530194 hw_breakpoint_disable();
Eric W. Biederman5033cba2005-06-25 14:57:56 -0700195
Huang Ying89081d12008-07-25 19:45:10 -0700196 if (image->preserve_context) {
197#ifdef CONFIG_X86_IO_APIC
Huang Yingfef3a7a2009-03-10 10:56:57 +0800198 /*
199 * We need to put APICs in legacy mode so that we can
Huang Ying89081d12008-07-25 19:45:10 -0700200 * get timer interrupts in second kernel. kexec/kdump
Baoquan He50374b92018-02-14 13:46:54 +0800201 * paths already have calls to restore_boot_irq_mode()
202 * in one form or other. kexec jump path also need one.
Huang Ying89081d12008-07-25 19:45:10 -0700203 */
Baoquan He3c9e76d2018-02-14 13:46:52 +0800204 clear_IO_APIC();
205 restore_boot_irq_mode();
Huang Ying89081d12008-07-25 19:45:10 -0700206#endif
207 }
208
Magnus Damm35665612006-09-26 10:52:38 +0200209 control_page = page_address(image->control_code_page);
Huang Yingfb45daa2008-08-15 00:40:23 -0700210 memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
Eric W. Biederman5033cba2005-06-25 14:57:56 -0700211
Huang Ying3ab83522008-07-25 19:45:07 -0700212 relocate_kernel_ptr = control_page;
Magnus Damm35665612006-09-26 10:52:38 +0200213 page_list[PA_CONTROL_PAGE] = __pa(control_page);
Huang Ying3ab83522008-07-25 19:45:07 -0700214 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
Huang Ying92be3d62008-10-31 09:48:08 +0800215 page_list[PA_PGD] = __pa(image->arch.pgd);
Ken'ichi Ohmichie7706fc2008-10-20 13:51:52 +0900216
217 if (image->type == KEXEC_TYPE_DEFAULT)
218 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
219 << PAGE_SHIFT);
Eric W. Biederman5033cba2005-06-25 14:57:56 -0700220
Huang Yingfef3a7a2009-03-10 10:56:57 +0800221 /*
222 * The segment registers are funny things, they have both a
Eric W. Biederman2a8a3d52006-07-30 03:03:20 -0700223 * visible and an invisible part. Whenever the visible part is
224 * set to a specific selector, the invisible part is loaded
225 * with from a table in memory. At no other time is the
226 * descriptor table in memory accessed.
Eric W. Biederman5033cba2005-06-25 14:57:56 -0700227 *
228 * I take advantage of this here by force loading the
229 * segments, before I zap the gdt with an invalid value.
230 */
231 load_segments();
Huang Yingfef3a7a2009-03-10 10:56:57 +0800232 /*
233 * The gdt & idt are now invalid.
Eric W. Biederman5033cba2005-06-25 14:57:56 -0700234 * If you want to load them you must set up your own idt & gdt.
235 */
Thomas Gleixnere802a512017-08-28 08:47:46 +0200236 idt_invalidate(phys_to_virt(0));
Linus Torvaldsac461122017-12-27 11:48:50 -0800237 set_gdt(phys_to_virt(0), 0);
Eric W. Biederman5033cba2005-06-25 14:57:56 -0700238
239 /* now call it */
Huang Ying3ab83522008-07-25 19:45:07 -0700240 image->start = relocate_kernel_ptr((unsigned long)image->head,
241 (unsigned long)page_list,
Dave Hansenc8128cc2014-09-11 14:15:11 -0700242 image->start,
243 boot_cpu_has(X86_FEATURE_PAE),
Huang Ying3ab83522008-07-25 19:45:07 -0700244 image->preserve_context);
Huang Ying3122c332008-08-15 00:40:26 -0700245
246#ifdef CONFIG_KEXEC_JUMP
Huang Ying6407df52009-05-08 10:51:41 +0800247 if (image->preserve_context)
Huang Ying3122c332008-08-15 00:40:26 -0700248 restore_processor_state();
249#endif
250
251 __ftrace_enabled_restore(save_ftrace_enabled);
Eric W. Biederman5033cba2005-06-25 14:57:56 -0700252}
Rusty Russell1a3f2392006-09-26 10:52:32 +0200253
Ken'ichi Ohmichifd59d232007-10-16 23:27:27 -0700254void arch_crash_save_vmcoreinfo(void)
255{
Ken'ichi Ohmichi92df5c32008-02-07 00:15:23 -0800256#ifdef CONFIG_NUMA
Ken'ichi Ohmichibcbba6c2007-10-16 23:27:30 -0700257 VMCOREINFO_SYMBOL(node_data);
258 VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
Ken'ichi Ohmichifd59d232007-10-16 23:27:27 -0700259#endif
260#ifdef CONFIG_X86_PAE
Ken'ichi Ohmichibcbba6c2007-10-16 23:27:30 -0700261 VMCOREINFO_CONFIG(X86_PAE);
Ken'ichi Ohmichifd59d232007-10-16 23:27:27 -0700262#endif
263}
264