Home Home > GIT Browse
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichal Marek <mmarek@suse.cz>2013-08-27 10:44:18 +0200
committerMichal Marek <mmarek@suse.cz>2013-08-27 10:44:18 +0200
commita649fedb28fa48cf81c2f8df40f90fddcdf9a3d1 (patch)
tree53d552f037b8fb77d2907399e75c58fc81d9c7fa
parent0e6c421a1eeac8988240e8f6a9a40917ea910ff5 (diff)
parent960447c5ab289c8d4c09f6d017931fcbed650431 (diff)
Merge branch 'SLE11-SP2' into SLE11-SP3rpm-3.0.93-0.8
Conflicts: kernel-source.changes series.conf suse-commit: 70ed288609c90cf64be74c96dac72f13e2067a46
-rw-r--r--arch/x86/kernel/entry_64.S209
-rw-r--r--kernel/printk.c299
2 files changed, 405 insertions, 103 deletions
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2bf18ba02b83..b11e7f43db75 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1558,11 +1558,166 @@ ENTRY(error_exit)
CFI_ENDPROC
END(error_exit)
+/*
+ * Test if a given stack is an NMI stack or not.
+ */
+ .macro test_in_nmi reg stack nmi_ret normal_ret
+ cmpq %\reg, \stack
+ ja \normal_ret
+ subq $EXCEPTION_STKSZ, %\reg
+ cmpq %\reg, \stack
+ jb \normal_ret
+ jmp \nmi_ret
+ .endm
/* runs on exception stack */
ENTRY(nmi)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
+ /*
+ * We allow breakpoints in NMIs. If a breakpoint occurs, then
+ * the iretq it performs will take us out of NMI context.
+ * This means that we can have nested NMIs where the next
+ * NMI is using the top of the stack of the previous NMI. We
+ * can't let it execute because the nested NMI will corrupt the
+ * stack of the previous NMI. NMI handlers are not re-entrant
+ * anyway.
+ *
+ * To handle this case we do the following:
+ * Check the a special location on the stack that contains
+ * a variable that is set when NMIs are executing.
+ * The interrupted task's stack is also checked to see if it
+ * is an NMI stack.
+ * If the variable is not set and the stack is not the NMI
+ * stack then:
+ * o Set the special variable on the stack
+ * o Copy the interrupt frame into a "saved" location on the stack
+ * o Copy the interrupt frame into a "copy" location on the stack
+ * o Continue processing the NMI
+ * If the variable is set or the previous stack is the NMI stack:
+ * o Modify the "copy" location to jump to the repeate_nmi
+ * o return back to the first NMI
+ *
+ * Now on exit of the first NMI, we first clear the stack variable
+ * The NMI stack will tell any nested NMIs at that point that it is
+ * nested. Then we pop the stack normally with iret, and if there was
+ * a nested NMI that updated the copy interrupt stack frame, a
+ * jump will be made to the repeat_nmi code that will handle the second
+ * NMI.
+ */
+
+ /* Use %rdx as out temp variable throughout */
+ pushq_cfi %rdx
+
+ /*
+ * Check the special variable on the stack to see if NMIs are
+ * executing.
+ */
+ cmp $1, -8(%rsp)
+ je nested_nmi
+
+ /*
+ * Now test if the previous stack was an NMI stack.
+ * We need the double check. We check the NMI stack to satisfy the
+ * race when the first NMI clears the variable before returning.
+ * We check the variable because the first NMI could be in a
+ * breakpoint routine using a breakpoint stack.
+ */
+ lea 6*8(%rsp), %rdx
+ test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+
+nested_nmi:
+ /*
+ * Do nothing if we interrupted the fixup in repeat_nmi.
+ * It's about to repeat the NMI handler, so we are fine
+ * with ignoring this one.
+ */
+ movq $repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja 1f
+ movq $end_repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja nested_nmi_out
+
+1:
+ /* Set up the interrupted NMIs stack to jump to repeat_nmi */
+ leaq -6*8(%rsp), %rdx
+ movq %rdx, %rsp
+ CFI_ADJUST_CFA_OFFSET 6*8
+ pushq_cfi $__KERNEL_DS
+ pushq_cfi %rdx
+ pushfq_cfi
+ pushq_cfi $__KERNEL_CS
+ pushq_cfi $repeat_nmi
+
+ /* Put stack back */
+ addq $(11*8), %rsp
+ CFI_ADJUST_CFA_OFFSET -11*8
+
+nested_nmi_out:
+ popq_cfi %rdx
+
+ /* No need to check faults here */
+ INTERRUPT_RETURN
+
+first_nmi:
+ /*
+ * Because nested NMIs will use the pushed location that we
+ * stored in rdx, we must keep that space available.
+ * Here's what our stack frame will look like:
+ * +-------------------------+
+ * | original SS |
+ * | original Return RSP |
+ * | original RFLAGS |
+ * | original CS |
+ * | original RIP |
+ * +-------------------------+
+ * | temp storage for rdx |
+ * +-------------------------+
+ * | NMI executing variable |
+ * +-------------------------+
+ * | Saved SS |
+ * | Saved Return RSP |
+ * | Saved RFLAGS |
+ * | Saved CS |
+ * | Saved RIP |
+ * +-------------------------+
+ * | copied SS |
+ * | copied Return RSP |
+ * | copied RFLAGS |
+ * | copied CS |
+ * | copied RIP |
+ * +-------------------------+
+ * | pt_regs |
+ * +-------------------------+
+ *
+ * The saved RIP is used to fix up the copied RIP that a nested
+ * NMI may zero out. The original stack frame and the temp storage
+ * is also used by nested NMIs and can not be trusted on exit.
+ */
+ /* Set the NMI executing variable on the stack. */
+ pushq_cfi $1
+
+ /* Copy the stack frame to the Saved frame */
+ .rept 5
+ pushq_cfi 6*8(%rsp)
+ .endr
+
+ /* Make another copy, this one may be modified by nested NMIs */
+ .rept 5
+ pushq_cfi 4*8(%rsp)
+ .endr
+
+ /* Do not pop rdx, nested NMIs will corrupt it */
+ movq 11*8(%rsp), %rdx
+
+ /*
+ * Everything below this point can be preempted by a nested
+ * NMI if the first NMI took an exception. Repeated NMIs
+ * caused by an exception and nested NMI will start here, and
+ * can still be preempted by another NMI.
+ */
+restart_nmi:
pushq_cfi $-1
subq $ORIG_RAX-R15, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
@@ -1572,48 +1727,38 @@ ENTRY(nmi)
movq %rsp,%rdi
movq $-1,%rsi
call do_nmi
-#ifdef CONFIG_TRACE_IRQFLAGS
- /* paranoidexit; without TRACE_IRQS_OFF */
- /* ebx: no swapgs flag */
- DISABLE_INTERRUPTS(CLBR_NONE)
testl %ebx,%ebx /* swapgs needed? */
jnz nmi_restore
- testl $3,CS(%rsp)
- jnz nmi_userspace
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_ALL 8
+ /* Clear the NMI executing stack variable */
+ movq $0, 10*8(%rsp)
jmp irq_return
-nmi_userspace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz nmi_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz nmi_schedule
- movl %ebx,%edx /* arg3: thread flags */
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- jmp nmi_userspace
-nmi_schedule:
- ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
- DISABLE_INTERRUPTS(CLBR_ANY)
- jmp nmi_userspace
CFI_ENDPROC
-#else
- jmp paranoid_exit
- CFI_ENDPROC
-#endif
END(nmi)
+ /*
+ * If an NMI hit an iret because of an exception or breakpoint,
+ * it can lose its NMI context, and a nested NMI may come in.
+ * In that case, the nested NMI will change the preempted NMI's
+ * stack to jump to here when it does the final iret.
+ */
+repeat_nmi:
+ INTR_FRAME
+ /* Update the stack variable to say we are still in NMI */
+ movq $1, 5*8(%rsp)
+
+ /* copy the saved stack back to copy stack */
+ .rept 5
+ pushq_cfi 4*8(%rsp)
+ .endr
+
+ jmp restart_nmi
+ CFI_ENDPROC
+end_repeat_nmi:
+
ENTRY(ignore_sysret)
CFI_STARTPROC
mov $-ENOSYS,%eax
diff --git a/kernel/printk.c b/kernel/printk.c
index 4e22436d7c6e..a3595ce39b77 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -107,8 +107,22 @@ static int console_locked, console_suspended;
*/
static DEFINE_SPINLOCK(logbuf_lock);
+/*
+ * nmi_logbuf_lock protects nmi_log_buf, log_start and log_end from NNI context
+ * when logbuf_lock is held to synchronize NMI contexts which try to do printk
+ * at the same time. NEVER EVER take this lock outside of NMI context.
+ * non NMI consumer of nmi_log_buf has to take logbuf_lock and be careful about
+ * racing with NMI context (see handle_nmi_delayed_printk).
+ */
+static DEFINE_SPINLOCK(nmi_logbuf_lock);
+
#define LOG_BUF_MASK (log_buf_len-1)
-#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
+#define __LOG_BUF(buf, len, idx) (buf[(idx) & ((len)-1)])
+#define LOG_BUF(idx) (__LOG_BUF(log_buf, log_buf_len, idx))
+
+/* Worker to print accumulated data to console when there's too much of it */
+static void printk_worker(struct work_struct *work);
+static DECLARE_WORK(printk_work, printk_worker);
/*
* The indices into log_buf are not constrained to log_buf_len - they
@@ -118,9 +132,8 @@ static unsigned log_start; /* Index into log_buf: next char to be read by syslog
static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */
static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
-/* Worker to print accumulated data to console when there's too much of it */
-static void printk_worker(struct work_struct *work);
-static DECLARE_WORK(printk_work, printk_worker);
+static unsigned nmi_log_start; /* Index into nmi_log_buf: next char to be copied to printk ringbuf */
+static unsigned nmi_log_end; /* Index into nmi_log_buf: most-recently-written-char + 1 */
/*
* If exclusive_console is non-NULL then only this console is to be printed to.
@@ -155,7 +168,9 @@ static int console_may_schedule;
static char __log_buf[__LOG_BUF_LEN];
static char *log_buf = __log_buf;
+static char *nmi_log_buf = NULL;
static int log_buf_len = __LOG_BUF_LEN;
+static int nmi_log_buf_len = __LOG_BUF_LEN;
static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
static int saved_console_loglevel = -1;
@@ -179,6 +194,7 @@ void log_buf_kexec_setup(void)
/* requested log_buf_len from kernel cmdline */
static unsigned long __initdata new_log_buf_len;
+static unsigned long __initdata new_nmi_log_buf_len;
/* save requested log_buf_len since it's too early to process it */
static int __init log_buf_len_setup(char *str)
@@ -194,6 +210,33 @@ static int __init log_buf_len_setup(char *str)
}
early_param("log_buf_len", log_buf_len_setup);
+static int __init nmi_log_buf_len_setup(char *str)
+{
+ unsigned size = memparse(str, &str);
+
+ if (size)
+ size = roundup_pow_of_two(size);
+ if (size > nmi_log_buf_len)
+ new_nmi_log_buf_len = size;
+
+ return 0;
+}
+early_param("nmi_log_buf_len", nmi_log_buf_len_setup);
+
+char * __init alloc_log_buf(int early, unsigned len)
+{
+ if (early) {
+ unsigned long mem;
+
+ mem = memblock_alloc(len, PAGE_SIZE);
+ if (mem == MEMBLOCK_ERROR)
+ return NULL;
+ return __va(mem);
+ }
+
+ return alloc_bootmem_nopanic(len);
+}
+
void __init setup_log_buf(int early)
{
unsigned long flags;
@@ -201,20 +244,26 @@ void __init setup_log_buf(int early)
char *new_log_buf;
int free;
+ if (!nmi_log_buf) {
+ unsigned len = (nmi_log_buf_len > new_nmi_log_buf_len) ?
+ nmi_log_buf_len: new_nmi_log_buf_len;
+
+ if (len) {
+ nmi_log_buf = alloc_log_buf(early, len);
+ if (!nmi_log_buf)
+ pr_err("%ld bytes not available for nmi ring buffer\n",
+ len);
+ else {
+ nmi_log_buf_len = len;
+ pr_info("nmi ring buffer: %d\n", len);
+ }
+ }
+ }
+
if (!new_log_buf_len)
return;
- if (early) {
- unsigned long mem;
-
- mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
- if (mem == MEMBLOCK_ERROR)
- return;
- new_log_buf = __va(mem);
- } else {
- new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);
- }
-
+ new_log_buf = alloc_log_buf(early, new_log_buf_len);
if (unlikely(!new_log_buf)) {
pr_err("log_buf_len: %ld bytes not available\n",
new_log_buf_len);
@@ -703,6 +752,18 @@ static void emit_log_char(char c)
logged_chars++;
}
+static void emit_nmi_log_char(char c)
+{
+ __LOG_BUF(nmi_log_buf, nmi_log_buf_len, nmi_log_end) = c;
+ /*
+ * Make sure that the buffer content is visible before nmi_log_end
+ * for out of lock access so that we can be sure that the content
+ * is up-to-date
+ */
+ smp_wmb();
+ nmi_log_end++;
+}
+
/*
* Zap console related locks when oopsing. Only zap at most once
* every 10 seconds, to leave time for slow consoles to print a
@@ -834,7 +895,6 @@ static int console_trylock_for_printk(unsigned int cpu)
retval = 0;
}
}
- printk_cpu = UINT_MAX;
spin_unlock(&logbuf_lock);
return retval;
}
@@ -843,6 +903,7 @@ static const char recursion_bug_msg [] =
static int recursion_bug;
static int new_text_line = 1;
static char printk_buf[1024];
+static char nmi_printk_buf[1024];
int printk_delay_msec __read_mostly;
@@ -858,61 +919,57 @@ static inline void printk_delay(void)
}
}
-asmlinkage int vprintk(const char *fmt, va_list args)
+/*
+ * Called from non-NMI context to move nmi ring buffer into the regular printk
+ * ring buffer
+ */
+static void handle_nmi_delayed_printk(void)
{
- int printed_len = 0;
- int current_log_level = default_message_loglevel;
- unsigned long flags;
- int this_cpu;
- char *p;
- size_t plen;
- char special;
+ unsigned end_idx, start_idx, idx;
- boot_delay_msec();
- printk_delay();
+ end_idx = ACCESS_ONCE(nmi_log_end);
+ start_idx = ACCESS_ONCE(nmi_log_start);
- preempt_disable();
- /* This stops the holder of console_sem just where we want him */
- raw_local_irq_save(flags);
- this_cpu = smp_processor_id();
+ if (likely(end_idx == start_idx))
+ return;
- /*
- * Ouch, printk recursed into itself!
- */
- if (unlikely(printk_cpu == this_cpu)) {
+ spin_lock(&logbuf_lock);
+ for (idx = nmi_log_start; ; idx++) {
/*
- * If a crash is occurring during printk() on this CPU,
- * then try to get the crash message out but make sure
- * we can't deadlock. Otherwise just return to avoid the
- * recursion and return - but flag the recursion so that
- * it can be printed at the next appropriate moment:
+ * nmi_log_end might be updated from NMI context. Make
+ * sure we refetch a new value every loop invocation
*/
- if (!oops_in_progress) {
- recursion_bug = 1;
- goto out_restore_irqs;
- }
- zap_locks();
- }
+ end_idx = ACCESS_ONCE(nmi_log_end);
+ if (idx == end_idx)
+ break;
- lockdep_off();
- spin_lock(&logbuf_lock);
- printk_cpu = this_cpu;
+ /* Make sure the ring buffer doesn't overflow */
+ if (end_idx - idx > nmi_log_buf_len)
+ idx = end_idx - nmi_log_buf_len;
- if (recursion_bug) {
- recursion_bug = 0;
- strcpy(printk_buf, recursion_bug_msg);
- printed_len = strlen(recursion_bug_msg);
+ smp_rmb();
+ emit_log_char(__LOG_BUF(nmi_log_buf, nmi_log_buf_len, idx));
}
- /* Emit the output into the temporary buffer */
- printed_len += vscnprintf(printk_buf + printed_len,
- sizeof(printk_buf) - printed_len, fmt, args);
+ /* Nobody touches nmi_log_buf except for us and we are locked */
+ nmi_log_start = idx;
+ if (console_trylock_for_printk(smp_processor_id()))
+ console_unlock();
+}
- p = printk_buf;
+static int finish_printk(char *msg, int printed_len, bool nmi_ring)
+{
+ int current_log_level = default_message_loglevel;
+ char *msg_start = msg;
+ size_t plen;
+ char special;
+ void (*emit_char)(char c) = (nmi_ring) ? emit_nmi_log_char : emit_log_char;
+
+ /* TODO new_text_line needs a special handling for nmi_ring */
/* Read log level and handle special printk prefix */
- plen = log_prefix(p, &current_log_level, &special);
+ plen = log_prefix(msg, &current_log_level, &special);
if (plen) {
- p += plen;
+ msg += plen;
switch (special) {
case 'c': /* Strip <c> KERN_CONT, continue line */
@@ -922,7 +979,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
plen = 0;
default:
if (!new_text_line) {
- emit_log_char('\n');
+ emit_char('\n');
new_text_line = 1;
}
}
@@ -932,7 +989,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* Copy the output into log_buf. If the caller didn't provide
* the appropriate log prefix, we insert them here
*/
- for (; *p; p++) {
+ for (; *msg; msg++) {
if (new_text_line) {
new_text_line = 0;
@@ -941,13 +998,13 @@ asmlinkage int vprintk(const char *fmt, va_list args)
int i;
for (i = 0; i < plen; i++)
- emit_log_char(printk_buf[i]);
+ emit_char(msg_start[i]);
printed_len += plen;
} else {
/* Add log prefix */
- emit_log_char('<');
- emit_log_char(current_log_level + '0');
- emit_log_char('>');
+ emit_char('<');
+ emit_char(current_log_level + '0');
+ emit_char('>');
printed_len += 3;
}
@@ -958,26 +1015,103 @@ asmlinkage int vprintk(const char *fmt, va_list args)
unsigned long long t;
unsigned long nanosec_rem;
- t = cpu_clock(printk_cpu);
+ t = cpu_clock(smp_processor_id());
nanosec_rem = do_div(t, 1000000000);
tlen = sprintf(tbuf, "[%5lu.%06lu] ",
(unsigned long) t,
nanosec_rem / 1000);
for (tp = tbuf; tp < tbuf + tlen; tp++)
- emit_log_char(*tp);
+ emit_char(*tp);
printed_len += tlen;
}
- if (!*p)
+ if (!*msg)
break;
}
- emit_log_char(*p);
- if (*p == '\n')
+ emit_char(*msg);
+ if (*msg == '\n')
new_text_line = 1;
}
+ return printed_len;
+}
+
+asmlinkage int vprintk(const char *fmt, va_list args)
+{
+ int printed_len = 0;
+ unsigned long flags;
+ int this_cpu;
+ char *buf = printk_buf;
+ unsigned buf_len = sizeof(printk_buf);
+ bool in_nmi_delayed_printk = false;
+
+ boot_delay_msec();
+ printk_delay();
+
+ preempt_disable();
+ /* This stops the holder of console_sem just where we want him */
+ raw_local_irq_save(flags);
+ this_cpu = smp_processor_id();
+
+ /*
+ * Ouch, printk recursed into itself!
+ */
+ if (!in_nmi() && unlikely(printk_cpu == this_cpu)) {
+ /*
+ * If a crash is occurring during printk() on this CPU,
+ * then try to get the crash message out but make sure
+ * we can't deadlock. Otherwise just return to avoid the
+ * recursion and return - but flag the recursion so that
+ * it can be printed at the next appropriate moment:
+ */
+ if (!oops_in_progress) {
+ recursion_bug = 1;
+ goto out_restore_irqs;
+ }
+ zap_locks();
+ }
+
+ lockdep_off();
+ /*
+ * Make sure we are not going to deadlock when we managed to preempt the
+ * currently running printk from NMI. Copy the current message into nmi
+ * ring buffer and let the current lock owner to print the message after
+ * he is back on CPU.
+ */
+ if (!spin_trylock(&logbuf_lock)) {
+ if (!in_nmi()) {
+ spin_lock(&logbuf_lock);
+ } else {
+ if (!nmi_log_buf) {
+ lockdep_on();
+ goto out_restore_irqs;
+ }
+ /*
+ * The lock is allowed to be taken only from NMI context
+ * to synchronize NMI printk callers.
+ */
+ spin_lock(&nmi_logbuf_lock);
+ buf = nmi_printk_buf;
+ buf_len = sizeof(nmi_printk_buf);
+ in_nmi_delayed_printk = true;
+ }
+ }
+ if (!in_nmi_delayed_printk) {
+ printk_cpu = this_cpu;
+ if (recursion_bug) {
+ recursion_bug = 0;
+ strcpy(buf, recursion_bug_msg);
+ printed_len = strlen(recursion_bug_msg);
+ }
+ }
+
+ /* Emit the output into the temporary buffer */
+ printed_len += vscnprintf(buf + printed_len,
+ buf_len - printed_len, fmt, args);
+ printed_len = finish_printk(buf, printed_len, in_nmi_delayed_printk);
+
/*
* Try to acquire and then immediately release the
* console semaphore. The release will do all the
@@ -987,9 +1121,32 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* The console_trylock_for_printk() function
* will release 'logbuf_lock' regardless of whether it
* actually gets the semaphore or not.
+ *
+ * This whole magic is not allowed from nmi context as
+ * console_unlock re-takes logbuf_lock and other locks
+ * from follow-up paths.
*/
- if (console_trylock_for_printk(this_cpu))
- console_unlock();
+ if (!in_nmi_delayed_printk) {
+ printk_cpu = UINT_MAX;
+ if (in_nmi()) {
+ spin_unlock(&logbuf_lock);
+ } else {
+ if (console_trylock_for_printk(this_cpu))
+ console_unlock();
+
+ /*
+ * We are calling this outside of the lock just to make
+ * sure that the printk which raced with NMI had a
+ * chance to do some progress since it has been
+ * interrupted.
+ * Do not try to handle pending NMI messages from NMI as
+ * we would need to take logbuf_lock and we could
+ * deadlock.
+ */
+ handle_nmi_delayed_printk();
+ }
+ } else
+ spin_unlock(&nmi_logbuf_lock);
lockdep_on();
out_restore_irqs: