[packages/kernel] RT variant added

jajcus jajcus at pld-linux.org
Fri Nov 11 18:05:59 CET 2016


commit 1a6e0f06202fd47f1c81421d3927476047c3477a
Author: Jacek Konieczny <j.konieczny at eggsoft.pl>
Date:   Fri Oct 21 14:45:12 2016 +0200

    RT variant added
    
    CONFIG_PREEMPT_RT patch for fully preemptible kernel, required for low
    latencies.
    
    Note: some patches are disabled when building the rt kernel, as they
    fail compilation with -Werror=incompatible-pointer-types. Probably all
    of those errors are patch bugs, which should be fixed for the regular
    kernel too.

 kernel-rt.config |    27 +
 kernel-rt.patch  | 25915 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel.spec      |    26 +-
 3 files changed, 25967 insertions(+), 1 deletion(-)
---
diff --git a/kernel.spec b/kernel.spec
index 23c38d7..a2b628e 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -31,6 +31,8 @@
 
 %bcond_with	vserver		# support for VServer
 
+%bcond_with	rt		# real-time kernel (CONFIG_PREEMPT_RT) for low latencies
+
 %bcond_with	vanilla		# don't include any patches
 %bcond_with	rescuecd	# build kernel for our rescue
 %bcond_with	myown		# build with your own config (kernel-myown.config)
@@ -96,6 +98,9 @@
 %if %{without pae}
 %define		alt_kernel	nopae
 %endif
+%if %{with rt}
+%define		alt_kernel	rt
+%endif
 
 # kernel release (used in filesystem and eventually in uname -r)
 # modules will be looked from /lib/modules/%{kernel_release}
@@ -140,6 +145,7 @@ Source25:	kernel-ia64.config
 
 Source41:	kernel-patches.config
 Source43:	kernel-vserver.config
+Source44:	kernel-rt.config
 
 Source55:	kernel-imq.config
 
@@ -205,6 +211,10 @@ Patch146:	kernel-aufs4+vserver.patch
 # Show normal colors in menuconfig with ncurses ABI 6
 Patch250:	kernel-fix_256colors_menuconfig.patch
 
+# https://rt.wiki.kernel.org/
+# https://www.kernel.org/pub/linux/kernel/projects/rt/4.8/patch-4.8.6-rt5.patch.xz
+Patch500:	kernel-rt.patch
+
 Patch2000:	kernel-small_fixes.patch
 Patch2001:	kernel-pwc-uncompress.patch
 Patch2003:	kernel-regressions.patch
@@ -382,6 +392,7 @@ BuildRoot:	%{tmpdir}/%{name}-%{version}-root-%(id -u -n)
 %{?with_fbcondecor:Fbsplash/fbcondecor - enabled }\
 %{?with_nfsroot:Root on NFS - enabled}\
 %{?with_vserver:Linux-VServer - %{vserver_patch}}\
+%{?with_rt:CONFIG_PREEMPT_RT - enabled}\
 
 %define Features %(echo "%{__features}" | sed '/^$/d')
 
@@ -649,7 +660,10 @@ cd linux-%{basever}
 #
 
 # kernel-pom-ng-IPV4OPTSSTRIP.patch
+%if %{without rt}
+# fails on -Werror=incompatible-pointer-types
 %patch10 -p1
+%endif
 
 # kernel-owner-xid.patch
 %if %{with vserver}
@@ -666,8 +680,11 @@ cd linux-%{basever}
 %patch50 -p1
 %endif
 
+%if %{without rt}
+# fails on -Werror=incompatible-pointer-types
 %patch55 -p1
 %patch56 -p1
+%endif
 
 # kernel-rndis_host-wm5.patch
 %patch59 -p1
@@ -691,6 +708,11 @@ cd linux-%{basever}
 %patch7000 -p1
 %endif
 
+%if %{with rt}
+%patch500 -p1
+rm -f localversion-rt
+%endif
+
 # apparmor
 %patch5000 -p1
 
@@ -729,7 +751,6 @@ EOF
 
 RescueConfig() {
 	set -x
-	cat <<-EOCONFIG > $1
 		# CONFIG_SOUND is not set
 		# CONFIG_AUDIT is not set
 		# CONFIG_TR is not set
@@ -895,6 +916,9 @@ EOCONFIG
 %if %{with vserver}
 		%{SOURCE43} \
 %endif
+%if %{with rt}
+		%{SOURCE44} \
+%endif
 		%{SOURCE41} %{?0:patches} \
 %endif
 		%{SOURCE20} \
diff --git a/kernel-rt.config b/kernel-rt.config
new file mode 100644
index 0000000..98633d5
--- /dev/null
+++ b/kernel-rt.config
@@ -0,0 +1,27 @@
+AUFS_FS all=n
+DEBUG_PREEMPT all=y
+HWLAT_DETECTOR all=m
+HZ_1000 all=y
+HZ_100 all=n
+HZ_250 all=n
+HZ_300 all=n
+HZ all=1000
+MISSED_TIMER_OFFSETS_HIST all=y
+PREEMPT all=y
+PREEMPT_COUNT all=y
+PREEMPT_LAZY all=y
+PREEMPT__LL all=n
+PREEMPT_NONE all=n
+PREEMPT_NOTIFIERS all=y
+PREEMPT_OFF_HIST all=y
+PREEMPT_RCU all=y
+PREEMPT_RTB all=n
+PREEMPT_RT_BASE all=y
+PREEMPT_RT_FULL all=y
+PREEMPT_TRACER all=y
+PREEMPT_VOLUNTARY all=n
+TASKS_RCU all=y
+TRACER_SNAPSHOT_PER_CPU_SWAP all=y
+TRANSPARENT_HUGEPAGE_ALWAYS all=n
+TREE_RCU all=n
+WAKEUP_LATENCY_HIST all=y
diff --git a/kernel-rt.patch b/kernel-rt.patch
new file mode 100644
index 0000000..b19eb6e
--- /dev/null
+++ b/kernel-rt.patch
@@ -0,0 +1,25915 @@
+diff --git a/Documentation/hwlat_detector.txt b/Documentation/hwlat_detector.txt
+new file mode 100644
+index 000000000000..cb61516483d3
+--- /dev/null
++++ b/Documentation/hwlat_detector.txt
+@@ -0,0 +1,64 @@
++Introduction:
++-------------
++
++The module hwlat_detector is a special purpose kernel module that is used to
++detect large system latencies induced by the behavior of certain underlying
++hardware or firmware, independent of Linux itself. The code was developed
++originally to detect SMIs (System Management Interrupts) on x86 systems,
++however there is nothing x86 specific about this patchset. It was
++originally written for use by the "RT" patch since the Real Time
++kernel is highly latency sensitive.
++
++SMIs are usually not serviced by the Linux kernel, which typically does not
++even know that they are occuring. SMIs are instead are set up by BIOS code
++and are serviced by BIOS code, usually for "critical" events such as
++management of thermal sensors and fans. Sometimes though, SMIs are used for
++other tasks and those tasks can spend an inordinate amount of time in the
++handler (sometimes measured in milliseconds). Obviously this is a problem if
++you are trying to keep event service latencies down in the microsecond range.
++
++The hardware latency detector works by hogging all of the cpus for configurable
++amounts of time (by calling stop_machine()), polling the CPU Time Stamp Counter
++for some period, then looking for gaps in the TSC data. Any gap indicates a
++time when the polling was interrupted and since the machine is stopped and
++interrupts turned off the only thing that could do that would be an SMI.
++
++Note that the SMI detector should *NEVER* be used in a production environment.
++It is intended to be run manually to determine if the hardware platform has a
++problem with long system firmware service routines.
++
++Usage:
++------
++
++Loading the module hwlat_detector passing the parameter "enabled=1" (or by
++setting the "enable" entry in "hwlat_detector" debugfs toggled on) is the only
++step required to start the hwlat_detector. It is possible to redefine the
++threshold in microseconds (us) above which latency spikes will be taken
++into account (parameter "threshold=").
++
++Example:
++
++	# modprobe hwlat_detector enabled=1 threshold=100
++
++After the module is loaded, it creates a directory named "hwlat_detector" under
++the debugfs mountpoint, "/debug/hwlat_detector" for this text. It is necessary
++to have debugfs mounted, which might be on /sys/debug on your system.
++
++The /debug/hwlat_detector interface contains the following files:
++
++count			- number of latency spikes observed since last reset
++enable			- a global enable/disable toggle (0/1), resets count
++max			- maximum hardware latency actually observed (usecs)
++sample			- a pipe from which to read current raw sample data
++			  in the format <timestamp> <latency observed usecs>
++			  (can be opened O_NONBLOCK for a single sample)
++threshold		- minimum latency value to be considered (usecs)
++width			- time period to sample with CPUs held (usecs)
++			  must be less than the total window size (enforced)
++window			- total period of sampling, width being inside (usecs)
++
++By default we will set width to 500,000 and window to 1,000,000, meaning that
++we will sample every 1,000,000 usecs (1s) for 500,000 usecs (0.5s). If we
++observe any latencies that exceed the threshold (initially 100 usecs),
++then we write to a global sample ring buffer of 8K samples, which is
++consumed by reading from the "sample" (pipe) debugfs file interface.
+diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
+index 3a3b30ac2a75..9e0745cafbd8 100644
+--- a/Documentation/sysrq.txt
++++ b/Documentation/sysrq.txt
+@@ -59,10 +59,17 @@ On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
+ On other - If you know of the key combos for other architectures, please
+            let me know so I can add them to this section.
+ 
+-On all -  write a character to /proc/sysrq-trigger.  e.g.:
+-
++On all -  write a character to /proc/sysrq-trigger, e.g.:
+ 		echo t > /proc/sysrq-trigger
+ 
++On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
++		echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
++	 Send an ICMP echo request with this pattern plus the particular
++	 SysRq command key. Example:
++		# ping -c1 -s57 -p0102030468
++	 will trigger the SysRq-H (help) command.
++
++
+ *  What are the 'command' keys?
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ 'b'     - Will immediately reboot the system without syncing or unmounting
+diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt
+new file mode 100644
+index 000000000000..6f2aeabf7faa
+--- /dev/null
++++ b/Documentation/trace/histograms.txt
+@@ -0,0 +1,186 @@
++		Using the Linux Kernel Latency Histograms
++
++
++This document gives a short explanation how to enable, configure and use
++latency histograms. Latency histograms are primarily relevant in the
++context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
++and are used in the quality management of the Linux real-time
++capabilities.
++
++
++* Purpose of latency histograms
++
++A latency histogram continuously accumulates the frequencies of latency
++data. There are two types of histograms
++- potential sources of latencies
++- effective latencies
++
++
++* Potential sources of latencies
++
++Potential sources of latencies are code segments where interrupts,
++preemption or both are disabled (aka critical sections). To create
++histograms of potential sources of latency, the kernel stores the time
++stamp at the start of a critical section, determines the time elapsed
++when the end of the section is reached, and increments the frequency
++counter of that latency value - irrespective of whether any concurrently
++running process is affected by latency or not.
++- Configuration items (in the Kernel hacking/Tracers submenu)
++  CONFIG_INTERRUPT_OFF_LATENCY
++  CONFIG_PREEMPT_OFF_LATENCY
++
++
++* Effective latencies
++
++Effective latencies are actually occuring during wakeup of a process. To
++determine effective latencies, the kernel stores the time stamp when a
++process is scheduled to be woken up, and determines the duration of the
++wakeup time shortly before control is passed over to this process. Note
++that the apparent latency in user space may be somewhat longer, since the
++process may be interrupted after control is passed over to it but before
++the execution in user space takes place. Simply measuring the interval
++between enqueuing and wakeup may also not appropriate in cases when a
++process is scheduled as a result of a timer expiration. The timer may have
++missed its deadline, e.g. due to disabled interrupts, but this latency
++would not be registered. Therefore, the offsets of missed timers are
++recorded in a separate histogram. If both wakeup latency and missed timer
++offsets are configured and enabled, a third histogram may be enabled that
++records the overall latency as a sum of the timer latency, if any, and the
++wakeup latency. This histogram is called "timerandwakeup".
++- Configuration items (in the Kernel hacking/Tracers submenu)
++  CONFIG_WAKEUP_LATENCY
++  CONFIG_MISSED_TIMER_OFSETS
++
++
++* Usage
++
++The interface to the administration of the latency histograms is located
++in the debugfs file system. To mount it, either enter
++
++mount -t sysfs nodev /sys
++mount -t debugfs nodev /sys/kernel/debug
++
++from shell command line level, or add
++
++nodev	/sys			sysfs	defaults	0 0
++nodev	/sys/kernel/debug	debugfs	defaults	0 0
++
++to the file /etc/fstab. All latency histogram related files are then
++available in the directory /sys/kernel/debug/tracing/latency_hist. A
++particular histogram type is enabled by writing non-zero to the related
++variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
++Select "preemptirqsoff" for the histograms of potential sources of
++latencies and "wakeup" for histograms of effective latencies etc. The
++histogram data - one per CPU - are available in the files
++
++/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
++/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
++/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
++/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
++/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
++/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
++/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
++
++The histograms are reset by writing non-zero to the file "reset" in a
++particular latency directory. To reset all latency data, use
++
++#!/bin/sh
++
++TRACINGDIR=/sys/kernel/debug/tracing
++HISTDIR=$TRACINGDIR/latency_hist
++
++if test -d $HISTDIR
++then
++  cd $HISTDIR
++  for i in `find . | grep /reset$`
++  do
++    echo 1 >$i
++  done
++fi
++
++
++* Data format
++
++Latency data are stored with a resolution of one microsecond. The
++maximum latency is 10,240 microseconds. The data are only valid, if the
++overflow register is empty. Every output line contains the latency in
++microseconds in the first row and the number of samples in the second
++row. To display only lines with a positive latency count, use, for
++example,
++
++grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
++
++#Minimum latency: 0 microseconds.
++#Average latency: 0 microseconds.
++#Maximum latency: 25 microseconds.
++#Total samples: 3104770694
++#There are 0 samples greater or equal than 10240 microseconds
++#usecs	         samples
++    0	      2984486876
++    1	        49843506
++    2	        58219047
++    3	         5348126
++    4	         2187960
++    5	         3388262
++    6	          959289
++    7	          208294
++    8	           40420
++    9	            4485
++   10	           14918
++   11	           18340
++   12	           25052
++   13	           19455
++   14	            5602
++   15	             969
++   16	              47
++   17	              18
++   18	              14
++   19	               1
++   20	               3
++   21	               2
++   22	               5
++   23	               2
++   25	               1
++
++
++* Wakeup latency of a selected process
++
++To only collect wakeup latency data of a particular process, write the
++PID of the requested process to
++
++/sys/kernel/debug/tracing/latency_hist/wakeup/pid
++
++PIDs are not considered, if this variable is set to 0.
++
++
++* Details of the process with the highest wakeup latency so far
++
++Selected data of the process that suffered from the highest wakeup
++latency that occurred in a particular CPU are available in the file
++
++/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
++
++In addition, other relevant system data at the time when the
++latency occurred are given.
++
++The format of the data is (all in one line):
++<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
++<- <PID> <Priority> <Command> <Timestamp>
++
++The value of <Timeroffset> is only relevant in the combined timer
++and wakeup latency recording. In the wakeup recording, it is
++always 0, in the missed_timer_offsets recording, it is the same
++as <Latency>.
++
++When retrospectively searching for the origin of a latency and
++tracing was not enabled, it may be helpful to know the name and
++some basic data of the task that (finally) was switching to the
++late real-tlme task. In addition to the victim's data, also the
++data of the possible culprit are therefore displayed after the
++"<-" symbol.
++
++Finally, the timestamp of the time when the latency occurred
++in <seconds>.<microseconds> after the most recent system boot
++is provided.
++
++These data are also reset when the wakeup histogram is reset.
+diff --git a/Makefile b/Makefile
+index b249529204cd..5d699d055995 100644
+--- a/Makefile
++++ b/Makefile
+@@ -398,12 +398,12 @@ KBUILD_CPPFLAGS := -D__KERNEL__
+ KBUILD_CFLAGS   := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
+ 		   -fno-strict-aliasing -fno-common \
+ 		   -Werror-implicit-function-declaration \
+-		   -Wno-format-security \
++		   -Wno-format-security -fno-PIE \
+ 		   -std=gnu89
+ 
+ KBUILD_AFLAGS_KERNEL :=
+ KBUILD_CFLAGS_KERNEL :=
+-KBUILD_AFLAGS   := -D__ASSEMBLY__
++KBUILD_AFLAGS   := -D__ASSEMBLY__ -fno-PIE
+ KBUILD_AFLAGS_MODULE  := -DMODULE
+ KBUILD_CFLAGS_MODULE  := -DMODULE
+ KBUILD_LDFLAGS_MODULE := -T $(srctree)/scripts/module-common.lds
+diff --git a/arch/Kconfig b/arch/Kconfig
+index fd6e9712af81..085134ee13e9 100644
+--- a/arch/Kconfig
++++ b/arch/Kconfig
+@@ -9,6 +9,7 @@ config OPROFILE
+ 	tristate "OProfile system profiling"
+ 	depends on PROFILING
+ 	depends on HAVE_OPROFILE
++	depends on !PREEMPT_RT_FULL
+ 	select RING_BUFFER
+ 	select RING_BUFFER_ALLOW_SWAP
+ 	help
+@@ -52,6 +53,7 @@ config KPROBES
+ config JUMP_LABEL
+        bool "Optimize very unlikely/likely branches"
+        depends on HAVE_ARCH_JUMP_LABEL
++       depends on (!INTERRUPT_OFF_HIST && !PREEMPT_OFF_HIST && !WAKEUP_LATENCY_HIST && !MISSED_TIMER_OFFSETS_HIST)
+        help
+          This option enables a transparent branch optimization that
+ 	 makes certain almost-always-true or almost-always-false branch
+diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
+index a9c4e48bb7ec..6eefe4f32302 100644
+--- a/arch/arm/Kconfig
++++ b/arch/arm/Kconfig
+@@ -36,7 +36,7 @@ config ARM
+ 	select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
+ 	select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
+ 	select HAVE_ARCH_HARDENED_USERCOPY
+-	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
++	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
+ 	select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
+ 	select HAVE_ARCH_MMAP_RND_BITS if MMU
+ 	select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
+@@ -75,6 +75,7 @@ config ARM
+ 	select HAVE_PERF_EVENTS
+ 	select HAVE_PERF_REGS
+ 	select HAVE_PERF_USER_STACK_DUMP
++	select HAVE_PREEMPT_LAZY
+ 	select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
+ 	select HAVE_REGS_AND_STACK_ACCESS_API
+ 	select HAVE_SYSCALL_TRACEPOINTS
+diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
+index 12ebfcc1d539..c962084605bc 100644
+--- a/arch/arm/include/asm/switch_to.h
++++ b/arch/arm/include/asm/switch_to.h
+@@ -3,6 +3,13 @@
+ 
+ #include <linux/thread_info.h>
+ 
++#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
++void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
++#else
++static inline void
++switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
++#endif
++
+ /*
+  * For v7 SMP cores running a preemptible kernel we may be pre-empted
+  * during a TLB maintenance operation, so execute an inner-shareable dsb
+@@ -25,6 +32,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
+ #define switch_to(prev,next,last)					\
+ do {									\
+ 	__complete_pending_tlbi();					\
++	switch_kmaps(prev, next);					\
+ 	last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));	\
+ } while (0)
+ 
+diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
+index 776757d1604a..1f36a4eccc72 100644
+--- a/arch/arm/include/asm/thread_info.h
++++ b/arch/arm/include/asm/thread_info.h
+@@ -49,6 +49,7 @@ struct cpu_context_save {
+ struct thread_info {
+ 	unsigned long		flags;		/* low level flags */
+ 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
++	int			preempt_lazy_count; /* 0 => preemptable, <0 => bug */
+ 	mm_segment_t		addr_limit;	/* address limit */
+ 	struct task_struct	*task;		/* main task structure */
+ 	__u32			cpu;		/* cpu */
+@@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
+ #define TIF_SYSCALL_TRACE	4	/* syscall trace active */
+ #define TIF_SYSCALL_AUDIT	5	/* syscall auditing active */
+ #define TIF_SYSCALL_TRACEPOINT	6	/* syscall tracepoint instrumentation */
+-#define TIF_SECCOMP		7	/* seccomp syscall filtering active */
++#define TIF_SECCOMP		8	/* seccomp syscall filtering active */
++#define TIF_NEED_RESCHED_LAZY	7
+ 
+ #define TIF_NOHZ		12	/* in adaptive nohz mode */
+ #define TIF_USING_IWMMXT	17
+@@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
+ #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
+ #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
+ #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
++#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
+ #define _TIF_UPROBE		(1 << TIF_UPROBE)
+ #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
+ #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
+@@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
+  * Change these and you break ASM code in entry-common.S
+  */
+ #define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+-				 _TIF_NOTIFY_RESUME | _TIF_UPROBE)
++				 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
++				 _TIF_NEED_RESCHED_LAZY)
+ 
+ #endif /* __KERNEL__ */
+ #endif /* __ASM_ARM_THREAD_INFO_H */
+diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
+index 608008229c7d..3866da3f7bb7 100644
+--- a/arch/arm/kernel/asm-offsets.c
++++ b/arch/arm/kernel/asm-offsets.c
+@@ -65,6 +65,7 @@ int main(void)
+   BLANK();
+   DEFINE(TI_FLAGS,		offsetof(struct thread_info, flags));
+   DEFINE(TI_PREEMPT,		offsetof(struct thread_info, preempt_count));
++  DEFINE(TI_PREEMPT_LAZY,	offsetof(struct thread_info, preempt_lazy_count));
+   DEFINE(TI_ADDR_LIMIT,		offsetof(struct thread_info, addr_limit));
+   DEFINE(TI_TASK,		offsetof(struct thread_info, task));
+   DEFINE(TI_CPU,		offsetof(struct thread_info, cpu));
+diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
+index 9f157e7c51e7..468e224d76aa 100644
+--- a/arch/arm/kernel/entry-armv.S
++++ b/arch/arm/kernel/entry-armv.S
+@@ -220,11 +220,18 @@ ENDPROC(__dabt_svc)
+ 
+ #ifdef CONFIG_PREEMPT
+ 	ldr	r8, [tsk, #TI_PREEMPT]		@ get preempt count
+-	ldr	r0, [tsk, #TI_FLAGS]		@ get flags
+ 	teq	r8, #0				@ if preempt count != 0
++	bne	1f				@ return from exeption
++	ldr	r0, [tsk, #TI_FLAGS]		@ get flags
++	tst	r0, #_TIF_NEED_RESCHED		@ if NEED_RESCHED is set
++	blne	svc_preempt			@ preempt!
++
++	ldr	r8, [tsk, #TI_PREEMPT_LAZY]	@ get preempt lazy count
++	teq	r8, #0				@ if preempt lazy count != 0
+ 	movne	r0, #0				@ force flags to 0
+-	tst	r0, #_TIF_NEED_RESCHED
++	tst	r0, #_TIF_NEED_RESCHED_LAZY
+ 	blne	svc_preempt
++1:
+ #endif
+ 
+ 	svc_exit r5, irq = 1			@ return from exception
+@@ -239,8 +246,14 @@ ENDPROC(__irq_svc)
+ 1:	bl	preempt_schedule_irq		@ irq en/disable is done inside
+ 	ldr	r0, [tsk, #TI_FLAGS]		@ get new tasks TI_FLAGS
+ 	tst	r0, #_TIF_NEED_RESCHED
++	bne	1b
++	tst	r0, #_TIF_NEED_RESCHED_LAZY
+ 	reteq	r8				@ go again
+-	b	1b
++	ldr	r0, [tsk, #TI_PREEMPT_LAZY]	@ get preempt lazy count
++	teq	r0, #0				@ if preempt lazy count != 0
++	beq	1b
++	ret	r8				@ go again
++
+ #endif
+ 
+ __und_fault:
+diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
+index 10c3283d6c19..8872937862cc 100644
+--- a/arch/arm/kernel/entry-common.S
++++ b/arch/arm/kernel/entry-common.S
+@@ -36,7 +36,9 @@
+  UNWIND(.cantunwind	)
+ 	disable_irq_notrace			@ disable interrupts
+ 	ldr	r1, [tsk, #TI_FLAGS]		@ re-check for syscall tracing
+-	tst	r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
++	tst	r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
++	bne	fast_work_pending
++	tst	r1, #_TIF_SECCOMP
+ 	bne	fast_work_pending
+ 
+ 	/* perform architecture specific actions before user return */
+@@ -62,8 +64,11 @@ ENDPROC(ret_fast_syscall)
+ 	str	r0, [sp, #S_R0 + S_OFF]!	@ save returned r0
+ 	disable_irq_notrace			@ disable interrupts
+ 	ldr	r1, [tsk, #TI_FLAGS]		@ re-check for syscall tracing
+-	tst	r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
++	tst	r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
++	bne 	do_slower_path
++	tst	r1, #_TIF_SECCOMP
+ 	beq	no_work_pending
++do_slower_path:
+  UNWIND(.fnend		)
+ ENDPROC(ret_fast_syscall)
+ 
+diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
+index 612eb530f33f..cd3006dc1fd3 100644
+--- a/arch/arm/kernel/process.c
++++ b/arch/arm/kernel/process.c
+@@ -323,6 +323,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
+ }
+ 
+ #ifdef CONFIG_MMU
++/*
++ * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
++ * initialized by pgtable_page_ctor() then a coredump of the vector page will
++ * fail.
++ */
++static int __init vectors_user_mapping_init_page(void)
++{
++	struct page *page;
++	unsigned long addr = 0xffff0000;
++	pgd_t *pgd;
++	pud_t *pud;
++	pmd_t *pmd;
++
++	pgd = pgd_offset_k(addr);
++	pud = pud_offset(pgd, addr);
++	pmd = pmd_offset(pud, addr);
++	page = pmd_page(*(pmd));
++
++	pgtable_page_ctor(page);
++
++	return 0;
++}
++late_initcall(vectors_user_mapping_init_page);
++
+ #ifdef CONFIG_KUSER_HELPERS
+ /*
+  * The vectors page is always readable from user space for the
+diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
+index 7b8f2141427b..96541e00b74a 100644
+--- a/arch/arm/kernel/signal.c
++++ b/arch/arm/kernel/signal.c
+@@ -572,7 +572,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
+ 	 */
+ 	trace_hardirqs_off();
+ 	do {
+-		if (likely(thread_flags & _TIF_NEED_RESCHED)) {
++		if (likely(thread_flags & (_TIF_NEED_RESCHED |
++					   _TIF_NEED_RESCHED_LAZY))) {
+ 			schedule();
+ 		} else {
+ 			if (unlikely(!user_mode(regs)))
+diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
+index 861521606c6d..e5ca865d321b 100644
+--- a/arch/arm/kernel/smp.c
++++ b/arch/arm/kernel/smp.c
+@@ -234,8 +234,6 @@ int __cpu_disable(void)
+ 	flush_cache_louis();
+ 	local_flush_tlb_all();
+ 
+-	clear_tasks_mm_cpumask(cpu);
+-
+ 	return 0;
+ }
+ 
+@@ -251,6 +249,9 @@ void __cpu_die(unsigned int cpu)
+ 		pr_err("CPU%u: cpu didn't die\n", cpu);
+ 		return;
+ 	}
++
++	clear_tasks_mm_cpumask(cpu);
++
+ 	pr_notice("CPU%u: shutdown\n", cpu);
+ 
+ 	/*
+diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
+index 0bee233fef9a..314cfb232a63 100644
+--- a/arch/arm/kernel/unwind.c
++++ b/arch/arm/kernel/unwind.c
+@@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
+ static const struct unwind_idx *__origin_unwind_idx;
+ extern const struct unwind_idx __stop_unwind_idx[];
+ 
+-static DEFINE_SPINLOCK(unwind_lock);
++static DEFINE_RAW_SPINLOCK(unwind_lock);
+ static LIST_HEAD(unwind_tables);
+ 
+ /* Convert a prel31 symbol to an absolute address */
+@@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
+ 		/* module unwind tables */
+ 		struct unwind_table *table;
+ 
+-		spin_lock_irqsave(&unwind_lock, flags);
++		raw_spin_lock_irqsave(&unwind_lock, flags);
+ 		list_for_each_entry(table, &unwind_tables, list) {
+ 			if (addr >= table->begin_addr &&
+ 			    addr < table->end_addr) {
+@@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
+ 				break;
+ 			}
+ 		}
+-		spin_unlock_irqrestore(&unwind_lock, flags);
++		raw_spin_unlock_irqrestore(&unwind_lock, flags);
+ 	}
+ 
+ 	pr_debug("%s: idx = %p\n", __func__, idx);
+@@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
+ 	tab->begin_addr = text_addr;
+ 	tab->end_addr = text_addr + text_size;
+ 
+-	spin_lock_irqsave(&unwind_lock, flags);
++	raw_spin_lock_irqsave(&unwind_lock, flags);
+ 	list_add_tail(&tab->list, &unwind_tables);
+-	spin_unlock_irqrestore(&unwind_lock, flags);
++	raw_spin_unlock_irqrestore(&unwind_lock, flags);
+ 
+ 	return tab;
+ }
+@@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
+ 	if (!tab)
+ 		return;
+ 
+-	spin_lock_irqsave(&unwind_lock, flags);
++	raw_spin_lock_irqsave(&unwind_lock, flags);
+ 	list_del(&tab->list);
+-	spin_unlock_irqrestore(&unwind_lock, flags);
++	raw_spin_unlock_irqrestore(&unwind_lock, flags);
+ 
+ 	kfree(tab);
+ }
+diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
+index c94b90d43772..244dde72018a 100644
+--- a/arch/arm/kvm/arm.c
++++ b/arch/arm/kvm/arm.c
+@@ -584,7 +584,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ 		 * involves poking the GIC, which must be done in a
+ 		 * non-preemptible context.
+ 		 */
+-		preempt_disable();
++		migrate_disable();
+ 		kvm_pmu_flush_hwstate(vcpu);
+ 		kvm_timer_flush_hwstate(vcpu);
+ 		kvm_vgic_flush_hwstate(vcpu);
+@@ -605,7 +605,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ 			kvm_pmu_sync_hwstate(vcpu);
+ 			kvm_timer_sync_hwstate(vcpu);
+ 			kvm_vgic_sync_hwstate(vcpu);
+-			preempt_enable();
++			migrate_enable();
+ 			continue;
+ 		}
+ 
+@@ -661,7 +661,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ 
+ 		kvm_vgic_sync_hwstate(vcpu);
+ 
+-		preempt_enable();
++		migrate_enable();
+ 
+ 		ret = handle_exit(vcpu, run, ret);
+ 	}
+diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
+index 98ffe1e62ad5..df9769ddece5 100644
+--- a/arch/arm/mach-exynos/platsmp.c
++++ b/arch/arm/mach-exynos/platsmp.c
+@@ -229,7 +229,7 @@ static void __iomem *scu_base_addr(void)
+ 	return (void __iomem *)(S5P_VA_SCU);
+ }
+ 
+-static DEFINE_SPINLOCK(boot_lock);
++static DEFINE_RAW_SPINLOCK(boot_lock);
+ 
+ static void exynos_secondary_init(unsigned int cpu)
+ {
+@@ -242,8 +242,8 @@ static void exynos_secondary_init(unsigned int cpu)
+ 	/*
+ 	 * Synchronise with the boot thread.
+ 	 */
+-	spin_lock(&boot_lock);
+-	spin_unlock(&boot_lock);
++	raw_spin_lock(&boot_lock);
++	raw_spin_unlock(&boot_lock);
+ }
+ 
+ int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
+@@ -307,7 +307,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
+ 	 * Set synchronisation state between this boot processor
+ 	 * and the secondary one
+ 	 */
+-	spin_lock(&boot_lock);
++	raw_spin_lock(&boot_lock);
+ 
+ 	/*
+ 	 * The secondary processor is waiting to be released from
+@@ -334,7 +334,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
+ 
+ 		if (timeout == 0) {
+ 			printk(KERN_ERR "cpu1 power enable failed");
+-			spin_unlock(&boot_lock);
++			raw_spin_unlock(&boot_lock);
+ 			return -ETIMEDOUT;
+ 		}
+ 	}
+@@ -380,7 +380,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
+ 	 * calibrations, then wait for it to finish
+ 	 */
+ fail:
+-	spin_unlock(&boot_lock);
++	raw_spin_unlock(&boot_lock);
+ 
+ 	return pen_release != -1 ? ret : 0;
+ }
+diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
+index 4b653a8cb75c..b03d5a922cb1 100644
+--- a/arch/arm/mach-hisi/platmcpm.c
++++ b/arch/arm/mach-hisi/platmcpm.c
+@@ -61,7 +61,7 @@
+ 
+ static void __iomem *sysctrl, *fabric;
+ static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
+-static DEFINE_SPINLOCK(boot_lock);
++static DEFINE_RAW_SPINLOCK(boot_lock);
+ static u32 fabric_phys_addr;
+ /*
+  * [0]: bootwrapper physical address
+@@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
+ 	if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
+ 		return -EINVAL;
+ 
+-	spin_lock_irq(&boot_lock);
++	raw_spin_lock_irq(&boot_lock);
+ 
+ 	if (hip04_cpu_table[cluster][cpu])
+ 		goto out;
+@@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
+ 
+ out:
+ 	hip04_cpu_table[cluster][cpu]++;
+-	spin_unlock_irq(&boot_lock);
++	raw_spin_unlock_irq(&boot_lock);
+ 
+ 	return 0;
+ }
+@@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu)
+ 	cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+ 	cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+ 
+-	spin_lock(&boot_lock);
++	raw_spin_lock(&boot_lock);
+ 	hip04_cpu_table[cluster][cpu]--;
+ 	if (hip04_cpu_table[cluster][cpu] == 1) {
+ 		/* A power_up request went ahead of us. */
+-		spin_unlock(&boot_lock);
++		raw_spin_unlock(&boot_lock);
+ 		return;
+ 	} else if (hip04_cpu_table[cluster][cpu] > 1) {
+ 		pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
+@@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu)
+ 	}
+ 
+ 	last_man = hip04_cluster_is_down(cluster);
+-	spin_unlock(&boot_lock);
++	raw_spin_unlock(&boot_lock);
+ 	if (last_man) {
+ 		/* Since it's Cortex A15, disable L2 prefetching. */
+ 		asm volatile(
+@@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu)
+ 	       cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
+ 
+ 	count = TIMEOUT_MSEC / POLL_MSEC;
+-	spin_lock_irq(&boot_lock);
++	raw_spin_lock_irq(&boot_lock);
+ 	for (tries = 0; tries < count; tries++) {
+ 		if (hip04_cpu_table[cluster][cpu])
+ 			goto err;
+@@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
+ 		data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
+ 		if (data & CORE_WFI_STATUS(cpu))
+ 			break;
+-		spin_unlock_irq(&boot_lock);
++		raw_spin_unlock_irq(&boot_lock);
+ 		/* Wait for clean L2 when the whole cluster is down. */
+ 		msleep(POLL_MSEC);
+-		spin_lock_irq(&boot_lock);
++		raw_spin_lock_irq(&boot_lock);
+ 	}
+ 	if (tries >= count)
+ 		goto err;
+@@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
+ 		goto err;
+ 	if (hip04_cluster_is_down(cluster))
+ 		hip04_set_snoop_filter(cluster, 0);
+-	spin_unlock_irq(&boot_lock);
++	raw_spin_unlock_irq(&boot_lock);
+ 	return 1;
+ err:
+-	spin_unlock_irq(&boot_lock);
++	raw_spin_unlock_irq(&boot_lock);
+ 	return 0;
+ }
+ #endif
+diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
+index b4de3da6dffa..b52893319d75 100644
+--- a/arch/arm/mach-omap2/omap-smp.c
++++ b/arch/arm/mach-omap2/omap-smp.c
+@@ -64,7 +64,7 @@ static const struct omap_smp_config omap5_cfg __initconst = {
+ 	.startup_addr = omap5_secondary_startup,
+ };
+ 
+-static DEFINE_SPINLOCK(boot_lock);
++static DEFINE_RAW_SPINLOCK(boot_lock);
+ 
+ void __iomem *omap4_get_scu_base(void)
+ {
+@@ -131,8 +131,8 @@ static void omap4_secondary_init(unsigned int cpu)
+ 	/*
+ 	 * Synchronise with the boot thread.
+ 	 */
+-	spin_lock(&boot_lock);
+-	spin_unlock(&boot_lock);
++	raw_spin_lock(&boot_lock);
++	raw_spin_unlock(&boot_lock);
+ }
+ 
+ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
+@@ -146,7 +146,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
+ 	 * Set synchronisation state between this boot processor
+ 	 * and the secondary one
+ 	 */
+-	spin_lock(&boot_lock);
++	raw_spin_lock(&boot_lock);
+ 
+ 	/*
+ 	 * Update the AuxCoreBoot0 with boot state for secondary core.
+@@ -223,7 +223,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
+ 	 * Now the secondary core is starting up let it run its
+ 	 * calibrations, then wait for it to finish
+ 	 */
+-	spin_unlock(&boot_lock);
++	raw_spin_unlock(&boot_lock);
+ 
+ 	return 0;
+ }
+diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
+index 0875b99add18..18b6d98d2581 100644
+--- a/arch/arm/mach-prima2/platsmp.c
++++ b/arch/arm/mach-prima2/platsmp.c
+@@ -22,7 +22,7 @@
+ 
+ static void __iomem *clk_base;
+ 
+-static DEFINE_SPINLOCK(boot_lock);
++static DEFINE_RAW_SPINLOCK(boot_lock);
+ 
+ static void sirfsoc_secondary_init(unsigned int cpu)
+ {
+@@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
+ 	/*
+ 	 * Synchronise with the boot thread.
+ 	 */
+-	spin_lock(&boot_lock);
+-	spin_unlock(&boot_lock);
++	raw_spin_lock(&boot_lock);
++	raw_spin_unlock(&boot_lock);
+ }
+ 
+ static const struct of_device_id clk_ids[]  = {
+@@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
+ 	/* make sure write buffer is drained */
+ 	mb();
+ 
+-	spin_lock(&boot_lock);
++	raw_spin_lock(&boot_lock);
+ 
+ 	/*
+ 	 * The secondary processor is waiting to be released from
+@@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
+ 	 * now the secondary core is starting up let it run its
+ 	 * calibrations, then wait for it to finish
+ 	 */
+-	spin_unlock(&boot_lock);
++	raw_spin_unlock(&boot_lock);
+ 
+ 	return pen_release != -1 ? -ENOSYS : 0;
+ }
+diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
+index 5494c9e0c909..e8ce157d3548 100644
+--- a/arch/arm/mach-qcom/platsmp.c
++++ b/arch/arm/mach-qcom/platsmp.c
+@@ -46,7 +46,7 @@
+ 
+ extern void secondary_startup_arm(void);
+ 
+-static DEFINE_SPINLOCK(boot_lock);
++static DEFINE_RAW_SPINLOCK(boot_lock);
+ 
+ #ifdef CONFIG_HOTPLUG_CPU
+ static void qcom_cpu_die(unsigned int cpu)
+@@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
+ 	/*
+ 	 * Synchronise with the boot thread.
+ 	 */
+-	spin_lock(&boot_lock);
+-	spin_unlock(&boot_lock);
++	raw_spin_lock(&boot_lock);
++	raw_spin_unlock(&boot_lock);
+ }
+ 
+ static int scss_release_secondary(unsigned int cpu)
+@@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
+ 	 * set synchronisation state between this boot processor
+ 	 * and the secondary one
+ 	 */
+-	spin_lock(&boot_lock);
++	raw_spin_lock(&boot_lock);
+ 
+ 	/*
+ 	 * Send the secondary CPU a soft interrupt, thereby causing
+@@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
+ 	 * now the secondary core is starting up let it run its
+ 	 * calibrations, then wait for it to finish
+ 	 */
+-	spin_unlock(&boot_lock);
++	raw_spin_unlock(&boot_lock);
+ 
+ 	return ret;
+ }
+diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
+index 8d1e2d551786..7fa56cc78118 100644
+--- a/arch/arm/mach-spear/platsmp.c
++++ b/arch/arm/mach-spear/platsmp.c
+@@ -32,7 +32,7 @@ static void write_pen_release(int val)
+ 	sync_cache_w(&pen_release);
+ }
+ 
+-static DEFINE_SPINLOCK(boot_lock);
++static DEFINE_RAW_SPINLOCK(boot_lock);
+ 
+ static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
+ 
+@@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
+ 	/*
+ 	 * Synchronise with the boot thread.
+ 	 */
+-	spin_lock(&boot_lock);
+-	spin_unlock(&boot_lock);
++	raw_spin_lock(&boot_lock);
++	raw_spin_unlock(&boot_lock);
+ }
+ 
+ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
+@@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
+ 	 * set synchronisation state between this boot processor
+ 	 * and the secondary one
+ 	 */
+-	spin_lock(&boot_lock);
++	raw_spin_lock(&boot_lock);
+ 
+ 	/*
+ 	 * The secondary processor is waiting to be released from
+@@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
+ 	 * now the secondary core is starting up let it run its
+ 	 * calibrations, then wait for it to finish
+ 	 */
+-	spin_unlock(&boot_lock);
++	raw_spin_unlock(&boot_lock);
+ 
+ 	return pen_release != -1 ? -ENOSYS : 0;
+ }
+diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
+index ea5a2277ee46..b988e081ac79 100644
+--- a/arch/arm/mach-sti/platsmp.c
++++ b/arch/arm/mach-sti/platsmp.c
+@@ -35,7 +35,7 @@ static void write_pen_release(int val)
+ 	sync_cache_w(&pen_release);
+ }
+ 
+-static DEFINE_SPINLOCK(boot_lock);
++static DEFINE_RAW_SPINLOCK(boot_lock);
+ 
+ static void sti_secondary_init(unsigned int cpu)
+ {
+@@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu)
+ 	/*
+ 	 * Synchronise with the boot thread.
+ 	 */
+-	spin_lock(&boot_lock);
+-	spin_unlock(&boot_lock);
++	raw_spin_lock(&boot_lock);
++	raw_spin_unlock(&boot_lock);
+ }
+ 
+ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
+@@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
+ 	 * set synchronisation state between this boot processor
+ 	 * and the secondary one
+ 	 */
+-	spin_lock(&boot_lock);
++	raw_spin_lock(&boot_lock);
+ 
+ 	/*
+ 	 * The secondary processor is waiting to be released from
+@@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
+ 	 * now the secondary core is starting up let it run its
+ 	 * calibrations, then wait for it to finish
+ 	 */
+-	spin_unlock(&boot_lock);
++	raw_spin_unlock(&boot_lock);
+ 
+ 	return pen_release != -1 ? -ENOSYS : 0;
+ }
+diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
+index 3a2e678b8d30..3ed1e9ba6a01 100644
+--- a/arch/arm/mm/fault.c
++++ b/arch/arm/mm/fault.c
+@@ -430,6 +430,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
+ 	if (addr < TASK_SIZE)
+ 		return do_page_fault(addr, fsr, regs);
+ 
++	if (interrupts_enabled(regs))
++		local_irq_enable();
++
+ 	if (user_mode(regs))
+ 		goto bad_area;
+ 
+@@ -497,6 +500,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
+ static int
+ do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
+ {
++	if (interrupts_enabled(regs))
++		local_irq_enable();
++
+ 	do_bad_area(addr, fsr, regs);
+ 	return 0;
+ }
+diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
+index d02f8187b1cc..542692dbd40a 100644
+--- a/arch/arm/mm/highmem.c
++++ b/arch/arm/mm/highmem.c
+@@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
+ 	return *ptep;
+ }
+ 
++static unsigned int fixmap_idx(int type)
++{
++	return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
++}
++
+ void *kmap(struct page *page)
+ {
+ 	might_sleep();
+@@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap);
+ 
+ void *kmap_atomic(struct page *page)
+ {
++	pte_t pte = mk_pte(page, kmap_prot);
+ 	unsigned int idx;
+ 	unsigned long vaddr;
+ 	void *kmap;
+ 	int type;
+ 
+-	preempt_disable();
++	preempt_disable_nort();
+ 	pagefault_disable();
+ 	if (!PageHighMem(page))
+ 		return page_address(page);
+@@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page)
+ 
+ 	type = kmap_atomic_idx_push();
+ 
+-	idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
++	idx = fixmap_idx(type);
+ 	vaddr = __fix_to_virt(idx);
+ #ifdef CONFIG_DEBUG_HIGHMEM
+ 	/*
+@@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page)
+ 	 * in place, so the contained TLB flush ensures the TLB is updated
+ 	 * with the new mapping.
+ 	 */
+-	set_fixmap_pte(idx, mk_pte(page, kmap_prot));
++#ifdef CONFIG_PREEMPT_RT_FULL
++	current->kmap_pte[type] = pte;
++#endif
++	set_fixmap_pte(idx, pte);
+ 
+ 	return (void *)vaddr;
+ }
+@@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr)
+ 
+ 	if (kvaddr >= (void *)FIXADDR_START) {
+ 		type = kmap_atomic_idx();
+-		idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
++		idx = fixmap_idx(type);
+ 
+ 		if (cache_is_vivt())
+ 			__cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
++#ifdef CONFIG_PREEMPT_RT_FULL
++		current->kmap_pte[type] = __pte(0);
++#endif
+ #ifdef CONFIG_DEBUG_HIGHMEM
+ 		BUG_ON(vaddr != __fix_to_virt(idx));
+-		set_fixmap_pte(idx, __pte(0));
+ #else
+ 		(void) idx;  /* to kill a warning */
+ #endif
++		set_fixmap_pte(idx, __pte(0));
+ 		kmap_atomic_idx_pop();
+ 	} else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
+ 		/* this address was obtained through kmap_high_get() */
+ 		kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
+ 	}
+ 	pagefault_enable();
+-	preempt_enable();
++	preempt_enable_nort();
+ }
+ EXPORT_SYMBOL(__kunmap_atomic);
+ 
+ void *kmap_atomic_pfn(unsigned long pfn)
+ {
++	pte_t pte = pfn_pte(pfn, kmap_prot);
+ 	unsigned long vaddr;
+ 	int idx, type;
+ 	struct page *page = pfn_to_page(pfn);
+ 
+-	preempt_disable();
++	preempt_disable_nort();
+ 	pagefault_disable();
+ 	if (!PageHighMem(page))
+ 		return page_address(page);
+ 
+ 	type = kmap_atomic_idx_push();
+-	idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
++	idx = fixmap_idx(type);
+ 	vaddr = __fix_to_virt(idx);
+ #ifdef CONFIG_DEBUG_HIGHMEM
+ 	BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
+ #endif
+-	set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
++#ifdef CONFIG_PREEMPT_RT_FULL
++	current->kmap_pte[type] = pte;
++#endif
++	set_fixmap_pte(idx, pte);
+ 
+ 	return (void *)vaddr;
+ }
++#if defined CONFIG_PREEMPT_RT_FULL
++void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
++{
++	int i;
++
++	/*
++	 * Clear @prev's kmap_atomic mappings
++	 */
++	for (i = 0; i < prev_p->kmap_idx; i++) {
++		int idx = fixmap_idx(i);
++
++		set_fixmap_pte(idx, __pte(0));
++	}
++	/*
++	 * Restore @next_p's kmap_atomic mappings
++	 */
++	for (i = 0; i < next_p->kmap_idx; i++) {
++		int idx = fixmap_idx(i);
++
++		if (!pte_none(next_p->kmap_pte[i]))
++			set_fixmap_pte(idx, next_p->kmap_pte[i]);
++	}
++}
++#endif
+diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
+index c2366510187a..6b60f582b738 100644
+--- a/arch/arm/plat-versatile/platsmp.c
++++ b/arch/arm/plat-versatile/platsmp.c
+@@ -32,7 +32,7 @@ static void write_pen_release(int val)
+ 	sync_cache_w(&pen_release);
+ }
+ 
+-static DEFINE_SPINLOCK(boot_lock);
++static DEFINE_RAW_SPINLOCK(boot_lock);
+ 
+ void versatile_secondary_init(unsigned int cpu)
+ {
+@@ -45,8 +45,8 @@ void versatile_secondary_init(unsigned int cpu)
+ 	/*
+ 	 * Synchronise with the boot thread.
+ 	 */
+-	spin_lock(&boot_lock);
+-	spin_unlock(&boot_lock);
++	raw_spin_lock(&boot_lock);
++	raw_spin_unlock(&boot_lock);
+ }
+ 
+ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
+@@ -57,7 +57,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
+ 	 * Set synchronisation state between this boot processor
+ 	 * and the secondary one
+ 	 */
+-	spin_lock(&boot_lock);
++	raw_spin_lock(&boot_lock);
+ 
+ 	/*
+ 	 * This is really belt and braces; we hold unintended secondary
+@@ -87,7 +87,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
+ 	 * now the secondary core is starting up let it run its
+ 	 * calibrations, then wait for it to finish
+ 	 */
+-	spin_unlock(&boot_lock);
++	raw_spin_unlock(&boot_lock);
+ 
+ 	return pen_release != -1 ? -ENOSYS : 0;
+ }
+diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
+index bc3f00f586f1..0f3df6d5154a 100644
+--- a/arch/arm64/Kconfig
++++ b/arch/arm64/Kconfig
+@@ -90,6 +90,7 @@ config ARM64
+ 	select HAVE_PERF_EVENTS
+ 	select HAVE_PERF_REGS
+ 	select HAVE_PERF_USER_STACK_DUMP
++	select HAVE_PREEMPT_LAZY
+ 	select HAVE_REGS_AND_STACK_ACCESS_API
+ 	select HAVE_RCU_TABLE_FREE
+ 	select HAVE_SYSCALL_TRACEPOINTS
+@@ -689,7 +690,7 @@ config XEN_DOM0
+ 
+ config XEN
+ 	bool "Xen guest support on ARM64"
+-	depends on ARM64 && OF
++	depends on ARM64 && OF && !PREEMPT_RT_FULL
+ 	select SWIOTLB_XEN
+ 	select PARAVIRT
+ 	help
+diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
+index abd64bd1f6d9..9170788ffa37 100644
+--- a/arch/arm64/include/asm/thread_info.h
++++ b/arch/arm64/include/asm/thread_info.h
+@@ -49,6 +49,7 @@ struct thread_info {
+ 	mm_segment_t		addr_limit;	/* address limit */
+ 	struct task_struct	*task;		/* main task structure */
+ 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
++	int			preempt_lazy_count; /* 0 => preemptable, <0 => bug */
+ 	int			cpu;		/* cpu */
+ };
+ 
+@@ -109,6 +110,7 @@ static inline struct thread_info *current_thread_info(void)
+ #define TIF_NEED_RESCHED	1
+ #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
+ #define TIF_FOREIGN_FPSTATE	3	/* CPU's FP state is not current's */
++#define TIF_NEED_RESCHED_LAZY	4
+ #define TIF_NOHZ		7
+ #define TIF_SYSCALL_TRACE	8
+ #define TIF_SYSCALL_AUDIT	9
+@@ -124,6 +126,7 @@ static inline struct thread_info *current_thread_info(void)
+ #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
+ #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
+ #define _TIF_FOREIGN_FPSTATE	(1 << TIF_FOREIGN_FPSTATE)
++#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
+ #define _TIF_NOHZ		(1 << TIF_NOHZ)
+ #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
+ #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
+@@ -132,7 +135,8 @@ static inline struct thread_info *current_thread_info(void)
+ #define _TIF_32BIT		(1 << TIF_32BIT)
+ 
+ #define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
+-				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
++				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
++				 _TIF_NEED_RESCHED_LAZY)
+ 
+ #define _TIF_SYSCALL_WORK	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
+ 				 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
+diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
+index 05070b72fc28..acfeddb1283a 100644
+--- a/arch/arm64/kernel/asm-offsets.c
++++ b/arch/arm64/kernel/asm-offsets.c
+@@ -37,6 +37,7 @@ int main(void)
+   BLANK();
+   DEFINE(TI_FLAGS,		offsetof(struct thread_info, flags));
+   DEFINE(TI_PREEMPT,		offsetof(struct thread_info, preempt_count));
++  DEFINE(TI_PREEMPT_LAZY,	offsetof(struct thread_info, preempt_lazy_count));
+   DEFINE(TI_ADDR_LIMIT,		offsetof(struct thread_info, addr_limit));
+   DEFINE(TI_TASK,		offsetof(struct thread_info, task));
+   DEFINE(TI_CPU,		offsetof(struct thread_info, cpu));
+diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
+index 441420ca7d08..404792bdca99 100644
+--- a/arch/arm64/kernel/entry.S
++++ b/arch/arm64/kernel/entry.S
+@@ -434,11 +434,16 @@ ENDPROC(el1_sync)
+ 
+ #ifdef CONFIG_PREEMPT
+ 	ldr	w24, [tsk, #TI_PREEMPT]		// get preempt count
+-	cbnz	w24, 1f				// preempt count != 0
++	cbnz	w24, 2f				// preempt count != 0
+ 	ldr	x0, [tsk, #TI_FLAGS]		// get flags
+-	tbz	x0, #TIF_NEED_RESCHED, 1f	// needs rescheduling?
+-	bl	el1_preempt
++	tbnz	x0, #TIF_NEED_RESCHED, 1f	// needs rescheduling?
++
++	ldr	w24, [tsk, #TI_PREEMPT_LAZY]	// get preempt lazy count
++	cbnz	w24, 2f				// preempt lazy count != 0
++	tbz	x0, #TIF_NEED_RESCHED_LAZY, 2f	// needs rescheduling?
+ 1:
++	bl	el1_preempt
++2:
+ #endif
+ #ifdef CONFIG_TRACE_IRQFLAGS
+ 	bl	trace_hardirqs_on
+@@ -452,6 +457,7 @@ ENDPROC(el1_irq)
+ 1:	bl	preempt_schedule_irq		// irq en/disable is done inside
+ 	ldr	x0, [tsk, #TI_FLAGS]		// get new tasks TI_FLAGS
+ 	tbnz	x0, #TIF_NEED_RESCHED, 1b	// needs rescheduling?
++	tbnz	x0, #TIF_NEED_RESCHED_LAZY, 1b	// needs rescheduling?
+ 	ret	x24
+ #endif
+ 
+@@ -708,6 +714,7 @@ ENDPROC(cpu_switch_to)
+  */
+ work_pending:
+ 	tbnz	x1, #TIF_NEED_RESCHED, work_resched
++	tbnz	x1, #TIF_NEED_RESCHED_LAZY, work_resched
+ 	/* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
+ 	mov	x0, sp				// 'regs'
+ 	enable_irq				// enable interrupts for do_notify_resume()
+diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
+index 212ff92920d2..71ad38d3d76b 100644
+--- a/arch/mips/Kconfig
++++ b/arch/mips/Kconfig
+@@ -2480,7 +2480,7 @@ config MIPS_ASID_BITS_VARIABLE
+ #
+ config HIGHMEM
+ 	bool "High Memory Support"
+-	depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
++	depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
+ 
+ config CPU_SUPPORTS_HIGHMEM
+ 	bool
+diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
+index 792cb1768c8f..ddf5a0fdb25a 100644
+--- a/arch/powerpc/Kconfig
++++ b/arch/powerpc/Kconfig
+@@ -57,10 +57,11 @@ config LOCKDEP_SUPPORT
+ 
+ config RWSEM_GENERIC_SPINLOCK
+ 	bool
++	default y if PREEMPT_RT_FULL
+ 
+ config RWSEM_XCHGADD_ALGORITHM
+ 	bool
+-	default y
++	default y if !PREEMPT_RT_FULL
+ 
+ config GENERIC_LOCKBREAK
+ 	bool
+@@ -140,6 +141,7 @@ config PPC
+ 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
+ 	select GENERIC_STRNCPY_FROM_USER
+ 	select GENERIC_STRNLEN_USER
++	select HAVE_PREEMPT_LAZY
+ 	select HAVE_MOD_ARCH_SPECIFIC
+ 	select MODULES_USE_ELF_RELA
+ 	select CLONE_BACKWARDS
+@@ -326,7 +328,7 @@ menu "Kernel options"
+ 
+ config HIGHMEM
+ 	bool "High memory support"
+-	depends on PPC32
++	depends on PPC32 && !PREEMPT_RT_FULL
+ 
+ source kernel/Kconfig.hz
+ source kernel/Kconfig.preempt
+diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
+index 87e4b2d8dcd4..981e501a4359 100644
+--- a/arch/powerpc/include/asm/thread_info.h
++++ b/arch/powerpc/include/asm/thread_info.h
+@@ -43,6 +43,8 @@ struct thread_info {
+ 	int		cpu;			/* cpu we're on */
+ 	int		preempt_count;		/* 0 => preemptable,
+ 						   <0 => BUG */
++	int		preempt_lazy_count;	/* 0 => preemptable,
++						   <0 => BUG */
+ 	unsigned long	local_flags;		/* private flags for thread */
+ #ifdef CONFIG_LIVEPATCH
+ 	unsigned long *livepatch_sp;
+@@ -88,8 +90,7 @@ static inline struct thread_info *current_thread_info(void)
+ #define TIF_SYSCALL_TRACE	0	/* syscall trace active */
+ #define TIF_SIGPENDING		1	/* signal pending */
+ #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
+-#define TIF_POLLING_NRFLAG	3	/* true if poll_idle() is polling
+-					   TIF_NEED_RESCHED */
++#define TIF_NEED_RESCHED_LAZY	3	/* lazy rescheduling necessary */
+ #define TIF_32BIT		4	/* 32 bit binary */
+ #define TIF_RESTORE_TM		5	/* need to restore TM FP/VEC/VSX */
+ #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
+@@ -107,6 +108,8 @@ static inline struct thread_info *current_thread_info(void)
+ #if defined(CONFIG_PPC64)
+ #define TIF_ELF2ABI		18	/* function descriptors must die! */
+ #endif
++#define TIF_POLLING_NRFLAG	19	/* true if poll_idle() is polling
++					   TIF_NEED_RESCHED */
+ 
+ /* as above, but as bit values */
+ #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
+@@ -125,14 +128,16 @@ static inline struct thread_info *current_thread_info(void)
+ #define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
+ #define _TIF_EMULATE_STACK_STORE	(1<<TIF_EMULATE_STACK_STORE)
+ #define _TIF_NOHZ		(1<<TIF_NOHZ)
++#define _TIF_NEED_RESCHED_LAZY	(1<<TIF_NEED_RESCHED_LAZY)
+ #define _TIF_SYSCALL_DOTRACE	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
+ 				 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
+ 				 _TIF_NOHZ)
+ 
+ #define _TIF_USER_WORK_MASK	(_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
+ 				 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
+-				 _TIF_RESTORE_TM)
++				 _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
+ #define _TIF_PERSYSCALL_MASK	(_TIF_RESTOREALL|_TIF_NOERROR)
++#define _TIF_NEED_RESCHED_MASK	(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
+ 
+ /* Bits in local_flags */
+ /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
+diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
+index b89d14c0352c..81ae8f4c88f6 100644
+--- a/arch/powerpc/kernel/asm-offsets.c
++++ b/arch/powerpc/kernel/asm-offsets.c
+@@ -156,6 +156,7 @@ int main(void)
+ 	DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
+ 	DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
+ 	DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
++	DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
+ 	DEFINE(TI_TASK, offsetof(struct thread_info, task));
+ 	DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
+ 
+diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
+index 9899032230b4..f95b93f46c47 100644
+--- a/arch/powerpc/kernel/entry_32.S
++++ b/arch/powerpc/kernel/entry_32.S
+@@ -835,7 +835,14 @@ user_exc_return:		/* r10 contains MSR_KERNEL here */
+ 	cmpwi	0,r0,0		/* if non-zero, just restore regs and return */
+ 	bne	restore
+ 	andi.	r8,r8,_TIF_NEED_RESCHED
++	bne+	1f
++	lwz	r0,TI_PREEMPT_LAZY(r9)
++	cmpwi	0,r0,0		/* if non-zero, just restore regs and return */
++	bne	restore
++	lwz	r0,TI_FLAGS(r9)
++	andi.	r0,r0,_TIF_NEED_RESCHED_LAZY
+ 	beq+	restore
++1:
+ 	lwz	r3,_MSR(r1)
+ 	andi.	r0,r3,MSR_EE	/* interrupts off? */
+ 	beq	restore		/* don't schedule if so */
+@@ -846,11 +853,11 @@ user_exc_return:		/* r10 contains MSR_KERNEL here */
+ 	 */
+ 	bl	trace_hardirqs_off
+ #endif
+-1:	bl	preempt_schedule_irq
++2:	bl	preempt_schedule_irq
+ 	CURRENT_THREAD_INFO(r9, r1)
+ 	lwz	r3,TI_FLAGS(r9)
+-	andi.	r0,r3,_TIF_NEED_RESCHED
+-	bne-	1b
++	andi.	r0,r3,_TIF_NEED_RESCHED_MASK
++	bne-	2b
+ #ifdef CONFIG_TRACE_IRQFLAGS
+ 	/* And now, to properly rebalance the above, we tell lockdep they
+ 	 * are being turned back on, which will happen when we return
+@@ -1171,7 +1178,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
+ #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
+ 
+ do_work:			/* r10 contains MSR_KERNEL here */
+-	andi.	r0,r9,_TIF_NEED_RESCHED
++	andi.	r0,r9,_TIF_NEED_RESCHED_MASK
+ 	beq	do_user_signal
+ 
+ do_resched:			/* r10 contains MSR_KERNEL here */
+@@ -1192,7 +1199,7 @@ do_resched:			/* r10 contains MSR_KERNEL here */
+ 	MTMSRD(r10)		/* disable interrupts */
+ 	CURRENT_THREAD_INFO(r9, r1)
+ 	lwz	r9,TI_FLAGS(r9)
+-	andi.	r0,r9,_TIF_NEED_RESCHED
++	andi.	r0,r9,_TIF_NEED_RESCHED_MASK
+ 	bne-	do_resched
+ 	andi.	r0,r9,_TIF_USER_WORK_MASK
+ 	beq	restore_user
+diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
+index 5afd03e5e8b8..f5d4c2a033ef 100644
+--- a/arch/powerpc/kernel/entry_64.S
++++ b/arch/powerpc/kernel/entry_64.S
+@@ -657,7 +657,7 @@ _GLOBAL(ret_from_except_lite)
+ 	bl	restore_math
+ 	b	restore
+ #endif
+-1:	andi.	r0,r4,_TIF_NEED_RESCHED
++1:	andi.	r0,r4,_TIF_NEED_RESCHED_MASK
+ 	beq	2f
+ 	bl	restore_interrupts
+ 	SCHEDULE_USER
+@@ -719,10 +719,18 @@ _GLOBAL(ret_from_except_lite)
+ 
+ #ifdef CONFIG_PREEMPT
+ 	/* Check if we need to preempt */
+-	andi.	r0,r4,_TIF_NEED_RESCHED
+-	beq+	restore
+-	/* Check that preempt_count() == 0 and interrupts are enabled */
+ 	lwz	r8,TI_PREEMPT(r9)
++	cmpwi	0,r8,0		/* if non-zero, just restore regs and return */
++	bne	restore
++	andi.	r0,r4,_TIF_NEED_RESCHED
++	bne+	check_count
++
++	andi.	r0,r4,_TIF_NEED_RESCHED_LAZY
++	beq+	restore
++	lwz	r8,TI_PREEMPT_LAZY(r9)
++
++	/* Check that preempt_count() == 0 and interrupts are enabled */
++check_count:
+ 	cmpwi	cr1,r8,0
+ 	ld	r0,SOFTE(r1)
+ 	cmpdi	r0,0
+@@ -739,7 +747,7 @@ _GLOBAL(ret_from_except_lite)
+ 	/* Re-test flags and eventually loop */
+ 	CURRENT_THREAD_INFO(r9, r1)
+ 	ld	r4,TI_FLAGS(r9)
+-	andi.	r0,r4,_TIF_NEED_RESCHED
++	andi.	r0,r4,_TIF_NEED_RESCHED_MASK
+ 	bne	1b
+ 
+ 	/*
+diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
+index 08887cf2b20e..f1770ea2d094 100644
+--- a/arch/powerpc/kernel/irq.c
++++ b/arch/powerpc/kernel/irq.c
+@@ -633,6 +633,7 @@ void irq_ctx_init(void)
+ 	}
+ }
+ 
++#ifndef CONFIG_PREEMPT_RT_FULL
+ void do_softirq_own_stack(void)
+ {
+ 	struct thread_info *curtp, *irqtp;
+@@ -650,6 +651,7 @@ void do_softirq_own_stack(void)
+ 	if (irqtp->flags)
+ 		set_bits(irqtp->flags, &curtp->flags);
+ }
++#endif
+ 
+ irq_hw_number_t virq_to_hw(unsigned int virq)
+ {
+diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
+index d9c912b6e632..7b2e997a5083 100644
+--- a/arch/powerpc/kernel/misc_32.S
++++ b/arch/powerpc/kernel/misc_32.S
+@@ -40,6 +40,7 @@
+  * We store the saved ksp_limit in the unused part
+  * of the STACK_FRAME_OVERHEAD
+  */
++#ifndef CONFIG_PREEMPT_RT_FULL
+ _GLOBAL(call_do_softirq)
+ 	mflr	r0
+ 	stw	r0,4(r1)
+@@ -56,6 +57,7 @@ _GLOBAL(call_do_softirq)
+ 	stw	r10,THREAD+KSP_LIMIT(r2)
+ 	mtlr	r0
+ 	blr
++#endif
+ 
+ /*
+  * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
+diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
+index cb195157b318..c919a2bfd0ca 100644
+--- a/arch/powerpc/kernel/misc_64.S
++++ b/arch/powerpc/kernel/misc_64.S
+@@ -30,6 +30,7 @@
+ 
+ 	.text
+ 
++#ifndef CONFIG_PREEMPT_RT_FULL
+ _GLOBAL(call_do_softirq)
+ 	mflr	r0
+ 	std	r0,16(r1)
+@@ -40,6 +41,7 @@ _GLOBAL(call_do_softirq)
+ 	ld	r0,16(r1)
+ 	mtlr	r0
+ 	blr
++#endif
+ 
+ _GLOBAL(call_do_irq)
+ 	mflr	r0
+diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
+index c2024ac9d4e8..2303788da7e1 100644
+--- a/arch/powerpc/kvm/Kconfig
++++ b/arch/powerpc/kvm/Kconfig
+@@ -172,6 +172,7 @@ config KVM_E500MC
+ config KVM_MPIC
+ 	bool "KVM in-kernel MPIC emulation"
+ 	depends on KVM && E500
++	depends on !PREEMPT_RT_FULL
+ 	select HAVE_KVM_IRQCHIP
+ 	select HAVE_KVM_IRQFD
+ 	select HAVE_KVM_IRQ_ROUTING
+diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
+index 57caaf11a83f..030c9bfe52e3 100644
+--- a/arch/powerpc/platforms/ps3/device-init.c
++++ b/arch/powerpc/platforms/ps3/device-init.c
+@@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev,
+ 	}
+ 	pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
+ 
+-	res = wait_event_interruptible(dev->done.wait,
++	res = swait_event_interruptible(dev->done.wait,
+ 				       dev->done.done || kthread_should_stop());
+ 	if (kthread_should_stop())
+ 		res = -EINTR;
+diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
+index 6c0378c0b8b5..abd58b4dff97 100644
+--- a/arch/sh/kernel/irq.c
++++ b/arch/sh/kernel/irq.c
+@@ -147,6 +147,7 @@ void irq_ctx_exit(int cpu)
+ 	hardirq_ctx[cpu] = NULL;
+ }
+ 
++#ifndef CONFIG_PREEMPT_RT_FULL
+ void do_softirq_own_stack(void)
+ {
+ 	struct thread_info *curctx;
+@@ -174,6 +175,7 @@ void do_softirq_own_stack(void)
+ 		  "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
+ 	);
+ }
++#endif
+ #else
+ static inline void handle_one_irq(unsigned int irq)
+ {
+diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
+index 59b09600dd32..1b073eb3dc2a 100644
+--- a/arch/sparc/Kconfig
++++ b/arch/sparc/Kconfig
+@@ -187,12 +187,10 @@ config NR_CPUS
+ source kernel/Kconfig.hz
+ 
+ config RWSEM_GENERIC_SPINLOCK
+-	bool
+-	default y if SPARC32
++	def_bool PREEMPT_RT_FULL
+ 
+ config RWSEM_XCHGADD_ALGORITHM
+-	bool
+-	default y if SPARC64
++	def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
+ 
+ config GENERIC_HWEIGHT
+ 	bool
+diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
+index 34a7930b76ef..773740521008 100644
+--- a/arch/sparc/kernel/irq_64.c
++++ b/arch/sparc/kernel/irq_64.c
+@@ -854,6 +854,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
+ 	set_irq_regs(old_regs);
+ }
+ 
++#ifndef CONFIG_PREEMPT_RT_FULL
+ void do_softirq_own_stack(void)
+ {
+ 	void *orig_sp, *sp = softirq_stack[smp_processor_id()];
+@@ -868,6 +869,7 @@ void do_softirq_own_stack(void)
+ 	__asm__ __volatile__("mov %0, %%sp"
+ 			     : : "r" (orig_sp));
+ }
++#endif
+ 
+ #ifdef CONFIG_HOTPLUG_CPU
+ void fixup_irqs(void)
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index 2a1f0ce7c59a..bd4ab87efb31 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -17,6 +17,7 @@ config X86_64
+ ### Arch settings
+ config X86
+ 	def_bool y
++	select HAVE_PREEMPT_LAZY
+ 	select ACPI_LEGACY_TABLES_LOOKUP	if ACPI
+ 	select ACPI_SYSTEM_POWER_STATES_SUPPORT	if ACPI
+ 	select ANON_INODES
+@@ -231,8 +232,11 @@ config ARCH_MAY_HAVE_PC_FDC
+ 	def_bool y
+ 	depends on ISA_DMA_API
+ 
++config RWSEM_GENERIC_SPINLOCK
++	def_bool PREEMPT_RT_FULL
++
+ config RWSEM_XCHGADD_ALGORITHM
+-	def_bool y
++	def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
+ 
+ config GENERIC_CALIBRATE_DELAY
+ 	def_bool y
+@@ -885,7 +889,7 @@ config IOMMU_HELPER
+ config MAXSMP
+ 	bool "Enable Maximum number of SMP Processors and NUMA Nodes"
+ 	depends on X86_64 && SMP && DEBUG_KERNEL
+-	select CPUMASK_OFFSTACK
++	select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
+ 	---help---
+ 	  Enable maximum number of CPUS and NUMA Nodes for this architecture.
+ 	  If unsure, say N.
+diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
+index 0ab5ee1c26af..fff8f6f1f90c 100644
+--- a/arch/x86/crypto/aesni-intel_glue.c
++++ b/arch/x86/crypto/aesni-intel_glue.c
+@@ -372,14 +372,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
+ 	err = blkcipher_walk_virt(desc, &walk);
+ 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ 
+-	kernel_fpu_begin();
+ 	while ((nbytes = walk.nbytes)) {
++		kernel_fpu_begin();
+ 		aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+-			      nbytes & AES_BLOCK_MASK);
++				nbytes & AES_BLOCK_MASK);
++		kernel_fpu_end();
+ 		nbytes &= AES_BLOCK_SIZE - 1;
+ 		err = blkcipher_walk_done(desc, &walk, nbytes);
+ 	}
+-	kernel_fpu_end();
+ 
+ 	return err;
+ }
+@@ -396,14 +396,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
+ 	err = blkcipher_walk_virt(desc, &walk);
+ 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ 
+-	kernel_fpu_begin();
+ 	while ((nbytes = walk.nbytes)) {
++		kernel_fpu_begin();
+ 		aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ 			      nbytes & AES_BLOCK_MASK);
++		kernel_fpu_end();
+ 		nbytes &= AES_BLOCK_SIZE - 1;
+ 		err = blkcipher_walk_done(desc, &walk, nbytes);
+ 	}
+-	kernel_fpu_end();
+ 
+ 	return err;
+ }
+@@ -420,14 +420,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
+ 	err = blkcipher_walk_virt(desc, &walk);
+ 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ 
+-	kernel_fpu_begin();
+ 	while ((nbytes = walk.nbytes)) {
++		kernel_fpu_begin();
+ 		aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ 			      nbytes & AES_BLOCK_MASK, walk.iv);
++		kernel_fpu_end();
+ 		nbytes &= AES_BLOCK_SIZE - 1;
+ 		err = blkcipher_walk_done(desc, &walk, nbytes);
+ 	}
+-	kernel_fpu_end();
+ 
+ 	return err;
+ }
+@@ -444,14 +444,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
+ 	err = blkcipher_walk_virt(desc, &walk);
+ 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ 
+-	kernel_fpu_begin();
+ 	while ((nbytes = walk.nbytes)) {
++		kernel_fpu_begin();
+ 		aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ 			      nbytes & AES_BLOCK_MASK, walk.iv);
++		kernel_fpu_end();
+ 		nbytes &= AES_BLOCK_SIZE - 1;
+ 		err = blkcipher_walk_done(desc, &walk, nbytes);
+ 	}
+-	kernel_fpu_end();
+ 
+ 	return err;
+ }
+@@ -503,18 +503,20 @@ static int ctr_crypt(struct blkcipher_desc *desc,
+ 	err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
+ 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ 
+-	kernel_fpu_begin();
+ 	while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
++		kernel_fpu_begin();
+ 		aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ 			              nbytes & AES_BLOCK_MASK, walk.iv);
++		kernel_fpu_end();
+ 		nbytes &= AES_BLOCK_SIZE - 1;
+ 		err = blkcipher_walk_done(desc, &walk, nbytes);
+ 	}
+ 	if (walk.nbytes) {
++		kernel_fpu_begin();
+ 		ctr_crypt_final(ctx, &walk);
++		kernel_fpu_end();
+ 		err = blkcipher_walk_done(desc, &walk, 0);
+ 	}
+-	kernel_fpu_end();
+ 
+ 	return err;
+ }
+diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
+index 8648158f3916..d7699130ee36 100644
+--- a/arch/x86/crypto/cast5_avx_glue.c
++++ b/arch/x86/crypto/cast5_avx_glue.c
+@@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
+ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
+ 		     bool enc)
+ {
+-	bool fpu_enabled = false;
++	bool fpu_enabled;
+ 	struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ 	const unsigned int bsize = CAST5_BLOCK_SIZE;
+ 	unsigned int nbytes;
+@@ -75,7 +75,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
+ 		u8 *wsrc = walk->src.virt.addr;
+ 		u8 *wdst = walk->dst.virt.addr;
+ 
+-		fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
++		fpu_enabled = cast5_fpu_begin(false, nbytes);
+ 
+ 		/* Process multi-block batch */
+ 		if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
+@@ -103,10 +103,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
+ 		} while (nbytes >= bsize);
+ 
+ done:
++		cast5_fpu_end(fpu_enabled);
+ 		err = blkcipher_walk_done(desc, walk, nbytes);
+ 	}
+-
+-	cast5_fpu_end(fpu_enabled);
+ 	return err;
+ }
+ 
+@@ -227,7 +226,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
+ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ 		       struct scatterlist *src, unsigned int nbytes)
+ {
+-	bool fpu_enabled = false;
++	bool fpu_enabled;
+ 	struct blkcipher_walk walk;
+ 	int err;
+ 
+@@ -236,12 +235,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ 
+ 	while ((nbytes = walk.nbytes)) {
+-		fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
++		fpu_enabled = cast5_fpu_begin(false, nbytes);
+ 		nbytes = __cbc_decrypt(desc, &walk);
++		cast5_fpu_end(fpu_enabled);
+ 		err = blkcipher_walk_done(desc, &walk, nbytes);
+ 	}
+-
+-	cast5_fpu_end(fpu_enabled);
+ 	return err;
+ }
+ 
+@@ -311,7 +309,7 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
+ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ 		     struct scatterlist *src, unsigned int nbytes)
+ {
+-	bool fpu_enabled = false;
++	bool fpu_enabled;
+ 	struct blkcipher_walk walk;
+ 	int err;
+ 
+@@ -320,13 +318,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ 
+ 	while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
+-		fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
++		fpu_enabled = cast5_fpu_begin(false, nbytes);
+ 		nbytes = __ctr_crypt(desc, &walk);
++		cast5_fpu_end(fpu_enabled);
+ 		err = blkcipher_walk_done(desc, &walk, nbytes);
+ 	}
+ 
+-	cast5_fpu_end(fpu_enabled);
+-
+ 	if (walk.nbytes) {
+ 		ctr_crypt_final(desc, &walk);
+ 		err = blkcipher_walk_done(desc, &walk, 0);
+diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
+index 6a85598931b5..3a506ce7ed93 100644
+--- a/arch/x86/crypto/glue_helper.c
++++ b/arch/x86/crypto/glue_helper.c
+@@ -39,7 +39,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
+ 	void *ctx = crypto_blkcipher_ctx(desc->tfm);
+ 	const unsigned int bsize = 128 / 8;
+ 	unsigned int nbytes, i, func_bytes;
+-	bool fpu_enabled = false;
++	bool fpu_enabled;
+ 	int err;
+ 
+ 	err = blkcipher_walk_virt(desc, walk);
+@@ -49,7 +49,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
+ 		u8 *wdst = walk->dst.virt.addr;
+ 
+ 		fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+-					     desc, fpu_enabled, nbytes);
++					     desc, false, nbytes);
+ 
+ 		for (i = 0; i < gctx->num_funcs; i++) {
+ 			func_bytes = bsize * gctx->funcs[i].num_blocks;
+@@ -71,10 +71,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
+ 		}
+ 
+ done:
++		glue_fpu_end(fpu_enabled);
+ 		err = blkcipher_walk_done(desc, walk, nbytes);
+ 	}
+ 
+-	glue_fpu_end(fpu_enabled);
+ 	return err;
+ }
+ 
+@@ -194,7 +194,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
+ 			    struct scatterlist *src, unsigned int nbytes)
+ {
+ 	const unsigned int bsize = 128 / 8;
+-	bool fpu_enabled = false;
++	bool fpu_enabled;
+ 	struct blkcipher_walk walk;
+ 	int err;
+ 
+@@ -203,12 +203,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
+ 
+ 	while ((nbytes = walk.nbytes)) {
+ 		fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+-					     desc, fpu_enabled, nbytes);
++					     desc, false, nbytes);
+ 		nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
++		glue_fpu_end(fpu_enabled);
+ 		err = blkcipher_walk_done(desc, &walk, nbytes);
+ 	}
+ 
+-	glue_fpu_end(fpu_enabled);
+ 	return err;
+ }
+ EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
+@@ -277,7 +277,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
+ 			  struct scatterlist *src, unsigned int nbytes)
+ {
+ 	const unsigned int bsize = 128 / 8;
+-	bool fpu_enabled = false;
++	bool fpu_enabled;
+ 	struct blkcipher_walk walk;
+ 	int err;
+ 
+@@ -286,13 +286,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
+ 
+ 	while ((nbytes = walk.nbytes) >= bsize) {
+ 		fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+-					     desc, fpu_enabled, nbytes);
++					     desc, false, nbytes);
+ 		nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
++		glue_fpu_end(fpu_enabled);
+ 		err = blkcipher_walk_done(desc, &walk, nbytes);
+ 	}
+ 
+-	glue_fpu_end(fpu_enabled);
+-
+ 	if (walk.nbytes) {
+ 		glue_ctr_crypt_final_128bit(
+ 			gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
+@@ -347,7 +346,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
+ 			  void *tweak_ctx, void *crypt_ctx)
+ {
+ 	const unsigned int bsize = 128 / 8;
+-	bool fpu_enabled = false;
++	bool fpu_enabled;
+ 	struct blkcipher_walk walk;
+ 	int err;
+ 
+@@ -360,21 +359,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
+ 
+ 	/* set minimum length to bsize, for tweak_fn */
+ 	fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+-				     desc, fpu_enabled,
++				     desc, false,
+ 				     nbytes < bsize ? bsize : nbytes);
+-
+ 	/* calculate first value of T */
+ 	tweak_fn(tweak_ctx, walk.iv, walk.iv);
++	glue_fpu_end(fpu_enabled);
+ 
+ 	while (nbytes) {
++		fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
++				desc, false, nbytes);
+ 		nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
+ 
++		glue_fpu_end(fpu_enabled);
+ 		err = blkcipher_walk_done(desc, &walk, nbytes);
+ 		nbytes = walk.nbytes;
+ 	}
+-
+-	glue_fpu_end(fpu_enabled);
+-
+ 	return err;
+ }
+ EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
+diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
+index 1433f6b4607d..f963fde8e4fa 100644
+--- a/arch/x86/entry/common.c
++++ b/arch/x86/entry/common.c
+@@ -136,7 +136,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
+ 
+ #define EXIT_TO_USERMODE_LOOP_FLAGS				\
+ 	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |	\
+-	 _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
++	 _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
+ 
+ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
+ {
+@@ -152,9 +152,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
+ 		/* We have work to do. */
+ 		local_irq_enable();
+ 
+-		if (cached_flags & _TIF_NEED_RESCHED)
++		if (cached_flags & _TIF_NEED_RESCHED_MASK)
+ 			schedule();
+ 
++#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
++		if (unlikely(current->forced_info.si_signo)) {
++			struct task_struct *t = current;
++			force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
++			t->forced_info.si_signo = 0;
++		}
++#endif
+ 		if (cached_flags & _TIF_UPROBE)
+ 			uprobe_notify_resume(regs);
+ 
+diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
+index 0b56666e6039..1d8ee026c9c5 100644
+--- a/arch/x86/entry/entry_32.S
++++ b/arch/x86/entry/entry_32.S
+@@ -271,8 +271,25 @@ END(ret_from_exception)
+ ENTRY(resume_kernel)
+ 	DISABLE_INTERRUPTS(CLBR_ANY)
+ need_resched:
++	# preempt count == 0 + NEED_RS set?
+ 	cmpl	$0, PER_CPU_VAR(__preempt_count)
++#ifndef CONFIG_PREEMPT_LAZY
+ 	jnz	restore_all
++#else
++	jz test_int_off
++
++	# atleast preempt count == 0 ?
++	cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
++	jne restore_all
++
++	GET_THREAD_INFO(%ebp)
++	cmpl $0,TI_preempt_lazy_count(%ebp)	# non-zero preempt_lazy_count ?
++	jnz restore_all
++
++	testl $_TIF_NEED_RESCHED_LAZY, TI_flags(%ebp)
++	jz restore_all
++test_int_off:
++#endif
+ 	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)	# interrupts off (exception path) ?
+ 	jz	restore_all
+ 	call	preempt_schedule_irq
+diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
+index 02fff3ebfb87..81ec3d016df0 100644
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -512,7 +512,23 @@ GLOBAL(retint_user)
+ 	bt	$9, EFLAGS(%rsp)		/* were interrupts off? */
+ 	jnc	1f
+ 0:	cmpl	$0, PER_CPU_VAR(__preempt_count)
++#ifndef CONFIG_PREEMPT_LAZY
+ 	jnz	1f
++#else
++	jz	do_preempt_schedule_irq
++
++	# atleast preempt count == 0 ?
++	cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
++	jnz	1f
++
++	GET_THREAD_INFO(%rcx)
++	cmpl	$0, TI_preempt_lazy_count(%rcx)
++	jnz	1f
++
++	bt	$TIF_NEED_RESCHED_LAZY,TI_flags(%rcx)
++	jnc	1f
++do_preempt_schedule_irq:
++#endif
+ 	call	preempt_schedule_irq
+ 	jmp	0b
+ 1:
+@@ -817,6 +833,7 @@ END(native_load_gs_index)
+ 	jmp	2b
+ 	.previous
+ 
++#ifndef CONFIG_PREEMPT_RT_FULL
+ /* Call softirq on interrupt stack. Interrupts are off. */
+ ENTRY(do_softirq_own_stack)
+ 	pushq	%rbp
+@@ -829,6 +846,7 @@ ENTRY(do_softirq_own_stack)
+ 	decl	PER_CPU_VAR(irq_count)
+ 	ret
+ END(do_softirq_own_stack)
++#endif
+ 
+ #ifdef CONFIG_XEN
+ idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
+diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
+index 17f218645701..11bd1b7ee6eb 100644
+--- a/arch/x86/include/asm/preempt.h
++++ b/arch/x86/include/asm/preempt.h
+@@ -79,17 +79,46 @@ static __always_inline void __preempt_count_sub(int val)
+  * a decrement which hits zero means we have no preempt_count and should
+  * reschedule.
+  */
+-static __always_inline bool __preempt_count_dec_and_test(void)
++static __always_inline bool ____preempt_count_dec_and_test(void)
+ {
+ 	GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
+ }
+ 
++static __always_inline bool __preempt_count_dec_and_test(void)
++{
++	if (____preempt_count_dec_and_test())
++		return true;
++#ifdef CONFIG_PREEMPT_LAZY
++	if (current_thread_info()->preempt_lazy_count)
++		return false;
++	return test_thread_flag(TIF_NEED_RESCHED_LAZY);
++#else
++	return false;
++#endif
++}
++
+ /*
+  * Returns true when we need to resched and can (barring IRQ state).
+  */
+ static __always_inline bool should_resched(int preempt_offset)
+ {
++#ifdef CONFIG_PREEMPT_LAZY
++	u32 tmp;
++
++	tmp = raw_cpu_read_4(__preempt_count);
++	if (tmp == preempt_offset)
++		return true;
++
++	/* preempt count == 0 ? */
++	tmp &= ~PREEMPT_NEED_RESCHED;
++	if (tmp)
++		return false;
++	if (current_thread_info()->preempt_lazy_count)
++		return false;
++	return test_thread_flag(TIF_NEED_RESCHED_LAZY);
++#else
+ 	return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
++#endif
+ }
+ 
+ #ifdef CONFIG_PREEMPT
+diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
+index dd1e7d6387ab..d59bedb28bab 100644
+--- a/arch/x86/include/asm/signal.h
++++ b/arch/x86/include/asm/signal.h
+@@ -23,6 +23,19 @@ typedef struct {
+ 	unsigned long sig[_NSIG_WORDS];
+ } sigset_t;
+ 
++/*
++ * Because some traps use the IST stack, we must keep preemption
++ * disabled while calling do_trap(), but do_trap() may call
++ * force_sig_info() which will grab the signal spin_locks for the
++ * task, which in PREEMPT_RT_FULL are mutexes.  By defining
++ * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
++ * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
++ * trap.
++ */
++#if defined(CONFIG_PREEMPT_RT_FULL)
++#define ARCH_RT_DELAYS_SIGNAL_SEND
++#endif
++
+ #ifndef CONFIG_COMPAT
+ typedef sigset_t compat_sigset_t;
+ #endif
+diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
+index 58505f01962f..02fa39652cd6 100644
+--- a/arch/x86/include/asm/stackprotector.h
++++ b/arch/x86/include/asm/stackprotector.h
+@@ -59,7 +59,7 @@
+  */
+ static __always_inline void boot_init_stack_canary(void)
+ {
+-	u64 canary;
++	u64 uninitialized_var(canary);
+ 	u64 tsc;
+ 
+ #ifdef CONFIG_X86_64
+@@ -70,8 +70,15 @@ static __always_inline void boot_init_stack_canary(void)
+ 	 * of randomness. The TSC only matters for very early init,
+ 	 * there it already has some randomness on most systems. Later
+ 	 * on during the bootup the random pool has true entropy too.
++	 *
++	 * For preempt-rt we need to weaken the randomness a bit, as
++	 * we can't call into the random generator from atomic context
++	 * due to locking constraints. We just leave canary
++	 * uninitialized and use the TSC based randomness on top of it.
+ 	 */
++#ifndef CONFIG_PREEMPT_RT_FULL
+ 	get_random_bytes(&canary, sizeof(canary));
++#endif
+ 	tsc = rdtsc();
+ 	canary += tsc + (tsc << 32UL);
+ 
+diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
+index 8b7c8d8e0852..631059ef61da 100644
+--- a/arch/x86/include/asm/thread_info.h
++++ b/arch/x86/include/asm/thread_info.h
+@@ -57,6 +57,8 @@ struct thread_info {
+ 	__u32			flags;		/* low level flags */
+ 	__u32			status;		/* thread synchronous flags */
+ 	__u32			cpu;		/* current CPU */
++	int			preempt_lazy_count;	/* 0 => lazy preemptable
++							   <0 => BUG */
+ };
+ 
+ #define INIT_THREAD_INFO(tsk)			\
+@@ -73,6 +75,10 @@ struct thread_info {
+ 
+ #include <asm/asm-offsets.h>
+ 
++#define GET_THREAD_INFO(reg) \
++	_ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
++	_ASM_SUB $(THREAD_SIZE),reg ;
++
+ #endif
+ 
+ /*
+@@ -91,6 +97,7 @@ struct thread_info {
+ #define TIF_SYSCALL_EMU		6	/* syscall emulation active */
+ #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
+ #define TIF_SECCOMP		8	/* secure computing */
++#define TIF_NEED_RESCHED_LAZY	9	/* lazy rescheduling necessary */
+ #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
+ #define TIF_UPROBE		12	/* breakpointed or singlestepping */
+ #define TIF_NOTSC		16	/* TSC is not accessible in userland */
+@@ -115,6 +122,7 @@ struct thread_info {
+ #define _TIF_SYSCALL_EMU	(1 << TIF_SYSCALL_EMU)
+ #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
+ #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
++#define _TIF_NEED_RESCHED_LAZY	(1 << TIF_NEED_RESCHED_LAZY)
+ #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
+ #define _TIF_UPROBE		(1 << TIF_UPROBE)
+ #define _TIF_NOTSC		(1 << TIF_NOTSC)
+@@ -151,6 +159,8 @@ struct thread_info {
+ #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
+ #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
+ 
++#define _TIF_NEED_RESCHED_MASK	(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
++
+ #define STACK_WARN		(THREAD_SIZE/8)
+ 
+ /*
+diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
+index cc44d926c17e..df278aa0f638 100644
+--- a/arch/x86/include/asm/uv/uv_bau.h
++++ b/arch/x86/include/asm/uv/uv_bau.h
+@@ -615,9 +615,9 @@ struct bau_control {
+ 	cycles_t		send_message;
+ 	cycles_t		period_end;
+ 	cycles_t		period_time;
+-	spinlock_t		uvhub_lock;
+-	spinlock_t		queue_lock;
+-	spinlock_t		disable_lock;
++	raw_spinlock_t		uvhub_lock;
++	raw_spinlock_t		queue_lock;
++	raw_spinlock_t		disable_lock;
+ 	/* tunables */
+ 	int			max_concurr;
+ 	int			max_concurr_const;
+@@ -776,15 +776,15 @@ static inline int atom_asr(short i, struct atomic_short *v)
+  * to be lowered below the current 'v'.  atomic_add_unless can only stop
+  * on equal.
+  */
+-static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
++static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
+ {
+-	spin_lock(lock);
++	raw_spin_lock(lock);
+ 	if (atomic_read(v) >= u) {
+-		spin_unlock(lock);
++		raw_spin_unlock(lock);
+ 		return 0;
+ 	}
+ 	atomic_inc(v);
+-	spin_unlock(lock);
++	raw_spin_unlock(lock);
+ 	return 1;
+ }
+ 
+diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
+index fbd19444403f..e78f477a4ae3 100644
+--- a/arch/x86/kernel/acpi/boot.c
++++ b/arch/x86/kernel/acpi/boot.c
+@@ -87,7 +87,9 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+  *		->ioapic_mutex
+  *			->ioapic_lock
+  */
++#ifdef CONFIG_X86_IO_APIC
+ static DEFINE_MUTEX(acpi_ioapic_lock);
++#endif
+ 
+ /* --------------------------------------------------------------------------
+                               Boot-time Configuration
+diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
+index 48e6d84f173e..0b5a8b994f65 100644
+--- a/arch/x86/kernel/apic/io_apic.c
++++ b/arch/x86/kernel/apic/io_apic.c
+@@ -1712,7 +1712,8 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
+ static inline bool ioapic_irqd_mask(struct irq_data *data)
+ {
+ 	/* If we are moving the irq we need to mask it */
+-	if (unlikely(irqd_is_setaffinity_pending(data))) {
++	if (unlikely(irqd_is_setaffinity_pending(data) &&
++		     !irqd_irq_inprogress(data))) {
+ 		mask_ioapic_irq(data);
+ 		return true;
+ 	}
+diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
+index 2bd5c6ff7ee7..a2c317f5839b 100644
+--- a/arch/x86/kernel/asm-offsets.c
++++ b/arch/x86/kernel/asm-offsets.c
+@@ -31,6 +31,7 @@ void common(void) {
+ 	BLANK();
+ 	OFFSET(TI_flags, thread_info, flags);
+ 	OFFSET(TI_status, thread_info, status);
++	OFFSET(TI_preempt_lazy_count, thread_info, preempt_lazy_count);
+ 
+ 	BLANK();
+ 	OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
+@@ -88,4 +89,5 @@ void common(void) {
+ 
+ 	BLANK();
+ 	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
++	DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
+ }
+diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
+index 79d8ec849468..accbf0e806d0 100644
+--- a/arch/x86/kernel/cpu/mcheck/mce.c
++++ b/arch/x86/kernel/cpu/mcheck/mce.c
+@@ -41,6 +41,8 @@
+ #include <linux/debugfs.h>
+ #include <linux/irq_work.h>
+ #include <linux/export.h>
++#include <linux/jiffies.h>
++#include <linux/swork.h>
+ 
+ #include <asm/processor.h>
+ #include <asm/traps.h>
+@@ -1291,7 +1293,7 @@ void mce_log_therm_throt_event(__u64 status)
+ static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
+ 
+ static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
+-static DEFINE_PER_CPU(struct timer_list, mce_timer);
++static DEFINE_PER_CPU(struct hrtimer, mce_timer);
+ 
+ static unsigned long mce_adjust_timer_default(unsigned long interval)
+ {
+@@ -1300,32 +1302,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
+ 
+ static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
+ 
+-static void __restart_timer(struct timer_list *t, unsigned long interval)
++static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
+ {
+-	unsigned long when = jiffies + interval;
+-	unsigned long flags;
+-
+-	local_irq_save(flags);
+-
+-	if (timer_pending(t)) {
+-		if (time_before(when, t->expires))
+-			mod_timer(t, when);
+-	} else {
+-		t->expires = round_jiffies(when);
+-		add_timer_on(t, smp_processor_id());
+-	}
+-
+-	local_irq_restore(flags);
++	if (!interval)
++		return HRTIMER_NORESTART;
++	hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
++	return HRTIMER_RESTART;
+ }
+ 
+-static void mce_timer_fn(unsigned long data)
++static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
+ {
+-	struct timer_list *t = this_cpu_ptr(&mce_timer);
+-	int cpu = smp_processor_id();
+ 	unsigned long iv;
+ 
+-	WARN_ON(cpu != data);
+-
+ 	iv = __this_cpu_read(mce_next_interval);
+ 
+ 	if (mce_available(this_cpu_ptr(&cpu_info))) {
+@@ -1348,7 +1336,7 @@ static void mce_timer_fn(unsigned long data)
+ 
+ done:
+ 	__this_cpu_write(mce_next_interval, iv);
+-	__restart_timer(t, iv);
++	return __restart_timer(timer, iv);
+ }
+ 
+ /*
+@@ -1356,7 +1344,7 @@ static void mce_timer_fn(unsigned long data)
+  */
+ void mce_timer_kick(unsigned long interval)
+ {
+-	struct timer_list *t = this_cpu_ptr(&mce_timer);
++	struct hrtimer *t = this_cpu_ptr(&mce_timer);
+ 	unsigned long iv = __this_cpu_read(mce_next_interval);
+ 
+ 	__restart_timer(t, interval);
+@@ -1371,7 +1359,7 @@ static void mce_timer_delete_all(void)
+ 	int cpu;
+ 
+ 	for_each_online_cpu(cpu)
+-		del_timer_sync(&per_cpu(mce_timer, cpu));
++		hrtimer_cancel(&per_cpu(mce_timer, cpu));
+ }
+ 
+ static void mce_do_trigger(struct work_struct *work)
+@@ -1381,6 +1369,56 @@ static void mce_do_trigger(struct work_struct *work)
+ 
+ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+ 
++static void __mce_notify_work(struct swork_event *event)
++{
++	/* Not more than two messages every minute */
++	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
++
++	/* wake processes polling /dev/mcelog */
++	wake_up_interruptible(&mce_chrdev_wait);
++
++	/*
++	 * There is no risk of missing notifications because
++	 * work_pending is always cleared before the function is
++	 * executed.
++	 */
++	if (mce_helper[0] && !work_pending(&mce_trigger_work))
++		schedule_work(&mce_trigger_work);
++
++	if (__ratelimit(&ratelimit))
++		pr_info(HW_ERR "Machine check events logged\n");
++}
++
++#ifdef CONFIG_PREEMPT_RT_FULL
++static bool notify_work_ready __read_mostly;
++static struct swork_event notify_work;
++
++static int mce_notify_work_init(void)
++{
++	int err;
++
++	err = swork_get();
++	if (err)
++		return err;
++
++	INIT_SWORK(&notify_work, __mce_notify_work);
++	notify_work_ready = true;
++	return 0;
++}
++
++static void mce_notify_work(void)
++{
++	if (notify_work_ready)
++		swork_queue(&notify_work);
++}
++#else
++static void mce_notify_work(void)
++{
++	__mce_notify_work(NULL);
++}
++static inline int mce_notify_work_init(void) { return 0; }
++#endif
++
+ /*
+  * Notify the user(s) about new machine check events.
+  * Can be called from interrupt context, but not from machine check/NMI
+@@ -1388,19 +1426,8 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+  */
+ int mce_notify_irq(void)
+ {
+-	/* Not more than two messages every minute */
+-	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+-
+ 	if (test_and_clear_bit(0, &mce_need_notify)) {
+-		/* wake processes polling /dev/mcelog */
+-		wake_up_interruptible(&mce_chrdev_wait);
+-
+-		if (mce_helper[0])
+-			schedule_work(&mce_trigger_work);
+-
+-		if (__ratelimit(&ratelimit))
+-			pr_info(HW_ERR "Machine check events logged\n");
+-
++		mce_notify_work();
+ 		return 1;
+ 	}
+ 	return 0;
+@@ -1717,7 +1744,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
+ 	}
+ }
+ 
+-static void mce_start_timer(unsigned int cpu, struct timer_list *t)
++static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
+ {
+ 	unsigned long iv = check_interval * HZ;
+ 
+@@ -1726,16 +1753,17 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
+ 
+ 	per_cpu(mce_next_interval, cpu) = iv;
+ 
+-	t->expires = round_jiffies(jiffies + iv);
+-	add_timer_on(t, cpu);
++	hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
++			0, HRTIMER_MODE_REL_PINNED);
+ }
+ 
+ static void __mcheck_cpu_init_timer(void)
+ {
+-	struct timer_list *t = this_cpu_ptr(&mce_timer);
++	struct hrtimer *t = this_cpu_ptr(&mce_timer);
+ 	unsigned int cpu = smp_processor_id();
+ 
+-	setup_pinned_timer(t, mce_timer_fn, cpu);
++	hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
++	t->function = mce_timer_fn;
+ 	mce_start_timer(cpu, t);
+ }
+ 
+@@ -2459,6 +2487,8 @@ static void mce_disable_cpu(void *h)
+ 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ 		return;
+ 
++	hrtimer_cancel(this_cpu_ptr(&mce_timer));
++
+ 	if (!(action & CPU_TASKS_FROZEN))
+ 		cmci_clear();
+ 
+@@ -2481,6 +2511,7 @@ static void mce_reenable_cpu(void *h)
+ 		if (b->init)
+ 			wrmsrl(msr_ops.ctl(i), b->ctl);
+ 	}
++	__mcheck_cpu_init_timer();
+ }
+ 
+ /* Get notified when a cpu comes on/off. Be hotplug friendly. */
+@@ -2488,7 +2519,6 @@ static int
+ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+ {
+ 	unsigned int cpu = (unsigned long)hcpu;
+-	struct timer_list *t = &per_cpu(mce_timer, cpu);
+ 
+ 	switch (action & ~CPU_TASKS_FROZEN) {
+ 	case CPU_ONLINE:
+@@ -2508,11 +2538,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+ 		break;
+ 	case CPU_DOWN_PREPARE:
+ 		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+-		del_timer_sync(t);
+ 		break;
+ 	case CPU_DOWN_FAILED:
+ 		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+-		mce_start_timer(cpu, t);
+ 		break;
+ 	}
+ 
+@@ -2551,6 +2579,10 @@ static __init int mcheck_init_device(void)
+ 		goto err_out;
+ 	}
+ 
++	err = mce_notify_work_init();
++	if (err)
++		goto err_out;
++
+ 	if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
+ 		err = -ENOMEM;
+ 		goto err_out;
+diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
+index 09675712eba8..eea7557b355d 100644
+--- a/arch/x86/kernel/dumpstack_32.c
++++ b/arch/x86/kernel/dumpstack_32.c
+@@ -42,7 +42,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
+ 		unsigned long *stack, unsigned long bp,
+ 		const struct stacktrace_ops *ops, void *data)
+ {
+-	const unsigned cpu = get_cpu();
++	const unsigned cpu = get_cpu_light();
+ 	int graph = 0;
+ 	u32 *prev_esp;
+ 
+@@ -84,7 +84,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
+ 			break;
+ 		touch_nmi_watchdog();
+ 	}
+-	put_cpu();
++	put_cpu_light();
+ }
+ EXPORT_SYMBOL(dump_trace);
+ 
+diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
+index 9ee4520ce83c..2cd610b68868 100644
+--- a/arch/x86/kernel/dumpstack_64.c
++++ b/arch/x86/kernel/dumpstack_64.c
+@@ -152,7 +152,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
+ 		unsigned long *stack, unsigned long bp,
+ 		const struct stacktrace_ops *ops, void *data)
+ {
+-	const unsigned cpu = get_cpu();
++	const unsigned cpu = get_cpu_light();
+ 	unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu);
+ 	unsigned long dummy;
+ 	unsigned used = 0;
+@@ -239,7 +239,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
+ 	 * This handles the process stack:
+ 	 */
+ 	bp = ops->walk_stack(task, stack, bp, ops, data, NULL, &graph);
+-	put_cpu();
++	put_cpu_light();
+ }
+ EXPORT_SYMBOL(dump_trace);
+ 
+@@ -253,7 +253,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ 	int cpu;
+ 	int i;
+ 
+-	preempt_disable();
++	migrate_disable();
+ 	cpu = smp_processor_id();
+ 
+ 	irq_stack_end	= (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
+@@ -299,7 +299,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ 		stack++;
+ 		touch_nmi_watchdog();
+ 	}
+-	preempt_enable();
++	migrate_enable();
+ 
+ 	pr_cont("\n");
+ 	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
+index 1f38d9a4d9de..053bf3b2ef39 100644
+--- a/arch/x86/kernel/irq_32.c
++++ b/arch/x86/kernel/irq_32.c
+@@ -127,6 +127,7 @@ void irq_ctx_init(int cpu)
+ 	       cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
+ }
+ 
++#ifndef CONFIG_PREEMPT_RT_FULL
+ void do_softirq_own_stack(void)
+ {
+ 	struct irq_stack *irqstk;
+@@ -143,6 +144,7 @@ void do_softirq_own_stack(void)
+ 
+ 	call_on_stack(__do_softirq, isp);
+ }
++#endif
+ 
+ bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
+ {
+diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
+index d86be29c38c7..b0e29d1a0571 100644
+--- a/arch/x86/kernel/process_32.c
++++ b/arch/x86/kernel/process_32.c
+@@ -35,6 +35,7 @@
+ #include <linux/uaccess.h>
+ #include <linux/io.h>
+ #include <linux/kdebug.h>
++#include <linux/highmem.h>
+ 
+ #include <asm/pgtable.h>
+ #include <asm/ldt.h>
+@@ -210,6 +211,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+ }
+ EXPORT_SYMBOL_GPL(start_thread);
+ 
++#ifdef CONFIG_PREEMPT_RT_FULL
++static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
++{
++	int i;
++
++	/*
++	 * Clear @prev's kmap_atomic mappings
++	 */
++	for (i = 0; i < prev_p->kmap_idx; i++) {
++		int idx = i + KM_TYPE_NR * smp_processor_id();
++		pte_t *ptep = kmap_pte - idx;
++
++		kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
++	}
++	/*
++	 * Restore @next_p's kmap_atomic mappings
++	 */
++	for (i = 0; i < next_p->kmap_idx; i++) {
++		int idx = i + KM_TYPE_NR * smp_processor_id();
++
++		if (!pte_none(next_p->kmap_pte[i]))
++			set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
++	}
++}
++#else
++static inline void
++switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
++#endif
++
+ 
+ /*
+  *	switch_to(x,y) should switch tasks from x to y.
+@@ -286,6 +316,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+ 		     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
+ 		__switch_to_xtra(prev_p, next_p, tss);
+ 
++	switch_kmaps(prev_p, next_p);
++
+ 	/*
+ 	 * Leave lazy mode, flushing any hypercalls made here.
+ 	 * This must be done before restoring TLS segments so
+diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
+index b62c85229711..d907b281a9d6 100644
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -1938,6 +1938,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
+ 	hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
+ 		     HRTIMER_MODE_ABS_PINNED);
+ 	apic->lapic_timer.timer.function = apic_timer_fn;
++	apic->lapic_timer.timer.irqsafe = 1;
+ 
+ 	/*
+ 	 * APIC is created enabled. This will prevent kvm_lapic_set_base from
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 699f8726539a..24f30c86510c 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -5865,6 +5865,13 @@ int kvm_arch_init(void *opaque)
+ 		goto out;
+ 	}
+ 
++#ifdef CONFIG_PREEMPT_RT_FULL
++	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
++		printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
++		return -EOPNOTSUPP;
++	}
++#endif
++
+ 	r = kvm_mmu_module_init();
+ 	if (r)
+ 		goto out_free_percpu;
+diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
+index 6d18b70ed5a9..f752724c22e8 100644
+--- a/arch/x86/mm/highmem_32.c
++++ b/arch/x86/mm/highmem_32.c
+@@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap);
+  */
+ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+ {
++	pte_t pte = mk_pte(page, prot);
+ 	unsigned long vaddr;
+ 	int idx, type;
+ 
+-	preempt_disable();
++	preempt_disable_nort();
+ 	pagefault_disable();
+ 
+ 	if (!PageHighMem(page))
+@@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+ 	idx = type + KM_TYPE_NR*smp_processor_id();
+ 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ 	BUG_ON(!pte_none(*(kmap_pte-idx)));
+-	set_pte(kmap_pte-idx, mk_pte(page, prot));
++#ifdef CONFIG_PREEMPT_RT_FULL
++	current->kmap_pte[type] = pte;
++#endif
++	set_pte(kmap_pte-idx, pte);
+ 	arch_flush_lazy_mmu_mode();
+ 
+ 	return (void *)vaddr;
+@@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
+ 		 * is a bad idea also, in case the page changes cacheability
+ 		 * attributes or becomes a protected page in a hypervisor.
+ 		 */
++#ifdef CONFIG_PREEMPT_RT_FULL
++		current->kmap_pte[type] = __pte(0);
++#endif
+ 		kpte_clear_flush(kmap_pte-idx, vaddr);
+ 		kmap_atomic_idx_pop();
+ 		arch_flush_lazy_mmu_mode();
+@@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr)
+ #endif
+ 
+ 	pagefault_enable();
+-	preempt_enable();
++	preempt_enable_nort();
+ }
+ EXPORT_SYMBOL(__kunmap_atomic);
+ 
+diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
+index ada98b39b8ad..585f6829653b 100644
+--- a/arch/x86/mm/iomap_32.c
++++ b/arch/x86/mm/iomap_32.c
+@@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
+ 
+ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
+ {
++	pte_t pte = pfn_pte(pfn, prot);
+ 	unsigned long vaddr;
+ 	int idx, type;
+ 
+@@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
+ 	type = kmap_atomic_idx_push();
+ 	idx = type + KM_TYPE_NR * smp_processor_id();
+ 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+-	set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
++	WARN_ON(!pte_none(*(kmap_pte - idx)));
++
++#ifdef CONFIG_PREEMPT_RT_FULL
++	current->kmap_pte[type] = pte;
++#endif
++	set_pte(kmap_pte - idx, pte);
+ 	arch_flush_lazy_mmu_mode();
+ 
+ 	return (void *)vaddr;
+@@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr)
+ 		 * is a bad idea also, in case the page changes cacheability
+ 		 * attributes or becomes a protected page in a hypervisor.
+ 		 */
++#ifdef CONFIG_PREEMPT_RT_FULL
++		current->kmap_pte[type] = __pte(0);
++#endif
+ 		kpte_clear_flush(kmap_pte-idx, vaddr);
+ 		kmap_atomic_idx_pop();
+ 	}
+diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
+index fdb4d42b4ce5..8ab90fbecff0 100644
+--- a/arch/x86/platform/uv/tlb_uv.c
++++ b/arch/x86/platform/uv/tlb_uv.c
+@@ -729,9 +729,9 @@ static void destination_plugged(struct bau_desc *bau_desc,
+ 
+ 		quiesce_local_uvhub(hmaster);
+ 
+-		spin_lock(&hmaster->queue_lock);
++		raw_spin_lock(&hmaster->queue_lock);
+ 		reset_with_ipi(&bau_desc->distribution, bcp);
+-		spin_unlock(&hmaster->queue_lock);
++		raw_spin_unlock(&hmaster->queue_lock);
+ 
+ 		end_uvhub_quiesce(hmaster);
+ 
+@@ -751,9 +751,9 @@ static void destination_timeout(struct bau_desc *bau_desc,
+ 
+ 		quiesce_local_uvhub(hmaster);
+ 
+-		spin_lock(&hmaster->queue_lock);
++		raw_spin_lock(&hmaster->queue_lock);
+ 		reset_with_ipi(&bau_desc->distribution, bcp);
+-		spin_unlock(&hmaster->queue_lock);
++		raw_spin_unlock(&hmaster->queue_lock);
+ 
+ 		end_uvhub_quiesce(hmaster);
+ 
+@@ -774,7 +774,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
+ 	cycles_t tm1;
+ 
+ 	hmaster = bcp->uvhub_master;
+-	spin_lock(&hmaster->disable_lock);
++	raw_spin_lock(&hmaster->disable_lock);
+ 	if (!bcp->baudisabled) {
+ 		stat->s_bau_disabled++;
+ 		tm1 = get_cycles();
+@@ -787,7 +787,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
+ 			}
+ 		}
+ 	}
+-	spin_unlock(&hmaster->disable_lock);
++	raw_spin_unlock(&hmaster->disable_lock);
+ }
+ 
+ static void count_max_concurr(int stat, struct bau_control *bcp,
+@@ -850,7 +850,7 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
+  */
+ static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
+ {
+-	spinlock_t *lock = &hmaster->uvhub_lock;
++	raw_spinlock_t *lock = &hmaster->uvhub_lock;
+ 	atomic_t *v;
+ 
+ 	v = &hmaster->active_descriptor_count;
+@@ -983,7 +983,7 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
+ 	struct bau_control *hmaster;
+ 
+ 	hmaster = bcp->uvhub_master;
+-	spin_lock(&hmaster->disable_lock);
++	raw_spin_lock(&hmaster->disable_lock);
+ 	if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
+ 		stat->s_bau_reenabled++;
+ 		for_each_present_cpu(tcpu) {
+@@ -995,10 +995,10 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
+ 				tbcp->period_giveups = 0;
+ 			}
+ 		}
+-		spin_unlock(&hmaster->disable_lock);
++		raw_spin_unlock(&hmaster->disable_lock);
+ 		return 0;
+ 	}
+-	spin_unlock(&hmaster->disable_lock);
++	raw_spin_unlock(&hmaster->disable_lock);
+ 	return -1;
+ }
+ 
+@@ -1916,9 +1916,9 @@ static void __init init_per_cpu_tunables(void)
+ 		bcp->cong_reps			= congested_reps;
+ 		bcp->disabled_period =		sec_2_cycles(disabled_period);
+ 		bcp->giveup_limit =		giveup_limit;
+-		spin_lock_init(&bcp->queue_lock);
+-		spin_lock_init(&bcp->uvhub_lock);
+-		spin_lock_init(&bcp->disable_lock);
++		raw_spin_lock_init(&bcp->queue_lock);
++		raw_spin_lock_init(&bcp->uvhub_lock);
++		raw_spin_lock_init(&bcp->disable_lock);
+ 	}
+ }
+ 
+diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
+index b333fc45f9ec..8b85916e6986 100644
+--- a/arch/x86/platform/uv/uv_time.c
++++ b/arch/x86/platform/uv/uv_time.c
+@@ -57,7 +57,7 @@ static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
+ 
+ /* There is one of these allocated per node */
+ struct uv_rtc_timer_head {
+-	spinlock_t	lock;
++	raw_spinlock_t	lock;
+ 	/* next cpu waiting for timer, local node relative: */
+ 	int		next_cpu;
+ 	/* number of cpus on this node: */
+@@ -177,7 +177,7 @@ static __init int uv_rtc_allocate_timers(void)
+ 				uv_rtc_deallocate_timers();
+ 				return -ENOMEM;
+ 			}
+-			spin_lock_init(&head->lock);
++			raw_spin_lock_init(&head->lock);
+ 			head->ncpus = uv_blade_nr_possible_cpus(bid);
+ 			head->next_cpu = -1;
+ 			blade_info[bid] = head;
+@@ -231,7 +231,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
+ 	unsigned long flags;
+ 	int next_cpu;
+ 
+-	spin_lock_irqsave(&head->lock, flags);
++	raw_spin_lock_irqsave(&head->lock, flags);
+ 
+ 	next_cpu = head->next_cpu;
+ 	*t = expires;
+@@ -243,12 +243,12 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
+ 		if (uv_setup_intr(cpu, expires)) {
+ 			*t = ULLONG_MAX;
+ 			uv_rtc_find_next_timer(head, pnode);
+-			spin_unlock_irqrestore(&head->lock, flags);
++			raw_spin_unlock_irqrestore(&head->lock, flags);
+ 			return -ETIME;
+ 		}
+ 	}
+ 
+-	spin_unlock_irqrestore(&head->lock, flags);
++	raw_spin_unlock_irqrestore(&head->lock, flags);
+ 	return 0;
+ }
+ 
+@@ -267,7 +267,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
+ 	unsigned long flags;
+ 	int rc = 0;
+ 
+-	spin_lock_irqsave(&head->lock, flags);
++	raw_spin_lock_irqsave(&head->lock, flags);
+ 
+ 	if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
+ 		rc = 1;
+@@ -279,7 +279,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
+ 			uv_rtc_find_next_timer(head, pnode);
+ 	}
+ 
+-	spin_unlock_irqrestore(&head->lock, flags);
++	raw_spin_unlock_irqrestore(&head->lock, flags);
+ 
+ 	return rc;
+ }
+@@ -299,13 +299,18 @@ static int uv_rtc_unset_timer(int cpu, int force)
+ static cycle_t uv_read_rtc(struct clocksource *cs)
+ {
+ 	unsigned long offset;
++	cycle_t cycles;
+ 
++	preempt_disable();
+ 	if (uv_get_min_hub_revision_id() == 1)
+ 		offset = 0;
+ 	else
+ 		offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
+ 
+-	return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
++	cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
++	preempt_enable();
++
++	return cycles;
+ }
+ 
+ /*
+diff --git a/block/blk-core.c b/block/blk-core.c
+index 36c7ac328d8c..caa5fc1be2a2 100644
+--- a/block/blk-core.c
++++ b/block/blk-core.c
+@@ -125,6 +125,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
+ 
+ 	INIT_LIST_HEAD(&rq->queuelist);
+ 	INIT_LIST_HEAD(&rq->timeout_list);
++#ifdef CONFIG_PREEMPT_RT_FULL
++	INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
++#endif
+ 	rq->cpu = -1;
+ 	rq->q = q;
+ 	rq->__sector = (sector_t) -1;
+@@ -233,7 +236,7 @@ EXPORT_SYMBOL(blk_start_queue_async);
+  **/
+ void blk_start_queue(struct request_queue *q)
+ {
+-	WARN_ON(!irqs_disabled());
++	WARN_ON_NONRT(!irqs_disabled());
+ 
+ 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+ 	__blk_run_queue(q);
+@@ -659,7 +662,7 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
+ 		if (nowait)
+ 			return -EBUSY;
+ 
+-		ret = wait_event_interruptible(q->mq_freeze_wq,
++		ret = swait_event_interruptible(q->mq_freeze_wq,
+ 				!atomic_read(&q->mq_freeze_depth) ||
+ 				blk_queue_dying(q));
+ 		if (blk_queue_dying(q))
+@@ -679,7 +682,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
+ 	struct request_queue *q =
+ 		container_of(ref, struct request_queue, q_usage_counter);
+ 
+-	wake_up_all(&q->mq_freeze_wq);
++	swake_up_all(&q->mq_freeze_wq);
+ }
+ 
+ static void blk_rq_timed_out_timer(unsigned long data)
+@@ -748,7 +751,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
+ 	q->bypass_depth = 1;
+ 	__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+ 
+-	init_waitqueue_head(&q->mq_freeze_wq);
++	init_swait_queue_head(&q->mq_freeze_wq);
+ 
+ 	/*
+ 	 * Init percpu_ref in atomic mode so that it's faster to shutdown.
+@@ -3171,7 +3174,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
+ 		blk_run_queue_async(q);
+ 	else
+ 		__blk_run_queue(q);
+-	spin_unlock(q->queue_lock);
++	spin_unlock_irq(q->queue_lock);
+ }
+ 
+ static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
+@@ -3219,7 +3222,6 @@ EXPORT_SYMBOL(blk_check_plugged);
+ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+ {
+ 	struct request_queue *q;
+-	unsigned long flags;
+ 	struct request *rq;
+ 	LIST_HEAD(list);
+ 	unsigned int depth;
+@@ -3239,11 +3241,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+ 	q = NULL;
+ 	depth = 0;
+ 
+-	/*
+-	 * Save and disable interrupts here, to avoid doing it for every
+-	 * queue lock we have to take.
+-	 */
+-	local_irq_save(flags);
+ 	while (!list_empty(&list)) {
+ 		rq = list_entry_rq(list.next);
+ 		list_del_init(&rq->queuelist);
+@@ -3256,7 +3253,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+ 				queue_unplugged(q, depth, from_schedule);
+ 			q = rq->q;
+ 			depth = 0;
+-			spin_lock(q->queue_lock);
++			spin_lock_irq(q->queue_lock);
+ 		}
+ 
+ 		/*
+@@ -3283,8 +3280,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+ 	 */
+ 	if (q)
+ 		queue_unplugged(q, depth, from_schedule);
+-
+-	local_irq_restore(flags);
+ }
+ 
+ void blk_finish_plug(struct blk_plug *plug)
+diff --git a/block/blk-ioc.c b/block/blk-ioc.c
+index 381cb50a673c..dc8785233d94 100644
+--- a/block/blk-ioc.c
++++ b/block/blk-ioc.c
+@@ -7,6 +7,7 @@
+ #include <linux/bio.h>
+ #include <linux/blkdev.h>
+ #include <linux/slab.h>
++#include <linux/delay.h>
+ 
+ #include "blk.h"
+ 
+@@ -109,7 +110,7 @@ static void ioc_release_fn(struct work_struct *work)
+ 			spin_unlock(q->queue_lock);
+ 		} else {
+ 			spin_unlock_irqrestore(&ioc->lock, flags);
+-			cpu_relax();
++			cpu_chill();
+ 			spin_lock_irqsave_nested(&ioc->lock, flags, 1);
+ 		}
+ 	}
+@@ -187,7 +188,7 @@ void put_io_context_active(struct io_context *ioc)
+ 			spin_unlock(icq->q->queue_lock);
+ 		} else {
+ 			spin_unlock_irqrestore(&ioc->lock, flags);
+-			cpu_relax();
++			cpu_chill();
+ 			goto retry;
+ 		}
+ 	}
+diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
+index bb3ed488f7b5..628c6c13c482 100644
+--- a/block/blk-mq-cpu.c
++++ b/block/blk-mq-cpu.c
+@@ -16,7 +16,7 @@
+ #include "blk-mq.h"
+ 
+ static LIST_HEAD(blk_mq_cpu_notify_list);
+-static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock);
++static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
+ 
+ static int blk_mq_main_cpu_notify(struct notifier_block *self,
+ 				  unsigned long action, void *hcpu)
+@@ -25,7 +25,10 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
+ 	struct blk_mq_cpu_notifier *notify;
+ 	int ret = NOTIFY_OK;
+ 
+-	raw_spin_lock(&blk_mq_cpu_notify_lock);
++	if (action != CPU_POST_DEAD)
++		return NOTIFY_OK;
++
++	spin_lock(&blk_mq_cpu_notify_lock);
+ 
+ 	list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
+ 		ret = notify->notify(notify->data, action, cpu);
+@@ -33,7 +36,7 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
+ 			break;
+ 	}
+ 
+-	raw_spin_unlock(&blk_mq_cpu_notify_lock);
++	spin_unlock(&blk_mq_cpu_notify_lock);
+ 	return ret;
+ }
+ 
+@@ -41,16 +44,16 @@ void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
+ {
+ 	BUG_ON(!notifier->notify);
+ 
+-	raw_spin_lock(&blk_mq_cpu_notify_lock);
++	spin_lock(&blk_mq_cpu_notify_lock);
+ 	list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
+-	raw_spin_unlock(&blk_mq_cpu_notify_lock);
++	spin_unlock(&blk_mq_cpu_notify_lock);
+ }
+ 
+ void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
+ {
+-	raw_spin_lock(&blk_mq_cpu_notify_lock);
++	spin_lock(&blk_mq_cpu_notify_lock);
+ 	list_del(&notifier->list);
+-	raw_spin_unlock(&blk_mq_cpu_notify_lock);
++	spin_unlock(&blk_mq_cpu_notify_lock);
+ }
+ 
+ void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
+diff --git a/block/blk-mq.c b/block/blk-mq.c
+index c207fa9870eb..ac71b0455e9f 100644
+--- a/block/blk-mq.c
++++ b/block/blk-mq.c
+@@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
+ 
+ static void blk_mq_freeze_queue_wait(struct request_queue *q)
+ {
+-	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
++	swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
+ }
+ 
+ /*
+@@ -130,7 +130,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
+ 	WARN_ON_ONCE(freeze_depth < 0);
+ 	if (!freeze_depth) {
+ 		percpu_ref_reinit(&q->q_usage_counter);
+-		wake_up_all(&q->mq_freeze_wq);
++		swake_up_all(&q->mq_freeze_wq);
+ 	}
+ }
+ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
+@@ -149,7 +149,7 @@ void blk_mq_wake_waiters(struct request_queue *q)
+ 	 * dying, we need to ensure that processes currently waiting on
+ 	 * the queue are notified as well.
+ 	 */
+-	wake_up_all(&q->mq_freeze_wq);
++	swake_up_all(&q->mq_freeze_wq);
+ }
+ 
+ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
+@@ -197,6 +197,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+ 	rq->resid_len = 0;
+ 	rq->sense = NULL;
+ 
++#ifdef CONFIG_PREEMPT_RT_FULL
++	INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
++#endif
+ 	INIT_LIST_HEAD(&rq->timeout_list);
+ 	rq->timeout = 0;
+ 
+@@ -379,6 +382,17 @@ void blk_mq_end_request(struct request *rq, int error)
+ }
+ EXPORT_SYMBOL(blk_mq_end_request);
+ 
++#ifdef CONFIG_PREEMPT_RT_FULL
++
++void __blk_mq_complete_request_remote_work(struct work_struct *work)
++{
++	struct request *rq = container_of(work, struct request, work);
++
++	rq->q->softirq_done_fn(rq);
++}
++
++#else
++
+ static void __blk_mq_complete_request_remote(void *data)
+ {
+ 	struct request *rq = data;
+@@ -386,6 +400,8 @@ static void __blk_mq_complete_request_remote(void *data)
+ 	rq->q->softirq_done_fn(rq);
+ }
+ 
++#endif
++
+ static void blk_mq_ipi_complete_request(struct request *rq)
+ {
+ 	struct blk_mq_ctx *ctx = rq->mq_ctx;
+@@ -397,19 +413,23 @@ static void blk_mq_ipi_complete_request(struct request *rq)
+ 		return;
+ 	}
+ 
+-	cpu = get_cpu();
++	cpu = get_cpu_light();
+ 	if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
+ 		shared = cpus_share_cache(cpu, ctx->cpu);
+ 
+ 	if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
++#ifdef CONFIG_PREEMPT_RT_FULL
++		schedule_work_on(ctx->cpu, &rq->work);
++#else
+ 		rq->csd.func = __blk_mq_complete_request_remote;
+ 		rq->csd.info = rq;
+ 		rq->csd.flags = 0;
+ 		smp_call_function_single_async(ctx->cpu, &rq->csd);
++#endif
+ 	} else {
+ 		rq->q->softirq_done_fn(rq);
+ 	}
+-	put_cpu();
++	put_cpu_light();
+ }
+ 
+ static void __blk_mq_complete_request(struct request *rq)
+@@ -938,14 +958,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+ 		return;
+ 
+ 	if (!async) {
+-		int cpu = get_cpu();
++		int cpu = get_cpu_light();
+ 		if (cpumask_test_cpu(cpu, hctx->cpumask)) {
+ 			__blk_mq_run_hw_queue(hctx);
+-			put_cpu();
++			put_cpu_light();
+ 			return;
+ 		}
+ 
+-		put_cpu();
++		put_cpu_light();
+ 	}
+ 
+ 	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+@@ -1667,7 +1687,7 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
+ {
+ 	struct blk_mq_hw_ctx *hctx = data;
+ 
+-	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
++	if (action == CPU_POST_DEAD)
+ 		return blk_mq_hctx_cpu_offline(hctx, cpu);
+ 
+ 	/*
+diff --git a/block/blk-mq.h b/block/blk-mq.h
+index 9087b11037b7..0401d76e827c 100644
+--- a/block/blk-mq.h
++++ b/block/blk-mq.h
+@@ -86,12 +86,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
+  */
+ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
+ {
+-	return __blk_mq_get_ctx(q, get_cpu());
++	return __blk_mq_get_ctx(q, get_cpu_light());
+ }
+ 
+ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
+ {
+-	put_cpu();
++	put_cpu_light();
+ }
+ 
+ struct blk_mq_alloc_data {
+diff --git a/block/blk-softirq.c b/block/blk-softirq.c
+index 53b1737e978d..81c3c0a62edf 100644
+--- a/block/blk-softirq.c
++++ b/block/blk-softirq.c
+@@ -51,6 +51,7 @@ static void trigger_softirq(void *data)
+ 		raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ 
+ 	local_irq_restore(flags);
++	preempt_check_resched_rt();
+ }
+ 
+ /*
+@@ -93,6 +94,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
+ 				 this_cpu_ptr(&blk_cpu_done));
+ 		raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ 		local_irq_enable();
++		preempt_check_resched_rt();
+ 	}
+ 
+ 	return NOTIFY_OK;
+@@ -150,6 +152,7 @@ void __blk_complete_request(struct request *req)
+ 		goto do_local;
+ 
+ 	local_irq_restore(flags);
++	preempt_check_resched_rt();
+ }
+ 
+ /**
+diff --git a/block/bounce.c b/block/bounce.c
+index 1cb5dd3a5da1..2f1ec8a67cbe 100644
+--- a/block/bounce.c
++++ b/block/bounce.c
+@@ -55,11 +55,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
+ 	unsigned long flags;
+ 	unsigned char *vto;
+ 
+-	local_irq_save(flags);
++	local_irq_save_nort(flags);
+ 	vto = kmap_atomic(to->bv_page);
+ 	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
+ 	kunmap_atomic(vto);
+-	local_irq_restore(flags);
++	local_irq_restore_nort(flags);
+ }
+ 
+ #else /* CONFIG_HIGHMEM */
+diff --git a/crypto/algapi.c b/crypto/algapi.c
+index df939b54b09f..efe5e06adcf7 100644
+--- a/crypto/algapi.c
++++ b/crypto/algapi.c
+@@ -718,13 +718,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
+ 
+ int crypto_register_notifier(struct notifier_block *nb)
+ {
+-	return blocking_notifier_chain_register(&crypto_chain, nb);
++	return srcu_notifier_chain_register(&crypto_chain, nb);
+ }
+ EXPORT_SYMBOL_GPL(crypto_register_notifier);
+ 
+ int crypto_unregister_notifier(struct notifier_block *nb)
+ {
+-	return blocking_notifier_chain_unregister(&crypto_chain, nb);
++	return srcu_notifier_chain_unregister(&crypto_chain, nb);
+ }
+ EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
+ 
+diff --git a/crypto/api.c b/crypto/api.c
+index bbc147cb5dec..bc1a848f02ec 100644
+--- a/crypto/api.c
++++ b/crypto/api.c
+@@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
+ DECLARE_RWSEM(crypto_alg_sem);
+ EXPORT_SYMBOL_GPL(crypto_alg_sem);
+ 
+-BLOCKING_NOTIFIER_HEAD(crypto_chain);
++SRCU_NOTIFIER_HEAD(crypto_chain);
+ EXPORT_SYMBOL_GPL(crypto_chain);
+ 
+ static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
+@@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
+ {
+ 	int ok;
+ 
+-	ok = blocking_notifier_call_chain(&crypto_chain, val, v);
++	ok = srcu_notifier_call_chain(&crypto_chain, val, v);
+ 	if (ok == NOTIFY_DONE) {
+ 		request_module("cryptomgr");
+-		ok = blocking_notifier_call_chain(&crypto_chain, val, v);
++		ok = srcu_notifier_call_chain(&crypto_chain, val, v);
+ 	}
+ 
+ 	return ok;
+diff --git a/crypto/internal.h b/crypto/internal.h
+index 7eefcdb00227..0ecc7f5a2f40 100644
+--- a/crypto/internal.h
++++ b/crypto/internal.h
+@@ -47,7 +47,7 @@ struct crypto_larval {
+ 
+ extern struct list_head crypto_alg_list;
+ extern struct rw_semaphore crypto_alg_sem;
+-extern struct blocking_notifier_head crypto_chain;
++extern struct srcu_notifier_head crypto_chain;
+ 
+ #ifdef CONFIG_PROC_FS
+ void __init crypto_init_proc(void);
+@@ -146,7 +146,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
+ 
+ static inline void crypto_notify(unsigned long val, void *v)
+ {
+-	blocking_notifier_call_chain(&crypto_chain, val, v);
++	srcu_notifier_call_chain(&crypto_chain, val, v);
+ }
+ 
+ #endif	/* _CRYPTO_INTERNAL_H */
+diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
+index fded776236e2..bda523219d50 100644
+--- a/drivers/acpi/acpica/acglobal.h
++++ b/drivers/acpi/acpica/acglobal.h
+@@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
+  * interrupt level
+  */
+ ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock);	/* For GPE data structs and registers */
+-ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);	/* For ACPI H/W except GPE registers */
++ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);	/* For ACPI H/W except GPE registers */
+ ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
+ 
+ /* Mutex for _OSI support */
+diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
+index 3b7fb99362b6..696bf8e62afb 100644
+--- a/drivers/acpi/acpica/hwregs.c
++++ b/drivers/acpi/acpica/hwregs.c
+@@ -363,14 +363,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
+ 			  ACPI_BITMASK_ALL_FIXED_STATUS,
+ 			  ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
+ 
+-	lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
++	raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
+ 
+ 	/* Clear the fixed events in PM1 A/B */
+ 
+ 	status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
+ 					ACPI_BITMASK_ALL_FIXED_STATUS);
+ 
+-	acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
++	raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
+ 
+ 	if (ACPI_FAILURE(status)) {
+ 		goto exit;
+diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
+index 98c26ff39409..6e236f2ea791 100644
+--- a/drivers/acpi/acpica/hwxface.c
++++ b/drivers/acpi/acpica/hwxface.c
+@@ -373,7 +373,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
+ 		return_ACPI_STATUS(AE_BAD_PARAMETER);
+ 	}
+ 
+-	lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
++	raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
+ 
+ 	/*
+ 	 * At this point, we know that the parent register is one of the
+@@ -434,7 +434,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
+ 
+ unlock_and_exit:
+ 
+-	acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
++	raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
+ 	return_ACPI_STATUS(status);
+ }
+ 
+diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
+index 15073375bd00..357e7ca5a587 100644
+--- a/drivers/acpi/acpica/utmutex.c
++++ b/drivers/acpi/acpica/utmutex.c
+@@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
+ 		return_ACPI_STATUS (status);
+ 	}
+ 
+-	status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
++	status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
+ 	if (ACPI_FAILURE (status)) {
+ 		return_ACPI_STATUS (status);
+ 	}
+@@ -145,7 +145,7 @@ void acpi_ut_mutex_terminate(void)
+ 	/* Delete the spinlocks */
+ 
+ 	acpi_os_delete_lock(acpi_gbl_gpe_lock);
+-	acpi_os_delete_lock(acpi_gbl_hardware_lock);
++	acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
+ 	acpi_os_delete_lock(acpi_gbl_reference_count_lock);
+ 
+ 	/* Delete the reader/writer lock */
+diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
+index 051b6158d1b7..7ad293bef6ed 100644
+--- a/drivers/ata/libata-sff.c
++++ b/drivers/ata/libata-sff.c
+@@ -678,9 +678,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
+ 	unsigned long flags;
+ 	unsigned int consumed;
+ 
+-	local_irq_save(flags);
++	local_irq_save_nort(flags);
+ 	consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
+-	local_irq_restore(flags);
++	local_irq_restore_nort(flags);
+ 
+ 	return consumed;
+ }
+@@ -719,7 +719,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
+ 		unsigned long flags;
+ 
+ 		/* FIXME: use a bounce buffer */
+-		local_irq_save(flags);
++		local_irq_save_nort(flags);
+ 		buf = kmap_atomic(page);
+ 
+ 		/* do the actual data transfer */
+@@ -727,7 +727,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
+ 				       do_write);
+ 
+ 		kunmap_atomic(buf);
+-		local_irq_restore(flags);
++		local_irq_restore_nort(flags);
+ 	} else {
+ 		buf = page_address(page);
+ 		ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
+@@ -864,7 +864,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
+ 		unsigned long flags;
+ 
+ 		/* FIXME: use bounce buffer */
+-		local_irq_save(flags);
++		local_irq_save_nort(flags);
+ 		buf = kmap_atomic(page);
+ 
+ 		/* do the actual data transfer */
+@@ -872,7 +872,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
+ 								count, rw);
+ 
+ 		kunmap_atomic(buf);
+-		local_irq_restore(flags);
++		local_irq_restore_nort(flags);
+ 	} else {
+ 		buf = page_address(page);
+ 		consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
+diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
+index 4b5cd3a7b2b6..fa8329ad79fd 100644
+--- a/drivers/block/zram/zcomp.c
++++ b/drivers/block/zram/zcomp.c
+@@ -118,12 +118,19 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
+ 
+ struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
+ {
+-	return *get_cpu_ptr(comp->stream);
++	struct zcomp_strm *zstrm;
++
++	zstrm = *this_cpu_ptr(comp->stream);
++	spin_lock(&zstrm->zcomp_lock);
++	return zstrm;
+ }
+ 
+ void zcomp_stream_put(struct zcomp *comp)
+ {
+-	put_cpu_ptr(comp->stream);
++	struct zcomp_strm *zstrm;
++
++	zstrm = *this_cpu_ptr(comp->stream);
++	spin_unlock(&zstrm->zcomp_lock);
+ }
+ 
+ int zcomp_compress(struct zcomp_strm *zstrm,
+@@ -174,6 +181,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp,
+ 			pr_err("Can't allocate a compression stream\n");
+ 			return NOTIFY_BAD;
+ 		}
++		spin_lock_init(&zstrm->zcomp_lock);
+ 		*per_cpu_ptr(comp->stream, cpu) = zstrm;
+ 		break;
+ 	case CPU_DEAD:
+diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
+index 478cac2ed465..f7a6efdc3285 100644
+--- a/drivers/block/zram/zcomp.h
++++ b/drivers/block/zram/zcomp.h
+@@ -14,6 +14,7 @@ struct zcomp_strm {
+ 	/* compression/decompression buffer */
+ 	void *buffer;
+ 	struct crypto_comp *tfm;
++	spinlock_t zcomp_lock;
+ };
+ 
+ /* dynamic per-device compression frontend */
+diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
+index 04365b17ee67..b4a0577a4dbc 100644
+--- a/drivers/block/zram/zram_drv.c
++++ b/drivers/block/zram/zram_drv.c
+@@ -519,6 +519,8 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
+ 		goto out_error;
+ 	}
+ 
++	zram_meta_init_table_locks(meta, disksize);
++
+ 	return meta;
+ 
+ out_error:
+@@ -566,28 +568,28 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
+ 	struct zram_meta *meta = zram->meta;
+ 	unsigned long handle;
+ 	unsigned int size;
++	struct zcomp_strm *zstrm;
+ 
+-	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
++	zram_lock_table(&meta->table[index]);
+ 	handle = meta->table[index].handle;
+ 	size = zram_get_obj_size(meta, index);
+ 
+ 	if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
+-		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
++		zram_unlock_table(&meta->table[index]);
+ 		clear_page(mem);
+ 		return 0;
+ 	}
+ 
++	zstrm = zcomp_stream_get(zram->comp);
+ 	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
+ 	if (size == PAGE_SIZE) {
+ 		copy_page(mem, cmem);
+ 	} else {
+-		struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
+-
+ 		ret = zcomp_decompress(zstrm, cmem, size, mem);
+-		zcomp_stream_put(zram->comp);
+ 	}
+ 	zs_unmap_object(meta->mem_pool, handle);
+-	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
++	zcomp_stream_put(zram->comp);
++	zram_unlock_table(&meta->table[index]);
+ 
+ 	/* Should NEVER happen. Return bio error if it does. */
+ 	if (unlikely(ret)) {
+@@ -607,14 +609,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
+ 	struct zram_meta *meta = zram->meta;
+ 	page = bvec->bv_page;
+ 
+-	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
++	zram_lock_table(&meta->table[index]);
+ 	if (unlikely(!meta->table[index].handle) ||
+ 			zram_test_flag(meta, index, ZRAM_ZERO)) {
+-		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
++		zram_unlock_table(&meta->table[index]);
+ 		handle_zero_page(bvec);
+ 		return 0;
+ 	}
+-	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
++	zram_unlock_table(&meta->table[index]);
+ 
+ 	if (is_partial_io(bvec))
+ 		/* Use  a temporary buffer to decompress the page */
+@@ -691,10 +693,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
+ 		if (user_mem)
+ 			kunmap_atomic(user_mem);
+ 		/* Free memory associated with this sector now. */
+-		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
++		zram_lock_table(&meta->table[index]);
+ 		zram_free_page(zram, index);
+ 		zram_set_flag(meta, index, ZRAM_ZERO);
+-		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
++		zram_unlock_table(&meta->table[index]);
+ 
+ 		atomic64_inc(&zram->stats.zero_pages);
+ 		ret = 0;
+@@ -785,12 +787,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
+ 	 * Free memory associated with this sector
+ 	 * before overwriting unused sectors.
+ 	 */
+-	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
++	zram_lock_table(&meta->table[index]);
+ 	zram_free_page(zram, index);
+ 
+ 	meta->table[index].handle = handle;
+ 	zram_set_obj_size(meta, index, clen);
+-	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
++	zram_unlock_table(&meta->table[index]);
+ 
+ 	/* Update stats */
+ 	atomic64_add(clen, &zram->stats.compr_data_size);
+@@ -833,9 +835,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
+ 	}
+ 
+ 	while (n >= PAGE_SIZE) {
+-		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
++		zram_lock_table(&meta->table[index]);
+ 		zram_free_page(zram, index);
+-		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
++		zram_unlock_table(&meta->table[index]);
+ 		atomic64_inc(&zram->stats.notify_free);
+ 		index++;
+ 		n -= PAGE_SIZE;
+@@ -964,9 +966,9 @@ static void zram_slot_free_notify(struct block_device *bdev,
+ 	zram = bdev->bd_disk->private_data;
+ 	meta = zram->meta;
+ 
+-	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
++	zram_lock_table(&meta->table[index]);
+ 	zram_free_page(zram, index);
+-	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
++	zram_unlock_table(&meta->table[index]);
+ 	atomic64_inc(&zram->stats.notify_free);
+ }
+ 
+diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
+index 74fcf10da374..fd4020c99b9e 100644
+--- a/drivers/block/zram/zram_drv.h
++++ b/drivers/block/zram/zram_drv.h
+@@ -73,6 +73,9 @@ enum zram_pageflags {
+ struct zram_table_entry {
+ 	unsigned long handle;
+ 	unsigned long value;
++#ifdef CONFIG_PREEMPT_RT_BASE
++	spinlock_t lock;
++#endif
+ };
+ 
+ struct zram_stats {
+@@ -120,4 +123,42 @@ struct zram {
+ 	 */
+ 	bool claim; /* Protected by bdev->bd_mutex */
+ };
++
++#ifndef CONFIG_PREEMPT_RT_BASE
++static inline void zram_lock_table(struct zram_table_entry *table)
++{
++	bit_spin_lock(ZRAM_ACCESS, &table->value);
++}
++
++static inline void zram_unlock_table(struct zram_table_entry *table)
++{
++	bit_spin_unlock(ZRAM_ACCESS, &table->value);
++}
++
++static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { }
++#else /* CONFIG_PREEMPT_RT_BASE */
++static inline void zram_lock_table(struct zram_table_entry *table)
++{
++	spin_lock(&table->lock);
++	__set_bit(ZRAM_ACCESS, &table->value);
++}
++
++static inline void zram_unlock_table(struct zram_table_entry *table)
++{
++	__clear_bit(ZRAM_ACCESS, &table->value);
++	spin_unlock(&table->lock);
++}
++
++static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize)
++{
++        size_t num_pages = disksize >> PAGE_SHIFT;
++        size_t index;
++
++        for (index = 0; index < num_pages; index++) {
++		spinlock_t *lock = &meta->table[index].lock;
++		spin_lock_init(lock);
++        }
++}
++#endif /* CONFIG_PREEMPT_RT_BASE */
++
+ #endif
+diff --git a/drivers/char/random.c b/drivers/char/random.c
+index 3efb3bf0ab83..c894d2e266f3 100644
+--- a/drivers/char/random.c
++++ b/drivers/char/random.c
+@@ -1028,8 +1028,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
+ 	} sample;
+ 	long delta, delta2, delta3;
+ 
+-	preempt_disable();
+-
+ 	sample.jiffies = jiffies;
+ 	sample.cycles = random_get_entropy();
+ 	sample.num = num;
+@@ -1070,7 +1068,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
+ 		 */
+ 		credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
+ 	}
+-	preempt_enable();
+ }
+ 
+ void add_input_randomness(unsigned int type, unsigned int code,
+@@ -1123,28 +1120,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
+ 	return *(ptr + f->reg_idx++);
+ }
+ 
+-void add_interrupt_randomness(int irq, int irq_flags)
++void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
+ {
+ 	struct entropy_store	*r;
+ 	struct fast_pool	*fast_pool = this_cpu_ptr(&irq_randomness);
<Skipped 22090 lines>
================================================================

---- gitweb:

http://git.pld-linux.org/gitweb.cgi/packages/kernel.git/commitdiff/1a6e0f06202fd47f1c81421d3927476047c3477a



More information about the pld-cvs-commit mailing list