Merge branch 'linux-mainline' into android-mainline-tmp

Change-Id: I4380c68c3474026a42ffa9f95c525f9a563ba7a3
2019-05-03 12:22:22 -07:00 · 2019-05-03 12:22:22 -07:00 · 0f2cb7cf80
commit 0f2cb7cf80
parent 03bd957ad6 37624b5854
24258 changed files with 1697131 additions and 826686 deletions
--- a/.clang-format
+++ b/.clang-format
@ -72,8 +72,14 @@ ForEachMacros:
  - 'apei_estatus_for_each_section'
  - 'ata_for_each_dev'
  - 'ata_for_each_link'
+  - '__ata_qc_for_each'
+  - 'ata_qc_for_each'
+  - 'ata_qc_for_each_raw'
+  - 'ata_qc_for_each_with_internal'
  - 'ax25_for_each'
  - 'ax25_uid_for_each'
+  - '__bio_for_each_bvec'
+  - 'bio_for_each_bvec'
  - 'bio_for_each_integrity_vec'
  - '__bio_for_each_segment'
  - 'bio_for_each_segment'
@ -85,6 +91,7 @@ ForEachMacros:
  - 'blk_queue_for_each_rl'
  - 'bond_for_each_slave'
  - 'bond_for_each_slave_rcu'
+  - 'bpf_for_each_spilled_reg'
  - 'btree_for_each_safe128'
  - 'btree_for_each_safe32'
  - 'btree_for_each_safe64'
@ -103,6 +110,8 @@ ForEachMacros:
  - 'drm_atomic_crtc_for_each_plane'
  - 'drm_atomic_crtc_state_for_each_plane'
  - 'drm_atomic_crtc_state_for_each_plane_state'
+  - 'drm_atomic_for_each_plane_damage'
+  - 'drm_connector_for_each_possible_encoder'
  - 'drm_for_each_connector_iter'
  - 'drm_for_each_crtc'
  - 'drm_for_each_encoder'
@ -111,21 +120,33 @@ ForEachMacros:
  - 'drm_for_each_legacy_plane'
  - 'drm_for_each_plane'
  - 'drm_for_each_plane_mask'
+  - 'drm_for_each_privobj'
  - 'drm_mm_for_each_hole'
  - 'drm_mm_for_each_node'
  - 'drm_mm_for_each_node_in_range'
  - 'drm_mm_for_each_node_safe'
+  - 'flow_action_for_each'
  - 'for_each_active_drhd_unit'
  - 'for_each_active_iommu'
  - 'for_each_available_child_of_node'
  - 'for_each_bio'
  - 'for_each_board_func_rsrc'
  - 'for_each_bvec'
+  - 'for_each_card_components'
+  - 'for_each_card_links'
+  - 'for_each_card_links_safe'
+  - 'for_each_card_prelinks'
+  - 'for_each_card_rtds'
+  - 'for_each_card_rtds_safe'
+  - 'for_each_cgroup_storage_type'
  - 'for_each_child_of_node'
  - 'for_each_clear_bit'
  - 'for_each_clear_bit_from'
  - 'for_each_cmsghdr'
  - 'for_each_compatible_node'
+  - 'for_each_component_dais'
+  - 'for_each_component_dais_safe'
+  - 'for_each_comp_order'
  - 'for_each_console'
  - 'for_each_cpu'
  - 'for_each_cpu_and'
@ -133,10 +154,17 @@ ForEachMacros:
  - 'for_each_cpu_wrap'
  - 'for_each_dev_addr'
  - 'for_each_dma_cap_mask'
+  - 'for_each_dpcm_be'
+  - 'for_each_dpcm_be_rollback'
+  - 'for_each_dpcm_be_safe'
+  - 'for_each_dpcm_fe'
  - 'for_each_drhd_unit'
  - 'for_each_dss_dev'
  - 'for_each_efi_memory_desc'
  - 'for_each_efi_memory_desc_in_map'
+  - 'for_each_element'
+  - 'for_each_element_extid'
+  - 'for_each_element_id'
  - 'for_each_endpoint_of_node'
  - 'for_each_evictable_lru'
  - 'for_each_fib6_node_rt_rcu'
@ -149,6 +177,7 @@ ForEachMacros:
  - 'for_each_iommu'
  - 'for_each_ip_tunnel_rcu'
  - 'for_each_irq_nr'
+  - 'for_each_link_codecs'
  - 'for_each_lru'
  - 'for_each_matching_node'
  - 'for_each_matching_node_and_match'
@ -160,6 +189,7 @@ ForEachMacros:
  - 'for_each_mem_range_rev'
  - 'for_each_migratetype_order'
  - 'for_each_msi_entry'
+  - 'for_each_msi_entry_safe'
  - 'for_each_net'
  - 'for_each_netdev'
  - 'for_each_netdev_continue'
@ -172,6 +202,7 @@ ForEachMacros:
  - 'for_each_net_rcu'
  - 'for_each_new_connector_in_state'
  - 'for_each_new_crtc_in_state'
+  - 'for_each_new_mst_mgr_in_state'
  - 'for_each_new_plane_in_state'
  - 'for_each_new_private_obj_in_state'
  - 'for_each_node'
@ -183,12 +214,16 @@ ForEachMacros:
  - 'for_each_node_with_property'
  - 'for_each_of_allnodes'
  - 'for_each_of_allnodes_from'
+  - 'for_each_of_cpu_node'
  - 'for_each_of_pci_range'
  - 'for_each_old_connector_in_state'
  - 'for_each_old_crtc_in_state'
+  - 'for_each_old_mst_mgr_in_state'
  - 'for_each_oldnew_connector_in_state'
  - 'for_each_oldnew_crtc_in_state'
+  - 'for_each_oldnew_mst_mgr_in_state'
  - 'for_each_oldnew_plane_in_state'
+  - 'for_each_oldnew_plane_in_state_reverse'
  - 'for_each_oldnew_private_obj_in_state'
  - 'for_each_old_plane_in_state'
  - 'for_each_old_private_obj_in_state'
@ -206,14 +241,21 @@ ForEachMacros:
  - 'for_each_process'
  - 'for_each_process_thread'
  - 'for_each_property_of_node'
+  - 'for_each_registered_fb'
  - 'for_each_reserved_mem_region'
-  - 'for_each_resv_unavail_range'
+  - 'for_each_rtd_codec_dai'
+  - 'for_each_rtd_codec_dai_rollback'
  - 'for_each_rtdcom'
  - 'for_each_rtdcom_safe'
  - 'for_each_set_bit'
  - 'for_each_set_bit_from'
  - 'for_each_sg'
+  - 'for_each_sg_dma_page'
  - 'for_each_sg_page'
+  - 'for_each_sibling_event'
+  - 'for_each_subelement'
+  - 'for_each_subelement_extid'
+  - 'for_each_subelement_id'
  - '__for_each_thread'
  - 'for_each_thread'
  - 'for_each_zone'
@ -223,6 +265,8 @@ ForEachMacros:
  - 'fwnode_for_each_child_node'
  - 'fwnode_graph_for_each_endpoint'
  - 'gadget_for_each_ep'
+  - 'genradix_for_each'
+  - 'genradix_for_each_from'
  - 'hash_for_each'
  - 'hash_for_each_possible'
  - 'hash_for_each_possible_rcu'
@ -251,6 +295,8 @@ ForEachMacros:
  - 'hlist_nulls_for_each_entry_from'
  - 'hlist_nulls_for_each_entry_rcu'
  - 'hlist_nulls_for_each_entry_safe'
+  - 'i3c_bus_for_each_i2cdev'
+  - 'i3c_bus_for_each_i3cdev'
  - 'ide_host_for_each_port'
  - 'ide_port_for_each_dev'
  - 'ide_port_for_each_present_dev'
@ -259,19 +305,25 @@ ForEachMacros:
  - 'idr_for_each_entry_ul'
  - 'inet_bind_bucket_for_each'
  - 'inet_lhash2_for_each_icsk_rcu'
-  - 'iov_for_each'
  - 'key_for_each'
  - 'key_for_each_safe'
  - 'klp_for_each_func'
+  - 'klp_for_each_func_safe'
+  - 'klp_for_each_func_static'
  - 'klp_for_each_object'
+  - 'klp_for_each_object_safe'
+  - 'klp_for_each_object_static'
  - 'kvm_for_each_memslot'
  - 'kvm_for_each_vcpu'
  - 'list_for_each'
+  - 'list_for_each_codec'
+  - 'list_for_each_codec_safe'
  - 'list_for_each_entry'
  - 'list_for_each_entry_continue'
  - 'list_for_each_entry_continue_rcu'
  - 'list_for_each_entry_continue_reverse'
  - 'list_for_each_entry_from'
+  - 'list_for_each_entry_from_rcu'
  - 'list_for_each_entry_from_reverse'
  - 'list_for_each_entry_lockless'
  - 'list_for_each_entry_rcu'
@ -291,6 +343,9 @@ ForEachMacros:
  - 'media_device_for_each_intf'
  - 'media_device_for_each_link'
  - 'media_device_for_each_pad'
+  - 'mp_bvec_for_each_page'
+  - 'mp_bvec_for_each_segment'
+  - 'nanddev_io_for_each_page'
  - 'netdev_for_each_lower_dev'
  - 'netdev_for_each_lower_private'
  - 'netdev_for_each_lower_private_rcu'
@ -323,10 +378,10 @@ ForEachMacros:
  - 'protocol_for_each_card'
  - 'protocol_for_each_dev'
  - 'queue_for_each_hw_ctx'
-  - 'radix_tree_for_each_contig'
  - 'radix_tree_for_each_slot'
  - 'radix_tree_for_each_tagged'
  - 'rbtree_postorder_for_each_entry_safe'
+  - 'rdma_for_each_port'
  - 'resource_list_for_each_entry'
  - 'resource_list_for_each_entry_safe'
  - 'rhl_for_each_entry_rcu'
@ -341,6 +396,7 @@ ForEachMacros:
  - 'rht_for_each_rcu'
  - 'rht_for_each_rcu_continue'
  - '__rq_for_each_bio'
+  - 'rq_for_each_bvec'
  - 'rq_for_each_segment'
  - 'scsi_for_each_prot_sg'
  - 'scsi_for_each_sg'
@ -358,12 +414,14 @@ ForEachMacros:
  - 'sk_nulls_for_each'
  - 'sk_nulls_for_each_from'
  - 'sk_nulls_for_each_rcu'
+  - 'snd_array_for_each'
  - 'snd_pcm_group_for_each_entry'
  - 'snd_soc_dapm_widget_for_each_path'
  - 'snd_soc_dapm_widget_for_each_path_safe'
  - 'snd_soc_dapm_widget_for_each_sink_path'
  - 'snd_soc_dapm_widget_for_each_source_path'
  - 'tb_property_for_each'
+  - 'tcf_exts_for_each_action'
  - 'udp_portaddr_for_each_entry'
  - 'udp_portaddr_for_each_entry_rcu'
  - 'usb_hub_for_each_child'
@ -372,6 +430,13 @@ ForEachMacros:
  - 'v4l2_m2m_for_each_dst_buf_safe'
  - 'v4l2_m2m_for_each_src_buf'
  - 'v4l2_m2m_for_each_src_buf_safe'
+  - 'virtio_device_for_each_vq'
+  - 'xa_for_each'
+  - 'xa_for_each_marked'
+  - 'xa_for_each_start'
+  - 'xas_for_each'
+  - 'xas_for_each_conflict'
+  - 'xas_for_each_marked'
  - 'zorro_for_each_dev'

 #IncludeBlocks: Preserve # Unknown to clang-format-5.0
--- a/.gitignore
+++ b/.gitignore
@ -15,6 +15,7 @@
 *.bin
 *.bz2
 *.c.[012]*.*
+*.dt.yaml
 *.dtb
 *.dtb.S
 *.dwo
--- a/.mailmap
+++ b/.mailmap
@ -36,9 +36,10 @@ Bart Van Assche <bvanassche@acm.org> <bart.vanassche@sandisk.com>
 Ben Gardner <bgardner@wabtec.com>
 Ben M Cahill <ben.m.cahill@intel.com>
 Björn Steinbrink <B.Steinbrink@gmx.de>
-Boris Brezillon <boris.brezillon@bootlin.com> <boris.brezillon@free-electrons.com>
-Boris Brezillon <boris.brezillon@bootlin.com> <b.brezillon.dev@gmail.com>
-Boris Brezillon <boris.brezillon@bootlin.com> <b.brezillon@overkiz.com>
+Boris Brezillon <bbrezillon@kernel.org> <boris.brezillon@bootlin.com>
+Boris Brezillon <bbrezillon@kernel.org> <boris.brezillon@free-electrons.com>
+Boris Brezillon <bbrezillon@kernel.org> <b.brezillon.dev@gmail.com>
+Boris Brezillon <bbrezillon@kernel.org> <b.brezillon@overkiz.com>
 Brian Avery <b.avery@hp.com>
 Brian King <brking@us.ibm.com>
 Christoph Hellwig <hch@lst.de>
@ -47,7 +48,10 @@ Corey Minyard <minyard@acm.org>
 Damian Hobson-Garcia <dhobsong@igel.co.jp>
 David Brownell <david-b@pacbell.net>
 David Woodhouse <dwmw2@shinybook.infradead.org>
-Deng-Cheng Zhu <dengcheng.zhu@mips.com> <dengcheng.zhu@imgtec.com>
+Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@mips.com>
+Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@imgtec.com>
+Dengcheng Zhu <dzhu@wavecomp.com> <dczhu@mips.com>
+Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@gmail.com>
 Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
 Domen Puncer <domen@coderock.org>
 Douglas Gilbert <dougg@torque.net>
@ -119,6 +123,14 @@ Mark Brown <broonie@sirena.org.uk>
 Mark Yao <markyao0591@gmail.com> <mark.yao@rock-chips.com>
 Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com>
 Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com>
+Mathieu Othacehe <m.othacehe@gmail.com>
+Matthew Wilcox <willy@infradead.org> <matthew.r.wilcox@intel.com>
+Matthew Wilcox <willy@infradead.org> <matthew@wil.cx>
+Matthew Wilcox <willy@infradead.org> <mawilcox@linuxonhyperv.com>
+Matthew Wilcox <willy@infradead.org> <mawilcox@microsoft.com>
+Matthew Wilcox <willy@infradead.org> <willy@debian.org>
+Matthew Wilcox <willy@infradead.org> <willy@linux.intel.com>
+Matthew Wilcox <willy@infradead.org> <willy@parisc-linux.org>
 Matthieu CASTET <castet.matthieu@free.fr>
 Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@brturbo.com.br>
 Mauro Carvalho Chehab <mchehab@kernel.org> <maurochehab@gmail.com>
@ -144,6 +156,8 @@ Morten Welinder <welinder@darter.rentec.com>
 Morten Welinder <welinder@troll.com>
 Mythri P K <mythripk@ti.com>
 Nguyen Anh Quynh <aquynh@gmail.com>
+Nicolas Pitre <nico@fluxnic.net> <nicolas.pitre@linaro.org>
+Nicolas Pitre <nico@fluxnic.net> <nico@linaro.org>
 Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
 Patrick Mochel <mochel@digitalimplant.org>
 Paul Burton <paul.burton@mips.com> <paul.burton@imgtec.com>
@ -152,7 +166,13 @@ Peter Oruba <peter@oruba.de>
 Peter Oruba <peter.oruba@amd.com>
 Pratyush Anand <pratyush.anand@gmail.com> <pratyush.anand@st.com>
 Praveen BP <praveenbp@ti.com>
+Punit Agrawal <punitagrawal@gmail.com> <punit.agrawal@arm.com>
 Qais Yousef <qsyousef@gmail.com> <qais.yousef@imgtec.com>
+Oleksij Rempel <linux@rempel-privat.de> <bug-track@fisher-privat.net>
+Oleksij Rempel <linux@rempel-privat.de> <external.Oleksij.Rempel@de.bosch.com>
+Oleksij Rempel <linux@rempel-privat.de> <fixed-term.Oleksij.Rempel@de.bosch.com>
+Oleksij Rempel <linux@rempel-privat.de> <o.rempel@pengutronix.de>
+Oleksij Rempel <linux@rempel-privat.de> <ore@pengutronix.de>
 Rajesh Shah <rajesh.shah@intel.com>
 Ralf Baechle <ralf@linux-mips.org>
 Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
@ -206,3 +226,5 @@ Yakir Yang <kuankuan.y@gmail.com> <ykk@rock-chips.com>
 Yusuke Goda <goda.yusuke@renesas.com>
 Gustavo Padovan <gustavo@las.ic.unicamp.br>
 Gustavo Padovan <padovan@profusion.mobi>
+Changbin Du <changbin.du@intel.com> <changbin.du@intel.com>
+Changbin Du <changbin.du@intel.com> <changbin.du@gmail.com>
--- a/44
+++ b/44
@ -842,10 +842,9 @@ D: ax25-utils maintainer.

 N: Helge Deller
 E: deller@gmx.de
-E: hdeller@redhat.de
-D: PA-RISC Linux hacker, LASI-, ASP-, WAX-, LCD/LED-driver
-S: Schimmelsrain 1
-S: D-69231 Rauenberg
+W: http://www.parisc-linux.org/
+D: PA-RISC Linux architecture maintainer
+D: LASI-, ASP-, WAX-, LCD/LED-driver
 S: Germany

 N: Jean Delvare
@ -1222,7 +1221,7 @@ S: Brazil

 N: Oded Gabbay
 E: oded.gabbay@gmail.com
-D: AMD KFD maintainer
+D: HabanaLabs and AMD KFD maintainer
 S: 12 Shraga Raphaeli
 S: Petah-Tikva, 4906418
 S: Israel
@ -1361,7 +1360,7 @@ S: Stellenbosch, Western Cape
 S: South Africa

 N: Grant Grundler
-E: grundler@parisc-linux.org
+E: grantgrundler@gmail.com
 W: http://obmouse.sourceforge.net/
 W: http://www.parisc-linux.org/
 D: obmouse - rewrote Olivier Florent's Omnibook 600 "pop-up" mouse driver
@ -2138,6 +2137,10 @@ E: paul@laufernet.com
 D: Soundblaster driver fixes, ISAPnP quirk
 S: California, USA

+N: Jarkko Lavinen
+E: jarkko.lavinen@nokia.com
+D: OMAP MMC support
+
 N: Jonathan Layes
 D: ARPD support

@ -2200,6 +2203,16 @@ S: Post Office Box 371
 S: North Little Rock, Arkansas 72115
 S: USA

+N: Christopher Li
+E: sparse@chrisli.org
+D: Sparse maintainer 2009 - 2018
+
+N: Shaohua Li
+D: Worked on many parts of the kernel, from core x86, ACPI, PCI, KVM, MM,
+D: and much more. He was the maintainer of MD from 2016 to 2018. Shaohua
+D: passed away late 2018, he will be greatly missed.
+W: https://www.spinics.net/lists/raid/msg61993.html
+
 N: Stephan Linz
 E: linz@mazet.de
 E: Stephan.Linz@gmx.de
@ -2478,7 +2491,7 @@ S: Syracuse, New York 13206
 S: USA

 N: Kyle McMartin
-E: kyle@parisc-linux.org
+E: kyle@mcmartin.ca
 D: Linux/PARISC hacker
 D: AD1889 sound driver
 S: Ottawa, Canada
@ -2533,6 +2546,10 @@ S: Ormond
 S: Victoria 3163
 S: Australia

+N: Eric Miao
+E: eric.y.miao@gmail.com
+D: MMP support
+
 N: Pauline Middelink
 E: middelin@polyware.nl
 D: General low-level bug fixes, /proc fixes, identd support
@ -3762,14 +3779,13 @@ S: 21513 Conradia Ct
 S: Cupertino, CA 95014
 S: USA

-N: Thibaut Varene
-E: T-Bone@parisc-linux.org
-W: http://www.parisc-linux.org/~varenet/
-P: 1024D/B7D2F063 E67C 0D43 A75E 12A5 BB1C  FA2F 1E32 C3DA B7D2 F063
+N: Thibaut Varène
+E: hacks+kernel@slashdirt.org
+W: http://hacks.slashdirt.org/
 D: PA-RISC port minion, PDC and GSCPS2 drivers, debuglocks and other bits
 D: Some ARM at91rm9200 bits, S1D13XXX FB driver, random patches here and there
 D: AD1889 sound driver
-S: Paris, France
+S: France

 N: Heikki Vatiainen
 E: hessu@cs.tut.fi
@ -4107,6 +4123,10 @@ S: 1507 145th Place SE #B5
 S: Bellevue, Washington 98007
 S: USA

+N: Haojian Zhuang
+E: haojian.zhuang@gmail.com
+D: MMP support
+
 N: Richard Zidlicky
 E: rz@linux-m68k.org, rdzidlic@geocities.com
 W: http://www.geocities.com/rdzidlic
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@ -1,428 +0,0 @@
-
-This is a brief list of all the files in ./linux/Documentation and what
-they contain. If you add a documentation file, please list it here in
-alphabetical order as well, or risk being hunted down like a rabid dog.
-Please keep the descriptions small enough to fit on one line.
-							 Thanks -- Paul G.
-
-Following translations are available on the WWW:
-
-   - Japanese, maintained by the JF Project (jf@listserv.linux.or.jp), at
-     http://linuxjf.sourceforge.jp/
-
-00-INDEX
-	- this file.
-ABI/
-	- info on kernel <-> userspace ABI and relative interface stability.
-CodingStyle
-	- nothing here, just a pointer to process/coding-style.rst.
-DMA-API.txt
-	- DMA API, pci_ API & extensions for non-consistent memory machines.
-DMA-API-HOWTO.txt
-	- Dynamic DMA mapping Guide
-DMA-ISA-LPC.txt
-	- How to do DMA with ISA (and LPC) devices.
-DMA-attributes.txt
-	- listing of the various possible attributes a DMA region can have
-EDID/
-	- directory with info on customizing EDID for broken gfx/displays.
-IPMI.txt
-	- info on Linux Intelligent Platform Management Interface (IPMI) Driver.
-IRQ-affinity.txt
-	- how to select which CPU(s) handle which interrupt events on SMP.
-IRQ-domain.txt
-	- info on interrupt numbering and setting up IRQ domains.
-IRQ.txt
-	- description of what an IRQ is.
-Intel-IOMMU.txt
-	- basic info on the Intel IOMMU virtualization support.
-Makefile
-	- It's not of interest for those who aren't touching the build system.
-PCI/
-	- info related to PCI drivers.
-RCU/
-	- directory with info on RCU (read-copy update).
-SAK.txt
-	- info on Secure Attention Keys.
-SM501.txt
-	- Silicon Motion SM501 multimedia companion chip
-SubmittingPatches
-	- nothing here, just a pointer to process/coding-style.rst.
-accounting/
-	- documentation on accounting and taskstats.
-acpi/
-	- info on ACPI-specific hooks in the kernel.
-admin-guide/
-	- info related to Linux users and system admins.
-aoe/
-	- description of AoE (ATA over Ethernet) along with config examples.
-arm/
-	- directory with info about Linux on the ARM architecture.
-arm64/
-	- directory with info about Linux on the 64 bit ARM architecture.
-auxdisplay/
-	- misc. LCD driver documentation (cfag12864b, ks0108).
-backlight/
-	- directory with info on controlling backlights in flat panel displays
-block/
-	- info on the Block I/O (BIO) layer.
-blockdev/
-	- info on block devices & drivers
-bt8xxgpio.txt
-	- info on how to modify a bt8xx video card for GPIO usage.
-btmrvl.txt
-	- info on Marvell Bluetooth driver usage.
-bus-devices/
-	- directory with info on TI GPMC (General Purpose Memory Controller)
-bus-virt-phys-mapping.txt
-	- how to access I/O mapped memory from within device drivers.
-cdrom/
-	- directory with information on the CD-ROM drivers that Linux has.
-cgroup-v1/
-	- cgroups v1 features, including cpusets and memory controller.
-cma/
-	- Continuous Memory Area (CMA) debugfs interface.
-conf.py
-	- It's not of interest for those who aren't touching the build system.
-connector/
-	- docs on the netlink based userspace<->kernel space communication mod.
-console/
-	- documentation on Linux console drivers.
-core-api/
-	- documentation on kernel core components.
-cpu-freq/
-	- info on CPU frequency and voltage scaling.
-cpu-hotplug.txt
-	- document describing CPU hotplug support in the Linux kernel.
-cpu-load.txt
-	- document describing how CPU load statistics are collected.
-cpuidle/
-	- info on CPU_IDLE, CPU idle state management subsystem.
-cputopology.txt
-	- documentation on how CPU topology info is exported via sysfs.
-crc32.txt
-	- brief tutorial on CRC computation
-crypto/
-	- directory with info on the Crypto API.
-dcdbas.txt
-	- information on the Dell Systems Management Base Driver.
-debugging-modules.txt
-	- some notes on debugging modules after Linux 2.6.3.
-debugging-via-ohci1394.txt
-	- how to use firewire like a hardware debugger memory reader.
-dell_rbu.txt
-	- document demonstrating the use of the Dell Remote BIOS Update driver.
-dev-tools/
-	- directory with info on development tools for the kernel.
-device-mapper/
-	- directory with info on Device Mapper.
-dmaengine/
-	- the DMA engine and controller API guides.
-devicetree/
-	- directory with info on device tree files used by OF/PowerPC/ARM
-digsig.txt
-	-info on the Digital Signature Verification API
-dma-buf-sharing.txt
-	- the DMA Buffer Sharing API Guide
-docutils.conf
-	- nothing here. Just a configuration file for docutils.
-dontdiff
-	- file containing a list of files that should never be diff'ed.
-driver-api/
-	- the Linux driver implementer's API guide.
-driver-model/
-	- directory with info about Linux driver model.
-early-userspace/
-	- info about initramfs, klibc, and userspace early during boot.
-efi-stub.txt
-	- How to use the EFI boot stub to bypass GRUB or elilo on EFI systems.
-eisa.txt
-	- info on EISA bus support.
-extcon/
-	- directory with porting guide for Android kernel switch driver.
-isa.txt
-	- info on EISA bus support.
-fault-injection/
-	- dir with docs about the fault injection capabilities infrastructure.
-fb/
-	- directory with info on the frame buffer graphics abstraction layer.
-features/
-	- status of feature implementation on different architectures.
-filesystems/
-	- info on the vfs and the various filesystems that Linux supports.
-firmware_class/
-	- request_firmware() hotplug interface info.
-flexible-arrays.txt
-	- how to make use of flexible sized arrays in linux
-fmc/
-	- information about the FMC bus abstraction
-fpga/
-	- FPGA Manager Core.
-futex-requeue-pi.txt
-	- info on requeueing of tasks from a non-PI futex to a PI futex
-gcc-plugins.txt
-	- GCC plugin infrastructure.
-gpio/
-	- gpio related documentation
-gpu/
-	- directory with information on GPU driver developer's guide.
-hid/
-	- directory with information on human interface devices
-highuid.txt
-	- notes on the change from 16 bit to 32 bit user/group IDs.
-hwspinlock.txt
-	- hardware spinlock provides hardware assistance for synchronization
-timers/
-	- info on the timer related topics
-hw_random.txt
-	- info on Linux support for random number generator in i8xx chipsets.
-hwmon/
-	- directory with docs on various hardware monitoring drivers.
-i2c/
-	- directory with info about the I2C bus/protocol (2 wire, kHz speed).
-x86/i386/
-	- directory with info about Linux on Intel 32 bit architecture.
-ia64/
-	- directory with info about Linux on Intel 64 bit architecture.
-ide/
-	- Information regarding the Enhanced IDE drive.
-iio/
-	- info on industrial IIO configfs support.
-index.rst
-	- main index for the documentation at ReST format.
-infiniband/
-	- directory with documents concerning Linux InfiniBand support.
-input/
-	- info on Linux input device support.
-intel_txt.txt
-	- info on intel Trusted Execution Technology (intel TXT).
-io-mapping.txt
-	- description of io_mapping functions in linux/io-mapping.h
-io_ordering.txt
-	- info on ordering I/O writes to memory-mapped addresses.
-ioctl/
-	- directory with documents describing various IOCTL calls.
-iostats.txt
-	- info on I/O statistics Linux kernel provides.
-irqflags-tracing.txt
-	- how to use the irq-flags tracing feature.
-isapnp.txt
-	- info on Linux ISA Plug & Play support.
-isdn/
-	- directory with info on the Linux ISDN support, and supported cards.
-kbuild/
-	- directory with info about the kernel build process.
-kdump/
-	- directory with mini HowTo on getting the crash dump code to work.
-doc-guide/
-	- how to write and format reStructuredText kernel documentation
-kernel-per-CPU-kthreads.txt
-	- List of all per-CPU kthreads and how they introduce jitter.
-kobject.txt
-	- info of the kobject infrastructure of the Linux kernel.
-kprobes.txt
-	- documents the kernel probes debugging feature.
-kref.txt
-	- docs on adding reference counters (krefs) to kernel objects.
-laptops/
-	- directory with laptop related info and laptop driver documentation.
-ldm.txt
-	- a brief description of LDM (Windows Dynamic Disks).
-leds/
-	- directory with info about LED handling under Linux.
-livepatch/
-	- info on kernel live patching.
-locking/
-	- directory with info about kernel locking primitives
-lockup-watchdogs.txt
-	- info on soft and hard lockup detectors (aka nmi_watchdog).
-logo.gif
-	- full colour GIF image of Linux logo (penguin - Tux).
-logo.txt
-	- info on creator of above logo & site to get additional images from.
-lsm.txt
-	- Linux Security Modules: General Security Hooks for Linux
-lzo.txt
-	- kernel LZO decompressor input formats
-m68k/
-	- directory with info about Linux on Motorola 68k architecture.
-mailbox.txt
-	- How to write drivers for the common mailbox framework (IPC).
-md/
-	- directory with info about Linux Software RAID
-media/
-	- info on media drivers: uAPI, kAPI and driver documentation.
-memory-barriers.txt
-	- info on Linux kernel memory barriers.
-memory-devices/
-	- directory with info on parts like the Texas Instruments EMIF driver
-memory-hotplug.txt
-	- Hotpluggable memory support, how to use and current status.
-men-chameleon-bus.txt
-	- info on MEN chameleon bus.
-mic/
-	- Intel Many Integrated Core (MIC) architecture device driver.
-mips/
-	- directory with info about Linux on MIPS architecture.
-misc-devices/
-	- directory with info about devices using the misc dev subsystem
-mmc/
-	- directory with info about the MMC subsystem
-mtd/
-	- directory with info about memory technology devices (flash)
-namespaces/
-	- directory with various information about namespaces
-netlabel/
-	- directory with information on the NetLabel subsystem.
-networking/
-	- directory with info on various aspects of networking with Linux.
-nfc/
-	- directory relating info about Near Field Communications support.
-nios2/
-	- Linux on the Nios II architecture.
-nommu-mmap.txt
-	- documentation about no-mmu memory mapping support.
-numastat.txt
-	- info on how to read Numa policy hit/miss statistics in sysfs.
-ntb.txt
-	- info on Non-Transparent Bridge (NTB) drivers.
-nvdimm/
-	- info on non-volatile devices.
-nvmem/
-	- info on non volatile memory framework.
-output/
-	- default directory where html/LaTeX/pdf files will be written.
-padata.txt
-	- An introduction to the "padata" parallel execution API
-parisc/
-	- directory with info on using Linux on PA-RISC architecture.
-parport-lowlevel.txt
-	- description and usage of the low level parallel port functions.
-pcmcia/
-	- info on the Linux PCMCIA driver.
-percpu-rw-semaphore.txt
-	- RCU based read-write semaphore optimized for locking for reading
-perf/
-	- info about the APM X-Gene SoC Performance Monitoring Unit (PMU).
-phy/
-	- ino on Samsung USB 2.0 PHY adaptation layer.
-phy.txt
-	- Description of the generic PHY framework.
-pi-futex.txt
-	- documentation on lightweight priority inheritance futexes.
-pinctrl.txt
-	- info on pinctrl subsystem and the PINMUX/PINCONF and drivers
-platform/
-	- List of supported hardware by compal and Dell laptop.
-pnp.txt
-	- Linux Plug and Play documentation.
-power/
-	- directory with info on Linux PCI power management.
-powerpc/
-	- directory with info on using Linux with the PowerPC.
-prctl/
-	- directory with info on the priveledge control subsystem
-preempt-locking.txt
-	- info on locking under a preemptive kernel.
-process/
-	- how to work with the mainline kernel development process.
-pps/
-	- directory with information on the pulse-per-second support
-pti/
-	- directory with info on Intel MID PTI.
-ptp/
-	- directory with info on support for IEEE 1588 PTP clocks in Linux.
-pwm.txt
-	- info on the pulse width modulation driver subsystem
-rapidio/
-	- directory with info on RapidIO packet-based fabric interconnect
-rbtree.txt
-	- info on what red-black trees are and what they are for.
-remoteproc.txt
-	- info on how to handle remote processor (e.g. AMP) offloads/usage.
-rfkill.txt
-	- info on the radio frequency kill switch subsystem/support.
-robust-futex-ABI.txt
-	- documentation of the robust futex ABI.
-robust-futexes.txt
-	- a description of what robust futexes are.
-rpmsg.txt
-	- info on the Remote Processor Messaging (rpmsg) Framework
-rtc.txt
-	- notes on how to use the Real Time Clock (aka CMOS clock) driver.
-s390/
-	- directory with info on using Linux on the IBM S390.
-scheduler/
-	- directory with info on the scheduler.
-scsi/
-	- directory with info on Linux scsi support.
-security/
-	- directory that contains security-related info
-serial/
-	- directory with info on the low level serial API.
-sgi-ioc4.txt
-	- description of the SGI IOC4 PCI (multi function) device.
-sh/
-	- directory with info on porting Linux to a new architecture.
-smsc_ece1099.txt
-	-info on the smsc Keyboard Scan Expansion/GPIO Expansion device.
-sound/
-	- directory with info on sound card support.
-spi/
-	- overview of Linux kernel Serial Peripheral Interface (SPI) support.
-sphinx/
-	- no documentation here, just files required by Sphinx toolchain.
-sphinx-static/
-	- no documentation here, just files required by Sphinx toolchain.
-static-keys.txt
-	- info on how static keys allow debug code in hotpaths via patching
-svga.txt
-	- short guide on selecting video modes at boot via VGA BIOS.
-sync_file.txt
-	- Sync file API guide.
-sysctl/
-	- directory with info on the /proc/sys/* files.
-target/
-	- directory with info on generating TCM v4 fabric .ko modules
-tee.txt
-	- info on the TEE subsystem and drivers
-this_cpu_ops.txt
-	- List rationale behind and the way to use this_cpu operations.
-thermal/
-	- directory with information on managing thermal issues (CPU/temp)
-trace/
-	- directory with info on tracing technologies within linux
-translations/
-	- translations of this document from English to another language
-unaligned-memory-access.txt
-	- info on how to avoid arch breaking unaligned memory access in code.
-unshare.txt
-	- description of the Linux unshare system call.
-usb/
-	- directory with info regarding the Universal Serial Bus.
-vfio.txt
-	- info on Virtual Function I/O used in guest/hypervisor instances.
-video-output.txt
-	- sysfs class driver interface to enable/disable a video output device.
-virtual/
-	- directory with information on the various linux virtualizations.
-vm/
-	- directory with info on the Linux vm code.
-w1/
-	- directory with documents regarding the 1-wire (w1) subsystem.
-watchdog/
-	- how to auto-reboot Linux if it has "fallen and can't get up". ;-)
-wimax/
-	- directory with info about Intel Wireless Wimax Connections
-core-api/workqueue.rst
-	- information on the Concurrency Managed Workqueue implementation
-x86/x86_64/
-	- directory with info on Linux support for AMD x86-64 (Hammer) machines.
-xillybus.txt
-	- Overview and basic ui of xillybus driver
-xtensa/
-	- directory with documents relating to arch/xtensa port/implementation
-xz.txt
-	- how to make use of the XZ data compression within linux kernel
-zorro.txt
-	- info on writing drivers for Zorro bus devices found on Amigas.
--- a/Documentation/ABI/obsolete/sysfs-class-dax
+++ b/Documentation/ABI/obsolete/sysfs-class-dax
@ -0,0 +1,22 @@
+What:           /sys/class/dax/
+Date:           May, 2016
+KernelVersion:  v4.7
+Contact:        linux-nvdimm@lists.01.org
+Description:	Device DAX is the device-centric analogue of Filesystem
+		DAX (CONFIG_FS_DAX).  It allows memory ranges to be
+		allocated and mapped without need of an intervening file
+		system.  Device DAX is strict, precise and predictable.
+		Specifically this interface:
+
+		1/ Guarantees fault granularity with respect to a given
+		page size (pte, pmd, or pud) set at configuration time.
+
+		2/ Enforces deterministic behavior by being strict about
+		what fault scenarios are supported.
+
+		The /sys/class/dax/ interface enumerates all the
+		device-dax instances in the system. The ABI is
+		deprecated and will be removed after 2020. It is
+		replaced with the DAX bus interface /sys/bus/dax/ where
+		device-dax instances can be found under
+		/sys/bus/dax/devices/
--- a/Documentation/ABI/stable/sysfs-bus-vmbus
+++ b/Documentation/ABI/stable/sysfs-bus-vmbus
@ -146,3 +146,36 @@ KernelVersion:	4.16
 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
 Description:	Binary file created by uio_hv_generic for ring buffer
 Users:		Userspace drivers
+
+What:           /sys/bus/vmbus/devices/<UUID>/channels/<N>/intr_in_full
+Date:           February 2019
+KernelVersion:  5.0
+Contact:        Michael Kelley <mikelley@microsoft.com>
+Description:    Number of guest to host interrupts caused by the inbound ring
+		buffer transitioning from full to not full while a packet is
+		waiting for buffer space to become available
+Users:          Debugging tools
+
+What:           /sys/bus/vmbus/devices/<UUID>/channels/<N>/intr_out_empty
+Date:           February 2019
+KernelVersion:  5.0
+Contact:        Michael Kelley <mikelley@microsoft.com>
+Description:    Number of guest to host interrupts caused by the outbound ring
+		buffer transitioning from empty to not empty
+Users:          Debugging tools
+
+What:           /sys/bus/vmbus/devices/<UUID>/channels/<N>/out_full_first
+Date:           February 2019
+KernelVersion:  5.0
+Contact:        Michael Kelley <mikelley@microsoft.com>
+Description:    Number of write operations that were the first to encounter an
+		outbound ring buffer full condition
+Users:          Debugging tools
+
+What:           /sys/bus/vmbus/devices/<UUID>/channels/<N>/out_full_total
+Date:           February 2019
+KernelVersion:  5.0
+Contact:        Michael Kelley <mikelley@microsoft.com>
+Description:    Total number of write operations that encountered an outbound
+		ring buffer full condition
+Users:          Debugging tools
--- a/Documentation/ABI/stable/sysfs-driver-mlxreg-io
+++ b/Documentation/ABI/stable/sysfs-driver-mlxreg-io
@ -12,7 +12,6 @@ Description:	This file shows ASIC health status. The possible values are:
 What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
 							cpld1_version
 							cpld2_version
-
 Date:		June 2018
 KernelVersion:	4.19
 Contact:	Vadim Pasternak <vadimpmellanox.com>
@ -21,6 +20,40 @@ Description:	These files show with which CPLD versions have been burned

 		The files are read only.

+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
+							fan_dir
+
+Date:		December 2018
+KernelVersion:	5.0
+Contact:	Vadim Pasternak <vadimpmellanox.com>
+Description:	This file shows the system fans direction:
+		forward direction - relevant bit is set 0;
+		reversed direction - relevant bit is set 1.
+
+		The files are read only.
+
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
+							jtag_enable
+
+Date:		November 2018
+KernelVersion:	5.0
+Contact:	Vadim Pasternak <vadimpmellanox.com>
+Description:	These files show with which CPLD versions have been burned
+		on LED board.
+
+		The files are read only.
+
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
+							jtag_enable
+
+Date:		November 2018
+KernelVersion:	5.0
+Contact:	Vadim Pasternak <vadimpmellanox.com>
+Description:	These files enable and disable the access to the JTAG domain.
+		By default access to the JTAG domain is disabled.
+
+		The file is read/write.
+
 What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/select_iio
 Date:		June 2018
 KernelVersion:	4.19
@ -76,3 +109,21 @@ Description:	These files show the system reset cause, as following: power
 		reset cause.

 		The files are read only.
+
+What:		/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
+						reset_comex_pwr_fail
+						reset_from_comex
+						reset_system
+						reset_voltmon_upgrade_fail
+
+Date:		November 2018
+KernelVersion:	5.0
+Contact:	Vadim Pasternak <vadimpmellanox.com>
+Description:	These files show the system reset cause, as following: ComEx
+		power fail, reset from ComEx, system platform reset, reset
+		due to voltage monitor devices upgrade failure,
+		Value 1 in file means this is reset cause, 0 - otherwise.
+		Only one bit could be 1 at the same time, representing only
+		the last reset cause.
+
+		The files are read only.
--- a/Documentation/ABI/stable/sysfs-driver-usb-usbtmc
+++ b/Documentation/ABI/stable/sysfs-driver-usb-usbtmc
@ -25,38 +25,3 @@ Description:
 		4.2.2.

 		The files are read only.
-
-
-What:		/sys/bus/usb/drivers/usbtmc/*/TermChar
-Date:		August 2008
-Contact:	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-Description:
-		This file is the TermChar value to be sent to the USB TMC
-		device as described by the document, "Universal Serial Bus Test
-		and Measurement Class Specification
-		(USBTMC) Revision 1.0" as published by the USB-IF.
-
-		Note that the TermCharEnabled file determines if this value is
-		sent to the device or not.
-
-
-What:		/sys/bus/usb/drivers/usbtmc/*/TermCharEnabled
-Date:		August 2008
-Contact:	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-Description:
-		This file determines if the TermChar is to be sent to the
-		device on every transaction or not.  For more details about
-		this, please see the document, "Universal Serial Bus Test and
-		Measurement Class Specification (USBTMC) Revision 1.0" as
-		published by the USB-IF.
-
-
-What:		/sys/bus/usb/drivers/usbtmc/*/auto_abort
-Date:		August 2008
-Contact:	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-Description:
-		This file determines if the transaction of the USB TMC
-		device is to be automatically aborted if there is any error.
-		For more details about this, please see the document,
-		"Universal Serial Bus Test and Measurement Class Specification
-		(USBTMC) Revision 1.0" as published by the USB-IF.
--- a/Documentation/ABI/testing/configfs-stp-policy-p_sys-t
+++ b/Documentation/ABI/testing/configfs-stp-policy-p_sys-t
@ -0,0 +1,41 @@
+What:		/config/stp-policy/<device>:p_sys-t.<policy>/<node>/uuid
+Date:		June 2018
+KernelVersion:	4.19
+Description:
+		UUID source identifier string, RW.
+		Default value is randomly generated at the mkdir <node> time.
+		Data coming from trace sources that use this <node> will be
+		tagged with this UUID in the MIPI SyS-T packet stream, to
+		allow the decoder to discern between different sources
+		within the same master/channel range, and identify the
+		higher level decoders that may be needed for each source.
+
+What:		/config/stp-policy/<device>:p_sys-t.<policy>/<node>/do_len
+Date:		June 2018
+KernelVersion:	4.19
+Description:
+		Include payload length in the MIPI SyS-T header, boolean.
+		If enabled, the SyS-T protocol encoder will include payload
+		length in each packet's metadata. This is normally redundant
+		if the underlying transport protocol supports marking message
+		boundaries (which STP does), so this is off by default.
+
+What:		/config/stp-policy/<device>:p_sys-t.<policy>/<node>/ts_interval
+Date:		June 2018
+KernelVersion:	4.19
+Description:
+		Time interval in milliseconds. Include a timestamp in the
+		MIPI SyS-T packet metadata, if this many milliseconds have
+		passed since the previous packet from this source. Zero is
+		the default and stands for "never send the timestamp".
+
+What:		/config/stp-policy/<device>:p_sys-t.<policy>/<node>/clocksync_interval
+Date:		June 2018
+KernelVersion:	4.19
+Description:
+		Time interval in milliseconds. Send a CLOCKSYNC packet if
+		this many milliseconds have passed since the previous
+		CLOCKSYNC packet from this source. Zero is the default and
+		stands for "never send the CLOCKSYNC". It makes sense to
+		use this option with sources that generate constant and/or
+		periodic data, like stm_heartbeat.
--- a/Documentation/ABI/testing/configfs-usb-gadget-uvc
+++ b/Documentation/ABI/testing/configfs-usb-gadget-uvc
@ -12,6 +12,10 @@ Date:		Dec 2014
 KernelVersion:	4.0
 Description:	Control descriptors

+		All attributes read only:
+		bInterfaceNumber	- USB interface number for this
+					  streaming interface
+
 What:		/config/usb-gadget/gadget/functions/uvc.name/control/class
 Date:		Dec 2014
 KernelVersion:	4.0
@ -109,6 +113,10 @@ Date:		Dec 2014
 KernelVersion:	4.0
 Description:	Streaming descriptors

+		All attributes read only:
+		bInterfaceNumber	- USB interface number for this
+					  streaming interface
+
 What:		/config/usb-gadget/gadget/functions/uvc.name/streaming/class
 Date:		Dec 2014
 KernelVersion:	4.0
@ -160,6 +168,10 @@ Description:	Specific MJPEG format descriptors

 		All attributes read only,
 		except bmaControls and bDefaultFrameIndex:
+		bFormatIndex		- unique id for this format descriptor;
+					only defined after parent header is
+					linked into the streaming class;
+					read-only
 		bmaControls		- this format's data for bmaControls in
 					the streaming header
 		bmInterfaceFlags	- specifies interlace information,
@ -177,6 +189,10 @@ Date:		Dec 2014
 KernelVersion:	4.0
 Description:	Specific MJPEG frame descriptors

+		bFrameIndex		- unique id for this framedescriptor;
+					only defined after parent format is
+					linked into the streaming header;
+					read-only
 		dwFrameInterval		- indicates how frame interval can be
 					programmed; a number of values
 					separated by newline can be specified
@ -204,6 +220,10 @@ Date:		Dec 2014
 KernelVersion:	4.0
 Description:	Specific uncompressed format descriptors

+		bFormatIndex		- unique id for this format descriptor;
+					only defined after parent header is
+					linked into the streaming class;
+					read-only
 		bmaControls		- this format's data for bmaControls in
 					the streaming header
 		bmInterfaceFlags	- specifies interlace information,
@ -224,6 +244,10 @@ Date:		Dec 2014
 KernelVersion:	4.0
 Description:	Specific uncompressed frame descriptors

+		bFrameIndex		- unique id for this framedescriptor;
+					only defined after parent format is
+					linked into the streaming header;
+					read-only
 		dwFrameInterval		- indicates how frame interval can be
 					programmed; a number of values
 					separated by newline can be specified
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@ -0,0 +1,126 @@
+What:           /sys/kernel/debug/habanalabs/hl<n>/addr
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Sets the device address to be used for read or write through
+                PCI bar. The acceptable value is a string that starts with "0x"
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/command_buffers
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Displays a list with information about the currently allocated
+                command buffers
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/command_submission
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Displays a list with information about the currently active
+                command submissions
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/command_submission_jobs
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Displays a list with detailed information about each JOB (CB) of
+                each active command submission
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/data32
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Allows the root user to read or write directly through the
+                device's PCI bar. Writing to this file generates a write
+                transaction while reading from the file generates a read
+                transcation. This custom interface is needed (instead of using
+                the generic Linux user-space PCI mapping) because the DDR bar
+                is very small compared to the DDR memory and only the driver can
+                move the bar before and after the transaction
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/device
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Enables the root user to set the device to specific state.
+                Valid values are "disable", "enable", "suspend", "resume".
+                User can read this property to see the valid values
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/i2c_addr
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Sets I2C device address for I2C transaction that is generated
+                by the device's CPU
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/i2c_bus
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Sets I2C bus address for I2C transaction that is generated by
+                the device's CPU
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/i2c_data
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Triggers an I2C transaction that is generated by the device's
+                CPU. Writing to this file generates a write transaction while
+                reading from the file generates a read transcation
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/i2c_reg
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Sets I2C register id for I2C transaction that is generated by
+                the device's CPU
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/led0
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Sets the state of the first S/W led on the device
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/led1
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Sets the state of the second S/W led on the device
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/led2
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Sets the state of the third S/W led on the device
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/mmu
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Displays the hop values and physical address for a given ASID
+                and virtual address. The user should write the ASID and VA into
+                the file and then read the file to get the result.
+                e.g. to display info about VA 0x1000 for ASID 1 you need to do:
+                echo "1 0x1000" > /sys/kernel/debug/habanalabs/hl0/mmu
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/set_power_state
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Sets the PCI power state. Valid values are "1" for D0 and "2"
+                for D3Hot
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/userptr
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Displays a list with information about the currently user
+                pointers (user virtual addresses) that are pinned and mapped
+                to DMA addresses
+
+What:           /sys/kernel/debug/habanalabs/hl<n>/vm
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Displays a list with information about all the active virtual
+                address mappings per ASID
--- a/Documentation/ABI/testing/debugfs-wilco-ec
+++ b/Documentation/ABI/testing/debugfs-wilco-ec
@ -0,0 +1,23 @@
+What:		/sys/kernel/debug/wilco_ec/raw
+Date:		January 2019
+KernelVersion:	5.1
+Description:
+		Write and read raw mailbox commands to the EC.
+
+		For writing:
+		Bytes 0-1 indicate the message type:
+			00 F0 = Execute Legacy Command
+			00 F2 = Read/Write NVRAM Property
+		Byte 2 provides the command code
+		Bytes 3+ consist of the data passed in the request
+
+		At least three bytes are required, for the msg type and command,
+		with additional bytes optional for additional data.
+
+		Example:
+		// Request EC info type 3 (EC firmware build date)
+		$ echo 00 f0 38 00 03 00 > raw
+		// View the result. The decoded ASCII result "12/21/18" is
+		// included after the raw hex.
+		$ cat raw
+		00 31 32 2f 32 31 2f 31 38 00 38 00 01 00 2f 00  .12/21/18.8...
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@ -244,7 +244,7 @@ Description:

 What:		/sys/block/<disk>/queue/zoned
 Date:		September 2016
-Contact:	Damien Le Moal <damien.lemoal@hgst.com>
+Contact:	Damien Le Moal <damien.lemoal@wdc.com>
 Description:
 		zoned indicates if the device is a zoned block device
 		and the zone model of the device if it is indeed zoned.
@ -259,6 +259,14 @@ Description:
 		zone commands, they will be treated as regular block
 		devices and zoned will report "none".

+What:		/sys/block/<disk>/queue/nr_zones
+Date:		November 2018
+Contact:	Damien Le Moal <damien.lemoal@wdc.com>
+Description:
+		nr_zones indicates the total number of zones of a zoned block
+		device ("host-aware" or "host-managed" zone model). For regular
+		block devices, the value is always 0.
+
 What:		/sys/block/<disk>/queue/chunk_sectors
 Date:		September 2016
 Contact:	Hannes Reinecke <hare@suse.com>
@ -268,6 +276,15 @@ Description:
 		indicates the size in 512B sectors of the RAID volume
 		stripe segment. For a zoned block device, either
 		host-aware or host-managed, chunk_sectors indicates the
-		size of 512B sectors of the zones of the device, with
+		size in 512B sectors of the zones of the device, with
 		the eventual exception of the last zone of the device
 		which may be smaller.
+
+What:		/sys/block/<disk>/queue/io_timeout
+Date:		November 2018
+Contact:	Weiping Zhang <zhangweiping@didiglobal.com>
+Description:
+		io_timeout is the request timeout in milliseconds. If a request
+		does not complete in this time then the block driver timeout
+		handler is invoked. That timeout handler can decide to retry
+		the request, to fail it or to start a device recovery strategy.
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@ -98,3 +98,42 @@ Description:
 		The backing_dev file is read-write and set up backing
 		device for zram to write incompressible pages.
 		For using, user should enable CONFIG_ZRAM_WRITEBACK.
+
+What:		/sys/block/zram<id>/idle
+Date:		November 2018
+Contact:	Minchan Kim <minchan@kernel.org>
+Description:
+		idle file is write-only and mark zram slot as idle.
+		If system has mounted debugfs, user can see which slots
+		are idle via /sys/kernel/debug/zram/zram<id>/block_state
+
+What:		/sys/block/zram<id>/writeback
+Date:		November 2018
+Contact:	Minchan Kim <minchan@kernel.org>
+Description:
+		The writeback file is write-only and trigger idle and/or
+		huge page writeback to backing device.
+
+What:		/sys/block/zram<id>/bd_stat
+Date:		November 2018
+Contact:	Minchan Kim <minchan@kernel.org>
+Description:
+		The bd_stat file is read-only and represents backing device's
+		statistics (bd_count, bd_reads, bd_writes) in a format
+		similar to block layer statistics file format.
+
+What:		/sys/block/zram<id>/writeback_limit_enable
+Date:		November 2018
+Contact:	Minchan Kim <minchan@kernel.org>
+Description:
+		The writeback_limit_enable file is read-write and specifies
+		eanbe of writeback_limit feature. "1" means eable the feature.
+		No limit "0" is the initial state.
+
+What:		/sys/block/zram<id>/writeback_limit
+Date:		November 2018
+Contact:	Minchan Kim <minchan@kernel.org>
+Description:
+		The writeback_limit file is read-write and specifies the maximum
+		amount of writeback ZRAM can do. The limit could be changed
+		in run time.
--- a/Documentation/ABI/testing/sysfs-bus-i3c
+++ b/Documentation/ABI/testing/sysfs-bus-i3c
@ -0,0 +1,146 @@
+What:		/sys/bus/i3c/devices/i3c-<bus-id>
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		An I3C bus. This directory will contain one sub-directory per
+		I3C device present on the bus.
+
+What:		/sys/bus/i3c/devices/i3c-<bus-id>/current_master
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		Expose the master that owns the bus (<bus-id>-<master-pid>) at
+		the time this file is read. Note that bus ownership can change
+		overtime, so there's no guarantee that when the read() call
+		returns, the value returned is still valid.
+
+What:		/sys/bus/i3c/devices/i3c-<bus-id>/mode
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		I3C bus mode. Can be "pure", "mixed-fast" or "mixed-slow". See
+		the I3C specification for a detailed description of what each
+		of these modes implies.
+
+What:		/sys/bus/i3c/devices/i3c-<bus-id>/i3c_scl_frequency
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		The frequency (expressed in Hz) of the SCL signal when
+		operating in I3C SDR mode.
+
+What:		/sys/bus/i3c/devices/i3c-<bus-id>/i2c_scl_frequency
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		The frequency (expressed in Hz) of the SCL signal when
+		operating in I2C mode.
+
+What:		/sys/bus/i3c/devices/i3c-<bus-id>/dynamic_address
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		Dynamic address assigned to the master controller. This
+		address may change if the bus is re-initialized.
+
+What:		/sys/bus/i3c/devices/i3c-<bus-id>/bcr
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		BCR stands for Bus Characteristics Register and express the
+		device capabilities in term of speed, maximum read/write
+		length, etc. See the I3C specification for more details.
+		This entry describes the BCR of the master controller driving
+		the bus.
+
+What:		/sys/bus/i3c/devices/i3c-<bus-id>/dcr
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		DCR stands for Device Characteristics Register and express the
+		device capabilities in term of exposed features. See the I3C
+		specification for more details.
+		This entry describes the DCR of the master controller driving
+		the bus.
+
+What:		/sys/bus/i3c/devices/i3c-<bus-id>/pid
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		PID stands for Provisional ID and is used to uniquely identify
+		a device on a bus. This PID contains information about the
+		vendor, the part and an instance ID so that several devices of
+		the same type can be connected on the same bus.
+		See the I3C specification for more details.
+		This entry describes the PID of the master controller driving
+		the bus.
+
+What:		/sys/bus/i3c/devices/i3c-<bus-id>/hdrcap
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		Expose the HDR (High Data Rate) capabilities of a device.
+		Returns a list of supported HDR mode, each element is separated
+		by space. Modes can be "hdr-ddr", "hdr-tsp" and "hdr-tsl".
+		See the I3C specification for more details about these HDR
+		modes.
+		This entry describes the HDRCAP of the master controller
+		driving the bus.
+
+What:		/sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		An I3C device present on I3C bus identified by <bus-id>. Note
+		that all devices are represented including the master driving
+		the bus.
+
+What:		/sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/dynamic_address
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		Dynamic address assigned to device <bus-id>-<device-pid>. This
+		address may change if the bus is re-initialized.
+
+What:		/sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/bcr
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		BCR stands for Bus Characteristics Register and express the
+		device capabilities in term of speed, maximum read/write
+		length, etc. See the I3C specification for more details.
+
+What:		/sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/dcr
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		DCR stands for Device Characteristics Register and express the
+		device capabilities in term of exposed features. See the I3C
+		specification for more details.
+
+What:		/sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/pid
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		PID stands for Provisional ID and is used to uniquely identify
+		a device on a bus. This PID contains information about the
+		vendor, the part and an instance ID so that several devices of
+		the same type can be connected on the same bus.
+		See the I3C specification for more details.
+
+What:		/sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/hdrcap
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		Expose the HDR (High Data Rate) capabilities of a device.
+		Returns a list of supported HDR mode, each element is separated
+		by space. Modes can be "hdr-ddr", "hdr-tsp" and "hdr-tsl".
+		See the I3C specification for more details about these HDR
+		modes.
+
+What:		/sys/bus/i3c/devices/<bus-id>-<device-pid>
+KernelVersion:  5.0
+Contact:	linux-i3c@vger.kernel.org
+Description:
+		These directories are just symbolic links to
+		/sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>.
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@ -199,7 +199,7 @@ Description:

 What:		/sys/bus/iio/devices/iio:deviceX/in_positionrelative_x_raw
 What:		/sys/bus/iio/devices/iio:deviceX/in_positionrelative_y_raw
-KernelVersion:	4.18
+KernelVersion:	4.19
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Relative position in direction x or y on a pad (may be
@ -1554,6 +1554,10 @@ What:		/sys/bus/iio/devices/iio:deviceX/in_concentration_raw
 What:		/sys/bus/iio/devices/iio:deviceX/in_concentrationX_raw
 What:		/sys/bus/iio/devices/iio:deviceX/in_concentration_co2_raw
 What:		/sys/bus/iio/devices/iio:deviceX/in_concentrationX_co2_raw
+What:		/sys/bus/iio/devices/iio:deviceX/in_concentration_ethanol_raw
+What:		/sys/bus/iio/devices/iio:deviceX/in_concentrationX_ethanol_raw
+What:		/sys/bus/iio/devices/iio:deviceX/in_concentration_h2_raw
+What:		/sys/bus/iio/devices/iio:deviceX/in_concentrationX_h2_raw
 What:		/sys/bus/iio/devices/iio:deviceX/in_concentration_voc_raw
 What:		/sys/bus/iio/devices/iio:deviceX/in_concentrationX_voc_raw
 KernelVersion:	4.3
@ -1684,4 +1688,19 @@ KernelVersion:	4.18
 Contact:	linux-iio@vger.kernel.org
 Description:
 		Raw (unscaled) phase difference reading from channel Y
-		that can be processed to radians.
+		that can be processed to radians.
+
+What:		/sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm1_input
+What:		/sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm1_input
+What:		/sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm2p5_input
+What:		/sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm2p5_input
+What:		/sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm4_input
+What:		/sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm4_input
+What:		/sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm10_input
+What:		/sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm10_input
+KernelVersion:	4.22
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Mass concentration reading of particulate matter in ug / m3.
+		pmX consists of particles with aerodynamic diameter less or
+		equal to X micrometers.
--- a/Documentation/ABI/testing/sysfs-bus-iio-sps30
+++ b/Documentation/ABI/testing/sysfs-bus-iio-sps30
@ -0,0 +1,28 @@
+What:		/sys/bus/iio/devices/iio:deviceX/start_cleaning
+Date:		December 2018
+KernelVersion:	4.22
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Writing 1 starts sensor self cleaning. Internal fan accelerates
+		to its maximum speed and keeps spinning for about 10 seconds in
+		order to blow out accumulated dust.
+
+What:		/sys/bus/iio/devices/iio:deviceX/cleaning_period
+Date:		January 2019
+KernelVersion:	5.1
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Sensor is capable of triggering self cleaning periodically.
+		Period can be changed by writing a new value here. Upon reading
+		the current one is returned. Units are seconds.
+
+		Writing 0 disables periodical self cleaning entirely.
+
+What:		/sys/bus/iio/devices/iio:deviceX/cleaning_period_available
+Date:		January 2019
+KernelVersion:	5.1
+Contact:	linux-iio@vger.kernel.org
+Description:
+		The range of available values in seconds represented as the
+		minimum value, the step and the maximum value, all enclosed in
+		square brackets.
--- a/Documentation/ABI/testing/sysfs-bus-intel_th-output-devices
+++ b/Documentation/ABI/testing/sysfs-bus-intel_th-output-devices
@ -3,11 +3,13 @@ Date:		June 2015
 KernelVersion:	4.3
 Contact:	Alexander Shishkin <alexander.shishkin@linux.intel.com>
 Description:	(RW) Writes of 1 or 0 enable or disable trace output to this
-		output device. Reads return current status.
+		output device. Reads return current status. Requires that the
+		correstponding output port driver be loaded.

 What:		/sys/bus/intel_th/devices/<intel_th_id>-msc<msc-id>/port
 Date:		June 2015
 KernelVersion:	4.3
 Contact:	Alexander Shishkin <alexander.shishkin@linux.intel.com>
 Description:	(RO) Port number, corresponding to this output device on the
-		switch (GTH).
+		switch (GTH) or "unassigned" if the corresponding output
+		port driver is not loaded.
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@ -323,3 +323,27 @@ Description:

 		This is similar to /sys/bus/pci/drivers_autoprobe, but
 		affects only the VFs associated with a specific PF.
+
+What:		/sys/bus/pci/devices/.../p2pmem/size
+Date:		November 2017
+Contact:	Logan Gunthorpe <logang@deltatee.com>
+Description:
+		If the device has any Peer-to-Peer memory registered, this
+	        file contains the total amount of memory that the device
+		provides (in decimal).
+
+What:		/sys/bus/pci/devices/.../p2pmem/available
+Date:		November 2017
+Contact:	Logan Gunthorpe <logang@deltatee.com>
+Description:
+		If the device has any Peer-to-Peer memory registered, this
+	        file contains the amount of memory that has not been
+		allocated (in decimal).
+
+What:		/sys/bus/pci/devices/.../p2pmem/published
+Date:		November 2017
+Contact:	Logan Gunthorpe <logang@deltatee.com>
+Description:
+		If the device has any Peer-to-Peer memory registered, this
+	        file contains a '1' if the memory has been published for
+		use outside the driver that owns the device.
--- a/Documentation/ABI/testing/sysfs-bus-thunderbolt
+++ b/Documentation/ABI/testing/sysfs-bus-thunderbolt
@ -21,6 +21,15 @@ Description:	Holds a comma separated list of device unique_ids that
 		If a device is authorized automatically during boot its
 		boot attribute is set to 1.

+What: /sys/bus/thunderbolt/devices/.../domainX/iommu_dma_protection
+Date:		Mar 2019
+KernelVersion:	4.21
+Contact:	thunderbolt-software@lists.01.org
+Description:	This attribute tells whether the system uses IOMMU
+		for DMA protection. Value of 1 means IOMMU is used 0 means
+		it is not (DMA protection is solely based on Thunderbolt
+		security levels).
+
 What: /sys/bus/thunderbolt/devices/.../domainX/security
 Date:		Sep 2017
 KernelVersion:	4.13
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@ -186,9 +186,19 @@ Contact:	Lan Tianyu <tianyu.lan@intel.com>
 Description:
 		Some platforms provide usb port connect types through ACPI.
 		This attribute is to expose these information to user space.
-		The file will read "hotplug", "wired" and "not used" if the
+		The file will read "hotplug", "hardwired" and "not used" if the
 		information is available, and "unknown" otherwise.

+What:		/sys/bus/usb/devices/.../(hub interface)/portX/location
+Date:		October 2018
+Contact:	Bjørn Mork <bjorn@mork.no>
+Description:
+		Some platforms provide usb port physical location through
+		firmware. This is used by the kernel to pair up logical ports
+		mapping to the same physical connector. The attribute exposes the
+		raw location value as a hex integer.
+
+
 What:		/sys/bus/usb/devices/.../(hub interface)/portX/quirks
 Date:		May 2018
 Contact:	Nicolas Boichat <drinkcat@chromium.org>
@ -219,7 +229,14 @@ Description:
 		ports and report them to the kernel. This attribute is to expose
 		the number of over-current situation occurred on a specific port
 		to user space. This file will contain an unsigned 32 bit value
-		which wraps to 0 after its maximum is reached.
+		which wraps to 0 after its maximum is reached. This file supports
+		poll() for monitoring changes to this value in user space.
+
+		Any time this value changes the corresponding hub device will send a
+		udev event with the following attributes:
+
+		OVER_CURRENT_PORT=/sys/bus/usb/devices/.../(hub interface)/portX
+		OVER_CURRENT_COUNT=[current value of this sysfs attribute]

 What:		/sys/bus/usb/devices/.../(hub interface)/portX/usb3_lpm_permit
 Date:		November 2015
--- a/Documentation/ABI/testing/sysfs-bus-vmbus
+++ b/Documentation/ABI/testing/sysfs-bus-vmbus
@ -0,0 +1,21 @@
+What:		/sys/bus/vmbus/devices/.../driver_override
+Date:		August 2019
+Contact:	Stephen Hemminger <sthemmin@microsoft.com>
+Description:
+		This file allows the driver for a device to be specified which
+		will override standard static and dynamic ID matching.  When
+		specified, only a driver with a name matching the value written
+		to driver_override will have an opportunity to bind to the
+		device.  The override is specified by writing a string to the
+		driver_override file (echo uio_hv_generic > driver_override) and
+		may be cleared with an empty string (echo > driver_override).
+		This returns the device to standard matching rules binding.
+		Writing to driver_override does not automatically unbind the
+		device from its current driver or make any attempt to
+		automatically load the specified driver.  If no driver with a
+		matching name is currently loaded in the kernel, the device
+		will not bind to any driver.  This also allows devices to
+		opt-out of driver binding using a driver_override name such as
+		"none".  Only a single driver may be specified in the override,
+		there is no support for parsing delimiters.
+
--- a/Documentation/ABI/testing/sysfs-class-chromeos
+++ b/Documentation/ABI/testing/sysfs-class-chromeos
@ -0,0 +1,32 @@
+What:		/sys/class/chromeos/<ec-device-name>/flashinfo
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		Show the EC flash information.
+
+What:		/sys/class/chromeos/<ec-device-name>/kb_wake_angle
+Date:		March 2018
+KernelVersion:	4.17
+Description:
+		Control the keyboard wake lid angle. Values are between
+		0 and 360. This file will also show the keyboard wake lid
+		angle by querying the hardware.
+
+What:		/sys/class/chromeos/<ec-device-name>/reboot
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		Tell the EC to reboot in various ways. Options are:
+		"cancel": Cancel a pending reboot.
+		"ro": Jump to RO without rebooting.
+		"rw": Jump to RW without rebooting.
+		"cold": Cold reboot.
+		"disable-jump": Disable jump until next reboot.
+		"hibernate": Hibernate the EC.
+		"at-shutdown": Reboot after an AP shutdown.
+
+What:		/sys/class/chromeos/<ec-device-name>/version
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		Show the information about the EC software and hardware.
--- a/Documentation/ABI/testing/sysfs-class-chromeos-driver-cros-ec-lightbar
+++ b/Documentation/ABI/testing/sysfs-class-chromeos-driver-cros-ec-lightbar
@ -0,0 +1,74 @@
+What:		/sys/class/chromeos/<ec-device-name>/lightbar/brightness
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		Writing to this file adjusts the overall brightness of
+		the lightbar, separate from any color intensity. The
+		valid range is 0 (off) to 255 (maximum brightness).
+
+What:		/sys/class/chromeos/<ec-device-name>/lightbar/interval_msec
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		The lightbar is controlled by an embedded controller (EC),
+		which also manages the keyboard, battery charging, fans,
+		and other system hardware. To prevent unprivileged users
+		from interfering with the other EC functions, the rate at
+		which the lightbar control files can be read or written is
+		limited.
+
+		Reading this file will return the number of milliseconds
+		that must elapse between accessing any of the lightbar
+		functions through this interface. Going faster will simply
+		block until the necessary interval has lapsed. The interval
+		applies uniformly to all accesses of any kind by any user.
+
+What:		/sys/class/chromeos/<ec-device-name>/lightbar/led_rgb
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		This allows you to control each LED segment. If the
+		lightbar is already running one of the automatic
+		sequences, you probably won’t see anything change because
+		your color setting will be almost immediately replaced.
+		To get useful results, you should stop the lightbar
+		sequence first.
+
+		The values written to this file are sets of four integers,
+		indicating LED, RED, GREEN, BLUE. The LED number is 0 to 3
+		to select a single segment, or 4 to set all four segments
+		to the same value at once. The RED, GREEN, and BLUE
+		numbers should be in the range 0 (off) to 255 (maximum).
+		You can update more than one segment at a time by writing
+		more than one set of four integers.
+
+What:		/sys/class/chromeos/<ec-device-name>/lightbar/program
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		This allows you to upload and run custom lightbar sequences.
+
+What:		/sys/class/chromeos/<ec-device-name>/lightbar/sequence
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		The Pixel lightbar has a number of built-in sequences
+		that it displays under various conditions, such as at
+		power on, shut down, or while running. Reading from this
+		file displays the current sequence that the lightbar is
+		displaying. Writing to this file allows you to change the
+		sequence.
+
+What:		/sys/class/chromeos/<ec-device-name>/lightbar/userspace_control
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		This allows you to take the control of the lightbar. This
+		prevents the kernel from going through its normal
+		sequences.
+
+What:		/sys/class/chromeos/<ec-device-name>/lightbar/version
+Date:		August 2015
+KernelVersion:	4.2
+Description:
+		Show the information about the lightbar version.
--- a/Documentation/ABI/testing/sysfs-class-chromeos-driver-cros-ec-vbc
+++ b/Documentation/ABI/testing/sysfs-class-chromeos-driver-cros-ec-vbc
@ -0,0 +1,6 @@
+What:		/sys/class/chromeos/<ec-device-name>/vbc/vboot_context
+Date:		October 2015
+KernelVersion:	4.4
+Description:
+		Read/write the verified boot context data included on a
+		small nvram space on some EC implementations.
--- a/Documentation/ABI/testing/sysfs-class-lcd-s6e63m0
+++ b/Documentation/ABI/testing/sysfs-class-lcd-s6e63m0
@ -1,27 +0,0 @@
-sysfs interface for the S6E63M0 AMOLED LCD panel driver
-------------------------------------------------------
-
-What:		/sys/class/lcd/<lcd>/gamma_mode
-Date:		May, 2010
-KernelVersion:	v2.6.35
-Contact:	dri-devel@lists.freedesktop.org
-Description:
-		(RW) Read or write the gamma mode. Following three modes are
-		supported:
-		0 - gamma value 2.2,
-		1 - gamma value 1.9 and
-		2 - gamma value 1.7.
-
-
-What:		/sys/class/lcd/<lcd>/gamma_table
-Date:		May, 2010
-KernelVersion:	v2.6.35
-Contact:	dri-devel@lists.freedesktop.org
-Description:
-		(RO) Displays the size of the gamma table i.e. the number of
-		gamma modes available.
-
-This is a backlight lcd driver. These interfaces are an extension to the API
-documented in Documentation/ABI/testing/sysfs-class-lcd and in
-Documentation/ABI/stable/sysfs-class-backlight (under
-/sys/class/backlight/<backlight>/).
--- a/Documentation/ABI/testing/sysfs-class-led-driver-sc27xx
+++ b/Documentation/ABI/testing/sysfs-class-led-driver-sc27xx
@ -0,0 +1,22 @@
+What:		/sys/class/leds/<led>/hw_pattern
+Date:		September 2018
+KernelVersion:	4.20
+Description:
+		Specify a hardware pattern for the SC27XX LED. For the SC27XX
+		LED controller, it only supports 4 stages to make a single
+		hardware pattern, which is used to configure the rise time,
+		high time, fall time and low time for the breathing mode.
+
+		For the breathing mode, the SC27XX LED only expects one brightness
+		for the high stage. To be compatible with the hardware pattern
+		format, we should set brightness as 0 for rise stage, fall
+		stage and low stage.
+
+		Min stage duration: 125 ms
+		Max stage duration: 31875 ms
+
+		Since the stage duration step is 125 ms, the duration should be
+		a multiplier of 125, like 125ms, 250ms, 375ms, 500ms ... 31875ms.
+
+		Thus the format of the hardware pattern values should be:
+		"0 rise_duration brightness high_duration 0 fall_duration 0 low_duration".
--- a/Documentation/ABI/testing/sysfs-class-led-trigger-pattern
+++ b/Documentation/ABI/testing/sysfs-class-led-trigger-pattern
@ -0,0 +1,37 @@
+What:		/sys/class/leds/<led>/pattern
+Date:		September 2018
+KernelVersion:	4.20
+Description:
+		Specify a software pattern for the LED, that supports altering
+		the brightness for the specified duration with one software
+		timer. It can do gradual dimming and step change of brightness.
+
+		The pattern is given by a series of tuples, of brightness and
+		duration (ms).
+
+		The exact format is described in:
+		Documentation/devicetree/bindings/leds/leds-trigger-pattern.txt
+
+What:		/sys/class/leds/<led>/hw_pattern
+Date:		September 2018
+KernelVersion:	4.20
+Description:
+		Specify a hardware pattern for the LED, for LED hardware that
+		supports autonomously controlling brightness over time, according
+		to some preprogrammed hardware patterns. It deactivates any active
+		software pattern.
+
+		Since different LED hardware can have different semantics of
+		hardware patterns, each driver is expected to provide its own
+		description for the hardware patterns in their ABI documentation
+		file.
+
+What:		/sys/class/leds/<led>/repeat
+Date:		September 2018
+KernelVersion:	4.20
+Description:
+		Specify a pattern repeat number. -1 means repeat indefinitely,
+		other negative numbers and number 0 are invalid.
+
+		This file will always return the originally written repeat
+		number.
--- a/Documentation/ABI/testing/sysfs-class-net
+++ b/Documentation/ABI/testing/sysfs-class-net
@ -91,6 +91,24 @@ Description:
 		stacked (e.g: VLAN interfaces) but still have the same MAC
 		address as their parent device.

+What:		/sys/class/net/<iface>/dev_port
+Date:		February 2014
+KernelVersion:	3.15
+Contact:	netdev@vger.kernel.org
+Description:
+		Indicates the port number of this network device, formatted
+		as a decimal value. Some NICs have multiple independent ports
+		on the same PCI bus, device and function. This attribute allows
+		userspace to distinguish the respective interfaces.
+
+		Note: some device drivers started to use 'dev_id' for this
+		purpose since long before 3.15 and have not adopted the new
+		attribute ever since. To query the port number, some tools look
+		exclusively at 'dev_port', while others only consult 'dev_id'.
+		If a network device has multiple client adapter ports as
+		described in the previous paragraph and does not set this
+		attribute to its port number, it's a kernel bug.
+
 What:		/sys/class/net/<iface>/dormant
 Date:		March 2006
 KernelVersion:	2.6.17
@ -117,7 +135,7 @@ Description:
 		full: full duplex

 		Note: This attribute is only valid for interfaces that implement
-		the ethtool get_settings method (mostly Ethernet).
+		the ethtool get_link_ksettings method (mostly Ethernet).

 What:		/sys/class/net/<iface>/flags
 Date:		April 2005
@ -224,7 +242,7 @@ Description:
 		an integer representing the link speed in Mbits/sec.

 		Note: this attribute is only valid for interfaces that implement
-		the ethtool get_settings method (mostly Ethernet ).
+		the ethtool get_link_ksettings method (mostly Ethernet).

 What:		/sys/class/net/<iface>/tx_queue_len
 Date:		April 2005
--- a/Documentation/ABI/testing/sysfs-class-net-dsa
+++ b/Documentation/ABI/testing/sysfs-class-net-dsa
@ -0,0 +1,7 @@
+What:		/sys/class/net/<iface>/dsa/tagging
+Date:		August 2018
+KernelVersion:	4.20
+Contact:	netdev@vger.kernel.org
+Description:
+		String indicating the type of tagging protocol used by the
+		DSA slave network device.
--- a/Documentation/ABI/testing/sysfs-class-watchdog
+++ b/Documentation/ABI/testing/sysfs-class-watchdog
@ -49,3 +49,26 @@ Contact:	Wim Van Sebroeck <wim@iguana.be>
 Description:
 		It is a read only file. It is read to know about current
 		value of timeout programmed.
+
+What:		/sys/class/watchdog/watchdogn/pretimeout
+Date:		December 2016
+Contact:	Wim Van Sebroeck <wim@iguana.be>
+Description:
+		It is a read only file. It specifies the time in seconds before
+		timeout when the pretimeout interrupt is delivered.  Pretimeout
+		is an optional feature.
+
+What:		/sys/class/watchdog/watchdogn/pretimeout_avaialable_governors
+Date:		February 2017
+Contact:	Wim Van Sebroeck <wim@iguana.be>
+Description:
+		It is a read only file. It shows the pretimeout governors
+		available for this watchdog.
+
+What:		/sys/class/watchdog/watchdogn/pretimeout_governor
+Date:		February 2017
+Contact:	Wim Van Sebroeck <wim@iguana.be>
+Description:
+		It is a read/write file. When read, the currently assigned
+		pretimeout governor is returned.  When written, it sets
+		the pretimeout governor.
--- a/Documentation/ABI/testing/sysfs-devices-software_node
+++ b/Documentation/ABI/testing/sysfs-devices-software_node
@ -0,0 +1,10 @@
+What:		/sys/devices/.../software_node/
+Date:		January 2019
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		This directory contains the details about the device that are
+		assigned in kernel (i.e. software), as opposed to the
+		firmware_node directory which contains the details that are
+		assigned for the device in firmware. The main attributes in the
+		directory will show the properties the device has, and the
+		relationship it has to some of the other devices.
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@ -145,6 +145,8 @@ What:		/sys/devices/system/cpu/cpuX/cpuidle/stateN/name
 		/sys/devices/system/cpu/cpuX/cpuidle/stateN/power
 		/sys/devices/system/cpu/cpuX/cpuidle/stateN/time
 		/sys/devices/system/cpu/cpuX/cpuidle/stateN/usage
+		/sys/devices/system/cpu/cpuX/cpuidle/stateN/above
+		/sys/devices/system/cpu/cpuX/cpuidle/stateN/below
 Date:		September 2007
 KernelVersion:	v2.6.24
 Contact:	Linux power management list <linux-pm@vger.kernel.org>
@ -166,6 +168,11 @@ Description:

 		usage: (RO) Number of times this state was entered (a count).

+		above: (RO) Number of times this state was entered, but the
+		       observed CPU idle duration was too short for it (a count).
+
+		below: (RO) Number of times this state was entered, but the
+		       observed CPU idle duration was too long for it (a count).

 What:		/sys/devices/system/cpu/cpuX/cpuidle/stateN/desc
 Date:		February 2008
--- a/Documentation/ABI/testing/sysfs-driver-habanalabs
+++ b/Documentation/ABI/testing/sysfs-driver-habanalabs
@ -0,0 +1,190 @@
+What:           /sys/class/habanalabs/hl<n>/armcp_kernel_ver
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Version of the Linux kernel running on the device's CPU
+
+What:           /sys/class/habanalabs/hl<n>/armcp_ver
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Version of the application running on the device's CPU
+
+What:           /sys/class/habanalabs/hl<n>/cpld_ver
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Version of the Device's CPLD F/W
+
+What:           /sys/class/habanalabs/hl<n>/device_type
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Displays the code name of the device according to its type.
+                The supported values are: "GOYA"
+
+What:           /sys/class/habanalabs/hl<n>/eeprom
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    A binary file attribute that contains the contents of the
+                on-board EEPROM
+
+What:           /sys/class/habanalabs/hl<n>/fuse_ver
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Displays the device's version from the eFuse
+
+What:           /sys/class/habanalabs/hl<n>/hard_reset
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Interface to trigger a hard-reset operation for the device.
+                Hard-reset will reset ALL internal components of the device
+                except for the PCI interface and the internal PLLs
+
+What:           /sys/class/habanalabs/hl<n>/hard_reset_cnt
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Displays how many times the device have undergone a hard-reset
+                operation since the driver was loaded
+
+What:           /sys/class/habanalabs/hl<n>/high_pll
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Allows the user to set the maximum clock frequency for MME, TPC
+                and IC when the power management profile is set to "automatic".
+
+What:           /sys/class/habanalabs/hl<n>/ic_clk
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Allows the user to set the maximum clock frequency of the
+                Interconnect fabric. Writes to this parameter affect the device
+                only when the power management profile is set to "manual" mode.
+                The device IC clock might be set to lower value then the
+                maximum. The user should read the ic_clk_curr to see the actual
+                frequency value of the IC
+
+What:           /sys/class/habanalabs/hl<n>/ic_clk_curr
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Displays the current clock frequency of the Interconnect fabric
+
+What:           /sys/class/habanalabs/hl<n>/infineon_ver
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Version of the Device's power supply F/W code
+
+What:           /sys/class/habanalabs/hl<n>/max_power
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Allows the user to set the maximum power consumption of the
+                device in milliwatts.
+
+What:           /sys/class/habanalabs/hl<n>/mme_clk
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Allows the user to set the maximum clock frequency of the
+                MME compute engine. Writes to this parameter affect the device
+                only when the power management profile is set to "manual" mode.
+                The device MME clock might be set to lower value then the
+                maximum. The user should read the mme_clk_curr to see the actual
+                frequency value of the MME
+
+What:           /sys/class/habanalabs/hl<n>/mme_clk_curr
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Displays the current clock frequency of the MME compute engine
+
+What:           /sys/class/habanalabs/hl<n>/pci_addr
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Displays the PCI address of the device. This is needed so the
+                user would be able to open a device based on its PCI address
+
+What:           /sys/class/habanalabs/hl<n>/pm_mng_profile
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Power management profile. Values are "auto", "manual". In "auto"
+                mode, the driver will set the maximum clock frequency to a high
+                value when a user-space process opens the device's file (unless
+                it was already opened by another process). The driver will set
+                the max clock frequency to a low value when there are no user
+                processes that are opened on the device's file. In "manual"
+                mode, the user sets the maximum clock frequency by writing to
+                ic_clk, mme_clk and tpc_clk
+
+
+What:           /sys/class/habanalabs/hl<n>/preboot_btl_ver
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Version of the device's preboot F/W code
+
+What:           /sys/class/habanalabs/hl<n>/soft_reset
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Interface to trigger a soft-reset operation for the device.
+                Soft-reset will reset only the compute and DMA engines of the
+                device
+
+What:           /sys/class/habanalabs/hl<n>/soft_reset_cnt
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Displays how many times the device have undergone a soft-reset
+                operation since the driver was loaded
+
+What:           /sys/class/habanalabs/hl<n>/status
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Status of the card: "Operational", "Malfunction", "In reset".
+
+What:           /sys/class/habanalabs/hl<n>/thermal_ver
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Version of the Device's thermal daemon
+
+What:           /sys/class/habanalabs/hl<n>/tpc_clk
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Allows the user to set the maximum clock frequency of the
+                TPC compute engines. Writes to this parameter affect the device
+                only when the power management profile is set to "manual" mode.
+                The device TPC clock might be set to lower value then the
+                maximum. The user should read the tpc_clk_curr to see the actual
+                frequency value of the TPC
+
+What:           /sys/class/habanalabs/hl<n>/tpc_clk_curr
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Displays the current clock frequency of the TPC compute engines
+
+What:           /sys/class/habanalabs/hl<n>/uboot_ver
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Version of the u-boot running on the device's CPU
+
+What:           /sys/class/habanalabs/hl<n>/write_open_cnt
+Date:           Jan 2019
+KernelVersion:  5.1
+Contact:        oded.gabbay@gmail.com
+Description:    Displays the total number of user processes that are currently
+                opened on the device's file
--- a/Documentation/ABI/testing/sysfs-fs-ext4
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@ -109,3 +109,10 @@ Description:
 		write operation (since a 4k random write might turn
 		into a much larger write due to the zeroout
 		operation).
+
+What:		/sys/fs/ext4/<disk>/journal_task
+Date:		February 2019
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		This file is read-only and shows the pid of journal thread in
+		current pid-namespace or 0 if task is unreachable.
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@ -86,12 +86,28 @@ Description:
 		The unit size is one block, now only support configuring in range
 		of [1, 512].

+What:          /sys/fs/f2fs/<disk>/umount_discard_timeout
+Date:          January 2019
+Contact:       "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:
+		Set timeout to issue discard commands during umount.
+		Default: 5 secs
+
 What:		/sys/fs/f2fs/<disk>/max_victim_search
 Date:		January 2014
 Contact:	"Jaegeuk Kim" <jaegeuk.kim@samsung.com>
 Description:
 		 Controls the number of trials to find a victim segment.

+What:		/sys/fs/f2fs/<disk>/migration_granularity
+Date:		October 2018
+Contact:	"Chao Yu" <yuchao0@huawei.com>
+Description:
+		 Controls migration granularity of garbage collection on large
+		 section, it can let GC move partial segment{s} of one section
+		 in one GC cycle, so that dispersing heavy overhead GC to
+		 multiple lightweight one.
+
 What:		/sys/fs/f2fs/<disk>/dir_level
 Date:		March 2014
 Contact:	"Jaegeuk Kim" <jaegeuk.kim@samsung.com>
@ -121,7 +137,22 @@ What:		/sys/fs/f2fs/<disk>/idle_interval
 Date:		January 2016
 Contact:	"Jaegeuk Kim" <jaegeuk@kernel.org>
 Description:
-		 Controls the idle timing.
+		 Controls the idle timing for all paths other than
+		 discard and gc path.
+
+What:		/sys/fs/f2fs/<disk>/discard_idle_interval
+Date:		September 2018
+Contact:	"Chao Yu" <yuchao0@huawei.com>
+Contact:	"Sahitya Tummala" <stummala@codeaurora.org>
+Description:
+		 Controls the idle timing for discard path.
+
+What:		/sys/fs/f2fs/<disk>/gc_idle_interval
+Date:		September 2018
+Contact:	"Chao Yu" <yuchao0@huawei.com>
+Contact:	"Sahitya Tummala" <stummala@codeaurora.org>
+Description:
+		 Controls the idle timing for gc path.

 What:		/sys/fs/f2fs/<disk>/iostat_enable
 Date:		August 2017
--- a/Documentation/ABI/testing/sysfs-kernel-livepatch
+++ b/Documentation/ABI/testing/sysfs-kernel-livepatch
@ -33,18 +33,6 @@ Description:
 		An attribute which indicates whether the patch is currently in
 		transition.

-What:		/sys/kernel/livepatch/<patch>/signal
-Date:		Nov 2017
-KernelVersion:	4.15.0
-Contact:	live-patching@vger.kernel.org
-Description:
-		A writable attribute that allows administrator to affect the
-		course of an existing transition. Writing 1 sends a fake
-		signal to all remaining blocking tasks. The fake signal
-		means that no proper signal is delivered (there is no data in
-		signal pending structures). Tasks are interrupted or woken up,
-		and forced to change their patched state.
-
 What:		/sys/kernel/livepatch/<patch>/force
 Date:		Nov 2017
 KernelVersion:	4.15.0
--- a/Documentation/ABI/testing/sysfs-platform-lg-laptop
+++ b/Documentation/ABI/testing/sysfs-platform-lg-laptop
@ -0,0 +1,35 @@
+What:		/sys/devices/platform/lg-laptop/reader_mode
+Date:		October 2018
+KernelVersion:	4.20
+Contact:	"Matan Ziv-Av <matan@svgalib.org>
+Description:
+        Control reader mode. 1 means on, 0 means off.
+
+What:		/sys/devices/platform/lg-laptop/fn_lock
+Date:		October 2018
+KernelVersion:	4.20
+Contact:	"Matan Ziv-Av <matan@svgalib.org>
+Description:
+        Control FN lock mode. 1 means on, 0 means off.
+
+What:		/sys/devices/platform/lg-laptop/battery_care_limit
+Date:		October 2018
+KernelVersion:	4.20
+Contact:	"Matan Ziv-Av <matan@svgalib.org>
+Description:
+        Maximal battery charge level. Accepted values are 80 or 100.
+
+What:		/sys/devices/platform/lg-laptop/fan_mode
+Date:		October 2018
+KernelVersion:	4.20
+Contact:	"Matan Ziv-Av <matan@svgalib.org>
+Description:
+        Control fan mode. 1 for performance mode, 0 for silent mode.
+
+What:		/sys/devices/platform/lg-laptop/usb_charge
+Date:		October 2018
+KernelVersion:	4.20
+Contact:	"Matan Ziv-Av <matan@svgalib.org>
+Description:
+        Control USB port charging when device is turned off.
+        1 means on, 0 means off.
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@ -99,7 +99,7 @@ Description:
 		this file, the suspend image will be as small as possible.

 		Reading from this file will display the current image size
-		limit, which is set to 500 MB by default.
+		limit, which is set to around 2/5 of available RAM by default.

 What:		/sys/power/pm_trace
 Date:		August 2006
--- a/Documentation/DMA-API-HOWTO.txt
+++ b/Documentation/DMA-API-HOWTO.txt
@ -146,114 +146,75 @@ What about block I/O and networking buffers?  The block I/O and
 networking subsystems make sure that the buffers they use are valid
 for you to DMA from/to.

-DMA addressing limitations
+DMA addressing capabilities
 ==========================

-Does your device have any DMA addressing limitations?  For example, is
-your device only capable of driving the low order 24-bits of address?
-If so, you need to inform the kernel of this fact.
+By default, the kernel assumes that your device can address 32-bits of DMA
+addressing.  For a 64-bit capable device, this needs to be increased, and for
+a device with limitations, it needs to be decreased.

-By default, the kernel assumes that your device can address the full
-32-bits.  For a 64-bit capable device, this needs to be increased.
-And for a device with limitations, as discussed in the previous
-paragraph, it needs to be decreased.
+Special note about PCI: PCI-X specification requires PCI-X devices to support
+64-bit addressing (DAC) for all transactions.  And at least one platform (SGI
+SN2) requires 64-bit consistent allocations to operate correctly when the IO
+bus is in PCI-X mode.

-Special note about PCI: PCI-X specification requires PCI-X devices to
-support 64-bit addressing (DAC) for all transactions.  And at least
-one platform (SGI SN2) requires 64-bit consistent allocations to
-operate correctly when the IO bus is in PCI-X mode.
+For correct operation, you must set the DMA mask to inform the kernel about
+your devices DMA addressing capabilities.

-For correct operation, you must interrogate the kernel in your device
-probe routine to see if the DMA controller on the machine can properly
-support the DMA addressing limitation your device has.  It is good
-style to do this even if your device holds the default setting,
-because this shows that you did think about these issues wrt. your
-device.
-
-The query is performed via a call to dma_set_mask_and_coherent()::
+This is performed via a call to dma_set_mask_and_coherent()::

 	int dma_set_mask_and_coherent(struct device *dev, u64 mask);

-which will query the mask for both streaming and coherent APIs together.
-If you have some special requirements, then the following two separate
-queries can be used instead:
+which will set the mask for both streaming and coherent APIs together.  If you
+have some special requirements, then the following two separate calls can be
+used instead:

-	The query for streaming mappings is performed via a call to
+	The setup for streaming mappings is performed via a call to
 	dma_set_mask()::

 		int dma_set_mask(struct device *dev, u64 mask);

-	The query for consistent allocations is performed via a call
+	The setup for consistent allocations is performed via a call
 	to dma_set_coherent_mask()::

 		int dma_set_coherent_mask(struct device *dev, u64 mask);

-Here, dev is a pointer to the device struct of your device, and mask
-is a bit mask describing which bits of an address your device
-supports.  It returns zero if your card can perform DMA properly on
-the machine given the address mask you provided.  In general, the
-device struct of your device is embedded in the bus-specific device
-struct of your device.  For example, &pdev->dev is a pointer to the
-device struct of a PCI device (pdev is a pointer to the PCI device
-struct of your device).
+Here, dev is a pointer to the device struct of your device, and mask is a bit
+mask describing which bits of an address your device supports.  Often the
+device struct of your device is embedded in the bus-specific device struct of
+your device.  For example, &pdev->dev is a pointer to the device struct of a
+PCI device (pdev is a pointer to the PCI device struct of your device).

-If it returns non-zero, your device cannot perform DMA properly on
-this platform, and attempting to do so will result in undefined
-behavior.  You must either use a different mask, or not use DMA.
+These calls usually return zero to indicated your device can perform DMA
+properly on the machine given the address mask you provided, but they might
+return an error if the mask is too small to be supportable on the given
+system.  If it returns non-zero, your device cannot perform DMA properly on
+this platform, and attempting to do so will result in undefined behavior.
+You must not use DMA on this device unless the dma_set_mask family of
+functions has returned success.

-This means that in the failure case, you have three options:
+This means that in the failure case, you have two options:

-1) Use another DMA mask, if possible (see below).
-2) Use some non-DMA mode for data transfer, if possible.
-3) Ignore this device and do not initialize it.
+1) Use some non-DMA mode for data transfer, if possible.
+2) Ignore this device and do not initialize it.

-It is recommended that your driver print a kernel KERN_WARNING message
-when you end up performing either #2 or #3.  In this manner, if a user
-of your driver reports that performance is bad or that the device is not
-even detected, you can ask them for the kernel messages to find out
-exactly why.
+It is recommended that your driver print a kernel KERN_WARNING message when
+setting the DMA mask fails.  In this manner, if a user of your driver reports
+that performance is bad or that the device is not even detected, you can ask
+them for the kernel messages to find out exactly why.

-The standard 32-bit addressing device would do something like this::
+The standard 64-bit addressing device would do something like this::

-	if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32))) {
+	if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64))) {
 		dev_warn(dev, "mydev: No suitable DMA available\n");
 		goto ignore_this_device;
 	}

-Another common scenario is a 64-bit capable device.  The approach here
-is to try for 64-bit addressing, but back down to a 32-bit mask that
-should not fail.  The kernel may fail the 64-bit mask not because the
-platform is not capable of 64-bit addressing.  Rather, it may fail in
-this case simply because 32-bit addressing is done more efficiently
-than 64-bit addressing.  For example, Sparc64 PCI SAC addressing is
-more efficient than DAC addressing.
+If the device only supports 32-bit addressing for descriptors in the
+coherent allocations, but supports full 64-bits for streaming mappings
+it would look like this:

-Here is how you would handle a 64-bit capable device which can drive
-all 64-bits when accessing streaming DMA::
-
-	int using_dac;
-
-	if (!dma_set_mask(dev, DMA_BIT_MASK(64))) {
-		using_dac = 1;
-	} else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) {
-		using_dac = 0;
-	} else {
-		dev_warn(dev, "mydev: No suitable DMA available\n");
-		goto ignore_this_device;
-	}
-
-If a card is capable of using 64-bit consistent allocations as well,
-the case would look like this::
-
-	int using_dac, consistent_using_dac;
-
-	if (!dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64))) {
-		using_dac = 1;
-		consistent_using_dac = 1;
-	} else if (!dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32))) {
-		using_dac = 0;
-		consistent_using_dac = 0;
-	} else {
+	if (dma_set_mask(dev, DMA_BIT_MASK(64))) {
 		dev_warn(dev, "mydev: No suitable DMA available\n");
 		goto ignore_this_device;
 	}
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@ -58,15 +58,6 @@ specify the ``GFP_`` flags (see kmalloc()) for the allocation (the
 implementation may choose to ignore flags that affect the location of
 the returned memory, like GFP_DMA).

-::
-
-	void *
-	dma_zalloc_coherent(struct device *dev, size_t size,
-			    dma_addr_t *dma_handle, gfp_t flag)
-
-Wraps dma_alloc_coherent() and also zeroes the returned memory if the
-allocation attempt succeeded.
-
 ::

 	void
@ -204,6 +195,14 @@ Requesting the required mask does not alter the current mask.  If you
 wish to take advantage of it, you should issue a dma_set_mask()
 call to set the mask to the value returned.

+::
+
+	size_t
+	dma_direct_max_mapping_size(struct device *dev);
+
+Returns the maximum size of a mapping for the device. The size parameter
+of the mapping functions like dma_map_single(), dma_map_page() and
+others should not be larger than the returned value.

 Part Id - Streaming DMA mappings
 --------------------------------
@ -539,8 +538,8 @@ that simply cannot make consistent memory.
 	dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
 		       dma_addr_t dma_handle, unsigned long attrs)

-Free memory allocated by the dma_alloc_attrs().  All parameters common
-parameters must identical to those otherwise passed to dma_fre_coherent,
+Free memory allocated by the dma_alloc_attrs().  All common
+parameters must be identical to those otherwise passed to dma_free_coherent,
 and the attrs argument must be identical to the attrs passed to
 dma_alloc_attrs().

@ -575,8 +574,7 @@ boundaries when doing this.

 	int
 	dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
-				    dma_addr_t device_addr, size_t size, int
-				    flags)
+				    dma_addr_t device_addr, size_t size);

 Declare region of memory to be handed out by dma_alloc_coherent() when
 it's asked for coherent memory for this device.
@ -590,12 +588,6 @@ dma_addr_t in dma_alloc_coherent()).

 size is the size of the area (must be multiples of PAGE_SIZE).

-flags can be ORed together and are:
-
- DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions.
-  Do not allow dma_alloc_coherent() to fall back to system memory when
-  it's out of memory in the declared region.
-
 As a simplification for the platforms, only *one* such region of
 memory may be declared per device.

@ -614,23 +606,6 @@ unconditionally having removed all the required structures.  It is the
 driver's job to ensure that no parts of this memory region are
 currently in use.

-::
-
-	void *
-	dma_mark_declared_memory_occupied(struct device *dev,
-					  dma_addr_t device_addr, size_t size)
-
-This is used to occupy specific regions of the declared space
-(dma_alloc_coherent() will hand out the first free region it finds).
-
-device_addr is the *device* address of the region requested.
-
-size is the size (and should be a page-sized multiple).
-
-The return value will be either a pointer to the processor virtual
-address of the memory, or an error (via PTR_ERR()) if any part of the
-region is occupied.
-
 Part III - Debug drivers use of the DMA-API
 -------------------------------------------

@ -705,6 +680,9 @@ dma-api/disabled		This read-only file contains the character 'Y'
 				happen when it runs out of memory or if it was
 				disabled at boot time

+dma-api/dump			This read-only file contains current DMA
+				mappings.
+
 dma-api/error_count		This file is read-only and shows the total
 				numbers of errors found.

@ -717,13 +695,16 @@ dma-api/num_errors		The number in this file shows how many
 dma-api/min_free_entries	This read-only file can be read to get the
 				minimum number of free dma_debug_entries the
 				allocator has ever seen. If this value goes
-				down to zero the code will disable itself
-				because it is not longer reliable.
+				down to zero the code will attempt to increase
+				nr_total_entries to compensate.

 dma-api/num_free_entries	The current number of free dma_debug_entries
 				in the allocator.

-dma-api/driver-filter		You can write a name of a driver into this file
+dma-api/nr_total_entries	The total number of dma_debug_entries in the
+				allocator, both free and used.
+
+dma-api/driver_filter		You can write a name of a driver into this file
 				to limit the debug output to requests from that
 				particular driver. Write an empty string to
 				that file to disable the filter and see
@ -742,10 +723,15 @@ driver filter at boot time. The debug code will only print errors for that
 driver afterwards. This filter can be disabled or changed later using debugfs.

 When the code disables itself at runtime this is most likely because it ran
-out of dma_debug_entries. These entries are preallocated at boot. The number
-of preallocated entries is defined per architecture. If it is too low for you
-boot with 'dma_debug_entries=<your_desired_number>' to overwrite the
-architectural default.
+out of dma_debug_entries and was unable to allocate more on-demand. 65536
+entries are preallocated at boot - if this is too low for you boot with
+'dma_debug_entries=<your_desired_number>' to overwrite the default. Note
+that the code allocates entries in batches, so the exact number of
+preallocated entries may be greater than the actual number requested. The
+code will print to the kernel log each time it has dynamically allocated
+as many entries as were initially preallocated. This is to indicate that a
+larger preallocation size may be appropriate, or if it happens continually
+that a driver may be leaking mappings.

 ::

--- a/Documentation/DMA-ISA-LPC.txt
+++ b/Documentation/DMA-ISA-LPC.txt
@ -52,8 +52,8 @@ Address translation
 -------------------

 To translate the virtual address to a bus address, use the normal DMA
-API. Do _not_ use isa_virt_to_phys() even though it does the same
-thing. The reason for this is that the function isa_virt_to_phys()
+API. Do _not_ use isa_virt_to_bus() even though it does the same
+thing. The reason for this is that the function isa_virt_to_bus()
 will require a Kconfig dependency to ISA, not just ISA_DMA_API which
 is really all you need. Remember that even though the DMA controller
 has its origins in ISA it is used elsewhere.
--- a/Documentation/EDID/1024x768.S
+++ b/Documentation/EDID/1024x768.S
@ -31,14 +31,13 @@
 #define YBLANK 38
 #define XOFFSET 8
 #define XPULSE 144
-#define YOFFSET (63+3)
-#define YPULSE (63+6)
+#define YOFFSET 3
+#define YPULSE 6
 #define DPI 72
 #define VFREQ 60 /* Hz */
 #define TIMING_NAME "Linux XGA"
 #define ESTABLISHED_TIMING2_BITS 0x08 /* Bit 3 -> 1024x768 @60 Hz */
 #define HSYNC_POL 0
 #define VSYNC_POL 0
-#define CRC 0x55

 #include "edid.S"
--- a/Documentation/EDID/1280x1024.S
+++ b/Documentation/EDID/1280x1024.S
@ -31,14 +31,13 @@
 #define YBLANK 42
 #define XOFFSET 48
 #define XPULSE 112
-#define YOFFSET (63+1)
-#define YPULSE (63+3)
+#define YOFFSET 1
+#define YPULSE 3
 #define DPI 72
 #define VFREQ 60 /* Hz */
 #define TIMING_NAME "Linux SXGA"
 /* No ESTABLISHED_TIMINGx_BITS */
 #define HSYNC_POL 1
 #define VSYNC_POL 1
-#define CRC 0xa0

 #include "edid.S"
--- a/Documentation/EDID/1600x1200.S
+++ b/Documentation/EDID/1600x1200.S
@ -31,14 +31,13 @@
 #define YBLANK 50
 #define XOFFSET 64
 #define XPULSE 192
-#define YOFFSET (63+1)
-#define YPULSE (63+3)
+#define YOFFSET 1
+#define YPULSE 3
 #define DPI 72
 #define VFREQ 60 /* Hz */
 #define TIMING_NAME "Linux UXGA"
 /* No ESTABLISHED_TIMINGx_BITS */
 #define HSYNC_POL 1
 #define VSYNC_POL 1
-#define CRC 0x9d

 #include "edid.S"
--- a/Documentation/EDID/1680x1050.S
+++ b/Documentation/EDID/1680x1050.S
@ -31,14 +31,13 @@
 #define YBLANK 39
 #define XOFFSET 104
 #define XPULSE 176
-#define YOFFSET (63+3)
-#define YPULSE (63+6)
+#define YOFFSET 3
+#define YPULSE 6
 #define DPI 96
 #define VFREQ 60 /* Hz */
 #define TIMING_NAME "Linux WSXGA"
 /* No ESTABLISHED_TIMINGx_BITS */
 #define HSYNC_POL 1
 #define VSYNC_POL 1
-#define CRC 0x26

 #include "edid.S"
--- a/Documentation/EDID/1920x1080.S
+++ b/Documentation/EDID/1920x1080.S
@ -31,14 +31,13 @@
 #define YBLANK 45
 #define XOFFSET 88
 #define XPULSE 44
-#define YOFFSET (63+4)
-#define YPULSE (63+5)
+#define YOFFSET 4
+#define YPULSE 5
 #define DPI 96
 #define VFREQ 60 /* Hz */
 #define TIMING_NAME "Linux FHD"
 /* No ESTABLISHED_TIMINGx_BITS */
 #define HSYNC_POL 1
 #define VSYNC_POL 1
-#define CRC 0x05

 #include "edid.S"
--- a/Documentation/EDID/800x600.S
+++ b/Documentation/EDID/800x600.S
@ -28,14 +28,13 @@
 #define YBLANK 28
 #define XOFFSET 40
 #define XPULSE 128
-#define YOFFSET (63+1)
-#define YPULSE (63+4)
+#define YOFFSET 1
+#define YPULSE 4
 #define DPI 72
 #define VFREQ 60 /* Hz */
 #define TIMING_NAME "Linux SVGA"
 #define ESTABLISHED_TIMING1_BITS 0x01 /* Bit 0: 800x600 @ 60Hz */
 #define HSYNC_POL 1
 #define VSYNC_POL 1
-#define CRC 0xc2

 #include "edid.S"
--- a/Documentation/EDID/HOWTO.txt
+++ b/Documentation/EDID/HOWTO.txt
@ -45,14 +45,5 @@ EDID:

 #define YPIX vdisp
 #define YBLANK vtotal-vdisp
-#define YOFFSET (63+(vsyncstart-vdisp))
-#define YPULSE (63+(vsyncend-vsyncstart))
-
-The CRC value in the last line
-  #define CRC 0x55
-also is a bit tricky. After a first version of the binary data set is
-created, it must be checked with the "edid-decode" utility which will
-most probably complain about a wrong CRC. Fortunately, the utility also
-displays the correct CRC which must then be inserted into the source
-file. After the make procedure is repeated, the EDID data set is ready
-to be used.
+#define YOFFSET vsyncstart-vdisp
+#define YPULSE vsyncend-vsyncstart
--- a/Documentation/EDID/Makefile
+++ b/Documentation/EDID/Makefile
@ -15,10 +15,21 @@ clean:
 %.o:	%.S
 	@cc -c $^

-%.bin:	%.o
+%.bin.nocrc:	%.o
 	@objcopy -Obinary $^ $@

-%.bin.ihex:	%.o
+%.crc:	%.bin.nocrc
+	@list=$$(for i in `seq 1 127`; do head -c$$i $^ | tail -c1 \
+		| hexdump -v -e '/1 "%02X+"'; done); \
+		echo "ibase=16;100-($${list%?})%100" | bc >$@
+
+%.p:	%.crc %.S
+	@cc -c -DCRC="$$(cat $*.crc)" -o $@ $*.S
+
+%.bin:	%.p
+	@objcopy -Obinary $^ $@
+
+%.bin.ihex:	%.p
 	@objcopy -Oihex $^ $@
 	@dos2unix $@ 2>/dev/null

--- a/Documentation/EDID/edid.S
+++ b/Documentation/EDID/edid.S
@ -47,9 +47,11 @@
 #define mfgname2id(v1,v2,v3) \
 	((((v1-'@')&0x1f)<<10)+(((v2-'@')&0x1f)<<5)+((v3-'@')&0x1f))
 #define swap16(v1) ((v1>>8)+((v1&0xff)<<8))
+#define lsbs2(v1,v2) (((v1&0x0f)<<4)+(v2&0x0f))
 #define msbs2(v1,v2) ((((v1>>8)&0x0f)<<4)+((v2>>8)&0x0f))
 #define msbs4(v1,v2,v3,v4) \
-	(((v1&0x03)>>2)+((v2&0x03)>>4)+((v3&0x03)>>6)+((v4&0x03)>>8))
+	((((v1>>8)&0x03)<<6)+(((v2>>8)&0x03)<<4)+\
+	(((v3>>4)&0x03)<<2)+((v4>>4)&0x03))
 #define pixdpi2mm(pix,dpi) ((pix*25)/dpi)
 #define xsize pixdpi2mm(XPIX,DPI)
 #define ysize pixdpi2mm(YPIX,DPI)
@ -200,9 +202,9 @@ y_msbs:		.byte	msbs2(YPIX,YBLANK)
 x_snc_off_lsb:	.byte	XOFFSET&0xff
 /* Horizontal sync pulse width pixels 8 lsbits (0-1023) */
 x_snc_pls_lsb:	.byte	XPULSE&0xff
-/* Bits 7-4 	Vertical sync offset lines 4 lsbits -63)
-   Bits 3-0 	Vertical sync pulse width lines 4 lsbits -63) */
-y_snc_lsb:	.byte	((YOFFSET-63)<<4)+(YPULSE-63)
+/* Bits 7-4 	Vertical sync offset lines 4 lsbits (0-63)
+   Bits 3-0 	Vertical sync pulse width lines 4 lsbits (0-63) */
+y_snc_lsb:	.byte	lsbs2(YOFFSET, YPULSE)
 /* Bits 7-6 	Horizontal sync offset pixels 2 msbits
   Bits 5-4 	Horizontal sync pulse width pixels 2 msbits
   Bits 3-2 	Vertical sync offset lines 2 msbits
--- a/Documentation/Makefile
+++ b/Documentation/Makefile
@ -2,7 +2,7 @@
 # Makefile for Sphinx documentation
 #

-subdir-y :=
+subdir-y := devicetree/bindings/

 # You can set these variables from the command line.
 SPHINXBUILD   = sphinx-build
--- a/Documentation/PCI/00-INDEX
+++ b/Documentation/PCI/00-INDEX
@ -1,26 +0,0 @@
-00-INDEX
-	- this file
-acpi-info.txt
-	- info on how PCI host bridges are represented in ACPI
-MSI-HOWTO.txt
-	- the Message Signaled Interrupts (MSI) Driver Guide HOWTO and FAQ.
-PCIEBUS-HOWTO.txt
-	- a guide describing the PCI Express Port Bus driver
-pci-error-recovery.txt
-	- info on PCI error recovery
-pci-iov-howto.txt
-	- the PCI Express I/O Virtualization HOWTO
-pci.txt
-	- info on the PCI subsystem for device driver authors
-pcieaer-howto.txt
-	- the PCI Express Advanced Error Reporting Driver Guide HOWTO
-endpoint/pci-endpoint.txt
-	- guide to add endpoint controller driver and endpoint function driver.
-endpoint/pci-endpoint-cfs.txt
-	- guide to use configfs to configure the PCI endpoint function.
-endpoint/pci-test-function.txt
-	- specification of *PCI test* function device.
-endpoint/pci-test-howto.txt
-	- userguide for PCI endpoint test function.
-endpoint/function/binding/
-	- binding documentation for PCI endpoint function
--- a/Documentation/PCI/endpoint/pci-test-howto.txt
+++ b/Documentation/PCI/endpoint/pci-test-howto.txt
@ -99,17 +99,20 @@ Note that the devices listed here correspond to the value populated in 1.4 above
 2.2 Using Endpoint Test function Device

 pcitest.sh added in tools/pci/ can be used to run all the default PCI endpoint
-tests. Before pcitest.sh can be used pcitest.c should be compiled using the
-following commands.
+tests. To compile this tool the following commands should be used:

-	cd <kernel-dir>
-	make headers_install ARCH=arm
-	arm-linux-gnueabihf-gcc -Iusr/include tools/pci/pcitest.c -o pcitest
-	cp pcitest  <rootfs>/usr/sbin/
-	cp tools/pci/pcitest.sh <rootfs>
+	# cd <kernel-dir>
+	# make -C tools/pci
+
+or if you desire to compile and install in your system:
+
+	# cd <kernel-dir>
+	# make -C tools/pci install
+
+The tool and script will be located in <rootfs>/usr/bin/

 2.2.1 pcitest.sh Output
-	# ./pcitest.sh
+	# pcitest.sh
 	BAR tests

 	BAR0:           OKAY
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@ -110,7 +110,7 @@ The actual steps taken by a platform to recover from a PCI error
 event will be platform-dependent, but will follow the general
 sequence described below.

-STEP 0: Error Event: ERR_NONFATAL
+STEP 0: Error Event
 -------------------
 A PCI bus error is detected by the PCI hardware.  On powerpc, the slot
 is isolated, in that all I/O is blocked: all reads return 0xffffffff,
@ -228,7 +228,13 @@ proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume Operations).
 If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
 proceeds to STEP 4 (Slot Reset)

-STEP 3: Slot Reset
+STEP 3: Link Reset
+------------------
+The platform resets the link.  This is a PCI-Express specific step
+and is done whenever a fatal error has been detected that can be
+"solved" by resetting the link.
+
+STEP 4: Slot Reset
 ------------------

 In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
@ -314,7 +320,7 @@ Failure).
 >>> However, it probably should.


-STEP 4: Resume Operations
+STEP 5: Resume Operations
 -------------------------
 The platform will call the resume() callback on all affected device
 drivers if all drivers on the segment have returned
@ -326,7 +332,7 @@ a result code.
 At this point, if a new error happens, the platform will restart
 a new error recovery sequence.

-STEP 5: Permanent Failure
+STEP 6: Permanent Failure
 -------------------------
 A "permanent failure" has occurred, and the platform cannot recover
 the device.  The platform will call error_detected() with a
@ -349,27 +355,6 @@ errors. See the discussion in powerpc/eeh-pci-error-recovery.txt
 for additional detail on real-life experience of the causes of
 software errors.

-STEP 0: Error Event: ERR_FATAL
-------------------
-PCI bus error is detected by the PCI hardware. On powerpc, the slot is
-isolated, in that all I/O is blocked: all reads return 0xffffffff, all
-writes are ignored.
-
-STEP 1: Remove devices
--------------------
-Platform removes the devices depending on the error agent, it could be
-this port for all subordinates or upstream component (likely downstream
-port)
-
-STEP 2: Reset link
--------------------
-The platform resets the link.  This is a PCI-Express specific step and is
-done whenever a fatal error has been detected that can be "solved" by
-resetting the link.
-
-STEP 3: Re-enumerate the devices
--------------------
-Initiates the re-enumeration.

 Conclusion; General Remarks
 ---------------------------
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@ -1,34 +0,0 @@
-00-INDEX
-	- This file
-arrayRCU.txt
-	- Using RCU to Protect Read-Mostly Arrays
-checklist.txt
-	- Review Checklist for RCU Patches
-listRCU.txt
-	- Using RCU to Protect Read-Mostly Linked Lists
-lockdep.txt
-	- RCU and lockdep checking
-lockdep-splat.txt
-	- RCU Lockdep splats explained.
-NMI-RCU.txt
-	- Using RCU to Protect Dynamic NMI Handlers
-rcu_dereference.txt
-	- Proper care and feeding of return values from rcu_dereference()
-rcubarrier.txt
-	- RCU and Unloadable Modules
-rculist_nulls.txt
-	- RCU list primitives for use with SLAB_TYPESAFE_BY_RCU
-rcuref.txt
-	- Reference-count design for elements of lists/arrays protected by RCU
-rcu.txt
-	- RCU Concepts
-RTFP.txt
-	- List of RCU papers (bibliography) going back to 1980.
-stallwarn.txt
-	- RCU CPU stall warnings (module parameter rcu_cpu_stall_suppress)
-torture.txt
-	- RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
-UP.txt
-	- RCU on Uniprocessor Systems
-whatisRCU.txt
-	- What is RCU?
--- a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg
+++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg
@ -1,499 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
-
-<!-- CreationDate: Wed Dec  9 17:26:09 2015 -->
-
-<!-- Magnification: 2.000 -->
-
-<svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   width="5.7in"
-   height="6.6in"
-   viewBox="-44 -44 6838 7888"
-   id="svg2"
-   version="1.1"
-   inkscape:version="0.48.4 r9939"
-   sodipodi:docname="BigTreeClassicRCUBH.fig">
-  <metadata
-     id="metadata110">
-    <rdf:RDF>
-      <cc:Work
-         rdf:about="">
-        <dc:format>image/svg+xml</dc:format>
-        <dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title></dc:title>
-      </cc:Work>
-    </rdf:RDF>
-  </metadata>
-  <defs
-     id="defs108">
-    <marker
-       inkscape:stockid="Arrow1Mend"
-       orient="auto"
-       refY="0.0"
-       refX="0.0"
-       id="Arrow1Mend"
-       style="overflow:visible;">
-      <path
-         id="path3868"
-         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
-         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
-         transform="scale(0.4) rotate(180) translate(10,0)" />
-    </marker>
-    <marker
-       inkscape:stockid="Arrow2Mend"
-       orient="auto"
-       refY="0.0"
-       refX="0.0"
-       id="Arrow2Mend"
-       style="overflow:visible;">
-      <path
-         id="path3886"
-         style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
-         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
-         transform="scale(0.6) rotate(180) translate(0,0)" />
-    </marker>
-  </defs>
-  <sodipodi:namedview
-     pagecolor="#ffffff"
-     bordercolor="#666666"
-     borderopacity="1"
-     objecttolerance="10"
-     gridtolerance="10"
-     guidetolerance="10"
-     inkscape:pageopacity="0"
-     inkscape:pageshadow="2"
-     inkscape:window-width="878"
-     inkscape:window-height="1148"
-     id="namedview106"
-     showgrid="false"
-     inkscape:zoom="1.3547758"
-     inkscape:cx="256.5"
-     inkscape:cy="297"
-     inkscape:window-x="45"
-     inkscape:window-y="24"
-     inkscape:window-maximized="0"
-     inkscape:current-layer="g4" />
-  <g
-     style="stroke-width:.025in; fill:none"
-     id="g4">
-    <!-- Line: box -->
-    <rect
-       x="450"
-       y="0"
-       width="6300"
-       height="7350"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
-       id="rect6" />
-    <!-- Line: box -->
-    <rect
-       x="4950"
-       y="4950"
-       width="1500"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
-       id="rect8" />
-    <!-- Line: box -->
-    <rect
-       x="750"
-       y="600"
-       width="5700"
-       height="3750"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
-       id="rect10" />
-    <!-- Line: box -->
-    <rect
-       x="0"
-       y="450"
-       width="6300"
-       height="7350"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
-       id="rect12" />
-    <!-- Line: box -->
-    <rect
-       x="300"
-       y="1050"
-       width="5700"
-       height="3750"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
-       id="rect14" />
-    <!-- Circle -->
-    <circle
-       cx="2850"
-       cy="3900"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle16" />
-    <!-- Circle -->
-    <circle
-       cx="3150"
-       cy="3900"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle18" />
-    <!-- Circle -->
-    <circle
-       cx="3450"
-       cy="3900"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle20" />
-    <!-- Circle -->
-    <circle
-       cx="1350"
-       cy="5100"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle22" />
-    <!-- Circle -->
-    <circle
-       cx="1650"
-       cy="5100"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle24" />
-    <!-- Circle -->
-    <circle
-       cx="1950"
-       cy="5100"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle26" />
-    <!-- Circle -->
-    <circle
-       cx="4350"
-       cy="5100"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle28" />
-    <!-- Circle -->
-    <circle
-       cx="4650"
-       cy="5100"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle30" />
-    <!-- Circle -->
-    <circle
-       cx="4950"
-       cy="5100"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle32" />
-    <!-- Line -->
-    <polyline
-       points="1350,3450 2350,2590 "
-       style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline34" />
-    <!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510-->
-    <!-- Line -->
-    <polyline
-       points="4950,3450 3948,2590 "
-       style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline38" />
-    <!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510-->
-    <!-- Line: box -->
-    <rect
-       x="750"
-       y="3450"
-       width="1800"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
-       id="rect42" />
-    <!-- Line -->
-    <polyline
-       points="2250,5400 2250,4414 "
-       style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline44" />
-    <!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290-->
-    <!-- Line: box -->
-    <rect
-       x="1500"
-       y="5400"
-       width="1500"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
-       id="rect48" />
-    <!-- Line: box -->
-    <rect
-       x="300"
-       y="6600"
-       width="1500"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
-       id="rect50" />
-    <!-- Line: box -->
-    <rect
-       x="3750"
-       y="3450"
-       width="1800"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
-       id="rect52" />
-    <!-- Line: box -->
-    <rect
-       x="4500"
-       y="5400"
-       width="1500"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
-       id="rect54" />
-    <!-- Line: box -->
-    <rect
-       x="3300"
-       y="6600"
-       width="1500"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
-       id="rect56" />
-    <!-- Line: box -->
-    <rect
-       x="2250"
-       y="1650"
-       width="1800"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
-       id="rect58" />
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="6450"
-       y="300"
-       fill="#000000"
-       font-family="Helvetica"
-       font-style="normal"
-       font-weight="normal"
-       font-size="192"
-       text-anchor="end"
-       id="text60">rcu_bh</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="3150"
-       y="1950"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text62">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="3150"
-       y="2250"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text64">rcu_node</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="1650"
-       y="3750"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text66">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="1650"
-       y="4050"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text68">rcu_node</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="4650"
-       y="4050"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text70">rcu_node</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="4650"
-       y="3750"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text72">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="2250"
-       y="5700"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text74">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="2250"
-       y="6000"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text76">rcu_data</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="1050"
-       y="6900"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text78">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="1050"
-       y="7200"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text80">rcu_data</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="5250"
-       y="5700"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text82">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="5250"
-       y="6000"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text84">rcu_data</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="4050"
-       y="6900"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text86">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="4050"
-       y="7200"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text88">rcu_data</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="450"
-       y="1350"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="start"
-       id="text90">struct rcu_state</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="6000"
-       y="750"
-       fill="#000000"
-       font-family="Helvetica"
-       font-style="normal"
-       font-weight="normal"
-       font-size="192"
-       text-anchor="end"
-       id="text92">rcu_sched</text>
-    <!-- Line -->
-    <polyline
-       points="5250,5400 5250,4414 "
-       style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline94" />
-    <!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290-->
-    <!-- Line -->
-    <polyline
-       points="4050,6600 4050,4414 "
-       style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline98" />
-    <!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290-->
-    <!-- Line -->
-    <polyline
-       points="1050,6600 1050,4414 "
-       style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline102" />
-    <!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290-->
-  </g>
-</svg>
--- a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg
+++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg
@ -1,695 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
-
-<!-- CreationDate: Wed Dec  9 17:20:02 2015 -->
-
-<!-- Magnification: 2.000 -->
-
-<svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   width="5.7in"
-   height="8.6in"
-   viewBox="-44 -44 6838 10288"
-   id="svg2"
-   version="1.1"
-   inkscape:version="0.48.4 r9939"
-   sodipodi:docname="BigTreeClassicRCUBHdyntick.fig">
-  <metadata
-     id="metadata166">
-    <rdf:RDF>
-      <cc:Work
-         rdf:about="">
-        <dc:format>image/svg+xml</dc:format>
-        <dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title></dc:title>
-      </cc:Work>
-    </rdf:RDF>
-  </metadata>
-  <defs
-     id="defs164">
-    <marker
-       inkscape:stockid="Arrow1Mend"
-       orient="auto"
-       refY="0.0"
-       refX="0.0"
-       id="Arrow1Mend"
-       style="overflow:visible;">
-      <path
-         id="path3924"
-         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
-         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
-         transform="scale(0.4) rotate(180) translate(10,0)" />
-    </marker>
-    <marker
-       inkscape:stockid="Arrow2Lend"
-       orient="auto"
-       refY="0.0"
-       refX="0.0"
-       id="Arrow2Lend"
-       style="overflow:visible;">
-      <path
-         id="path3936"
-         style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
-         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
-         transform="scale(1.1) rotate(180) translate(1,0)" />
-    </marker>
-  </defs>
-  <sodipodi:namedview
-     pagecolor="#ffffff"
-     bordercolor="#666666"
-     borderopacity="1"
-     objecttolerance="10"
-     gridtolerance="10"
-     guidetolerance="10"
-     inkscape:pageopacity="0"
-     inkscape:pageshadow="2"
-     inkscape:window-width="845"
-     inkscape:window-height="988"
-     id="namedview162"
-     showgrid="false"
-     inkscape:zoom="1.0452196"
-     inkscape:cx="256.5"
-     inkscape:cy="387.00003"
-     inkscape:window-x="356"
-     inkscape:window-y="61"
-     inkscape:window-maximized="0"
-     inkscape:current-layer="g4" />
-  <g
-     style="stroke-width:.025in; fill:none"
-     id="g4">
-    <!-- Line: box -->
-    <rect
-       x="450"
-       y="0"
-       width="6300"
-       height="7350"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
-       id="rect6" />
-    <!-- Line: box -->
-    <rect
-       x="4950"
-       y="4950"
-       width="1500"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
-       id="rect8" />
-    <!-- Line: box -->
-    <rect
-       x="750"
-       y="600"
-       width="5700"
-       height="3750"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
-       id="rect10" />
-    <!-- Line -->
-    <polyline
-       points="5250,8100 5688,5912 "
-       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
-       id="polyline12" />
-    <!-- Arrowhead on XXXpoint 5250 8100 - 5710 5790-->
-    <polyline
-       points="5714 6068 5704 5822 5598 6044 "
-       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
-       id="polyline14" />
-    <!-- Line -->
-    <polyline
-       points="4050,9300 4486,7262 "
-       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
-       id="polyline16" />
-    <!-- Arrowhead on XXXpoint 4050 9300 - 4512 7140-->
-    <polyline
-       points="4514 7418 4506 7172 4396 7394 "
-       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
-       id="polyline18" />
-    <!-- Line -->
-    <polyline
-       points="1040,9300 1476,7262 "
-       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
-       id="polyline20" />
-    <!-- Arrowhead on XXXpoint 1040 9300 - 1502 7140-->
-    <polyline
-       points="1504 7418 1496 7172 1386 7394 "
-       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
-       id="polyline22" />
-    <!-- Line -->
-    <polyline
-       points="2240,8100 2676,6062 "
-       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
-       id="polyline24" />
-    <!-- Arrowhead on XXXpoint 2240 8100 - 2702 5940-->
-    <polyline
-       points="2704 6218 2696 5972 2586 6194 "
-       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
-       id="polyline26" />
-    <!-- Line: box -->
-    <rect
-       x="0"
-       y="450"
-       width="6300"
-       height="7350"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
-       id="rect28" />
-    <!-- Line: box -->
-    <rect
-       x="300"
-       y="1050"
-       width="5700"
-       height="3750"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
-       id="rect30" />
-    <!-- Line -->
-    <polyline
-       points="1350,3450 2350,2590 "
-       style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline32" />
-    <!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510-->
-    <!-- Line -->
-    <polyline
-       points="4950,3450 3948,2590 "
-       style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline36" />
-    <!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510-->
-    <!-- Line -->
-    <polyline
-       points="4050,6600 4050,4414 "
-       style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline40" />
-    <!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290-->
-    <!-- Line -->
-    <polyline
-       points="1050,6600 1050,4414 "
-       style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline44" />
-    <!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290-->
-    <!-- Line -->
-    <polyline
-       points="2250,5400 2250,4414 "
-       style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline48" />
-    <!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290-->
-    <!-- Line -->
-    <polyline
-       points="2250,8100 2250,6364 "
-       style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
-       id="polyline52" />
-    <!-- Arrowhead on XXXpoint 2250 8100 - 2250 6240-->
-    <!-- Line -->
-    <polyline
-       points="1050,9300 1050,7564 "
-       style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
-       id="polyline56" />
-    <!-- Arrowhead on XXXpoint 1050 9300 - 1050 7440-->
-    <!-- Line -->
-    <polyline
-       points="4050,9300 4050,7564 "
-       style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
-       id="polyline60" />
-    <!-- Arrowhead on XXXpoint 4050 9300 - 4050 7440-->
-    <!-- Line -->
-    <polyline
-       points="5250,8100 5250,6364 "
-       style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
-       id="polyline64" />
-    <!-- Arrowhead on XXXpoint 5250 8100 - 5250 6240-->
-    <!-- Circle -->
-    <circle
-       cx="2850"
-       cy="3900"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle68" />
-    <!-- Circle -->
-    <circle
-       cx="3150"
-       cy="3900"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle70" />
-    <!-- Circle -->
-    <circle
-       cx="3450"
-       cy="3900"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle72" />
-    <!-- Circle -->
-    <circle
-       cx="1350"
-       cy="5100"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle74" />
-    <!-- Circle -->
-    <circle
-       cx="1650"
-       cy="5100"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle76" />
-    <!-- Circle -->
-    <circle
-       cx="1950"
-       cy="5100"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle78" />
-    <!-- Circle -->
-    <circle
-       cx="4350"
-       cy="5100"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle80" />
-    <!-- Circle -->
-    <circle
-       cx="4650"
-       cy="5100"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle82" />
-    <!-- Circle -->
-    <circle
-       cx="4950"
-       cy="5100"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle84" />
-    <!-- Line: box -->
-    <rect
-       x="750"
-       y="3450"
-       width="1800"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
-       id="rect86" />
-    <!-- Line: box -->
-    <rect
-       x="300"
-       y="6600"
-       width="1500"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
-       id="rect88" />
-    <!-- Line: box -->
-    <rect
-       x="3750"
-       y="3450"
-       width="1800"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
-       id="rect90" />
-    <!-- Line: box -->
-    <rect
-       x="4500"
-       y="5400"
-       width="1500"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
-       id="rect92" />
-    <!-- Line: box -->
-    <rect
-       x="3300"
-       y="6600"
-       width="1500"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
-       id="rect94" />
-    <!-- Line: box -->
-    <rect
-       x="2250"
-       y="1650"
-       width="1800"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
-       id="rect96" />
-    <!-- Line: box -->
-    <rect
-       x="0"
-       y="9300"
-       width="2100"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
-       id="rect98" />
-    <!-- Line: box -->
-    <rect
-       x="1350"
-       y="8100"
-       width="2100"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
-       id="rect100" />
-    <!-- Line: box -->
-    <rect
-       x="3000"
-       y="9300"
-       width="2100"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
-       id="rect102" />
-    <!-- Line: box -->
-    <rect
-       x="4350"
-       y="8100"
-       width="2100"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
-       id="rect104" />
-    <!-- Line: box -->
-    <rect
-       x="1500"
-       y="5400"
-       width="1500"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
-       id="rect106" />
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="6450"
-       y="300"
-       fill="#000000"
-       font-family="Helvetica"
-       font-style="normal"
-       font-weight="normal"
-       font-size="192"
-       text-anchor="end"
-       id="text108">rcu_bh</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="3150"
-       y="1950"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text110">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="3150"
-       y="2250"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text112">rcu_node</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="1650"
-       y="3750"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text114">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="1650"
-       y="4050"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text116">rcu_node</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="4650"
-       y="4050"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text118">rcu_node</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="4650"
-       y="3750"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text120">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="2250"
-       y="5700"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text122">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="2250"
-       y="6000"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text124">rcu_data</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="1050"
-       y="6900"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text126">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="1050"
-       y="7200"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text128">rcu_data</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="5250"
-       y="5700"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text130">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="5250"
-       y="6000"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text132">rcu_data</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="4050"
-       y="6900"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text134">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="4050"
-       y="7200"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text136">rcu_data</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="450"
-       y="1350"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="start"
-       id="text138">struct rcu_state</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="1050"
-       y="9600"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text140">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="1050"
-       y="9900"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text142">rcu_dynticks</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="4050"
-       y="9600"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text144">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="4050"
-       y="9900"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text146">rcu_dynticks</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="2400"
-       y="8400"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text148">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="2400"
-       y="8700"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text150">rcu_dynticks</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="5400"
-       y="8400"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text152">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="5400"
-       y="8700"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text154">rcu_dynticks</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="6000"
-       y="750"
-       fill="#000000"
-       font-family="Helvetica"
-       font-style="normal"
-       font-weight="normal"
-       font-size="192"
-       text-anchor="end"
-       id="text156">rcu_sched</text>
-    <!-- Line -->
-    <polyline
-       points="5250,5400 5250,4414 "
-       style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline158" />
-    <!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290-->
-  </g>
-</svg>
--- a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg
+++ b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg
@ -1,741 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
-
-<!-- CreationDate: Wed Dec  9 17:32:59 2015 -->
-
-<!-- Magnification: 2.000 -->
-
-<svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   width="6.1in"
-   height="8.9in"
-   viewBox="-44 -44 7288 10738"
-   id="svg2"
-   version="1.1"
-   inkscape:version="0.48.4 r9939"
-   sodipodi:docname="BigTreePreemptRCUBHdyntick.fig">
-  <metadata
-     id="metadata182">
-    <rdf:RDF>
-      <cc:Work
-         rdf:about="">
-        <dc:format>image/svg+xml</dc:format>
-        <dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title></dc:title>
-      </cc:Work>
-    </rdf:RDF>
-  </metadata>
-  <defs
-     id="defs180">
-    <marker
-       inkscape:stockid="Arrow1Mend"
-       orient="auto"
-       refY="0.0"
-       refX="0.0"
-       id="Arrow1Mend"
-       style="overflow:visible;">
-      <path
-         id="path3940"
-         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
-         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
-         transform="scale(0.4) rotate(180) translate(10,0)" />
-    </marker>
-  </defs>
-  <sodipodi:namedview
-     pagecolor="#ffffff"
-     bordercolor="#666666"
-     borderopacity="1"
-     objecttolerance="10"
-     gridtolerance="10"
-     guidetolerance="10"
-     inkscape:pageopacity="0"
-     inkscape:pageshadow="2"
-     inkscape:window-width="874"
-     inkscape:window-height="1148"
-     id="namedview178"
-     showgrid="false"
-     inkscape:zoom="1.2097379"
-     inkscape:cx="274.5"
-     inkscape:cy="400.49997"
-     inkscape:window-x="946"
-     inkscape:window-y="24"
-     inkscape:window-maximized="0"
-     inkscape:current-layer="g4" />
-  <g
-     style="stroke-width:.025in; fill:none"
-     id="g4">
-    <!-- Line: box -->
-    <rect
-       x="900"
-       y="0"
-       width="6300"
-       height="7350"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
-       id="rect6" />
-    <!-- Line: box -->
-    <rect
-       x="1200"
-       y="600"
-       width="5700"
-       height="3750"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
-       id="rect8" />
-    <!-- Line: box -->
-    <rect
-       x="5400"
-       y="4950"
-       width="1500"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
-       id="rect10" />
-    <!-- Line: box -->
-    <rect
-       x="450"
-       y="450"
-       width="6300"
-       height="7350"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
-       id="rect12" />
-    <!-- Line: box -->
-    <rect
-       x="750"
-       y="1050"
-       width="5700"
-       height="3750"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
-       id="rect14" />
-    <!-- Line: box -->
-    <rect
-       x="4950"
-       y="5400"
-       width="1500"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
-       id="rect16" />
-    <!-- Line -->
-    <polyline
-       points="5250,8550 5688,6362 "
-       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
-       id="polyline18" />
-    <!-- Arrowhead on XXXpoint 5250 8550 - 5710 6240-->
-    <polyline
-       points="5714 6518 5704 6272 5598 6494 "
-       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
-       id="polyline20" />
-    <!-- Line -->
-    <polyline
-       points="4050,9750 4486,7712 "
-       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
-       id="polyline22" />
-    <!-- Arrowhead on XXXpoint 4050 9750 - 4512 7590-->
-    <polyline
-       points="4514 7868 4506 7622 4396 7844 "
-       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
-       id="polyline24" />
-    <!-- Line -->
-    <polyline
-       points="1040,9750 1476,7712 "
-       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
-       id="polyline26" />
-    <!-- Arrowhead on XXXpoint 1040 9750 - 1502 7590-->
-    <polyline
-       points="1504 7868 1496 7622 1386 7844 "
-       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
-       id="polyline28" />
-    <!-- Line -->
-    <polyline
-       points="2240,8550 2676,6512 "
-       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
-       id="polyline30" />
-    <!-- Arrowhead on XXXpoint 2240 8550 - 2702 6390-->
-    <polyline
-       points="2704 6668 2696 6422 2586 6644 "
-       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
-       id="polyline32" />
-    <!-- Line -->
-    <polyline
-       points="4050,9750 5682,6360 "
-       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
-       id="polyline34" />
-    <!-- Arrowhead on XXXpoint 4050 9750 - 5736 6246-->
-    <polyline
-       points="5672 6518 5722 6276 5562 6466 "
-       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
-       id="polyline36" />
-    <!-- Line -->
-    <polyline
-       points="1010,9750 2642,6360 "
-       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
-       id="polyline38" />
-    <!-- Arrowhead on XXXpoint 1010 9750 - 2696 6246-->
-    <polyline
-       points="2632 6518 2682 6276 2522 6466 "
-       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
-       id="polyline40" />
-    <!-- Line: box -->
-    <rect
-       x="0"
-       y="900"
-       width="6300"
-       height="7350"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
-       id="rect42" />
-    <!-- Line: box -->
-    <rect
-       x="300"
-       y="1500"
-       width="5700"
-       height="3750"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
-       id="rect44" />
-    <!-- Line -->
-    <polyline
-       points="1350,3900 2350,3040 "
-       style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline46" />
-    <!-- Arrowhead on XXXpoint 1350 3900 - 2444 2960-->
-    <!-- Line -->
-    <polyline
-       points="4950,3900 3948,3040 "
-       style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline50" />
-    <!-- Arrowhead on XXXpoint 4950 3900 - 3854 2960-->
-    <!-- Line -->
-    <polyline
-       points="4050,7050 4050,4864 "
-       style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline54" />
-    <!-- Arrowhead on XXXpoint 4050 7050 - 4050 4740-->
-    <!-- Line -->
-    <polyline
-       points="1050,7050 1050,4864 "
-       style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline58" />
-    <!-- Arrowhead on XXXpoint 1050 7050 - 1050 4740-->
-    <!-- Line -->
-    <polyline
-       points="2250,5850 2250,4864 "
-       style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline62" />
-    <!-- Arrowhead on XXXpoint 2250 5850 - 2250 4740-->
-    <!-- Line -->
-    <polyline
-       points="2250,8550 2250,6814 "
-       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
-       id="polyline66" />
-    <!-- Arrowhead on XXXpoint 2250 8550 - 2250 6690-->
-    <!-- Line -->
-    <polyline
-       points="1050,9750 1050,8014 "
-       style="stroke:#00ff00;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline70" />
-    <!-- Arrowhead on XXXpoint 1050 9750 - 1050 7890-->
-    <!-- Line -->
-    <polyline
-       points="4050,9750 4050,8014 "
-       style="stroke:#00ff00;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline74" />
-    <!-- Arrowhead on XXXpoint 4050 9750 - 4050 7890-->
-    <!-- Line -->
-    <polyline
-       points="5250,8550 5250,6814 "
-       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
-       id="polyline78" />
-    <!-- Arrowhead on XXXpoint 5250 8550 - 5250 6690-->
-    <!-- Circle -->
-    <circle
-       cx="2850"
-       cy="4350"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle82" />
-    <!-- Circle -->
-    <circle
-       cx="3150"
-       cy="4350"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle84" />
-    <!-- Circle -->
-    <circle
-       cx="3450"
-       cy="4350"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle86" />
-    <!-- Circle -->
-    <circle
-       cx="1350"
-       cy="5550"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle88" />
-    <!-- Circle -->
-    <circle
-       cx="1650"
-       cy="5550"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle90" />
-    <!-- Circle -->
-    <circle
-       cx="1950"
-       cy="5550"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle92" />
-    <!-- Circle -->
-    <circle
-       cx="4350"
-       cy="5550"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle94" />
-    <!-- Circle -->
-    <circle
-       cx="4650"
-       cy="5550"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle96" />
-    <!-- Circle -->
-    <circle
-       cx="4950"
-       cy="5550"
-       r="76"
-       style="fill:#000000;stroke:#000000;stroke-width:14;"
-       id="circle98" />
-    <!-- Line: box -->
-    <rect
-       x="750"
-       y="3900"
-       width="1800"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
-       id="rect100" />
-    <!-- Line: box -->
-    <rect
-       x="300"
-       y="7050"
-       width="1500"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
-       id="rect102" />
-    <!-- Line: box -->
-    <rect
-       x="3750"
-       y="3900"
-       width="1800"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
-       id="rect104" />
-    <!-- Line: box -->
-    <rect
-       x="4500"
-       y="5850"
-       width="1500"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
-       id="rect106" />
-    <!-- Line: box -->
-    <rect
-       x="3300"
-       y="7050"
-       width="1500"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
-       id="rect108" />
-    <!-- Line: box -->
-    <rect
-       x="2250"
-       y="2100"
-       width="1800"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
-       id="rect110" />
-    <!-- Line: box -->
-    <rect
-       x="0"
-       y="9750"
-       width="2100"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
-       id="rect112" />
-    <!-- Line: box -->
-    <rect
-       x="1350"
-       y="8550"
-       width="2100"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
-       id="rect114" />
-    <!-- Line: box -->
-    <rect
-       x="3000"
-       y="9750"
-       width="2100"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
-       id="rect116" />
-    <!-- Line: box -->
-    <rect
-       x="4350"
-       y="8550"
-       width="2100"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
-       id="rect118" />
-    <!-- Line: box -->
-    <rect
-       x="1500"
-       y="5850"
-       width="1500"
-       height="900"
-       rx="0"
-       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
-       id="rect120" />
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="6450"
-       y="750"
-       fill="#000000"
-       font-family="Helvetica"
-       font-style="normal"
-       font-weight="normal"
-       font-size="192"
-       text-anchor="end"
-       id="text122">rcu_bh</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="3150"
-       y="2400"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text124">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="3150"
-       y="2700"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text126">rcu_node</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="1650"
-       y="4200"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text128">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="1650"
-       y="4500"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text130">rcu_node</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="4650"
-       y="4500"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text132">rcu_node</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="4650"
-       y="4200"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text134">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="2250"
-       y="6150"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text136">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="2250"
-       y="6450"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text138">rcu_data</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="1050"
-       y="7350"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text140">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="1050"
-       y="7650"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text142">rcu_data</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="5250"
-       y="6150"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text144">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="5250"
-       y="6450"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text146">rcu_data</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="4050"
-       y="7350"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text148">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="4050"
-       y="7650"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text150">rcu_data</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="450"
-       y="1800"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="start"
-       id="text152">struct rcu_state</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="1050"
-       y="10050"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text154">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="1050"
-       y="10350"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text156">rcu_dynticks</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="4050"
-       y="10050"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text158">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="4050"
-       y="10350"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text160">rcu_dynticks</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="2400"
-       y="8850"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text162">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="2400"
-       y="9150"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text164">rcu_dynticks</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="5400"
-       y="8850"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text166">struct</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="5400"
-       y="9150"
-       fill="#000000"
-       font-family="Courier"
-       font-style="normal"
-       font-weight="bold"
-       font-size="192"
-       text-anchor="middle"
-       id="text168">rcu_dynticks</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="6900"
-       y="300"
-       fill="#000000"
-       font-family="Helvetica"
-       font-style="normal"
-       font-weight="normal"
-       font-size="192"
-       text-anchor="end"
-       id="text170">rcu_preempt</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="6000"
-       y="1200"
-       fill="#000000"
-       font-family="Helvetica"
-       font-style="normal"
-       font-weight="normal"
-       font-size="192"
-       text-anchor="end"
-       id="text172">rcu_sched</text>
-    <!-- Line -->
-    <polyline
-       points="5250,5850 5250,4864 "
-       style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
-       id="polyline174" />
-    <!-- Arrowhead on XXXpoint 5250 5850 - 5250 4740-->
-  </g>
-</svg>
--- a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg
+++ b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg
--- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html
+++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html
@ -23,8 +23,6 @@ to each other.
 	The <tt>rcu_segcblist</tt> Structure</a>
 <li>	<a href="#The rcu_data Structure">
 	The <tt>rcu_data</tt> Structure</a>
-<li>	<a href="#The rcu_dynticks Structure">
-	The <tt>rcu_dynticks</tt> Structure</a>
 <li>	<a href="#The rcu_head Structure">
 	The <tt>rcu_head</tt> Structure</a>
 <li>	<a href="#RCU-Specific Fields in the task_struct Structure">
@ -127,9 +125,11 @@ CPUs, RCU would configure the <tt>rcu_node</tt> tree as follows:
 </p><p>RCU currently permits up to a four-level tree, which on a 64-bit system
 accommodates up to 4,194,304 CPUs, though only a mere 524,288 CPUs for
 32-bit systems.
-On the other hand, you can set <tt>CONFIG_RCU_FANOUT</tt> to be
-as small as 2 if you wish, which would permit only 16 CPUs, which
-is useful for testing.
+On the other hand, you can set both <tt>CONFIG_RCU_FANOUT</tt> and
+<tt>CONFIG_RCU_FANOUT_LEAF</tt> to be as small as 2, which would result
+in a 16-CPU test using a 4-level tree.
+This can be useful for testing large-system capabilities on small test
+machines.

 </p><p>This multi-level combining tree allows us to get most of the
 performance and scalability
@ -154,44 +154,9 @@ on that root <tt>rcu_node</tt> structure remains acceptably low.
 keeping lock contention under control at all tree levels regardless
 of the level of loading on the system.

-</p><p>The Linux kernel actually supports multiple flavors of RCU
-running concurrently, so RCU builds separate data structures for each
-flavor.
-For example, for <tt>CONFIG_TREE_RCU=y</tt> kernels, RCU provides
-rcu_sched and rcu_bh, as shown below:
-
-</p><p><img src="BigTreeClassicRCUBH.svg" alt="BigTreeClassicRCUBH.svg" width="33%">
-
-</p><p>Energy efficiency is increasingly important, and for that
-reason the Linux kernel provides <tt>CONFIG_NO_HZ_IDLE</tt>, which
-turns off the scheduling-clock interrupts on idle CPUs, which in
-turn allows those CPUs to attain deeper sleep states and to consume
-less energy.
-CPUs whose scheduling-clock interrupts have been turned off are
-said to be in <i>dyntick-idle mode</i>.
-RCU must handle dyntick-idle CPUs specially
-because RCU would otherwise wake up each CPU on every grace period,
-which would defeat the whole purpose of <tt>CONFIG_NO_HZ_IDLE</tt>.
-RCU uses the <tt>rcu_dynticks</tt> structure to track
-which CPUs are in dyntick idle mode, as shown below:
-
-</p><p><img src="BigTreeClassicRCUBHdyntick.svg" alt="BigTreeClassicRCUBHdyntick.svg" width="33%">
-
-</p><p>However, if a CPU is in dyntick-idle mode, it is in that mode
-for all flavors of RCU.
-Therefore, a single <tt>rcu_dynticks</tt> structure is allocated per
-CPU, and all of a given CPU's <tt>rcu_data</tt> structures share
-that <tt>rcu_dynticks</tt>, as shown in the figure.
-
-</p><p>Kernels built with <tt>CONFIG_PREEMPT_RCU</tt> support
-rcu_preempt in addition to rcu_sched and rcu_bh, as shown below:
-
-</p><p><img src="BigTreePreemptRCUBHdyntick.svg" alt="BigTreePreemptRCUBHdyntick.svg" width="35%">
-
 </p><p>RCU updaters wait for normal grace periods by registering
 RCU callbacks, either directly via <tt>call_rcu()</tt> and
 friends (namely <tt>call_rcu_bh()</tt> and <tt>call_rcu_sched()</tt>),
-there being a separate interface per flavor of RCU)
 or indirectly via <tt>synchronize_rcu()</tt> and friends.
 RCU callbacks are represented by <tt>rcu_head</tt> structures,
 which are queued on <tt>rcu_data</tt> structures while they are
@ -214,9 +179,6 @@ its own synchronization:
 <li>	Each <tt>rcu_node</tt> structure has a spinlock.
 <li>	The fields in <tt>rcu_data</tt> are private to the corresponding
 	CPU, although a few can be read and written by other CPUs.
-<li>	Similarly, the fields in <tt>rcu_dynticks</tt> are private
-	to the corresponding CPU, although a few can be read by
-	other CPUs.
 </ol>

 <p>It is important to note that different data structures can have
@ -272,11 +234,6 @@ follows:
 	access to this information from the corresponding CPU.
 	Finally, this structure records past dyntick-idle state
 	for the corresponding CPU and also tracks statistics.
-<li>	<tt>rcu_dynticks</tt>:
-	This per-CPU structure tracks the current dyntick-idle
-	state for the corresponding CPU.
-	Unlike the other three structures, the <tt>rcu_dynticks</tt>
-	structure is not replicated per RCU flavor.
 <li>	<tt>rcu_head</tt>:
 	This structure represents RCU callbacks, and is the
 	only structure allocated and managed by RCU users.
@ -287,14 +244,14 @@ follows:
 <p>If all you wanted from this article was a general notion of how
 RCU's data structures are related, you are done.
 Otherwise, each of the following sections give more details on
-the <tt>rcu_state</tt>, <tt>rcu_node</tt>, <tt>rcu_data</tt>,
-and <tt>rcu_dynticks</tt> data structures.
+the <tt>rcu_state</tt>, <tt>rcu_node</tt> and <tt>rcu_data</tt> data
+structures.

 <h3><a name="The rcu_state Structure">
 The <tt>rcu_state</tt> Structure</a></h3>

 <p>The <tt>rcu_state</tt> structure is the base structure that
-represents a flavor of RCU.
+represents the state of RCU in the system.
 This structure forms the interconnection between the
 <tt>rcu_node</tt> and <tt>rcu_data</tt> structures,
 tracks grace periods, contains the lock used to
@ -389,7 +346,7 @@ sequence number.
 The bottom two bits are the state of the current grace period,
 which can be zero for not yet started or one for in progress.
 In other words, if the bottom two bits of <tt>-&gt;gp_seq</tt> are
-zero, the corresponding flavor of RCU is idle.
+zero, then RCU is idle.
 Any other value in the bottom two bits indicates that something is broken.
 This field is protected by the root <tt>rcu_node</tt> structure's
 <tt>-&gt;lock</tt> field.
@ -419,10 +376,10 @@ as follows:
 grace period in jiffies.
 It is protected by the root <tt>rcu_node</tt>'s <tt>-&gt;lock</tt>.

-<p>The <tt>-&gt;name</tt> field points to the name of the RCU flavor
-(for example, &ldquo;rcu_sched&rdquo;), and is constant.
-The <tt>-&gt;abbr</tt> field contains a one-character abbreviation,
-for example, &ldquo;s&rdquo; for RCU-sched.
+<p>The <tt>-&gt;name</tt> and <tt>-&gt;abbr</tt> fields distinguish
+between preemptible RCU (&ldquo;rcu_preempt&rdquo; and &ldquo;p&rdquo;)
+and non-preemptible RCU (&ldquo;rcu_sched&rdquo; and &ldquo;s&rdquo;).
+These fields are used for diagnostic and tracing purposes.

 <h3><a name="The rcu_node Structure">
 The <tt>rcu_node</tt> Structure</a></h3>
@ -971,25 +928,31 @@ this <tt>rcu_segcblist</tt> structure, <i>not</i> the <tt>-&gt;head</tt>
 pointer.
 The reason for this is that all the ready-to-invoke callbacks
 (that is, those in the <tt>RCU_DONE_TAIL</tt> segment) are extracted
-all at once at callback-invocation time.
+all at once at callback-invocation time (<tt>rcu_do_batch</tt>), due
+to which <tt>-&gt;head</tt> may be set to NULL if there are no not-done
+callbacks remaining in the <tt>rcu_segcblist</tt>.
 If callback invocation must be postponed, for example, because a
 high-priority process just woke up on this CPU, then the remaining
-callbacks are placed back on the <tt>RCU_DONE_TAIL</tt> segment.
-Either way, the <tt>-&gt;len</tt> and <tt>-&gt;len_lazy</tt> counts
-are adjusted after the corresponding callbacks have been invoked, and so
-again it is the <tt>-&gt;len</tt> count that accurately reflects whether
-or not there are callbacks associated with this <tt>rcu_segcblist</tt>
-structure.
+callbacks are placed back on the <tt>RCU_DONE_TAIL</tt> segment and
+<tt>-&gt;head</tt> once again points to the start of the segment.
+In short, the head field can briefly be <tt>NULL</tt> even though the
+CPU has callbacks present the entire time.
+Therefore, it is not appropriate to test the <tt>-&gt;head</tt> pointer
+for <tt>NULL</tt>.
+
+<p>In contrast, the <tt>-&gt;len</tt> and <tt>-&gt;len_lazy</tt> counts
+are adjusted only after the corresponding callbacks have been invoked.
+This means that the <tt>-&gt;len</tt> count is zero only if
+the <tt>rcu_segcblist</tt> structure really is devoid of callbacks.
 Of course, off-CPU sampling of the <tt>-&gt;len</tt> count requires
-the use of appropriate synchronization, for example, memory barriers.
+careful use of appropriate synchronization, for example, memory barriers.
 This synchronization can be a bit subtle, particularly in the case
 of <tt>rcu_barrier()</tt>.

 <h3><a name="The rcu_data Structure">
 The <tt>rcu_data</tt> Structure</a></h3>

-<p>The <tt>rcu_data</tt> maintains the per-CPU state for the
-corresponding flavor of RCU.
+<p>The <tt>rcu_data</tt> maintains the per-CPU state for the RCU subsystem.
 The fields in this structure may be accessed only from the corresponding
 CPU (and from tracing) unless otherwise stated.
 This structure is the
@ -1015,30 +978,19 @@ as follows:

 <pre>
  1   int cpu;
-  2   struct rcu_state *rsp;
-  3   struct rcu_node *mynode;
-  4   struct rcu_dynticks *dynticks;
-  5   unsigned long grpmask;
-  6   bool beenonline;
+  2   struct rcu_node *mynode;
+  3   unsigned long grpmask;
+  4   bool beenonline;
 </pre>

 <p>The <tt>-&gt;cpu</tt> field contains the number of the
-corresponding CPU, the <tt>-&gt;rsp</tt> pointer references
-the corresponding <tt>rcu_state</tt> structure (and is most frequently
-used to locate the name of the corresponding flavor of RCU for tracing),
-and the <tt>-&gt;mynode</tt> field references the corresponding
-<tt>rcu_node</tt> structure.
+corresponding CPU and the <tt>-&gt;mynode</tt> field references the
+corresponding <tt>rcu_node</tt> structure.
 The <tt>-&gt;mynode</tt> is used to propagate quiescent states
 up the combining tree.
-<p>The <tt>-&gt;dynticks</tt> pointer references the
-<tt>rcu_dynticks</tt> structure corresponding to this
-CPU.
-Recall that a single per-CPU instance of the <tt>rcu_dynticks</tt>
-structure is shared among all flavors of RCU.
-These first four fields are constant and therefore require not
-synchronization.
+These two fields are constant and therefore do not require synchronization.

-</p><p>The <tt>-&gt;grpmask</tt> field indicates the bit in
+<p>The <tt>-&gt;grpmask</tt> field indicates the bit in
 the <tt>-&gt;mynode-&gt;qsmask</tt> corresponding to this
 <tt>rcu_data</tt> structure, and is also used when propagating
 quiescent states.
@ -1057,12 +1009,12 @@ as follows:
  3   bool cpu_no_qs;
  4   bool core_needs_qs;
  5   bool gpwrap;
-  6   unsigned long rcu_qs_ctr_snap;
 </pre>

-<p>The <tt>-&gt;gp_seq</tt> and <tt>-&gt;gp_seq_needed</tt>
-fields are the counterparts of the fields of the same name
-in the <tt>rcu_state</tt> and <tt>rcu_node</tt> structures.
+<p>The <tt>-&gt;gp_seq</tt> field is the counterpart of the field of the same
+name in the <tt>rcu_state</tt> and <tt>rcu_node</tt> structures.  The
+<tt>-&gt;gp_seq_needed</tt> field is the counterpart of the field of the same
+name in the rcu_node</tt> structure.
 They may each lag up to one behind their <tt>rcu_node</tt>
 counterparts, but in <tt>CONFIG_NO_HZ_IDLE</tt> and
 <tt>CONFIG_NO_HZ_FULL</tt> kernels can lag
@ -1103,10 +1055,6 @@ CPU has remained idle for so long that the
 <tt>gp_seq</tt> counter is in danger of overflow, which
 will cause the CPU to disregard the values of its counters on
 its next exit from idle.
-Finally, the <tt>rcu_qs_ctr_snap</tt> field is used to detect
-cases where a given operation has resulted in a quiescent state
-for all flavors of RCU, for example, <tt>cond_resched()</tt>
-when RCU has indicated a need for quiescent states.

 <h5>RCU Callback Handling</h5>

@ -1179,26 +1127,22 @@ Finally, the <tt>-&gt;dynticks_fqs</tt> field is used to
 count the number of times this CPU is determined to be in
 dyntick-idle state, and is used for tracing and debugging purposes.

-<h3><a name="The rcu_dynticks Structure">
-The <tt>rcu_dynticks</tt> Structure</a></h3>
-
-<p>The <tt>rcu_dynticks</tt> maintains the per-CPU dyntick-idle state
-for the corresponding CPU.
-Unlike the other structures, <tt>rcu_dynticks</tt> is not
-replicated over the different flavors of RCU.
-The fields in this structure may be accessed only from the corresponding
-CPU (and from tracing) unless otherwise stated.
-Its fields are as follows:
+<p>
+This portion of the rcu_data structure is declared as follows:

 <pre>
  1   long dynticks_nesting;
  2   long dynticks_nmi_nesting;
  3   atomic_t dynticks;
  4   bool rcu_need_heavy_qs;
-  5   unsigned long rcu_qs_ctr;
-  6   bool rcu_urgent_qs;
+  5   bool rcu_urgent_qs;
 </pre>

+<p>These fields in the rcu_data structure maintain the per-CPU dyntick-idle
+state for the corresponding CPU.
+The fields may be accessed only from the corresponding CPU (and from tracing)
+unless otherwise stated.
+
 <p>The <tt>-&gt;dynticks_nesting</tt> field counts the
 nesting depth of process execution, so that in normal circumstances
 this counter has value zero or one.
@ -1227,9 +1171,11 @@ to overflow the counter, this approach corrects the
 CPU enters the idle loop from process context.

 </p><p>The <tt>-&gt;dynticks</tt> field counts the corresponding
-CPU's transitions to and from dyntick-idle mode, so that this counter
-has an even value when the CPU is in dyntick-idle mode and an odd
-value otherwise.
+CPU's transitions to and from either dyntick-idle or user mode, so
+that this counter has an even value when the CPU is in dyntick-idle
+mode or user mode and an odd value otherwise. The transitions to/from
+user mode need to be counted for user mode adaptive-ticks support
+(see timers/NO_HZ.txt).

 </p><p>The <tt>-&gt;rcu_need_heavy_qs</tt> field is used
 to record the fact that the RCU core code would really like to
@ -1238,19 +1184,12 @@ it is willing to call for heavy-weight dyntick-counter operations.
 This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
 code, which provide a momentary idle sojourn in response.

-</p><p>The <tt>-&gt;rcu_qs_ctr</tt> field is used to record
-quiescent states from <tt>cond_resched()</tt>.
-Because <tt>cond_resched()</tt> can execute quite frequently, this
-must be quite lightweight, as in a non-atomic increment of this
-per-CPU field.
-
 </p><p>Finally, the <tt>-&gt;rcu_urgent_qs</tt> field is used to record
-the fact that the RCU core code would really like to see a quiescent
-state from the corresponding CPU, with the various other fields indicating
-just how badly RCU wants this quiescent state.
-This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
-code, which, if nothing else, non-atomically increment <tt>-&gt;rcu_qs_ctr</tt>
-in response.
+the fact that the RCU core code would really like to see a quiescent state from
+the corresponding CPU, with the various other fields indicating just how badly
+RCU wants this quiescent state.
+This flag is checked by RCU's context-switch path
+(<tt>rcu_note_context_switch</tt>) and the cond_resched code.

 <table>
 <tr><th>&nbsp;</th></tr>
@ -1372,8 +1311,7 @@ that is, if the CPU is currently idle.
 Accessor Functions</a></h3>

 <p>The following listing shows the
-<tt>rcu_get_root()</tt>, <tt>rcu_for_each_node_breadth_first</tt>,
-<tt>rcu_for_each_nonleaf_node_breadth_first()</tt>, and
+<tt>rcu_get_root()</tt>, <tt>rcu_for_each_node_breadth_first</tt> and
 <tt>rcu_for_each_leaf_node()</tt> function and macros:

 <pre>
@ -1386,13 +1324,9 @@ Accessor Functions</a></h3>
  7   for ((rnp) = &amp;(rsp)-&gt;node[0]; \
  8        (rnp) &lt; &amp;(rsp)-&gt;node[NUM_RCU_NODES]; (rnp)++)
  9
- 10 #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
- 11   for ((rnp) = &amp;(rsp)-&gt;node[0]; \
- 12        (rnp) &lt; (rsp)-&gt;level[NUM_RCU_LVLS - 1]; (rnp)++)
- 13
- 14 #define rcu_for_each_leaf_node(rsp, rnp) \
- 15   for ((rnp) = (rsp)-&gt;level[NUM_RCU_LVLS - 1]; \
- 16        (rnp) &lt; &amp;(rsp)-&gt;node[NUM_RCU_NODES]; (rnp)++)
+ 10 #define rcu_for_each_leaf_node(rsp, rnp) \
+ 11   for ((rnp) = (rsp)-&gt;level[NUM_RCU_LVLS - 1]; \
+ 12        (rnp) &lt; &amp;(rsp)-&gt;node[NUM_RCU_NODES]; (rnp)++)
 </pre>

 <p>The <tt>rcu_get_root()</tt> simply returns a pointer to the
@ -1405,10 +1339,7 @@ macro takes advantage of the layout of the <tt>rcu_node</tt>
 structures in the <tt>rcu_state</tt> structure's
 <tt>-&gt;node[]</tt> array, performing a breadth-first traversal by
 simply traversing the array in order.
-The <tt>rcu_for_each_nonleaf_node_breadth_first()</tt> macro operates
-similarly, but traverses only the first part of the array, thus excluding
-the leaf <tt>rcu_node</tt> structures.
-Finally, the <tt>rcu_for_each_leaf_node()</tt> macro traverses only
+Similarly, the <tt>rcu_for_each_leaf_node()</tt> macro traverses only
 the last part of the array, thus traversing only the leaf
 <tt>rcu_node</tt> structures.

@ -1416,15 +1347,14 @@ the last part of the array, thus traversing only the leaf
 <tr><th>&nbsp;</th></tr>
 <tr><th align="left">Quick Quiz:</th></tr>
 <tr><td>
-	What do <tt>rcu_for_each_nonleaf_node_breadth_first()</tt> and
+	What does
 	<tt>rcu_for_each_leaf_node()</tt> do if the <tt>rcu_node</tt> tree
 	contains only a single node?
 </td></tr>
 <tr><th align="left">Answer:</th></tr>
 <tr><td bgcolor="#ffffff"><font color="ffffff">
 	In the single-node case,
-	<tt>rcu_for_each_nonleaf_node_breadth_first()</tt> is a no-op
-	and <tt>rcu_for_each_leaf_node()</tt> traverses the single node.
+	<tt>rcu_for_each_leaf_node()</tt> traverses the single node.
 </font></td></tr>
 <tr><td>&nbsp;</td></tr>
 </table>
@ -1432,11 +1362,11 @@ the last part of the array, thus traversing only the leaf
 <h3><a name="Summary">
 Summary</a></h3>

-So each flavor of RCU is represented by an <tt>rcu_state</tt> structure,
+So the state of RCU is represented by an <tt>rcu_state</tt> structure,
 which contains a combining tree of <tt>rcu_node</tt> and
 <tt>rcu_data</tt> structures.
 Finally, in <tt>CONFIG_NO_HZ_IDLE</tt> kernels, each CPU's dyntick-idle
-state is tracked by an <tt>rcu_dynticks</tt> structure.
+state is tracked by dynticks-related fields in the <tt>rcu_data</tt> structure.

 If you made it this far, you are well prepared to read the code
 walkthroughs in the other articles in this series.
--- a/Documentation/RCU/Design/Data-Structures/blkd_task.svg
+++ b/Documentation/RCU/Design/Data-Structures/blkd_task.svg
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/ExpSchedFlow.svg
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/ExpSchedFlow.svg
@ -328,13 +328,13 @@
     inkscape:window-height="1148"
     id="namedview90"
     showgrid="true"
-     inkscape:zoom="0.80021373"
-     inkscape:cx="462.49289"
-     inkscape:cy="473.6718"
+     inkscape:zoom="0.69092787"
+     inkscape:cx="476.34085"
+     inkscape:cy="712.80957"
     inkscape:window-x="770"
     inkscape:window-y="24"
     inkscape:window-maximized="0"
-     inkscape:current-layer="g4114-9-3-9"
+     inkscape:current-layer="g4"
     inkscape:snap-grids="false"
     fit-margin-top="5"
     fit-margin-right="5"
@ -813,14 +813,18 @@
      <text
         sodipodi:linespacing="125%"
         id="text4110-5-7-6-2-4-0"
-         y="841.88086"
+         y="670.74316"
         x="1460.1007"
         style="font-size:267.24359131px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
         xml:space="preserve"><tspan
-           y="841.88086"
+           y="670.74316"
           x="1460.1007"
           sodipodi:role="line"
-           id="tspan4925-1-2-4-5">reched_cpu()</tspan></text>
+           id="tspan4925-1-2-4-5">Request</tspan><tspan
+           y="1004.7976"
+           x="1460.1007"
+           sodipodi:role="line"
+           id="tspan3100">context switch</tspan></text>
    </g>
  </g>
 </svg>
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html
@ -12,10 +12,9 @@ high efficiency and minimal disturbance, expedited grace periods accept
 lower efficiency and significant disturbance to attain shorter latencies.

 <p>
-There are three flavors of RCU (RCU-bh, RCU-preempt, and RCU-sched),
-but only two flavors of expedited grace periods because the RCU-bh
-expedited grace period maps onto the RCU-sched expedited grace period.
-Each of the remaining two implementations is covered in its own section.
+There are two flavors of RCU (RCU-preempt and RCU-sched), with an earlier
+third RCU-bh flavor having been implemented in terms of the other two.
+Each of the two implementations is covered in its own section.

 <ol>
 <li>	<a href="#Expedited Grace Period Design">
@ -73,10 +72,10 @@ will ignore it because idle and offline CPUs are already residing
 in quiescent states.
 Otherwise, the expedited grace period will use
 <tt>smp_call_function_single()</tt> to send the CPU an IPI, which
-is handled by <tt>sync_rcu_exp_handler()</tt>.
+is handled by <tt>rcu_exp_handler()</tt>.

 <p>
-However, because this is preemptible RCU, <tt>sync_rcu_exp_handler()</tt>
+However, because this is preemptible RCU, <tt>rcu_exp_handler()</tt>
 can check to see if the CPU is currently running in an RCU read-side
 critical section.
 If not, the handler can immediately report a quiescent state.
@ -146,24 +145,23 @@ expedited grace period is shown in the following diagram:
 <p><img src="ExpSchedFlow.svg" alt="ExpSchedFlow.svg" width="55%">

 <p>
-As with RCU-preempt's <tt>synchronize_rcu_expedited()</tt>,
+As with RCU-preempt, RCU-sched's
 <tt>synchronize_sched_expedited()</tt> ignores offline and
 idle CPUs, again because they are in remotely detectable
 quiescent states.
-However, the <tt>synchronize_rcu_expedited()</tt> handler
-is <tt>sync_sched_exp_handler()</tt>, and because the
+However, because the
 <tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
 leave no trace of their invocation, in general it is not possible to tell
 whether or not the current CPU is in an RCU read-side critical section.
-The best that <tt>sync_sched_exp_handler()</tt> can do is to check
+The best that RCU-sched's <tt>rcu_exp_handler()</tt> can do is to check
 for idle, on the off-chance that the CPU went idle while the IPI
 was in flight.
-If the CPU is idle, then tt>sync_sched_exp_handler()</tt> reports
+If the CPU is idle, then <tt>rcu_exp_handler()</tt> reports
 the quiescent state.

-<p>
-Otherwise, the handler invokes <tt>resched_cpu()</tt>, which forces
-a future context switch.
+<p> Otherwise, the handler forces a future context switch by setting the
+NEED_RESCHED flag of the current task's thread flag and the CPU preempt
+counter.
 At the time of the context switch, the CPU reports the quiescent state.
 Should the CPU go offline first, it will report the quiescent state
 at that time.
@ -299,19 +297,18 @@ Instead, the task pushing the grace period forward will include the
 idle CPUs in the mask passed to <tt>rcu_report_exp_cpu_mult()</tt>.

 <p>
-For RCU-sched, there is an additional check for idle in the IPI
-handler, <tt>sync_sched_exp_handler()</tt>.
+For RCU-sched, there is an additional check:
 If the IPI has interrupted the idle loop, then
-<tt>sync_sched_exp_handler()</tt> invokes <tt>rcu_report_exp_rdp()</tt>
+<tt>rcu_exp_handler()</tt> invokes <tt>rcu_report_exp_rdp()</tt>
 to report the corresponding quiescent state.

 <p>
 For RCU-preempt, there is no specific check for idle in the
-IPI handler (<tt>sync_rcu_exp_handler()</tt>), but because
+IPI handler (<tt>rcu_exp_handler()</tt>), but because
 RCU read-side critical sections are not permitted within the
-idle loop, if <tt>sync_rcu_exp_handler()</tt> sees that the CPU is within
+idle loop, if <tt>rcu_exp_handler()</tt> sees that the CPU is within
 RCU read-side critical section, the CPU cannot possibly be idle.
-Otherwise, <tt>sync_rcu_exp_handler()</tt> invokes
+Otherwise, <tt>rcu_exp_handler()</tt> invokes
 <tt>rcu_report_exp_rdp()</tt> to report the corresponding quiescent
 state, regardless of whether or not that quiescent state was due to
 the CPU being idle.
@ -626,6 +623,8 @@ checks, but only during the mid-boot dead zone.
 <p>
 With this refinement, synchronous grace periods can now be used from
 task context pretty much any time during the life of the kernel.
+That is, aside from some points in the suspend, hibernate, or shutdown
+code path.

 <h3><a name="Summary">
 Summary</a></h3>
--- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html
+++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html
@ -77,7 +77,7 @@ The key point is that the lock-acquisition functions, including
 <tt>smp_mb__after_unlock_lock()</tt> immediately after successful
 acquisition of the lock.

-<p>Therefore, for any given <tt>rcu_node</tt> struction, any access
+<p>Therefore, for any given <tt>rcu_node</tt> structure, any access
 happening before one of the above lock-release functions will be seen
 by all CPUs as happening before any access happening after a later
 one of the above lock-acquisition functions.
@ -485,13 +485,13 @@ section that the grace period must wait on.
 noted by <tt>rcu_node_context_switch()</tt> on the left.
 On the other hand, if the CPU takes a scheduler-clock interrupt
 while executing in usermode, a quiescent state will be noted by
-<tt>rcu_check_callbacks()</tt> on the right.
+<tt>rcu_sched_clock_irq()</tt> on the right.
 Either way, the passage through a quiescent state will be noted
 in a per-CPU variable.

 <p>The next time an <tt>RCU_SOFTIRQ</tt> handler executes on
 this CPU (for example, after the next scheduler-clock
-interrupt), <tt>__rcu_process_callbacks()</tt> will invoke
+interrupt), <tt>rcu_core()</tt> will invoke
 <tt>rcu_check_quiescent_state()</tt>, which will notice the
 recorded quiescent state, and invoke
 <tt>rcu_report_qs_rdp()</tt>.
@ -651,7 +651,7 @@ to end.
 These callbacks are identified by <tt>rcu_advance_cbs()</tt>,
 which is usually invoked by <tt>__note_gp_changes()</tt>.
 As shown in the diagram below, this invocation can be triggered by
-the scheduling-clock interrupt (<tt>rcu_check_callbacks()</tt> on
+the scheduling-clock interrupt (<tt>rcu_sched_clock_irq()</tt> on
 the left) or by idle entry (<tt>rcu_cleanup_after_idle()</tt> on
 the right, but only for kernels build with
 <tt>CONFIG_RCU_FAST_NO_HZ=y</tt>).
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-invocation.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-invocation.svg
@ -349,7 +349,7 @@
       font-weight="bold"
       font-size="192"
       id="text202-7-5"
-       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_check_callbacks()</text>
+       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_sched_clock_irq()</text>
    <rect
       x="7069.6187"
       y="5087.4678"
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg
@ -3902,7 +3902,7 @@
         font-style="normal"
         y="-4418.6582"
         x="3745.7725"
-         xml:space="preserve">rcu_check_callbacks()</text>
+         xml:space="preserve">rcu_sched_clock_irq()</text>
    </g>
    <g
       transform="translate(-850.30204,55463.106)"
@ -3924,7 +3924,7 @@
         font-style="normal"
         y="-4418.6582"
         x="3745.7725"
-         xml:space="preserve">rcu_process_callbacks()</text>
+         xml:space="preserve">rcu_core()</text>
      <text
         style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"
         id="text202-7-5-3-27-0"
@ -3933,7 +3933,7 @@
         font-style="normal"
         y="-4165.7954"
         x="3745.7725"
-         xml:space="preserve">rcu_check_quiescent_state())</text>
+         xml:space="preserve">rcu_check_quiescent_state()</text>
      <text
         style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"
         id="text202-7-5-3-27-0-9"
@ -4968,7 +4968,7 @@
       font-weight="bold"
       font-size="192"
       id="text202-7-5-19"
-       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_check_callbacks()</text>
+       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_sched_clock_irq()</text>
    <rect
       x="5314.2671"
       y="82817.688"
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-qs.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-qs.svg
@ -775,7 +775,7 @@
         font-style="normal"
         y="-4418.6582"
         x="3745.7725"
-         xml:space="preserve">rcu_check_callbacks()</text>
+         xml:space="preserve">rcu_sched_clock_irq()</text>
    </g>
    <g
       transform="translate(399.7744,828.86448)"
@ -797,7 +797,7 @@
         font-style="normal"
         y="-4418.6582"
         x="3745.7725"
-         xml:space="preserve">rcu_process_callbacks()</text>
+         xml:space="preserve">rcu_core()</text>
      <text
         style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"
         id="text202-7-5-3-27-0"
@ -806,7 +806,7 @@
         font-style="normal"
         y="-4165.7954"
         x="3745.7725"
-         xml:space="preserve">rcu_check_quiescent_state())</text>
+         xml:space="preserve">rcu_check_quiescent_state()</text>
      <text
         style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"
         id="text202-7-5-3-27-0-9"
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@ -900,8 +900,6 @@ Except where otherwise noted, these non-guarantees were premeditated.
 	Grace Periods Don't Partition Read-Side Critical Sections</a>
 <li>	<a href="#Read-Side Critical Sections Don't Partition Grace Periods">
 	Read-Side Critical Sections Don't Partition Grace Periods</a>
-<li>	<a href="#Disabling Preemption Does Not Block Grace Periods">
-	Disabling Preemption Does Not Block Grace Periods</a>
 </ol>

 <h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3>
@ -1259,56 +1257,6 @@ of RCU grace periods.
 <tr><td>&nbsp;</td></tr>
 </table>

-<h3><a name="Disabling Preemption Does Not Block Grace Periods">
-Disabling Preemption Does Not Block Grace Periods</a></h3>
-
-<p>
-There was a time when disabling preemption on any given CPU would block
-subsequent grace periods.
-However, this was an accident of implementation and is not a requirement.
-And in the current Linux-kernel implementation, disabling preemption
-on a given CPU in fact does not block grace periods, as Oleg Nesterov
-<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>.
-
-<p>
-If you need a preempt-disable region to block grace periods, you need to add
-<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example
-as follows:
-
-<blockquote>
-<pre>
- 1 preempt_disable();
- 2 rcu_read_lock();
- 3 do_something();
- 4 rcu_read_unlock();
- 5 preempt_enable();
- 6
- 7 /* Spinlocks implicitly disable preemption. */
- 8 spin_lock(&amp;mylock);
- 9 rcu_read_lock();
-10 do_something();
-11 rcu_read_unlock();
-12 spin_unlock(&amp;mylock);
-</pre>
-</blockquote>
-
-<p>
-In theory, you could enter the RCU read-side critical section first,
-but it is more efficient to keep the entire RCU read-side critical
-section contained in the preempt-disable region as shown above.
-Of course, RCU read-side critical sections that extend outside of
-preempt-disable regions will work correctly, but such critical sections
-can be preempted, which forces <tt>rcu_read_unlock()</tt> to do
-more work.
-And no, this is <i>not</i> an invitation to enclose all of your RCU
-read-side critical sections within preempt-disable regions, because
-doing so would degrade real-time response.
-
-<p>
-This non-requirement appeared with preemptible RCU.
-If you need a grace period that waits on non-preemptible code regions, use
-<a href="#Sched Flavor">RCU-sched</a>.
-
 <h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2>

 <p>
@ -1383,6 +1331,7 @@ Classes of quality-of-implementation requirements are as follows:
 <ol>
 <li>	<a href="#Specialization">Specialization</a>
 <li>	<a href="#Performance and Scalability">Performance and Scalability</a>
+<li>	<a href="#Forward Progress">Forward Progress</a>
 <li>	<a href="#Composability">Composability</a>
 <li>	<a href="#Corner Cases">Corner Cases</a>
 </ol>
@ -1647,7 +1596,7 @@ used in place of <tt>synchronize_rcu()</tt> as follows:
 16   struct foo *p;
 17
 18   spin_lock(&amp;gp_lock);
-19   p = rcu_dereference(gp);
+19   p = rcu_access_pointer(gp);
 20   if (!p) {
 21     spin_unlock(&amp;gp_lock);
 22     return false;
@ -1824,6 +1773,106 @@ so it is too early to tell whether they will stand the test of time.
 RCU thus provides a range of tools to allow updaters to strike the
 required tradeoff between latency, flexibility and CPU overhead.

+<h3><a name="Forward Progress">Forward Progress</a></h3>
+
+<p>
+In theory, delaying grace-period completion and callback invocation
+is harmless.
+In practice, not only are memory sizes finite but also callbacks sometimes
+do wakeups, and sufficiently deferred wakeups can be difficult
+to distinguish from system hangs.
+Therefore, RCU must provide a number of mechanisms to promote forward
+progress.
+
+<p>
+These mechanisms are not foolproof, nor can they be.
+For one simple example, an infinite loop in an RCU read-side critical
+section must by definition prevent later grace periods from ever completing.
+For a more involved example, consider a 64-CPU system built with
+<tt>CONFIG_RCU_NOCB_CPU=y</tt> and booted with <tt>rcu_nocbs=1-63</tt>,
+where CPUs&nbsp;1 through&nbsp;63 spin in tight loops that invoke
+<tt>call_rcu()</tt>.
+Even if these tight loops also contain calls to <tt>cond_resched()</tt>
+(thus allowing grace periods to complete), CPU&nbsp;0 simply will
+not be able to invoke callbacks as fast as the other 63 CPUs can
+register them, at least not until the system runs out of memory.
+In both of these examples, the Spiderman principle applies:  With great
+power comes great responsibility.
+However, short of this level of abuse, RCU is required to
+ensure timely completion of grace periods and timely invocation of
+callbacks.
+
+<p>
+RCU takes the following steps to encourage timely completion of
+grace periods:
+
+<ol>
+<li>	If a grace period fails to complete within 100&nbsp;milliseconds,
+	RCU causes future invocations of <tt>cond_resched()</tt> on
+	the holdout CPUs to provide an RCU quiescent state.
+	RCU also causes those CPUs' <tt>need_resched()</tt> invocations
+	to return <tt>true</tt>, but only after the corresponding CPU's
+	next scheduling-clock.
+<li>	CPUs mentioned in the <tt>nohz_full</tt> kernel boot parameter
+	can run indefinitely in the kernel without scheduling-clock
+	interrupts, which defeats the above <tt>need_resched()</tt>
+	strategem.
+	RCU will therefore invoke <tt>resched_cpu()</tt> on any
+	<tt>nohz_full</tt> CPUs still holding out after
+	109&nbsp;milliseconds.
+<li>	In kernels built with <tt>CONFIG_RCU_BOOST=y</tt>, if a given
+	task that has been preempted within an RCU read-side critical
+	section is holding out for more than 500&nbsp;milliseconds,
+	RCU will resort to priority boosting.
+<li>	If a CPU is still holding out 10&nbsp;seconds into the grace
+	period, RCU will invoke <tt>resched_cpu()</tt> on it regardless
+	of its <tt>nohz_full</tt> state.
+</ol>
+
+<p>
+The above values are defaults for systems running with <tt>HZ=1000</tt>.
+They will vary as the value of <tt>HZ</tt> varies, and can also be
+changed using the relevant Kconfig options and kernel boot parameters.
+RCU currently does not do much sanity checking of these
+parameters, so please use caution when changing them.
+Note that these forward-progress measures are provided only for RCU,
+not for
+<a href="#Sleepable RCU">SRCU</a> or
+<a href="#Tasks RCU">Tasks RCU</a>.
+
+<p>
+RCU takes the following steps in <tt>call_rcu()</tt> to encourage timely
+invocation of callbacks when any given non-<tt>rcu_nocbs</tt> CPU has
+10,000 callbacks, or has 10,000 more callbacks than it had the last time
+encouragement was provided:
+
+<ol>
+<li>	Starts a grace period, if one is not already in progress.
+<li>	Forces immediate checking for quiescent states, rather than
+	waiting for three milliseconds to have elapsed since the
+	beginning of the grace period.
+<li>	Immediately tags the CPU's callbacks with their grace period
+	completion numbers, rather than waiting for the <tt>RCU_SOFTIRQ</tt>
+	handler to get around to it.
+<li>	Lifts callback-execution batch limits, which speeds up callback
+	invocation at the expense of degrading realtime response.
+</ol>
+
+<p>
+Again, these are default values when running at <tt>HZ=1000</tt>,
+and can be overridden.
+Again, these forward-progress measures are provided only for RCU,
+not for
+<a href="#Sleepable RCU">SRCU</a> or
+<a href="#Tasks RCU">Tasks RCU</a>.
+Even for RCU, callback-invocation forward progress for <tt>rcu_nocbs</tt>
+CPUs is much less well-developed, in part because workloads benefiting
+from <tt>rcu_nocbs</tt> CPUs tend to invoke <tt>call_rcu()</tt>
+relatively infrequently.
+If workloads emerge that need both <tt>rcu_nocbs</tt> CPUs and high
+<tt>call_rcu()</tt> invocation rates, then additional forward-progress
+work will be required.
+
 <h3><a name="Composability">Composability</a></h3>

 <p>
@ -2165,14 +2214,9 @@ however, this is not a panacea because there would be severe restrictions
 on what operations those callbacks could invoke.

 <p>
-Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
-<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
-(<a href="#Bottom-Half Flavor">discussed below</a>),
-<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>,
+Perhaps surprisingly, <tt>synchronize_rcu()</tt> and
 <tt>synchronize_rcu_expedited()</tt>,
-<tt>synchronize_rcu_bh_expedited()</tt>, and
-<tt>synchronize_sched_expedited()</tt>
-will all operate normally
+will operate normally
 during very early boot, the reason being that there is only one CPU
 and preemption is disabled.
 This means that the call <tt>synchronize_rcu()</tt> (or friends)
@ -2269,12 +2313,23 @@ Thankfully, RCU update-side primitives, including
 The name notwithstanding, some Linux-kernel architectures
 can have nested NMIs, which RCU must handle correctly.
 Andy Lutomirski
-<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a>
+<a href="https://lkml.kernel.org/r/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a>
 with this requirement;
 he also kindly surprised me with
-<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a>
+<a href="https://lkml.kernel.org/r/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a>
 that meets this requirement.

+<p>
+Furthermore, NMI handlers can be interrupted by what appear to RCU
+to be normal interrupts.
+One way that this can happen is for code that directly invokes
+<tt>rcu_irq_enter()</tt> and <tt>rcu_irq_exit()</tt> to be called
+from an NMI handler.
+This astonishing fact of life prompted the current code structure,
+which has <tt>rcu_irq_enter()</tt> invoking <tt>rcu_nmi_enter()</tt>
+and <tt>rcu_irq_exit()</tt> invoking <tt>rcu_nmi_exit()</tt>.
+And yes, I also learned of this requirement the hard way.
+
 <h3><a name="Loadable Modules">Loadable Modules</a></h3>

 <p>
@ -2290,7 +2345,7 @@ via <tt>del_timer_sync()</tt> or similar.
 <p>
 Unfortunately, there is no way to cancel an RCU callback;
 once you invoke <tt>call_rcu()</tt>, the callback function is
-going to eventually be invoked, unless the system goes down first.
+eventually going to be invoked, unless the system goes down first.
 Because it is normally considered socially irresponsible to crash the system
 in response to a module unload request, we need some other way
 to deal with in-flight RCU callbacks.
@ -2394,30 +2449,9 @@ when invoked from a CPU-hotplug notifier.
 <p>
 RCU depends on the scheduler, and the scheduler uses RCU to
 protect some of its data structures.
-This means the scheduler is forbidden from acquiring
-the runqueue locks and the priority-inheritance locks
-in the middle of an outermost RCU read-side critical section unless either
-(1)&nbsp;it releases them before exiting that same
-RCU read-side critical section, or
-(2)&nbsp;interrupts are disabled across
-that entire RCU read-side critical section.
-This same prohibition also applies (recursively!) to any lock that is acquired
-while holding any lock to which this prohibition applies.
-Adhering to this rule prevents preemptible RCU from invoking
-<tt>rcu_read_unlock_special()</tt> while either runqueue or
-priority-inheritance locks are held, thus avoiding deadlock.
-
-<p>
-Prior to v4.4, it was only necessary to disable preemption across
-RCU read-side critical sections that acquired scheduler locks.
-In v4.4, expedited grace periods started using IPIs, and these
-IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath.
-Therefore, this expedited-grace-period change required disabling of
-interrupts, not just preemption.
-
-<p>
-For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
-implementation must be written carefully to avoid similar deadlocks.
+The preemptible-RCU <tt>rcu_read_unlock()</tt>
+implementation must therefore be written carefully to avoid deadlocks
+involving the scheduler's runqueue and priority-inheritance locks.
 In particular, <tt>rcu_read_unlock()</tt> must tolerate an
 interrupt where the interrupt handler invokes both
 <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
@ -2426,7 +2460,7 @@ negative nesting levels to avoid destructive recursion via
 interrupt handler's use of RCU.

 <p>
-This pair of mutual scheduler-RCU requirements came as a
+This scheduler-RCU requirement came as a
 <a href="https://lwn.net/Articles/453002/">complete surprise</a>.

 <p>
@ -2437,9 +2471,42 @@ when running context-switch-heavy workloads when built with
 <tt>CONFIG_NO_HZ_FULL=y</tt>
 <a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>.
 RCU has made good progress towards meeting this requirement, even
-for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
+for context-switch-heavy <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
 but there is room for further improvement.

+<p>
+It is forbidden to hold any of scheduler's runqueue or priority-inheritance
+spinlocks across an <tt>rcu_read_unlock()</tt> unless interrupts have been
+disabled across the entire RCU read-side critical section, that is,
+up to and including the matching <tt>rcu_read_lock()</tt>.
+Violating this restriction can result in deadlocks involving these
+scheduler spinlocks.
+There was hope that this restriction might be lifted when interrupt-disabled
+calls to <tt>rcu_read_unlock()</tt> started deferring the reporting of
+the resulting RCU-preempt quiescent state until the end of the corresponding
+interrupts-disabled region.
+Unfortunately, timely reporting of the corresponding quiescent state
+to expedited grace periods requires a call to <tt>raise_softirq()</tt>,
+which can acquire these scheduler spinlocks.
+In addition, real-time systems using RCU priority boosting
+need this restriction to remain in effect because deferred
+quiescent-state reporting would also defer deboosting, which in turn
+would degrade real-time latencies.
+
+<p>
+In theory, if a given RCU read-side critical section could be
+guaranteed to be less than one second in duration, holding a scheduler
+spinlock across that critical section's <tt>rcu_read_unlock()</tt>
+would require only that preemption be disabled across the entire
+RCU read-side critical section, not interrupts.
+Unfortunately, given the possibility of vCPU preemption, long-running
+interrupts, and so on, it is not possible in practice to guarantee
+that a given RCU read-side critical section will complete in less than
+one second.
+Therefore, as noted above, if scheduler spinlocks are held across
+a given call to <tt>rcu_read_unlock()</tt>, interrupts must be
+disabled across the entire RCU read-side critical section.
+
 <h3><a name="Tracing and RCU">Tracing and RCU</a></h3>

 <p>
@ -2850,15 +2917,22 @@ The other four flavors are listed below, with requirements for each
 described in a separate section.

 <ol>
-<li>	<a href="#Bottom-Half Flavor">Bottom-Half Flavor</a>
-<li>	<a href="#Sched Flavor">Sched Flavor</a>
+<li>	<a href="#Bottom-Half Flavor">Bottom-Half Flavor (Historical)</a>
+<li>	<a href="#Sched Flavor">Sched Flavor (Historical)</a>
 <li>	<a href="#Sleepable RCU">Sleepable RCU</a>
 <li>	<a href="#Tasks RCU">Tasks RCU</a>
-<li>	<a href="#Waiting for Multiple Grace Periods">
-	Waiting for Multiple Grace Periods</a>
 </ol>

-<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3>
+<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor (Historical)</a></h3>
+
+<p>
+The RCU-bh flavor of RCU has since been expressed in terms of
+the other RCU flavors as part of a consolidation of the three
+flavors into a single flavor.
+The read-side API remains, and continues to disable softirq and to
+be accounted for by lockdep.
+Much of the material in this section is therefore strictly historical
+in nature.

 <p>
 The softirq-disable (AKA &ldquo;bottom-half&rdquo;,
@ -2918,8 +2992,20 @@ includes
 <tt>call_rcu_bh()</tt>,
 <tt>rcu_barrier_bh()</tt>, and
 <tt>rcu_read_lock_bh_held()</tt>.
+However, the update-side APIs are now simple wrappers for other RCU
+flavors, namely RCU-sched in CONFIG_PREEMPT=n kernels and RCU-preempt
+otherwise.

-<h3><a name="Sched Flavor">Sched Flavor</a></h3>
+<h3><a name="Sched Flavor">Sched Flavor (Historical)</a></h3>
+
+<p>
+The RCU-sched flavor of RCU has since been expressed in terms of
+the other RCU flavors as part of a consolidation of the three
+flavors into a single flavor.
+The read-side API remains, and continues to disable preemption and to
+be accounted for by lockdep.
+Much of the material in this section is therefore strictly historical
+in nature.

 <p>
 Before preemptible RCU, waiting for an RCU grace period had the
@ -3013,7 +3099,7 @@ If you block forever in one of a given domain's SRCU read-side critical
 sections, then that domain's grace periods will also be blocked forever.
 Of course, one good way to block forever is to deadlock, which can
 happen if any operation in a given domain's SRCU read-side critical
-section can block waiting, either directly or indirectly, for that domain's
+section can wait, either directly or indirectly, for that domain's
 grace period to elapse.
 For example, this results in a self-deadlock:

@ -3053,12 +3139,18 @@ API, which, in combination with <tt>srcu_read_unlock()</tt>,
 guarantees a full memory barrier.

 <p>
-Also unlike other RCU flavors, SRCU's callbacks-wait function
-<tt>srcu_barrier()</tt> may be invoked from CPU-hotplug notifiers,
-though this is not necessarily a good idea.
-The reason that this is possible is that SRCU is insensitive
-to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt>
-need not exclude CPU-hotplug operations.
+Also unlike other RCU flavors, <tt>synchronize_srcu()</tt> may <b>not</b>
+be invoked from CPU-hotplug notifiers, due to the fact that SRCU grace
+periods make use of timers and the possibility of timers being temporarily
+&ldquo;stranded&rdquo; on the outgoing CPU.
+This stranding of timers means that timers posted to the outgoing CPU
+will not fire until late in the CPU-hotplug process.
+The problem is that if a notifier is waiting on an SRCU grace period,
+that grace period is waiting on a timer, and that timer is stranded on the
+outgoing CPU, then the notifier will never be awakened, in other words,
+deadlock has occurred.
+This same situation of course also prohibits <tt>srcu_barrier()</tt>
+from being invoked from CPU-hotplug notifiers.

 <p>
 SRCU also differs from other RCU flavors in that SRCU's expedited and
@ -3139,94 +3231,14 @@ The tasks-RCU API is quite compact, consisting only of
 <tt>call_rcu_tasks()</tt>,
 <tt>synchronize_rcu_tasks()</tt>, and
 <tt>rcu_barrier_tasks()</tt>.
-
-<h3><a name="Waiting for Multiple Grace Periods">
-Waiting for Multiple Grace Periods</a></h3>
-
-<p>
-Perhaps you have an RCU protected data structure that is accessed from
-RCU read-side critical sections, from softirq handlers, and from
-hardware interrupt handlers.
-That is three flavors of RCU, the normal flavor, the bottom-half flavor,
-and the sched flavor.
-How to wait for a compound grace period?
-
-<p>
-The best approach is usually to &ldquo;just say no!&rdquo; and
-insert <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>
-around each RCU read-side critical section, regardless of what
-environment it happens to be in.
-But suppose that some of the RCU read-side critical sections are
-on extremely hot code paths, and that use of <tt>CONFIG_PREEMPT=n</tt>
-is not a viable option, so that <tt>rcu_read_lock()</tt> and
-<tt>rcu_read_unlock()</tt> are not free.
-What then?
-
-<p>
-You <i>could</i> wait on all three grace periods in succession, as follows:
-
-<blockquote>
-<pre>
- 1 synchronize_rcu();
- 2 synchronize_rcu_bh();
- 3 synchronize_sched();
-</pre>
-</blockquote>
-
-<p>
-This works, but triples the update-side latency penalty.
-In cases where this is not acceptable, <tt>synchronize_rcu_mult()</tt>
-may be used to wait on all three flavors of grace period concurrently:
-
-<blockquote>
-<pre>
- 1 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched);
-</pre>
-</blockquote>
-
-<p>
-But what if it is necessary to also wait on SRCU?
-This can be done as follows:
-
-<blockquote>
-<pre>
- 1 static void call_my_srcu(struct rcu_head *head,
- 2        void (*func)(struct rcu_head *head))
- 3 {
- 4   call_srcu(&amp;my_srcu, head, func);
- 5 }
- 6
- 7 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched, call_my_srcu);
-</pre>
-</blockquote>
-
-<p>
-If you needed to wait on multiple different flavors of SRCU
-(but why???), you would need to create a wrapper function resembling
-<tt>call_my_srcu()</tt> for each SRCU flavor.
-
-<table>
-<tr><th>&nbsp;</th></tr>
-<tr><th align="left">Quick Quiz:</th></tr>
-<tr><td>
-	But what if I need to wait for multiple RCU flavors, but I also need
-	the grace periods to be expedited?
-</td></tr>
-<tr><th align="left">Answer:</th></tr>
-<tr><td bgcolor="#ffffff"><font color="ffffff">
-	If you are using expedited grace periods, there should be less penalty
-	for waiting on them in succession.
-	But if that is nevertheless a problem, you can use workqueues
-	or multiple kthreads to wait on the various expedited grace
-	periods concurrently.
-</font></td></tr>
-<tr><td>&nbsp;</td></tr>
-</table>
-
-<p>
-Again, it is usually better to adjust the RCU read-side critical sections
-to use a single flavor of RCU, but when this is not feasible, you can use
-<tt>synchronize_rcu_mult()</tt>.
+In <tt>CONFIG_PREEMPT=n</tt> kernels, trampolines cannot be preempted,
+so these APIs map to
+<tt>call_rcu()</tt>,
+<tt>synchronize_rcu()</tt>, and
+<tt>rcu_barrier()</tt>, respectively.
+In <tt>CONFIG_PREEMPT=y</tt> kernels, trampolines can be preempted,
+and these three APIs are therefore implemented by separate functions
+that check for voluntary context switches.

 <h2><a name="Possible Future Changes">Possible Future Changes</a></h2>

@ -3237,12 +3249,6 @@ If this becomes a serious problem, it will be necessary to rework the
 grace-period state machine so as to avoid the need for the additional
 latency.

-<p>
-Expedited grace periods scan the CPUs, so their latency and overhead
-increases with increasing numbers of CPUs.
-If this becomes a serious problem on large systems, it will be necessary
-to do some redesign to avoid this scalability problem.
-
 <p>
 RCU disables CPU hotplug in a few places, perhaps most notably in the
 <tt>rcu_barrier()</tt> operations.
@ -3287,11 +3293,6 @@ Please note that arrangements that require RCU to remap CPU numbers will
 require extremely good demonstration of need and full exploration of
 alternatives.

-<p>
-There is an embarrassingly large number of flavors of RCU, and this
-number has been increasing over time.
-Perhaps it will be possible to combine some at some future date.
-
 <p>
 RCU's various kthreads are reasonably recent additions.
 It is quite likely that adjustments will be required to more gracefully
@ -3303,6 +3304,11 @@ For example, RCU callback overhead might be charged back to the
 originating <tt>call_rcu()</tt> instance, though probably not
 in production kernels.

+<p>
+Additional work may be required to provide reasonable forward-progress
+guarantees under heavy load for grace periods and for callback
+invocation.
+
 <h2><a name="Summary">Summary</a></h2>

 <p>
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@ -63,7 +63,7 @@ over a rather long period of time, but improvements are always welcome!
 	pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(),
 	rcu_read_lock_sched(), or by the appropriate update-side lock.
 	Disabling of preemption can serve as rcu_read_lock_sched(), but
-	is less readable.
+	is less readable and prevents lockdep from detecting locking issues.

 	Letting RCU-protected pointers "leak" out of an RCU read-side
 	critical section is every bid as bad as letting them leak out
@ -285,11 +285,7 @@ over a rather long period of time, but improvements are always welcome!
 		here is that superuser already has lots of ways to crash
 		the machine.

-	d.	Use call_rcu_bh() rather than call_rcu(), in order to take
-		advantage of call_rcu_bh()'s faster grace periods.  (This
-		is only a partial solution, though.)
-
-	e.	Periodically invoke synchronize_rcu(), permitting a limited
+	d.	Periodically invoke synchronize_rcu(), permitting a limited
 		number of updates per grace period.

 	The same cautions apply to call_rcu_bh(), call_rcu_sched(),
@ -324,37 +320,14 @@ over a rather long period of time, but improvements are always welcome!
 	will break Alpha, cause aggressive compilers to generate bad code,
 	and confuse people trying to read your code.

-11.	Note that synchronize_rcu() -only- guarantees to wait until
-	all currently executing rcu_read_lock()-protected RCU read-side
-	critical sections complete.  It does -not- necessarily guarantee
-	that all currently running interrupts, NMIs, preempt_disable()
-	code, or idle loops will complete.  Therefore, if your
-	read-side critical sections are protected by something other
-	than rcu_read_lock(), do -not- use synchronize_rcu().
-
-	Similarly, disabling preemption is not an acceptable substitute
-	for rcu_read_lock().  Code that attempts to use preemption
-	disabling where it should be using rcu_read_lock() will break
-	in CONFIG_PREEMPT=y kernel builds.
-
-	If you want to wait for interrupt handlers, NMI handlers, and
-	code under the influence of preempt_disable(), you instead
-	need to use synchronize_irq() or synchronize_sched().
-
-	This same limitation also applies to synchronize_rcu_bh()
-	and synchronize_srcu(), as well as to the asynchronous and
-	expedited forms of the three primitives, namely call_rcu(),
-	call_rcu_bh(), call_srcu(), synchronize_rcu_expedited(),
-	synchronize_rcu_bh_expedited(), and synchronize_srcu_expedited().
-
-12.	Any lock acquired by an RCU callback must be acquired elsewhere
+11.	Any lock acquired by an RCU callback must be acquired elsewhere
 	with softirq disabled, e.g., via spin_lock_irqsave(),
 	spin_lock_bh(), etc.  Failing to disable irq on a given
 	acquisition of that lock will result in deadlock as soon as
 	the RCU softirq handler happens to run your RCU callback while
 	interrupting that acquisition's critical section.

-13.	RCU callbacks can be and are executed in parallel.  In many cases,
+12.	RCU callbacks can be and are executed in parallel.  In many cases,
 	the callback code simply wrappers around kfree(), so that this
 	is not an issue (or, more accurately, to the extent that it is
 	an issue, the memory-allocator locking handles it).  However,
@ -370,7 +343,7 @@ over a rather long period of time, but improvements are always welcome!
 	not the case, a self-spawning RCU callback would prevent the
 	victim CPU from ever going offline.)

-14.	Unlike other forms of RCU, it -is- permissible to block in an
+13.	Unlike other forms of RCU, it -is- permissible to block in an
 	SRCU read-side critical section (demarked by srcu_read_lock()
 	and srcu_read_unlock()), hence the "SRCU": "sleepable RCU".
 	Please note that if you don't need to sleep in read-side critical
@ -414,7 +387,7 @@ over a rather long period of time, but improvements are always welcome!
 	Note that rcu_dereference() and rcu_assign_pointer() relate to
 	SRCU just as they do to other forms of RCU.

-15.	The whole point of call_rcu(), synchronize_rcu(), and friends
+14.	The whole point of call_rcu(), synchronize_rcu(), and friends
 	is to wait until all pre-existing readers have finished before
 	carrying out some otherwise-destructive operation.  It is
 	therefore critically important to -first- remove any path
@ -426,13 +399,13 @@ over a rather long period of time, but improvements are always welcome!
 	is the caller's responsibility to guarantee that any subsequent
 	readers will execute safely.

-16.	The various RCU read-side primitives do -not- necessarily contain
+15.	The various RCU read-side primitives do -not- necessarily contain
 	memory barriers.  You should therefore plan for the CPU
 	and the compiler to freely reorder code into and out of RCU
 	read-side critical sections.  It is the responsibility of the
 	RCU update-side primitives to deal with this.

-17.	Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
+16.	Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
 	__rcu sparse checks to validate your RCU code.	These can help
 	find problems as follows:

@ -455,7 +428,7 @@ over a rather long period of time, but improvements are always welcome!
 	These debugging aids can help you find problems that are
 	otherwise extremely difficult to spot.

-18.	If you register a callback using call_rcu(), call_rcu_bh(),
+17.	If you register a callback using call_rcu(), call_rcu_bh(),
 	call_rcu_sched(), or call_srcu(), and pass in a function defined
 	within a loadable module, then it in necessary to wait for
 	all pending callbacks to be invoked after the last invocation
@ -469,8 +442,8 @@ over a rather long period of time, but improvements are always welcome!
 	You instead need to use one of the barrier functions:

 	o	call_rcu() -> rcu_barrier()
-	o	call_rcu_bh() -> rcu_barrier_bh()
-	o	call_rcu_sched() -> rcu_barrier_sched()
+	o	call_rcu_bh() -> rcu_barrier()
+	o	call_rcu_sched() -> rcu_barrier()
 	o	call_srcu() -> srcu_barrier()

 	However, these barrier functions are absolutely -not- guaranteed
--- a/Documentation/RCU/lockdep-splat.txt
+++ b/Documentation/RCU/lockdep-splat.txt
@ -14,9 +14,9 @@ being the real world and all that.
 So let's look at an example RCU lockdep splat from 3.0-rc5, one that
 has long since been fixed:

-===============================
-[ INFO: suspicious RCU usage. ]
-------------------------------
+=============================
+WARNING: suspicious RCU usage
+-----------------------------
 block/cfq-iosched.c:2776 suspicious rcu_dereference_protected() usage!

 other info that might help us debug this:
@ -24,11 +24,11 @@ other info that might help us debug this:

 rcu_scheduler_active = 1, debug_locks = 0
 3 locks held by scsi_scan_6/1552:
- #0:  (&shost->scan_mutex){+.+.+.}, at: [<ffffffff8145efca>]
+ #0:  (&shost->scan_mutex){+.+.}, at: [<ffffffff8145efca>]
 scsi_scan_host_selected+0x5a/0x150
- #1:  (&eq->sysfs_lock){+.+...}, at: [<ffffffff812a5032>]
+ #1:  (&eq->sysfs_lock){+.+.}, at: [<ffffffff812a5032>]
 elevator_exit+0x22/0x60
- #2:  (&(&q->__queue_lock)->rlock){-.-...}, at: [<ffffffff812b6233>]
+ #2:  (&(&q->__queue_lock)->rlock){-.-.}, at: [<ffffffff812b6233>]
 cfq_exit_queue+0x43/0x190

 stack backtrace:
--- a/Documentation/RCU/rcu.txt
+++ b/Documentation/RCU/rcu.txt
@ -87,7 +87,3 @@ o	Where can I find more information on RCU?

 	See the RTFP.txt file in this directory.
 	Or point your browser at http://www.rdrop.com/users/paulmck/RCU/.
-
-o	What are all these files in this directory?
-
-	See 00-INDEX for the list.
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@ -16,12 +16,9 @@ o	A CPU looping in an RCU read-side critical section.

 o	A CPU looping with interrupts disabled.

-o	A CPU looping with preemption disabled.  This condition can
-	result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
-	stalls.
+o	A CPU looping with preemption disabled.

-o	A CPU looping with bottom halves disabled.  This condition can
-	result in RCU-sched and RCU-bh stalls.
+o	A CPU looping with bottom halves disabled.

 o	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
 	without invoking schedule().  If the looping in the kernel is
@ -87,9 +84,9 @@ o	A hardware failure.  This is quite unlikely, but has occurred
 	This resulted in a series of RCU CPU stall warnings, eventually
 	leading the realization that the CPU had failed.

-The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall
-warning.  Note that SRCU does -not- have CPU stall warnings.  Please note
-that RCU only detects CPU stalls when there is a grace period in progress.
+The RCU, RCU-sched, and RCU-tasks implementations have CPU stall warning.
+Note that SRCU does -not- have CPU stall warnings.  Please note that
+RCU only detects CPU stalls when there is a grace period in progress.
 No grace period, no CPU stall warnings.

 To diagnose the cause of the stall, inspect the stack traces.
@ -179,9 +176,8 @@ causing stalls, and that the stall was affecting RCU-sched.  This message
 will normally be followed by stack dumps for each CPU.  Please note that
 PREEMPT_RCU builds can be stalled by tasks as well as by CPUs, and that
 the tasks will be indicated by PID, for example, "P3421".  It is even
-possible for a rcu_preempt_state stall to be caused by both CPUs -and-
-tasks, in which case the offending CPUs and tasks will all be called
-out in the list.
+possible for an rcu_state stall to be caused by both CPUs -and- tasks,
+in which case the offending CPUs and tasks will all be called out in the list.

 CPU 2's "(3 GPs behind)" indicates that this CPU has not interacted with
 the RCU core for the past three grace periods.  In contrast, CPU 16's "(0
@ -209,7 +205,7 @@ handlers are no longer able to execute on this CPU.  This can happen if
 the stalled CPU is spinning with interrupts are disabled, or, in -rt
 kernels, if a high-priority process is starving RCU's softirq handler.

-The "fps=" shows the number of force-quiescent-state idle/offline
+The "fqs=" shows the number of force-quiescent-state idle/offline
 detection passes that the grace-period kthread has made across this
 CPU since the last time that this CPU noted the beginning of a grace
 period.
@ -223,17 +219,18 @@ an estimate of the total number of RCU callbacks queued across all CPUs
 In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
 for each CPU:

-	0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D
+	0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 Nonlazy posted: ..D

 The "last_accelerate:" prints the low-order 16 bits (in hex) of the
 jiffies counter when this CPU last invoked rcu_try_advance_all_cbs()
 from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from
-rcu_prepare_for_idle().  The "nonlazy_posted:" prints the number
-of non-lazy callbacks posted since the last call to rcu_needs_cpu().
-Finally, an "L" indicates that there are currently no non-lazy callbacks
-("." is printed otherwise, as shown above) and "D" indicates that
-dyntick-idle processing is enabled ("." is printed otherwise, for example,
-if disabled via the "nohz=" kernel boot parameter).
+rcu_prepare_for_idle().  The "Nonlazy posted:" indicates lazy-callback
+status, so that an "l" indicates that all callbacks were lazy at the start
+of the last idle period and an "L" indicates that there are currently
+no non-lazy callbacks (in both cases, "." is printed otherwise, as
+shown above) and "D" indicates that dyntick-idle processing is enabled
+("." is printed otherwise, for example, if disabled via the "nohz="
+kernel boot parameter).

 If the grace period ends just as the stall warning starts printing,
 there will be a spurious stall-warning message, which will include
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@ -10,173 +10,8 @@ status messages via printk(), which can be examined via the dmesg
 command (perhaps grepping for "torture").  The test is started
 when the module is loaded, and stops when the module is unloaded.

-
-MODULE PARAMETERS
-
-This module has the following parameters:
-
-fqs_duration	Duration (in microseconds) of artificially induced bursts
-		of force_quiescent_state() invocations.  In RCU
-		implementations having force_quiescent_state(), these
-		bursts help force races between forcing a given grace
-		period and that grace period ending on its own.
-
-fqs_holdoff	Holdoff time (in microseconds) between consecutive calls
-		to force_quiescent_state() within a burst.
-
-fqs_stutter	Wait time (in seconds) between consecutive bursts
-		of calls to force_quiescent_state().
-
-gp_normal	Make the fake writers use normal synchronous grace-period
-		primitives.
-
-gp_exp		Make the fake writers use expedited synchronous grace-period
-		primitives.  If both gp_normal and gp_exp are set, or
-		if neither gp_normal nor gp_exp are set, then randomly
-		choose the primitive so that about 50% are normal and
-		50% expedited.  By default, neither are set, which
-		gives best overall test coverage.
-
-irqreader	Says to invoke RCU readers from irq level.  This is currently
-		done via timers.  Defaults to "1" for variants of RCU that
-		permit this.  (Or, more accurately, variants of RCU that do
-		-not- permit this know to ignore this variable.)
-
-n_barrier_cbs	If this is nonzero, RCU barrier testing will be conducted,
-		in which case n_barrier_cbs specifies the number of
-		RCU callbacks (and corresponding kthreads) to use for
-		this testing.  The value cannot be negative.  If you
-		specify this to be non-zero when torture_type indicates a
-		synchronous RCU implementation (one for which a member of
-		the synchronize_rcu() rather than the call_rcu() family is
-		used -- see the documentation for torture_type below), an
-		error will be reported and no testing will be carried out.
-
-nfakewriters	This is the number of RCU fake writer threads to run.  Fake
-		writer threads repeatedly use the synchronous "wait for
-		current readers" function of the interface selected by
-		torture_type, with a delay between calls to allow for various
-		different numbers of writers running in parallel.
-		nfakewriters defaults to 4, which provides enough parallelism
-		to trigger special cases caused by multiple writers, such as
-		the synchronize_srcu() early return optimization.
-
-nreaders	This is the number of RCU reading threads supported.
-		The default is twice the number of CPUs.  Why twice?
-		To properly exercise RCU implementations with preemptible
-		read-side critical sections.
-
-onoff_interval
-		The number of seconds between each attempt to execute a
-		randomly selected CPU-hotplug operation.  Defaults to
-		zero, which disables CPU hotplugging.  In HOTPLUG_CPU=n
-		kernels, rcutorture will silently refuse to do any
-		CPU-hotplug operations regardless of what value is
-		specified for onoff_interval.
-
-onoff_holdoff	The number of seconds to wait until starting CPU-hotplug
-		operations.  This would normally only be used when
-		rcutorture was built into the kernel and started
-		automatically at boot time, in which case it is useful
-		in order to avoid confusing boot-time code with CPUs
-		coming and going.
-
-shuffle_interval
-		The number of seconds to keep the test threads affinitied
-		to a particular subset of the CPUs, defaults to 3 seconds.
-		Used in conjunction with test_no_idle_hz.
-
-shutdown_secs	The number of seconds to run the test before terminating
-		the test and powering off the system.  The default is
-		zero, which disables test termination and system shutdown.
-		This capability is useful for automated testing.
-
-stall_cpu	The number of seconds that a CPU should be stalled while
-		within both an rcu_read_lock() and a preempt_disable().
-		This stall happens only once per rcutorture run.
-		If you need multiple stalls, use modprobe and rmmod to
-		repeatedly run rcutorture.  The default for stall_cpu
-		is zero, which prevents rcutorture from stalling a CPU.
-
-		Note that attempts to rmmod rcutorture while the stall
-		is ongoing will hang, so be careful what value you
-		choose for this module parameter!  In addition, too-large
-		values for stall_cpu might well induce failures and
-		warnings in other parts of the kernel.  You have been
-		warned!
-
-stall_cpu_holdoff
-		The number of seconds to wait after rcutorture starts
-		before stalling a CPU.  Defaults to 10 seconds.
-
-stat_interval	The number of seconds between output of torture
-		statistics (via printk()).  Regardless of the interval,
-		statistics are printed when the module is unloaded.
-		Setting the interval to zero causes the statistics to
-		be printed -only- when the module is unloaded, and this
-		is the default.
-
-stutter		The length of time to run the test before pausing for this
-		same period of time.  Defaults to "stutter=5", so as
-		to run and pause for (roughly) five-second intervals.
-		Specifying "stutter=0" causes the test to run continuously
-		without pausing, which is the old default behavior.
-
-test_boost	Whether or not to test the ability of RCU to do priority
-		boosting.  Defaults to "test_boost=1", which performs
-		RCU priority-inversion testing only if the selected
-		RCU implementation supports priority boosting.  Specifying
-		"test_boost=0" never performs RCU priority-inversion
-		testing.  Specifying "test_boost=2" performs RCU
-		priority-inversion testing even if the selected RCU
-		implementation does not support RCU priority boosting,
-		which can be used to test rcutorture's ability to
-		carry out RCU priority-inversion testing.
-
-test_boost_interval
-		The number of seconds in an RCU priority-inversion test
-		cycle.	Defaults to "test_boost_interval=7".  It is
-		usually wise for this value to be relatively prime to
-		the value selected for "stutter".
-
-test_boost_duration
-		The number of seconds to do RCU priority-inversion testing
-		within any given "test_boost_interval".  Defaults to
-		"test_boost_duration=4".
-
-test_no_idle_hz	Whether or not to test the ability of RCU to operate in
-		a kernel that disables the scheduling-clock interrupt to
-		idle CPUs.  Boolean parameter, "1" to test, "0" otherwise.
-		Defaults to omitting this test.
-
-torture_type	The type of RCU to test, with string values as follows:
-
-		"rcu":  rcu_read_lock(), rcu_read_unlock() and call_rcu(),
-			along with expedited, synchronous, and polling
-			variants.
-
-		"rcu_bh": rcu_read_lock_bh(), rcu_read_unlock_bh(), and
-			call_rcu_bh(), along with expedited and synchronous
-			variants.
-
-		"rcu_busted": This tests an intentionally incorrect version
-			of RCU in order to help test rcutorture itself.
-
-		"srcu": srcu_read_lock(), srcu_read_unlock() and
-			call_srcu(), along with expedited and
-			synchronous variants.
-
-		"sched": preempt_disable(), preempt_enable(), and
-			call_rcu_sched(), along with expedited,
-			synchronous, and polling variants.
-
-		"tasks": voluntary context switch and call_rcu_tasks(),
-			along with expedited and synchronous variants.
-
-		Defaults to "rcu".
-
-verbose		Enable debug printk()s.  Default is disabled.
-
+Module parameters are prefixed by "rcutorture." in
+Documentation/admin-guide/kernel-parameters.txt.

 OUTPUT

--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@ -266,7 +266,7 @@ rcu_dereference()
 	unnecessary overhead on Alpha CPUs.

 	Note that the value returned by rcu_dereference() is valid
-	only within the enclosing RCU read-side critical section.
+	only within the enclosing RCU read-side critical section [1].
 	For example, the following is -not- legal:

 		rcu_read_lock();
@ -292,6 +292,19 @@ rcu_dereference()
 	typically used indirectly, via the _rcu list-manipulation
 	primitives, such as list_for_each_entry_rcu().

+	[1] The variant rcu_dereference_protected() can be used outside
+	of an RCU read-side critical section as long as the usage is
+	protected by locks acquired by the update-side code.  This variant
+	avoids the lockdep warning that would happen when using (for
+	example) rcu_dereference() without rcu_read_lock() protection.
+	Using rcu_dereference_protected() also has the advantage
+	of permitting compiler optimizations that rcu_dereference()
+	must prohibit.	The rcu_dereference_protected() variant takes
+	a lockdep expression to indicate which locks must be acquired
+	by the caller. If the indicated protection is not provided,
+	a lockdep splat is emitted.  See RCU/Design/Requirements/Requirements.html
+	and the API's code comments for more details and example usage.
+
 The following diagram shows how each API communicates among the
 reader, updater, and reclaimer.

@ -322,28 +335,27 @@ to their callers and (2) call_rcu() callbacks may be invoked.  Efficient
 implementations of the RCU infrastructure make heavy use of batching in
 order to amortize their overhead over many uses of the corresponding APIs.

-There are no fewer than three RCU mechanisms in the Linux kernel; the
-diagram above shows the first one, which is by far the most commonly used.
-The rcu_dereference() and rcu_assign_pointer() primitives are used for
-all three mechanisms, but different defer and protect primitives are
-used as follows:
+There are at least three flavors of RCU usage in the Linux kernel. The diagram
+above shows the most common one. On the updater side, the rcu_assign_pointer(),
+sychronize_rcu() and call_rcu() primitives used are the same for all three
+flavors. However for protection (on the reader side), the primitives used vary
+depending on the flavor:

-	Defer			Protect
+a.	rcu_read_lock() / rcu_read_unlock()
+	rcu_dereference()

-a.	synchronize_rcu()	rcu_read_lock() / rcu_read_unlock()
-	call_rcu()		rcu_dereference()
+b.	rcu_read_lock_bh() / rcu_read_unlock_bh()
+	local_bh_disable() / local_bh_enable()
+	rcu_dereference_bh()

-b.	synchronize_rcu_bh()	rcu_read_lock_bh() / rcu_read_unlock_bh()
-	call_rcu_bh()		rcu_dereference_bh()
+c.	rcu_read_lock_sched() / rcu_read_unlock_sched()
+	preempt_disable() / preempt_enable()
+	local_irq_save() / local_irq_restore()
+	hardirq enter / hardirq exit
+	NMI enter / NMI exit
+	rcu_dereference_sched()

-c.	synchronize_sched()	rcu_read_lock_sched() / rcu_read_unlock_sched()
-	call_rcu_sched()	preempt_disable() / preempt_enable()
-				local_irq_save() / local_irq_restore()
-				hardirq enter / hardirq exit
-				NMI enter / NMI exit
-				rcu_dereference_sched()
-
-These three mechanisms are used as follows:
+These three flavors are used as follows:

 a.	RCU applied to normal data structures.

@ -548,7 +560,7 @@ presents two such "toy" implementations of RCU, one that is implemented
 in terms of familiar locking primitives, and another that more closely
 resembles "classic" RCU.  Both are way too simple for real-world use,
 lacking both functionality and performance.  However, they are useful
-in getting a feel for how RCU works.  See kernel/rcupdate.c for a
+in getting a feel for how RCU works.  See kernel/rcu/update.c for a
 production-quality implementation, and see:

 	http://www.rdrop.com/users/paulmck/RCU
@ -867,18 +879,20 @@ RCU:	Critical sections	Grace period		Barrier

 bh:	Critical sections	Grace period		Barrier

-	rcu_read_lock_bh	call_rcu_bh		rcu_barrier_bh
-	rcu_read_unlock_bh	synchronize_rcu_bh
-	rcu_dereference_bh	synchronize_rcu_bh_expedited
+	rcu_read_lock_bh	call_rcu		rcu_barrier
+	rcu_read_unlock_bh	synchronize_rcu
+	[local_bh_disable]	synchronize_rcu_expedited
+	[and friends]
+	rcu_dereference_bh
 	rcu_dereference_bh_check
 	rcu_dereference_bh_protected
 	rcu_read_lock_bh_held

 sched:	Critical sections	Grace period		Barrier

-	rcu_read_lock_sched	synchronize_sched	rcu_barrier_sched
-	rcu_read_unlock_sched	call_rcu_sched
-	[preempt_disable]	synchronize_sched_expedited
+	rcu_read_lock_sched	call_rcu		rcu_barrier
+	rcu_read_unlock_sched	synchronize_rcu
+	[preempt_disable]	synchronize_rcu_expedited
 	[and friends]
 	rcu_read_lock_sched_notrace
 	rcu_read_unlock_sched_notrace
@ -890,8 +904,8 @@ sched:	Critical sections	Grace period		Barrier

 SRCU:	Critical sections	Grace period		Barrier

-	srcu_read_lock		synchronize_srcu	srcu_barrier
-	srcu_read_unlock	call_srcu
+	srcu_read_lock		call_srcu		srcu_barrier
+	srcu_read_unlock	synchronize_srcu
 	srcu_dereference	synchronize_srcu_expedited
 	srcu_dereference_check
 	srcu_read_lock_held
@ -934,7 +948,8 @@ c.	Do you need to treat NMI handlers, hardirq handlers,
 d.	Do you need RCU grace periods to complete even in the face
 	of softirq monopolization of one or more of the CPUs?  For
 	example, is your code subject to network-based denial-of-service
-	attacks?  If so, you need RCU-bh.
+	attacks?  If so, you should disable softirq across your readers,
+	for example, by using rcu_read_lock_bh().

 e.	Is your workload too update-intensive for normal use of
 	RCU, but inappropriate for other synchronization mechanisms?
@ -1033,7 +1048,7 @@ Answer:		Just as PREEMPT_RT permits preemption of spinlock
 		spinlocks blocking while in RCU read-side critical
 		sections.

-		Why the apparent inconsistency?  Because it is it
+		Why the apparent inconsistency?  Because it is
 		possible to use priority boosting to keep the RCU
 		grace periods short if need be (for example, if running
 		short of memory).  In contrast, if blocking waiting
--- a/Documentation/accounting/psi.txt
+++ b/Documentation/accounting/psi.txt
@ -0,0 +1,73 @@
+================================
+PSI - Pressure Stall Information
+================================
+
+:Date: April, 2018
+:Author: Johannes Weiner <hannes@cmpxchg.org>
+
+When CPU, memory or IO devices are contended, workloads experience
+latency spikes, throughput losses, and run the risk of OOM kills.
+
+Without an accurate measure of such contention, users are forced to
+either play it safe and under-utilize their hardware resources, or
+roll the dice and frequently suffer the disruptions resulting from
+excessive overcommit.
+
+The psi feature identifies and quantifies the disruptions caused by
+such resource crunches and the time impact it has on complex workloads
+or even entire systems.
+
+Having an accurate measure of productivity losses caused by resource
+scarcity aids users in sizing workloads to hardware--or provisioning
+hardware according to workload demand.
+
+As psi aggregates this information in realtime, systems can be managed
+dynamically using techniques such as load shedding, migrating jobs to
+other systems or data centers, or strategically pausing or killing low
+priority or restartable batch jobs.
+
+This allows maximizing hardware utilization without sacrificing
+workload health or risking major disruptions such as OOM kills.
+
+Pressure interface
+==================
+
+Pressure information for each resource is exported through the
+respective file in /proc/pressure/ -- cpu, memory, and io.
+
+The format for CPU is as such:
+
+some avg10=0.00 avg60=0.00 avg300=0.00 total=0
+
+and for memory and IO:
+
+some avg10=0.00 avg60=0.00 avg300=0.00 total=0
+full avg10=0.00 avg60=0.00 avg300=0.00 total=0
+
+The "some" line indicates the share of time in which at least some
+tasks are stalled on a given resource.
+
+The "full" line indicates the share of time in which all non-idle
+tasks are stalled on a given resource simultaneously. In this state
+actual CPU cycles are going to waste, and a workload that spends
+extended time in this state is considered to be thrashing. This has
+severe impact on performance, and it's useful to distinguish this
+situation from a state where some tasks are stalled but the CPU is
+still doing productive work. As such, time spent in this subset of the
+stall state is tracked separately and exported in the "full" averages.
+
+The ratios (in %) are tracked as recent trends over ten, sixty, and
+three hundred second windows, which gives insight into short term events
+as well as medium and long term trends. The total absolute stall time
+(in us) is tracked and exported as well, to allow detection of latency
+spikes which wouldn't necessarily make a dent in the time averages,
+or to average trends over custom time frames.
+
+Cgroup2 interface
+=================
+
+In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem
+mounted, pressure stall information is also tracked for tasks grouped
+into cgroups. Each subdirectory in the cgroupfs mountpoint contains
+cpu.pressure, memory.pressure, and io.pressure files; the format is
+the same as the /proc/pressure/ files.
--- a/Documentation/acpi/aml-debugger.txt
+++ b/Documentation/acpi/aml-debugger.txt
@ -23,7 +23,7 @@ kernel.

   The resultant userspace tool binary is then located at:

-     tools/acpi/power/acpi/acpidbg/acpidbg
+     tools/power/acpi/acpidbg

   It can be installed to system directories by running "make install" (as a
   sufficiently privileged user).
@ -35,7 +35,7 @@ kernel.

   # mount -t debugfs none /sys/kernel/debug
   # modprobe acpi_dbg
-   # tools/acpi/power/acpi/acpidbg/acpidbg
+   # tools/power/acpi/acpidbg

   That spawns the interactive AML debugger environment where you can execute
   debugger commands.
--- a/Documentation/acpi/initrd_table_override.txt
+++ b/Documentation/acpi/initrd_table_override.txt
@ -14,6 +14,10 @@ upgrade the ACPI execution environment that is defined by the ACPI tables
 via upgrading the ACPI tables provided by the BIOS with an instrumented,
 modified, more recent version one, or installing brand new ACPI tables.

+When building initrd with kernel in a single image, option
+ACPI_TABLE_OVERRIDE_VIA_BUILTIN_INITRD should also be true for this
+feature to work.
+
 For a full list of ACPI tables that can be upgraded/installed, take a look
 at the char *table_sigs[MAX_ACPI_SIGNATURE]; definition in
 drivers/acpi/tables.c.
--- a/Documentation/admin-guide/LSM/SELinux.rst
+++ b/Documentation/admin-guide/LSM/SELinux.rst
@ -6,7 +6,7 @@ If you want to use SELinux, chances are you will want
 to use the distro-provided policies, or install the
 latest reference policy release from

-	http://oss.tresys.com/projects/refpolicy
+	https://github.com/SELinuxProject/refpolicy

 However, if you want to install a dummy policy for
 testing, you can do using ``mdp`` provided under
--- a/Documentation/admin-guide/LSM/SafeSetID.rst
+++ b/Documentation/admin-guide/LSM/SafeSetID.rst
@ -0,0 +1,107 @@
+=========
+SafeSetID
+=========
+SafeSetID is an LSM module that gates the setid family of syscalls to restrict
+UID/GID transitions from a given UID/GID to only those approved by a
+system-wide whitelist. These restrictions also prohibit the given UIDs/GIDs
+from obtaining auxiliary privileges associated with CAP_SET{U/G}ID, such as
+allowing a user to set up user namespace UID mappings.
+
+
+Background
+==========
+In absence of file capabilities, processes spawned on a Linux system that need
+to switch to a different user must be spawned with CAP_SETUID privileges.
+CAP_SETUID is granted to programs running as root or those running as a non-root
+user that have been explicitly given the CAP_SETUID runtime capability. It is
+often preferable to use Linux runtime capabilities rather than file
+capabilities, since using file capabilities to run a program with elevated
+privileges opens up possible security holes since any user with access to the
+file can exec() that program to gain the elevated privileges.
+
+While it is possible to implement a tree of processes by giving full
+CAP_SET{U/G}ID capabilities, this is often at odds with the goals of running a
+tree of processes under non-root user(s) in the first place. Specifically,
+since CAP_SETUID allows changing to any user on the system, including the root
+user, it is an overpowered capability for what is needed in this scenario,
+especially since programs often only call setuid() to drop privileges to a
+lesser-privileged user -- not elevate privileges. Unfortunately, there is no
+generally feasible way in Linux to restrict the potential UIDs that a user can
+switch to through setuid() beyond allowing a switch to any user on the system.
+This SafeSetID LSM seeks to provide a solution for restricting setid
+capabilities in such a way.
+
+The main use case for this LSM is to allow a non-root program to transition to
+other untrusted uids without full blown CAP_SETUID capabilities. The non-root
+program would still need CAP_SETUID to do any kind of transition, but the
+additional restrictions imposed by this LSM would mean it is a "safer" version
+of CAP_SETUID since the non-root program cannot take advantage of CAP_SETUID to
+do any unapproved actions (e.g. setuid to uid 0 or create/enter new user
+namespace). The higher level goal is to allow for uid-based sandboxing of system
+services without having to give out CAP_SETUID all over the place just so that
+non-root programs can drop to even-lesser-privileged uids. This is especially
+relevant when one non-root daemon on the system should be allowed to spawn other
+processes as different uids, but its undesirable to give the daemon a
+basically-root-equivalent CAP_SETUID.
+
+
+Other Approaches Considered
+===========================
+
+Solve this problem in userspace
+-------------------------------
+For candidate applications that would like to have restricted setid capabilities
+as implemented in this LSM, an alternative option would be to simply take away
+setid capabilities from the application completely and refactor the process
+spawning semantics in the application (e.g. by using a privileged helper program
+to do process spawning and UID/GID transitions). Unfortunately, there are a
+number of semantics around process spawning that would be affected by this, such
+as fork() calls where the program doesn???t immediately call exec() after the
+fork(), parent processes specifying custom environment variables or command line
+args for spawned child processes, or inheritance of file handles across a
+fork()/exec(). Because of this, as solution that uses a privileged helper in
+userspace would likely be less appealing to incorporate into existing projects
+that rely on certain process-spawning semantics in Linux.
+
+Use user namespaces
+-------------------
+Another possible approach would be to run a given process tree in its own user
+namespace and give programs in the tree setid capabilities. In this way,
+programs in the tree could change to any desired UID/GID in the context of their
+own user namespace, and only approved UIDs/GIDs could be mapped back to the
+initial system user namespace, affectively preventing privilege escalation.
+Unfortunately, it is not generally feasible to use user namespaces in isolation,
+without pairing them with other namespace types, which is not always an option.
+Linux checks for capabilities based off of the user namespace that ???owns??? some
+entity. For example, Linux has the notion that network namespaces are owned by
+the user namespace in which they were created. A consequence of this is that
+capability checks for access to a given network namespace are done by checking
+whether a task has the given capability in the context of the user namespace
+that owns the network namespace -- not necessarily the user namespace under
+which the given task runs. Therefore spawning a process in a new user namespace
+effectively prevents it from accessing the network namespace owned by the
+initial namespace. This is a deal-breaker for any application that expects to
+retain the CAP_NET_ADMIN capability for the purpose of adjusting network
+configurations. Using user namespaces in isolation causes problems regarding
+other system interactions, including use of pid namespaces and device creation.
+
+Use an existing LSM
+-------------------
+None of the other in-tree LSMs have the capability to gate setid transitions, or
+even employ the security_task_fix_setuid hook at all. SELinux says of that hook:
+"Since setuid only affects the current process, and since the SELinux controls
+are not based on the Linux identity attributes, SELinux does not need to control
+this operation."
+
+
+Directions for use
+==================
+This LSM hooks the setid syscalls to make sure transitions are allowed if an
+applicable restriction policy is in place. Policies are configured through
+securityfs by writing to the safesetid/add_whitelist_policy and
+safesetid/flush_whitelist_policies files at the location where securityfs is
+mounted. The format for adding a policy is '<UID>:<UID>', using literal
+numbers, such as '123:456'. To flush the policies, any write to the file is
+sufficient. Again, configuring a policy for a UID will prevent that UID from
+obtaining auxiliary setid privileges, such as allowing a user to set up user
+namespace UID mappings.
--- a/Documentation/admin-guide/LSM/Smack.rst
+++ b/Documentation/admin-guide/LSM/Smack.rst
@ -818,6 +818,10 @@ Smack supports some mount options:
 	specifies a label to which all labels set on the
 	filesystem must have read access. Not yet enforced.

+  smackfstransmute=label:
+	behaves exactly like smackfsroot except that it also
+	sets the transmute flag on the root of the mount
+
 These mount options apply to all file system types.

 Smack auditing
--- a/Documentation/admin-guide/LSM/Yama.rst
+++ b/Documentation/admin-guide/LSM/Yama.rst
@ -64,8 +64,8 @@ The sysctl settings (writable only with ``CAP_SYS_PTRACE``) are:
    Using ``PTRACE_TRACEME`` is unchanged.

 2 - admin-only attach:
-    only processes with ``CAP_SYS_PTRACE`` may use ptrace
-    with ``PTRACE_ATTACH``, or through children calling ``PTRACE_TRACEME``.
+    only processes with ``CAP_SYS_PTRACE`` may use ptrace, either with
+    ``PTRACE_ATTACH`` or through children calling ``PTRACE_TRACEME``.

 3 - no attach:
    no processes may use ptrace with ``PTRACE_ATTACH`` nor via
--- a/Documentation/admin-guide/LSM/index.rst
+++ b/Documentation/admin-guide/LSM/index.rst
@ -17,9 +17,8 @@ MAC extensions, other extensions can be built using the LSM to provide
 specific changes to system operation when these tweaks are not available
 in the core functionality of Linux itself.

-Without a specific LSM built into the kernel, the default LSM will be the
-Linux capabilities system. Most LSMs choose to extend the capabilities
-system, building their checks on top of the defined capability hooks.
+The Linux capabilities modules will always be included. This may be
+followed by any number of "minor" modules and at most one "major" module.
 For more details on capabilities, see ``capabilities(7)`` in the Linux
 man-pages project.

@ -30,6 +29,14 @@ order in which checks are made. The capability module will always
 be first, followed by any "minor" modules (e.g. Yama) and then
 the one "major" module (e.g. SELinux) if there is one configured.

+Process attributes associated with "major" security modules should
+be accessed and maintained using the special files in ``/proc/.../attr``.
+A security module may maintain a module specific subdirectory there,
+named after the module. ``/proc/.../attr/smack`` is provided by the Smack
+security module and contains all its special files. The files directly
+in ``/proc/.../attr`` remain as legacy interfaces for modules that provide
+subdirectories.
+
 .. toctree::
   :maxdepth: 1

@ -39,3 +46,4 @@ the one "major" module (e.g. SELinux) if there is one configured.
   Smack
   tomoyo
   Yama
+   SafeSetID
--- a/Documentation/admin-guide/README.rst
+++ b/Documentation/admin-guide/README.rst
@ -1,9 +1,9 @@
 .. _readme:

-Linux kernel release 4.x <http://kernel.org/>
+Linux kernel release 5.x <http://kernel.org/>
 =============================================

-These are the release notes for Linux version 4.  Read them carefully,
+These are the release notes for Linux version 5.  Read them carefully,
 as they tell you what this is all about, explain how to install the
 kernel, and what to do if something goes wrong.

@ -51,8 +51,7 @@ Documentation

 - There are various README files in the Documentation/ subdirectory:
   these typically contain kernel-specific installation notes for some
-   drivers for example. See Documentation/00-INDEX for a list of what
-   is contained in each file.  Please read the
+   drivers for example. Please read the
   :ref:`Documentation/process/changes.rst <changes>` file, as it
   contains information about the problems, which may result by upgrading
   your kernel.
@ -64,7 +63,7 @@ Installing the kernel source
   directory where you have permissions (e.g. your home directory) and
   unpack it::

-     xz -cd linux-4.X.tar.xz | tar xvf -
+     xz -cd linux-5.x.tar.xz | tar xvf -

   Replace "X" with the version number of the latest kernel.

@ -73,26 +72,26 @@ Installing the kernel source
   files.  They should match the library, and not get messed up by
   whatever the kernel-du-jour happens to be.

- - You can also upgrade between 4.x releases by patching.  Patches are
+ - You can also upgrade between 5.x releases by patching.  Patches are
   distributed in the xz format.  To install by patching, get all the
   newer patch files, enter the top level directory of the kernel source
-   (linux-4.X) and execute::
+   (linux-5.x) and execute::

-     xz -cd ../patch-4.x.xz | patch -p1
+     xz -cd ../patch-5.x.xz | patch -p1

-   Replace "x" for all versions bigger than the version "X" of your current
+   Replace "x" for all versions bigger than the version "x" of your current
   source tree, **in_order**, and you should be ok.  You may want to remove
   the backup files (some-file-name~ or some-file-name.orig), and make sure
   that there are no failed patches (some-file-name# or some-file-name.rej).
   If there are, either you or I have made a mistake.

-   Unlike patches for the 4.x kernels, patches for the 4.x.y kernels
+   Unlike patches for the 5.x kernels, patches for the 5.x.y kernels
   (also known as the -stable kernels) are not incremental but instead apply
-   directly to the base 4.x kernel.  For example, if your base kernel is 4.0
-   and you want to apply the 4.0.3 patch, you must not first apply the 4.0.1
-   and 4.0.2 patches. Similarly, if you are running kernel version 4.0.2 and
-   want to jump to 4.0.3, you must first reverse the 4.0.2 patch (that is,
-   patch -R) **before** applying the 4.0.3 patch. You can read more on this in
+   directly to the base 5.x kernel.  For example, if your base kernel is 5.0
+   and you want to apply the 5.0.3 patch, you must not first apply the 5.0.1
+   and 5.0.2 patches. Similarly, if you are running kernel version 5.0.2 and
+   want to jump to 5.0.3, you must first reverse the 5.0.2 patch (that is,
+   patch -R) **before** applying the 5.0.3 patch. You can read more on this in
   :ref:`Documentation/process/applying-patches.rst <applying_patches>`.

   Alternatively, the script patch-kernel can be used to automate this
@ -115,7 +114,7 @@ Installing the kernel source
 Software requirements
 ---------------------

-   Compiling and running the 4.x kernels requires up-to-date
+   Compiling and running the 5.x kernels requires up-to-date
   versions of various software packages.  Consult
   :ref:`Documentation/process/changes.rst <changes>` for the minimum version numbers
   required and how to get updates for these packages.  Beware that using
@ -133,12 +132,12 @@ Build directory for the kernel
   place for the output files (including .config).
   Example::

-     kernel source code: /usr/src/linux-4.X
+     kernel source code: /usr/src/linux-5.x
     build directory:    /home/name/build/kernel

   To configure and build the kernel, use::

-     cd /usr/src/linux-4.X
+     cd /usr/src/linux-5.x
     make O=/home/name/build/kernel menuconfig
     make O=/home/name/build/kernel
     sudo make O=/home/name/build/kernel modules_install install
@ -252,7 +251,7 @@ Configuring the kernel
 Compiling the kernel
 --------------------

- - Make sure you have at least gcc 3.2 available.
+ - Make sure you have at least gcc 4.6 available.
   For more information, refer to :ref:`Documentation/process/changes.rst <changes>`.

   Please note that you can still run a.out user programs with this kernel.
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@ -56,11 +56,13 @@ v1 is available under Documentation/cgroup-v1/.
         5-3-3-2. IO Latency Interface Files
     5-4. PID
       5-4-1. PID Interface Files
-     5-5. Device
-     5-6. RDMA
-       5-6-1. RDMA Interface Files
-     5-7. Misc
-       5-7-1. perf_event
+     5-5. Cpuset
+       5.5-1. Cpuset Interface Files
+     5-6. Device
+     5-7. RDMA
+       5-7-1. RDMA Interface Files
+     5-8. Misc
+       5-8-1. perf_event
     5-N. Non-normative information
       5-N-1. CPU controller root cgroup process behaviour
       5-N-2. IO controller root cgroup process behaviour
@ -966,6 +968,12 @@ All time durations are in microseconds.
 	$PERIOD duration.  "max" for $MAX indicates no limit.  If only
 	one number is written, $MAX is updated.

+  cpu.pressure
+	A read-only nested-key file which exists on non-root cgroups.
+
+	Shows pressure stall information for CPU. See
+	Documentation/accounting/psi.txt for details.
+

 Memory
 ------
@ -1127,6 +1135,10 @@ PAGE_SIZE multiple when read back.
 		disk readahead.  For now OOM in memory cgroup kills
 		tasks iff shortage has happened inside page fault.

+		This event is not raised if the OOM killer is not
+		considered as an option, e.g. for failed high-order
+		allocations.
+
 	  oom_kill
 		The number of processes belonging to this cgroup
 		killed by any kind of OOM killer.
@ -1177,6 +1189,10 @@ PAGE_SIZE multiple when read back.
 		Amount of cached filesystem data that was modified and
 		is currently being written back to disk

+	  anon_thp
+		Amount of memory used in anonymous mappings backed by
+		transparent hugepages
+
 	  inactive_anon, active_anon, inactive_file, active_file, unevictable
 		Amount of memory, swap-backed and filesystem-backed,
 		on the internal memory management lists used by the
@ -1236,6 +1252,18 @@ PAGE_SIZE multiple when read back.

 		Amount of reclaimed lazyfree pages

+	  thp_fault_alloc
+
+		Number of transparent hugepages which were allocated to satisfy
+		a page fault, including COW faults. This counter is not present
+		when CONFIG_TRANSPARENT_HUGEPAGE is not set.
+
+	  thp_collapse_alloc
+
+		Number of transparent hugepages which were allocated to allow
+		collapsing an existing range of pages. This counter is not
+		present when CONFIG_TRANSPARENT_HUGEPAGE is not set.
+
  memory.swap.current
 	A read-only single value file which exists on non-root
 	cgroups.
@ -1271,6 +1299,12 @@ PAGE_SIZE multiple when read back.
 	higher than the limit for an extended period of time.  This
 	reduces the impact on the workload and memory management.

+  memory.pressure
+	A read-only nested-key file which exists on non-root cgroups.
+
+	Shows pressure stall information for memory. See
+	Documentation/accounting/psi.txt for details.
+

 Usage Guidelines
 ~~~~~~~~~~~~~~~~
@ -1408,6 +1442,12 @@ IO Interface Files

 	  8:16 rbps=2097152 wbps=max riops=max wiops=max

+  io.pressure
+	A read-only nested-key file which exists on non-root cgroups.
+
+	Shows pressure stall information for IO. See
+	Documentation/accounting/psi.txt for details.
+

 Writeback
 ~~~~~~~~~
@ -1479,7 +1519,7 @@ protected workload.

 The limits are only applied at the peer level in the hierarchy.  This means that
 in the diagram below, only groups A, B, and C will influence each other, and
-groups D and F will influence each other.  Group G will influence nobody.
+groups D and F will influence each other.  Group G will influence nobody::

 			[root]
 		/	   |		\
@ -1588,6 +1628,176 @@ through fork() or clone(). These will return -EAGAIN if the creation
 of a new process would cause a cgroup policy to be violated.


+Cpuset
+------
+
+The "cpuset" controller provides a mechanism for constraining
+the CPU and memory node placement of tasks to only the resources
+specified in the cpuset interface files in a task's current cgroup.
+This is especially valuable on large NUMA systems where placing jobs
+on properly sized subsets of the systems with careful processor and
+memory placement to reduce cross-node memory access and contention
+can improve overall system performance.
+
+The "cpuset" controller is hierarchical.  That means the controller
+cannot use CPUs or memory nodes not allowed in its parent.
+
+
+Cpuset Interface Files
+~~~~~~~~~~~~~~~~~~~~~~
+
+  cpuset.cpus
+	A read-write multiple values file which exists on non-root
+	cpuset-enabled cgroups.
+
+	It lists the requested CPUs to be used by tasks within this
+	cgroup.  The actual list of CPUs to be granted, however, is
+	subjected to constraints imposed by its parent and can differ
+	from the requested CPUs.
+
+	The CPU numbers are comma-separated numbers or ranges.
+	For example:
+
+	  # cat cpuset.cpus
+	  0-4,6,8-10
+
+	An empty value indicates that the cgroup is using the same
+	setting as the nearest cgroup ancestor with a non-empty
+	"cpuset.cpus" or all the available CPUs if none is found.
+
+	The value of "cpuset.cpus" stays constant until the next update
+	and won't be affected by any CPU hotplug events.
+
+  cpuset.cpus.effective
+	A read-only multiple values file which exists on all
+	cpuset-enabled cgroups.
+
+	It lists the onlined CPUs that are actually granted to this
+	cgroup by its parent.  These CPUs are allowed to be used by
+	tasks within the current cgroup.
+
+	If "cpuset.cpus" is empty, the "cpuset.cpus.effective" file shows
+	all the CPUs from the parent cgroup that can be available to
+	be used by this cgroup.  Otherwise, it should be a subset of
+	"cpuset.cpus" unless none of the CPUs listed in "cpuset.cpus"
+	can be granted.  In this case, it will be treated just like an
+	empty "cpuset.cpus".
+
+	Its value will be affected by CPU hotplug events.
+
+  cpuset.mems
+	A read-write multiple values file which exists on non-root
+	cpuset-enabled cgroups.
+
+	It lists the requested memory nodes to be used by tasks within
+	this cgroup.  The actual list of memory nodes granted, however,
+	is subjected to constraints imposed by its parent and can differ
+	from the requested memory nodes.
+
+	The memory node numbers are comma-separated numbers or ranges.
+	For example:
+
+	  # cat cpuset.mems
+	  0-1,3
+
+	An empty value indicates that the cgroup is using the same
+	setting as the nearest cgroup ancestor with a non-empty
+	"cpuset.mems" or all the available memory nodes if none
+	is found.
+
+	The value of "cpuset.mems" stays constant until the next update
+	and won't be affected by any memory nodes hotplug events.
+
+  cpuset.mems.effective
+	A read-only multiple values file which exists on all
+	cpuset-enabled cgroups.
+
+	It lists the onlined memory nodes that are actually granted to
+	this cgroup by its parent. These memory nodes are allowed to
+	be used by tasks within the current cgroup.
+
+	If "cpuset.mems" is empty, it shows all the memory nodes from the
+	parent cgroup that will be available to be used by this cgroup.
+	Otherwise, it should be a subset of "cpuset.mems" unless none of
+	the memory nodes listed in "cpuset.mems" can be granted.  In this
+	case, it will be treated just like an empty "cpuset.mems".
+
+	Its value will be affected by memory nodes hotplug events.
+
+  cpuset.cpus.partition
+	A read-write single value file which exists on non-root
+	cpuset-enabled cgroups.  This flag is owned by the parent cgroup
+	and is not delegatable.
+
+        It accepts only the following input values when written to.
+
+        "root"   - a paritition root
+        "member" - a non-root member of a partition
+
+	When set to be a partition root, the current cgroup is the
+	root of a new partition or scheduling domain that comprises
+	itself and all its descendants except those that are separate
+	partition roots themselves and their descendants.  The root
+	cgroup is always a partition root.
+
+	There are constraints on where a partition root can be set.
+	It can only be set in a cgroup if all the following conditions
+	are true.
+
+	1) The "cpuset.cpus" is not empty and the list of CPUs are
+	   exclusive, i.e. they are not shared by any of its siblings.
+	2) The parent cgroup is a partition root.
+	3) The "cpuset.cpus" is also a proper subset of the parent's
+	   "cpuset.cpus.effective".
+	4) There is no child cgroups with cpuset enabled.  This is for
+	   eliminating corner cases that have to be handled if such a
+	   condition is allowed.
+
+	Setting it to partition root will take the CPUs away from the
+	effective CPUs of the parent cgroup.  Once it is set, this
+	file cannot be reverted back to "member" if there are any child
+	cgroups with cpuset enabled.
+
+	A parent partition cannot distribute all its CPUs to its
+	child partitions.  There must be at least one cpu left in the
+	parent partition.
+
+	Once becoming a partition root, changes to "cpuset.cpus" is
+	generally allowed as long as the first condition above is true,
+	the change will not take away all the CPUs from the parent
+	partition and the new "cpuset.cpus" value is a superset of its
+	children's "cpuset.cpus" values.
+
+	Sometimes, external factors like changes to ancestors'
+	"cpuset.cpus" or cpu hotplug can cause the state of the partition
+	root to change.  On read, the "cpuset.sched.partition" file
+	can show the following values.
+
+	"member"       Non-root member of a partition
+	"root"         Partition root
+	"root invalid" Invalid partition root
+
+	It is a partition root if the first 2 partition root conditions
+	above are true and at least one CPU from "cpuset.cpus" is
+	granted by the parent cgroup.
+
+	A partition root can become invalid if none of CPUs requested
+	in "cpuset.cpus" can be granted by the parent cgroup or the
+	parent cgroup is no longer a partition root itself.  In this
+	case, it is not a real partition even though the restriction
+	of the first partition root condition above will still apply.
+	The cpu affinity of all the tasks in the cgroup will then be
+	associated with CPUs in the nearest ancestor partition.
+
+	An invalid partition root can be transitioned back to a
+	real partition root if at least one of the requested CPUs
+	can now be granted by its parent.  In this case, the cpu
+	affinity of all the tasks in the formerly invalid partition
+	will be associated to the CPUs of the newly formed partition.
+	Changing the partition state of an invalid partition root to
+	"member" is always allowed even if child cpusets are present.
+
+
 Device controller
 -----------------

@ -1857,8 +2067,10 @@ following two functions.

  wbc_init_bio(@wbc, @bio)
 	Should be called for each bio carrying writeback data and
-	associates the bio with the inode's owner cgroup.  Can be
-	called anytime between bio allocation and submission.
+	associates the bio with the inode's owner cgroup and the
+	corresponding request queue.  This must be called after
+	a queue (device) has been associated with the bio and
+	before submission.

  wbc_account_io(@wbc, @page, @bytes)
 	Should be called for each data segment being written out.
@ -1877,7 +2089,7 @@ the configuration, the bio may be executed at a lower priority and if
 the writeback session is holding shared resources, e.g. a journal
 entry, may lead to priority inversion.  There is no one easy solution
 for the problem.  Filesystems can try to work around specific problem
-cases by skipping wbc_init_bio() or using bio_associate_blkcg()
+cases by skipping wbc_init_bio() and using bio_associate_blkg()
 directly.


--- a/Documentation/admin-guide/devices.rst
+++ b/Documentation/admin-guide/devices.rst
@ -1,3 +1,4 @@
+.. _admin_devices:

 Linux allocated devices (4.x+ version)
 ======================================
--- a/Documentation/admin-guide/dynamic-debug-howto.rst
+++ b/Documentation/admin-guide/dynamic-debug-howto.rst
@ -110,8 +110,8 @@ If your query set is big, you can batch them too::

  ~# cat query-batch-file > <debugfs>/dynamic_debug/control

-A another way is to use wildcard. The match rule support ``*`` (matches
-zero or more characters) and ``?`` (matches exactly one character).For
+Another way is to use wildcards. The match rule supports ``*`` (matches
+zero or more characters) and ``?`` (matches exactly one character). For
 example, you can match all usb drivers::

  ~# echo "file drivers/usb/* +p" > <debugfs>/dynamic_debug/control
@ -258,7 +258,7 @@ this boot parameter for debugging purposes.

 If ``foo`` module is not built-in, ``foo.dyndbg`` will still be processed at
 boot time, without effect, but will be reprocessed when module is
-loaded later. ``dyndbg_query=`` and bare ``dyndbg=`` are only processed at
+loaded later. ``ddebug_query=`` and bare ``dyndbg=`` are only processed at
 boot.


@ -301,7 +301,7 @@ The ``dyndbg`` option is a "fake" module parameter, which means:

 For ``CONFIG_DYNAMIC_DEBUG`` kernels, any settings given at boot-time (or
 enabled by ``-DDEBUG`` flag during compilation) can be disabled later via
-the sysfs interface if the debug messages are no longer needed::
+the debugfs interface if the debug messages are no longer needed::

   echo "module module_name -p" > <debugfs>/dynamic_debug/control

--- a/Documentation/admin-guide/ext4.rst
+++ b/Documentation/admin-guide/ext4.rst
@ -0,0 +1,574 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================
+ext4 General Information
+========================
+
+Ext4 is an advanced level of the ext3 filesystem which incorporates
+scalability and reliability enhancements for supporting large filesystems
+(64 bit) in keeping with increasing disk capacities and state-of-the-art
+feature requirements.
+
+Mailing list:	linux-ext4@vger.kernel.org
+Web site:	http://ext4.wiki.kernel.org
+
+
+Quick usage instructions
+========================
+
+Note: More extensive information for getting started with ext4 can be
+found at the ext4 wiki site at the URL:
+http://ext4.wiki.kernel.org/index.php/Ext4_Howto
+
+  - The latest version of e2fsprogs can be found at:
+
+    https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
+
+	or
+
+    http://sourceforge.net/project/showfiles.php?group_id=2406
+
+	or grab the latest git repository from:
+
+   https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
+
+  - Create a new filesystem using the ext4 filesystem type:
+
+        # mke2fs -t ext4 /dev/hda1
+
+    Or to configure an existing ext3 filesystem to support extents:
+
+	# tune2fs -O extents /dev/hda1
+
+    If the filesystem was created with 128 byte inodes, it can be
+    converted to use 256 byte for greater efficiency via:
+
+        # tune2fs -I 256 /dev/hda1
+
+  - Mounting:
+
+	# mount -t ext4 /dev/hda1 /wherever
+
+  - When comparing performance with other filesystems, it's always
+    important to try multiple workloads; very often a subtle change in a
+    workload parameter can completely change the ranking of which
+    filesystems do well compared to others.  When comparing versus ext3,
+    note that ext4 enables write barriers by default, while ext3 does
+    not enable write barriers by default.  So it is useful to use
+    explicitly specify whether barriers are enabled or not when via the
+    '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems
+    for a fair comparison.  When tuning ext3 for best benchmark numbers,
+    it is often worthwhile to try changing the data journaling mode; '-o
+    data=writeback' can be faster for some workloads.  (Note however that
+    running mounted with data=writeback can potentially leave stale data
+    exposed in recently written files in case of an unclean shutdown,
+    which could be a security exposure in some situations.)  Configuring
+    the filesystem with a large journal can also be helpful for
+    metadata-intensive workloads.
+
+Features
+========
+
+Currently Available
+-------------------
+
+* ability to use filesystems > 16TB (e2fsprogs support not available yet)
+* extent format reduces metadata overhead (RAM, IO for access, transactions)
+* extent format more robust in face of on-disk corruption due to magics,
+* internal redundancy in tree
+* improved file allocation (multi-block alloc)
+* lift 32000 subdirectory limit imposed by i_links_count[1]
+* nsec timestamps for mtime, atime, ctime, create time
+* inode version field on disk (NFSv4, Lustre)
+* reduced e2fsck time via uninit_bg feature
+* journal checksumming for robustness, performance
+* persistent file preallocation (e.g for streaming media, databases)
+* ability to pack bitmaps and inode tables into larger virtual groups via the
+  flex_bg feature
+* large file support
+* inode allocation using large virtual block groups via flex_bg
+* delayed allocation
+* large block (up to pagesize) support
+* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
+  the ordering)
+
+[1] Filesystems with a block size of 1k may see a limit imposed by the
+directory hash tree having a maximum depth of two.
+
+Options
+=======
+
+When mounting an ext4 filesystem, the following option are accepted:
+(*) == default
+
+  ro
+        Mount filesystem read only. Note that ext4 will replay the journal (and
+        thus write to the partition) even when mounted "read only". The mount
+        options "ro,noload" can be used to prevent writes to the filesystem.
+
+  journal_checksum
+        Enable checksumming of the journal transactions.  This will allow the
+        recovery code in e2fsck and the kernel to detect corruption in the
+        kernel.  It is a compatible change and will be ignored by older
+        kernels.
+
+  journal_async_commit
+        Commit block can be written to disk without waiting for descriptor
+        blocks. If enabled older kernels cannot mount the device. This will
+        enable 'journal_checksum' internally.
+
+  journal_path=path, journal_dev=devnum
+        When the external journal device's major/minor numbers have changed,
+        these options allow the user to specify the new journal location.  The
+        journal device is identified through either its new major/minor numbers
+        encoded in devnum, or via a path to the device.
+
+  norecovery, noload
+        Don't load the journal on mounting.  Note that if the filesystem was
+        not unmounted cleanly, skipping the journal replay will lead to the
+        filesystem containing inconsistencies that can lead to any number of
+        problems.
+
+  data=journal
+        All data are committed into the journal prior to being written into the
+        main file system.  Enabling this mode will disable delayed allocation
+        and O_DIRECT support.
+
+  data=ordered	(*)
+        All data are forced directly out to the main file system prior to its
+        metadata being committed to the journal.
+
+  data=writeback
+        Data ordering is not preserved, data may be written into the main file
+        system after its metadata has been committed to the journal.
+
+  commit=nrsec	(*)
+        Ext4 can be told to sync all its data and metadata every 'nrsec'
+        seconds. The default value is 5 seconds.  This means that if you lose
+        your power, you will lose as much as the latest 5 seconds of work (your
+        filesystem will not be damaged though, thanks to the journaling).  This
+        default value (or any low value) will hurt performance, but it's good
+        for data-safety.  Setting it to 0 will have the same effect as leaving
+        it at the default (5 seconds).  Setting it to very large values will
+        improve performance.
+
+  barrier=<0|1(*)>, barrier(*), nobarrier
+        This enables/disables the use of write barriers in the jbd code.
+        barrier=0 disables, barrier=1 enables.  This also requires an IO stack
+        which can support barriers, and if jbd gets an error on a barrier
+        write, it will disable again with a warning.  Write barriers enforce
+        proper on-disk ordering of journal commits, making volatile disk write
+        caches safe to use, at some performance penalty.  If your disks are
+        battery-backed in one way or another, disabling barriers may safely
+        improve performance.  The mount options "barrier" and "nobarrier" can
+        also be used to enable or disable barriers, for consistency with other
+        ext4 mount options.
+
+  inode_readahead_blks=n
+        This tuning parameter controls the maximum number of inode table blocks
+        that ext4's inode table readahead algorithm will pre-read into the
+        buffer cache.  The default value is 32 blocks.
+
+  nouser_xattr
+        Disables Extended User Attributes.  See the attr(5) manual page for
+        more information about extended attributes.
+
+  noacl
+        This option disables POSIX Access Control List support. If ACL support
+        is enabled in the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL
+        is enabled by default on mount. See the acl(5) manual page for more
+        information about acl.
+
+  bsddf	(*)
+        Make 'df' act like BSD.
+
+  minixdf
+        Make 'df' act like Minix.
+
+  debug
+        Extra debugging information is sent to syslog.
+
+  abort
+        Simulate the effects of calling ext4_abort() for debugging purposes.
+        This is normally used while remounting a filesystem which is already
+        mounted.
+
+  errors=remount-ro
+        Remount the filesystem read-only on an error.
+
+  errors=continue
+        Keep going on a filesystem error.
+
+  errors=panic
+        Panic and halt the machine if an error occurs.  (These mount options
+        override the errors behavior specified in the superblock, which can be
+        configured using tune2fs)
+
+  data_err=ignore(*)
+        Just print an error message if an error occurs in a file data buffer in
+        ordered mode.
+  data_err=abort
+        Abort the journal if an error occurs in a file data buffer in ordered
+        mode.
+
+  grpid | bsdgroups
+        New objects have the group ID of their parent.
+
+  nogrpid (*) | sysvgroups
+        New objects have the group ID of their creator.
+
+  resgid=n
+        The group ID which may use the reserved blocks.
+
+  resuid=n
+        The user ID which may use the reserved blocks.
+
+  sb=
+        Use alternate superblock at this location.
+
+  quota, noquota, grpquota, usrquota
+        These options are ignored by the filesystem. They are used only by
+        quota tools to recognize volumes where quota should be turned on. See
+        documentation in the quota-tools package for more details
+        (http://sourceforge.net/projects/linuxquota).
+
+  jqfmt=<quota type>, usrjquota=<file>, grpjquota=<file>
+        These options tell filesystem details about quota so that quota
+        information can be properly updated during journal replay. They replace
+        the above quota options. See documentation in the quota-tools package
+        for more details (http://sourceforge.net/projects/linuxquota).
+
+  stripe=n
+        Number of filesystem blocks that mballoc will try to use for allocation
+        size and alignment. For RAID5/6 systems this should be the number of
+        data disks *  RAID chunk size in file system blocks.
+
+  delalloc	(*)
+        Defer block allocation until just before ext4 writes out the block(s)
+        in question.  This allows ext4 to better allocation decisions more
+        efficiently.
+
+  nodelalloc
+        Disable delayed allocation.  Blocks are allocated when the data is
+        copied from userspace to the page cache, either via the write(2) system
+        call or when an mmap'ed page which was previously unallocated is
+        written for the first time.
+
+  max_batch_time=usec
+        Maximum amount of time ext4 should wait for additional filesystem
+        operations to be batch together with a synchronous write operation.
+        Since a synchronous write operation is going to force a commit and then
+        a wait for the I/O complete, it doesn't cost much, and can be a huge
+        throughput win, we wait for a small amount of time to see if any other
+        transactions can piggyback on the synchronous write.   The algorithm
+        used is designed to automatically tune for the speed of the disk, by
+        measuring the amount of time (on average) that it takes to finish
+        committing a transaction.  Call this time the "commit time".  If the
+        time that the transaction has been running is less than the commit
+        time, ext4 will try sleeping for the commit time to see if other
+        operations will join the transaction.   The commit time is capped by
+        the max_batch_time, which defaults to 15000us (15ms).   This
+        optimization can be turned off entirely by setting max_batch_time to 0.
+
+  min_batch_time=usec
+        This parameter sets the commit time (as described above) to be at least
+        min_batch_time.  It defaults to zero microseconds.  Increasing this
+        parameter may improve the throughput of multi-threaded, synchronous
+        workloads on very fast disks, at the cost of increasing latency.
+
+  journal_ioprio=prio
+        The I/O priority (from 0 to 7, where 0 is the highest priority) which
+        should be used for I/O operations submitted by kjournald2 during a
+        commit operation.  This defaults to 3, which is a slightly higher
+        priority than the default I/O priority.
+
+  auto_da_alloc(*), noauto_da_alloc
+        Many broken applications don't use fsync() when replacing existing
+        files via patterns such as fd = open("foo.new")/write(fd,..)/close(fd)/
+        rename("foo.new", "foo"), or worse yet, fd = open("foo",
+        O_TRUNC)/write(fd,..)/close(fd).  If auto_da_alloc is enabled, ext4
+        will detect the replace-via-rename and replace-via-truncate patterns
+        and force that any delayed allocation blocks are allocated such that at
+        the next journal commit, in the default data=ordered mode, the data
+        blocks of the new file are forced to disk before the rename() operation
+        is committed.  This provides roughly the same level of guarantees as
+        ext3, and avoids the "zero-length" problem that can happen when a
+        system crashes before the delayed allocation blocks are forced to disk.
+
+  noinit_itable
+        Do not initialize any uninitialized inode table blocks in the
+        background.  This feature may be used by installation CD's so that the
+        install process can complete as quickly as possible; the inode table
+        initialization process would then be deferred until the next time the
+        file system is unmounted.
+
+  init_itable=n
+        The lazy itable init code will wait n times the number of milliseconds
+        it took to zero out the previous block group's inode table.  This
+        minimizes the impact on the system performance while file system's
+        inode table is being initialized.
+
+  discard, nodiscard(*)
+        Controls whether ext4 should issue discard/TRIM commands to the
+        underlying block device when blocks are freed.  This is useful for SSD
+        devices and sparse/thinly-provisioned LUNs, but it is off by default
+        until sufficient testing has been done.
+
+  nouid32
+        Disables 32-bit UIDs and GIDs.  This is for interoperability  with
+        older kernels which only store and expect 16-bit values.
+
+  block_validity(*), noblock_validity
+        These options enable or disable the in-kernel facility for tracking
+        filesystem metadata blocks within internal data structures.  This
+        allows multi- block allocator and other routines to notice bugs or
+        corrupted allocation bitmaps which cause blocks to be allocated which
+        overlap with filesystem metadata blocks.
+
+  dioread_lock, dioread_nolock
+        Controls whether or not ext4 should use the DIO read locking. If the
+        dioread_nolock option is specified ext4 will allocate uninitialized
+        extent before buffer write and convert the extent to initialized after
+        IO completes. This approach allows ext4 code to avoid using inode
+        mutex, which improves scalability on high speed storages. However this
+        does not work with data journaling and dioread_nolock option will be
+        ignored with kernel warning. Note that dioread_nolock code path is only
+        used for extent-based files.  Because of the restrictions this options
+        comprises it is off by default (e.g. dioread_lock).
+
+  max_dir_size_kb=n
+        This limits the size of directories so that any attempt to expand them
+        beyond the specified limit in kilobytes will cause an ENOSPC error.
+        This is useful in memory constrained environments, where a very large
+        directory can cause severe performance problems or even provoke the Out
+        Of Memory killer.  (For example, if there is only 512mb memory
+        available, a 176mb directory may seriously cramp the system's style.)
+
+  i_version
+        Enable 64-bit inode version support. This option is off by default.
+
+  dax
+        Use direct access (no page cache).  See
+        Documentation/filesystems/dax.txt.  Note that this option is
+        incompatible with data=journal.
+
+Data Mode
+=========
+There are 3 different data modes:
+
+* writeback mode
+
+  In data=writeback mode, ext4 does not journal data at all.  This mode provides
+  a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
+  mode - metadata journaling.  A crash+recovery can cause incorrect data to
+  appear in files which were written shortly before the crash.  This mode will
+  typically provide the best ext4 performance.
+
+* ordered mode
+
+  In data=ordered mode, ext4 only officially journals metadata, but it logically
+  groups metadata information related to data changes with the data blocks into
+  a single unit called a transaction.  When it's time to write the new metadata
+  out to disk, the associated data blocks are written first.  In general, this
+  mode performs slightly slower than writeback but significantly faster than
+  journal mode.
+
+* journal mode
+
+  data=journal mode provides full data and metadata journaling.  All new data is
+  written to the journal first, and then to its final location.  In the event of
+  a crash, the journal can be replayed, bringing both data and metadata into a
+  consistent state.  This mode is the slowest except when data needs to be read
+  from and written to disk at the same time where it outperforms all others
+  modes.  Enabling this mode will disable delayed allocation and O_DIRECT
+  support.
+
+/proc entries
+=============
+
+Information about mounted ext4 file systems can be found in
+/proc/fs/ext4.  Each mounted filesystem will have a directory in
+/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
+/proc/fs/ext4/dm-0).   The files in each per-device directory are shown
+in table below.
+
+Files in /proc/fs/ext4/<devname>
+
+  mb_groups
+        details of multiblock allocator buddy cache of free blocks
+
+/sys entries
+============
+
+Information about mounted ext4 file systems can be found in
+/sys/fs/ext4.  Each mounted filesystem will have a directory in
+/sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or
+/sys/fs/ext4/dm-0).   The files in each per-device directory are shown
+in table below.
+
+Files in /sys/fs/ext4/<devname>:
+
+(see also Documentation/ABI/testing/sysfs-fs-ext4)
+
+  delayed_allocation_blocks
+        This file is read-only and shows the number of blocks that are dirty in
+        the page cache, but which do not have their location in the filesystem
+        allocated yet.
+
+  inode_goal
+        Tuning parameter which (if non-zero) controls the goal inode used by
+        the inode allocator in preference to all other allocation heuristics.
+        This is intended for debugging use only, and should be 0 on production
+        systems.
+
+  inode_readahead_blks
+        Tuning parameter which controls the maximum number of inode table
+        blocks that ext4's inode table readahead algorithm will pre-read into
+        the buffer cache.
+
+  lifetime_write_kbytes
+        This file is read-only and shows the number of kilobytes of data that
+        have been written to this filesystem since it was created.
+
+  max_writeback_mb_bump
+        The maximum number of megabytes the writeback code will try to write
+        out before move on to another inode.
+
+  mb_group_prealloc
+        The multiblock allocator will round up allocation requests to a
+        multiple of this tuning parameter if the stripe size is not set in the
+        ext4 superblock
+
+  mb_max_to_scan
+        The maximum number of extents the multiblock allocator will search to
+        find the best extent.
+
+  mb_min_to_scan
+        The minimum number of extents the multiblock allocator will search to
+        find the best extent.
+
+  mb_order2_req
+        Tuning parameter which controls the minimum size for requests (as a
+        power of 2) where the buddy cache is used.
+
+  mb_stats
+        Controls whether the multiblock allocator should collect statistics,
+        which are shown during the unmount. 1 means to collect statistics, 0
+        means not to collect statistics.
+
+  mb_stream_req
+        Files which have fewer blocks than this tunable parameter will have
+        their blocks allocated out of a block group specific preallocation
+        pool, so that small files are packed closely together.  Each large file
+        will have its blocks allocated out of its own unique preallocation
+        pool.
+
+  session_write_kbytes
+        This file is read-only and shows the number of kilobytes of data that
+        have been written to this filesystem since it was mounted.
+
+  reserved_clusters
+        This is RW file and contains number of reserved clusters in the file
+        system which will be used in the specific situations to avoid costly
+        zeroout, unexpected ENOSPC, or possible data loss. The default is 2% or
+        4096 clusters, whichever is smaller and this can be changed however it
+        can never exceed number of clusters in the file system. If there is not
+        enough space for the reserved space when mounting the file mount will
+        _not_ fail.
+
+Ioctls
+======
+
+There is some Ext4 specific functionality which can be accessed by applications
+through the system call interfaces. The list of all Ext4 specific ioctls are
+shown in the table below.
+
+Table of Ext4 specific ioctls
+
+  EXT4_IOC_GETFLAGS
+        Get additional attributes associated with inode.  The ioctl argument is
+        an integer bitfield, with bit values described in ext4.h. This ioctl is
+        an alias for FS_IOC_GETFLAGS.
+
+  EXT4_IOC_SETFLAGS
+        Set additional attributes associated with inode.  The ioctl argument is
+        an integer bitfield, with bit values described in ext4.h. This ioctl is
+        an alias for FS_IOC_SETFLAGS.
+
+  EXT4_IOC_GETVERSION, EXT4_IOC_GETVERSION_OLD
+        Get the inode i_generation number stored for each inode. The
+        i_generation number is normally changed only when new inode is created
+        and it is particularly useful for network filesystems. The '_OLD'
+        version of this ioctl is an alias for FS_IOC_GETVERSION.
+
+  EXT4_IOC_SETVERSION, EXT4_IOC_SETVERSION_OLD
+        Set the inode i_generation number stored for each inode. The '_OLD'
+        version of this ioctl is an alias for FS_IOC_SETVERSION.
+
+  EXT4_IOC_GROUP_EXTEND
+        This ioctl has the same purpose as the resize mount option. It allows
+        to resize filesystem to the end of the last existing block group,
+        further resize has to be done with resize2fs, either online, or
+        offline. The argument points to the unsigned logn number representing
+        the filesystem new block count.
+
+  EXT4_IOC_MOVE_EXT
+        Move the block extents from orig_fd (the one this ioctl is pointing to)
+        to the donor_fd (the one specified in move_extent structure passed as
+        an argument to this ioctl). Then, exchange inode metadata between
+        orig_fd and donor_fd.  This is especially useful for online
+        defragmentation, because the allocator has the opportunity to allocate
+        moved blocks better, ideally into one contiguous extent.
+
+  EXT4_IOC_GROUP_ADD
+        Add a new group descriptor to an existing or new group descriptor
+        block. The new group descriptor is described by ext4_new_group_input
+        structure, which is passed as an argument to this ioctl. This is
+        especially useful in conjunction with EXT4_IOC_GROUP_EXTEND, which
+        allows online resize of the filesystem to the end of the last existing
+        block group.  Those two ioctls combined is used in userspace online
+        resize tool (e.g. resize2fs).
+
+  EXT4_IOC_MIGRATE
+        This ioctl operates on the filesystem itself.  It converts (migrates)
+        ext3 indirect block mapped inode to ext4 extent mapped inode by walking
+        through indirect block mapping of the original inode and converting
+        contiguous block ranges into ext4 extents of the temporary inode. Then,
+        inodes are swapped. This ioctl might help, when migrating from ext3 to
+        ext4 filesystem, however suggestion is to create fresh ext4 filesystem
+        and copy data from the backup. Note, that filesystem has to support
+        extents for this ioctl to work.
+
+  EXT4_IOC_ALLOC_DA_BLKS
+        Force all of the delay allocated blocks to be allocated to preserve
+        application-expected ext3 behaviour. Note that this will also start
+        triggering a write of the data blocks, but this behaviour may change in
+        the future as it is not necessary and has been done this way only for
+        sake of simplicity.
+
+  EXT4_IOC_RESIZE_FS
+        Resize the filesystem to a new size.  The number of blocks of resized
+        filesystem is passed in via 64 bit integer argument.  The kernel
+        allocates bitmaps and inode table, the userspace tool thus just passes
+        the new number of blocks.
+
+  EXT4_IOC_SWAP_BOOT
+        Swap i_blocks and associated attributes (like i_blocks, i_size,
+        i_flags, ...) from the specified inode with inode EXT4_BOOT_LOADER_INO
+        (#5). This is typically used to store a boot loader in a secure part of
+        the filesystem, where it can't be changed by a normal user by accident.
+        The data blocks of the previous boot loader will be associated with the
+        given inode.
+
+References
+==========
+
+kernel source:	<file:fs/ext4/>
+		<file:fs/jbd2/>
+
+programs:	http://e2fsprogs.sourceforge.net/
+
+useful links:	http://fedoraproject.org/wiki/ext3-devel
+		http://www.bullopensource.org/ext4/
+		http://ext4.wiki.kernel.org/index.php/Main_Page
+		http://fedoraproject.org/wiki/Features/Ext4
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@ -71,10 +71,12 @@ configure specific aspects of kernel behavior to your liking.
   java
   ras
   bcache
+   ext4
   pm/index
   thunderbolt
   LSM/index
   mm/index
+   perf-security

 .. only::  subproject and html

--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@ -331,7 +331,7 @@
 			APC and your system crashes randomly.

 	apic=		[APIC,X86] Advanced Programmable Interrupt Controller
-			Change the output verbosity whilst booting
+			Change the output verbosity while booting
 			Format: { quiet (default) | verbose | debug }
 			Change the amount of debugging information output
 			when initialising the APIC and IO-APIC components.
@ -461,6 +461,11 @@
 			possible to determine what the correct size should be.
 			This option provides an override for these situations.

+	carrier_timeout=
+			[NET] Specifies amount of time (in seconds) that
+			the kernel should wait for a network carrier. By default
+			it waits 120 seconds.
+
 	ca_keys=	[KEYS] This parameter identifies a specific key(s) on
 			the system trusted keyring to be used for certificate
 			trust validation.
@ -486,10 +491,14 @@
 			cut the overhead, others just disable the usage. So
 			only cgroup_disable=memory is actually worthy}

-	cgroup_no_v1=	[KNL] Disable one, multiple, all cgroup controllers in v1
-			Format: { controller[,controller...] | "all" }
+	cgroup_no_v1=	[KNL] Disable cgroup controllers and named hierarchies in v1
+			Format: { { controller | "all" | "named" }
+			          [,{ controller | "all" | "named" }...] }
 			Like cgroup_disable, but only applies to cgroup v1;
 			the blacklisted controllers remain available in cgroup2.
+			"all" blacklists all controllers and "named" disables
+			named mounts. Specifying both "all" and "named" disables
+			all v1 hierarchies.

 	cgroup.memory=	[KNL] Pass options to the cgroup memory controller.
 			Format: <string>
@ -674,6 +683,9 @@
 	cpuidle.off=1	[CPU_IDLE]
 			disable the cpuidle sub-system

+	cpuidle.governor=
+			[CPU_IDLE] Name of the cpuidle governor to use.
+
 	cpufreq.off=1	[CPU_FREQ]
 			disable the cpufreq sub-system

@ -856,6 +868,12 @@
 			causing system reset or hang due to sending
 			INIT from AP to BSP.

+	perf_v4_pmi=	[X86,INTEL]
+			Format: <bool>
+			Disable Intel PMU counter freezing feature.
+			The feature only exists starting from
+			Arch Perfmon v4 (Skylake and newer).
+
 	disable_ddw	[PPC/PSERIES]
 			Disable Dynamic DMA Window support. Use this if
 			to workaround buggy firmware.
@ -897,6 +915,10 @@
 			The filter can be disabled or changed to another
 			driver later using sysfs.

+	driver_async_probe=  [KNL]
+			List of driver names to be probed asynchronously.
+			Format: <driver_name1>,<driver_name2>...
+
 	drm.edid_firmware=[<connector>:]<file>[,[<connector>:]<file>]
 			Broken monitors, graphic adapters, KVMs and EDIDless
 			panels may send no or incorrect EDID data sets.
@ -1015,6 +1037,12 @@
 			specified address. The serial port must already be
 			setup and configured. Options are not yet supported.

+		rda,<addr>
+			Start an early, polled-mode console on a serial port
+			of an RDA Micro SoC, such as RDA8810PL, at the
+			specified address. The serial port must already be
+			setup and configured. Options are not yet supported.
+
 		smh	Use ARM semihosting calls for early console.

 		s3c2410,<addr>
@ -1054,16 +1082,22 @@
 			specified address. The serial port must already be
 			setup and configured. Options are not yet supported.

+		efifb,[options]
+			Start an early, unaccelerated console on the EFI
+			memory mapped framebuffer (if available). On cache
+			coherent non-x86 systems that use system memory for
+			the framebuffer, pass the 'ram' option so that it is
+			mapped with the correct attributes.
+
 	earlyprintk=	[X86,SH,ARM,M68k,S390]
 			earlyprintk=vga
-			earlyprintk=efi
 			earlyprintk=sclp
 			earlyprintk=xen
 			earlyprintk=serial[,ttySn[,baudrate]]
 			earlyprintk=serial[,0x...[,baudrate]]
 			earlyprintk=ttySn[,baudrate]
 			earlyprintk=dbgp[debugController#]
-			earlyprintk=pciserial,bus:device.function[,baudrate]
+			earlyprintk=pciserial[,force],bus:device.function[,baudrate]
 			earlyprintk=xdbc[xhciController#]

 			earlyprintk is useful when the kernel crashes before
@ -1095,6 +1129,10 @@

 			The sclp output can only be used on s390.

+			The optional "force" to "pciserial" enables use of a
+			PCI device even when its classcode is not of the
+			UART class.
+
 	edac_report=	[HW,EDAC] Control how to report EDAC event
 			Format: {"on" | "off" | "force"}
 			on: enable EDAC to report H/W event. May be overridden
@ -1159,9 +1197,10 @@
 			arch/x86/kernel/cpu/cpufreq/elanfreq.c.

 	elevator=	[IOSCHED]
-			Format: {"cfq" | "deadline" | "noop"}
-			See Documentation/block/cfq-iosched.txt and
-			Documentation/block/deadline-iosched.txt for details.
+			Format: { "mq-deadline" | "kyber" | "bfq" }
+			See Documentation/block/deadline-iosched.txt,
+			Documentation/block/kyber-iosched.txt and
+			Documentation/block/bfq-iosched.txt for details.

 	elfcorehdr=[size[KMG]@]offset[KMG] [IA64,PPC,SH,X86,S390]
 			Specifies physical address of start of kernel core
@ -1385,6 +1424,11 @@
 	hvc_iucv_allow=	[S390]	Comma-separated list of z/VM user IDs.
 				If specified, z/VM IUCV HVC accepts connections
 				from listed z/VM user IDs only.
+
+	hv_nopvspin	[X86,HYPER_V] Disables the paravirt spinlock optimizations
+				      which allow the hypervisor to 'idle' the
+				      guest on lock contention.
+
 	keep_bootcon	[KNL]
 			Do not unregister boot console at start. This is only
 			useful for debugging when something happens in the window
@ -1668,12 +1712,11 @@
 			By default, super page will be supported if Intel IOMMU
 			has the capability. With this option, super page will
 			not be supported.
-		ecs_off [Default Off]
-			By default, extended context tables will be supported if
-			the hardware advertises that it has support both for the
-			extended tables themselves, and also PASID support. With
-			this option set, extended tables will not be used even
-			on hardware which claims to support them.
+		sm_on [Default Off]
+			By default, scalable mode will be disabled even if the
+			hardware advertises that it has support for the scalable
+			mode translation. With this option set, scalable mode
+			will be used on hardware which claims to support it.
 		tboot_noforce [Default Off]
 			Do not force the Intel IOMMU enabled under tboot.
 			By default, tboot will force Intel IOMMU on, which
@ -1749,12 +1792,24 @@
 		nobypass	[PPC/POWERNV]
 			Disable IOMMU bypass, using IOMMU for PCI devices.

+	iommu.strict=	[ARM64] Configure TLB invalidation behaviour
+			Format: { "0" | "1" }
+			0 - Lazy mode.
+			  Request that DMA unmap operations use deferred
+			  invalidation of hardware TLBs, for increased
+			  throughput at the cost of reduced device isolation.
+			  Will fall back to strict mode if not supported by
+			  the relevant IOMMU driver.
+			1 - Strict mode (default).
+			  DMA unmap operations invalidate IOMMU hardware TLBs
+			  synchronously.
+
 	iommu.passthrough=
 			[ARM64] Configure DMA to bypass the IOMMU by default.
 			Format: { "0" | "1" }
 			0 - Use IOMMU translation for DMA.
 			1 - Bypass the IOMMU for DMA.
-			unset - Use IOMMU translation for DMA.
+			unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH.

 	io7=		[HW] IO7 for Marvel based alpha systems
 			See comment before marvel_specify_io7 in
@ -1791,6 +1846,11 @@
 			to let secondary kernels in charge of setting up
 			LPIs.

+	irqchip.gicv3_pseudo_nmi= [ARM64]
+			Enables support for pseudo-NMIs in the kernel. This
+			requires the kernel to be built with
+			CONFIG_ARM64_PSEUDO_NMI.
+
 	irqfixup	[HW]
 			When an interrupt is not handled search all handlers
 			for it. Intended to get systems with badly broken
@ -1942,6 +2002,12 @@
 			Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y,
 			the default is off.

+	kpti=		[ARM64] Control page table isolation of user
+			and kernel address spaces.
+			Default: enabled on cores which need mitigation.
+			0: force disabled
+			1: force enabled
+
 	kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
 			Default is 0 (don't ignore, but inject #GP)

@ -2069,6 +2135,9 @@
 			off
 				Disables hypervisor mitigations and doesn't
 				emit any warnings.
+				It also drops the swap size and available
+				RAM limit restriction on both hypervisor and
+				bare metal.

 			Default is 'flush'.

@ -2274,6 +2343,12 @@
 	ltpc=		[NET]
 			Format: <io>,<irq>,<dma>

+	lsm.debug	[SECURITY] Enable LSM initialization debugging output.
+
+	lsm=lsm1,...,lsmN
+			[SECURITY] Choose order of LSM initialization. This
+			overrides CONFIG_LSM, and the "security=" parameter.
+
 	machvec=	[IA-64] Force the use of a particular machine-vector
 			(machvec) in a generic kernel.
 			Example: machvec=hpzx1_swiotlb
@ -2404,7 +2479,7 @@
 			seconds.  Use this parameter to check at some
 			other rate.  0 disables periodic checking.

-	memtest=	[KNL,X86,ARM] Enable memtest
+	memtest=	[KNL,X86,ARM,PPC] Enable memtest
 			Format: <integer>
 			default : 0 <disable>
 			Specifies the number of memtest passes to be
@ -2798,7 +2873,7 @@
 			check bypass). With this option data leaks are possible
 			in the system.

-	nospectre_v2	[X86] Disable all mitigations for the Spectre variant 2
+	nospectre_v2	[X86,PPC_FSL_BOOK3E] Disable all mitigations for the Spectre variant 2
 			(indirect branch prediction) vulnerability. System may
 			allow data leaks with this option, which is equivalent
 			to spectre_v2=off.
@ -3053,6 +3128,14 @@
 			timeout < 0: reboot immediately
 			Format: <timeout>

+	panic_print=	Bitmask for printing system info when panic happens.
+			User can chose combination of the following bits:
+			bit 0: print all tasks info
+			bit 1: print system memory info
+			bit 2: print timer info
+			bit 3: print locks info if CONFIG_LOCKDEP is on
+			bit 4: print ftrace buffer
+
 	panic_on_warn	panic() instead of WARN().  Useful to cause kdump
 			on a WARN().

@ -3476,6 +3559,10 @@
 			before loading.
 			See Documentation/blockdev/ramdisk.txt.

+	psi=		[KNL] Enable or disable pressure stall information
+			tracking.
+			Format: <bool>
+
 	psmouse.proto=	[HW,MOUSE] Highest PS2 mouse protocol extension to
 			probe for; one of (bare|imps|exps|lifebook|any).
 	psmouse.rate=	[HW,MOUSE] Set desired mouse report rate, in reports
@ -3540,14 +3627,14 @@

 			In kernels built with CONFIG_RCU_NOCB_CPU=y, set
 			the specified list of CPUs to be no-callback CPUs.
-			Invocation of these CPUs' RCU callbacks will
-			be offloaded to "rcuox/N" kthreads created for
-			that purpose, where "x" is "b" for RCU-bh, "p"
-			for RCU-preempt, and "s" for RCU-sched, and "N"
-			is the CPU number.  This reduces OS jitter on the
-			offloaded CPUs, which can be useful for HPC and
-			real-time workloads.  It can also improve energy
-			efficiency for asymmetric multiprocessors.
+			Invocation of these CPUs' RCU callbacks will be
+			offloaded to "rcuox/N" kthreads created for that
+			purpose, where "x" is "p" for RCU-preempt, and
+			"s" for RCU-sched, and "N" is the CPU number.
+			This reduces OS jitter on the offloaded CPUs,
+			which can be useful for HPC and real-time
+			workloads.  It can also improve energy efficiency
+			for asymmetric multiprocessors.

 	rcu_nocb_poll	[KNL]
 			Rather than requiring that offloaded CPUs
@ -3597,12 +3684,6 @@
 			latencies, which will choose a value aligned
 			with the appropriate hardware boundaries.

-	rcutree.jiffies_till_sched_qs= [KNL]
-			Set required age in jiffies for a
-			given grace period before RCU starts
-			soliciting quiescent-state help from
-			rcu_note_context_switch().
-
 	rcutree.jiffies_till_first_fqs= [KNL]
 			Set delay from grace-period initialization to
 			first attempt to force quiescent states.
@ -3614,6 +3695,20 @@
 			quiescent states.  Units are jiffies, minimum
 			value is one, and maximum value is HZ.

+	rcutree.jiffies_till_sched_qs= [KNL]
+			Set required age in jiffies for a
+			given grace period before RCU starts
+			soliciting quiescent-state help from
+			rcu_note_context_switch() and cond_resched().
+			If not specified, the kernel will calculate
+			a value based on the most recent settings
+			of rcutree.jiffies_till_first_fqs
+			and rcutree.jiffies_till_next_fqs.
+			This calculated value may be viewed in
+			rcutree.jiffies_to_sched_qs.  Any attempt to set
+			rcutree.jiffies_to_sched_qs will be cheerfully
+			overwritten.
+
 	rcutree.kthread_prio= 	 [KNL,BOOT]
 			Set the SCHED_FIFO priority of the RCU per-CPU
 			kthreads (rcuc/N). This value is also used for
@ -3657,6 +3752,11 @@
 			This wake_up() will be accompanied by a
 			WARN_ONCE() splat and an ftrace_dump().

+	rcutree.sysrq_rcu= [KNL]
+			Commandeer a sysrq key to dump out Tree RCU's
+			rcu_node tree with an eye towards determining
+			why a new grace period has not yet started.
+
 	rcuperf.gp_async= [KNL]
 			Measure performance of asynchronous
 			grace-period primitives such as call_rcu().
@ -3708,24 +3808,6 @@
 			in microseconds.  The default of zero says
 			no holdoff.

-	rcutorture.cbflood_inter_holdoff= [KNL]
-			Set holdoff time (jiffies) between successive
-			callback-flood tests.
-
-	rcutorture.cbflood_intra_holdoff= [KNL]
-			Set holdoff time (jiffies) between successive
-			bursts of callbacks within a given callback-flood
-			test.
-
-	rcutorture.cbflood_n_burst= [KNL]
-			Set the number of bursts making up a given
-			callback-flood test.  Set this to zero to
-			disable callback-flood testing.
-
-	rcutorture.cbflood_n_per_burst= [KNL]
-			Set the number of callbacks to be registered
-			in a given burst of a callback-flood test.
-
 	rcutorture.fqs_duration= [KNL]
 			Set duration of force_quiescent_state bursts
 			in microseconds.
@ -3738,6 +3820,23 @@
 			Set wait time between force_quiescent_state bursts
 			in seconds.

+	rcutorture.fwd_progress= [KNL]
+			Enable RCU grace-period forward-progress testing
+			for the types of RCU supporting this notion.
+
+	rcutorture.fwd_progress_div= [KNL]
+			Specify the fraction of a CPU-stall-warning
+			period to do tight-loop forward-progress testing.
+
+	rcutorture.fwd_progress_holdoff= [KNL]
+			Number of seconds to wait between successive
+			forward-progress tests.
+
+	rcutorture.fwd_progress_need_resched= [KNL]
+			Enclose cond_resched() calls within checks for
+			need_resched() during tight-loop forward-progress
+			testing.
+
 	rcutorture.gp_cond= [KNL]
 			Use conditional/asynchronous update-side
 			primitives, if available.
@ -3869,12 +3968,6 @@
 	rcupdate.rcu_self_test= [KNL]
 			Run the RCU early boot self tests

-	rcupdate.rcu_self_test_bh= [KNL]
-			Run the RCU bh early boot self tests
-
-	rcupdate.rcu_self_test_sched= [KNL]
-			Run the RCU sched early boot self tests
-
 	rdinit=		[KNL]
 			Format: <full_path>
 			Run specified binary instead of /init from the ramdisk,
@ -4033,11 +4126,9 @@
 			Note: increases power consumption, thus should only be
 			enabled if running jitter sensitive (HPC/RT) workloads.

-	security=	[SECURITY] Choose a security module to enable at boot.
-			If this boot parameter is not specified, only the first
-			security module asking for security registration will be
-			loaded. An invalid security module name will be treated
-			as if no module has been chosen.
+	security=	[SECURITY] Choose a legacy "major" security module to
+			enable at boot. This has been deprecated by the
+			"lsm=" parameter.

 	selinux=	[SELINUX] Disable or enable SELinux at boot time.
 			Format: { "0" | "1" }
@ -4165,9 +4256,13 @@

 	spectre_v2=	[X86] Control mitigation of Spectre variant 2
 			(indirect branch speculation) vulnerability.
+			The default operation protects the kernel from
+			user space attacks.

-			on   - unconditionally enable
-			off  - unconditionally disable
+			on   - unconditionally enable, implies
+			       spectre_v2_user=on
+			off  - unconditionally disable, implies
+			       spectre_v2_user=off
 			auto - kernel detects whether your CPU model is
 			       vulnerable

@ -4177,6 +4272,12 @@
 			CONFIG_RETPOLINE configuration option, and the
 			compiler with which the kernel was built.

+			Selecting 'on' will also enable the mitigation
+			against user space to user space task attacks.
+
+			Selecting 'off' will disable both the kernel and
+			the user space protections.
+
 			Specific mitigations can also be selected manually:

 			retpoline	  - replace indirect branches
@ -4186,6 +4287,48 @@
 			Not specifying this option is equivalent to
 			spectre_v2=auto.

+	spectre_v2_user=
+			[X86] Control mitigation of Spectre variant 2
+		        (indirect branch speculation) vulnerability between
+		        user space tasks
+
+			on	- Unconditionally enable mitigations. Is
+				  enforced by spectre_v2=on
+
+			off     - Unconditionally disable mitigations. Is
+				  enforced by spectre_v2=off
+
+			prctl   - Indirect branch speculation is enabled,
+				  but mitigation can be enabled via prctl
+				  per thread.  The mitigation control state
+				  is inherited on fork.
+
+			prctl,ibpb
+				- Like "prctl" above, but only STIBP is
+				  controlled per thread. IBPB is issued
+				  always when switching between different user
+				  space processes.
+
+			seccomp
+				- Same as "prctl" above, but all seccomp
+				  threads will enable the mitigation unless
+				  they explicitly opt out.
+
+			seccomp,ibpb
+				- Like "seccomp" above, but only STIBP is
+				  controlled per thread. IBPB is issued
+				  always when switching between different
+				  user space processes.
+
+			auto    - Kernel selects the mitigation depending on
+				  the available CPU features and vulnerability.
+
+			Default mitigation:
+			If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl"
+
+			Not specifying this option is equivalent to
+			spectre_v2_user=auto.
+
 	spec_store_bypass_disable=
 			[HW] Control Speculative Store Bypass (SSB) Disable mitigation
 			(Speculative Store Bypass vulnerability)
@ -4589,7 +4732,8 @@
 	usbcore.authorized_default=
 			[USB] Default USB device authorization:
 			(default -1 = authorized except for wireless USB,
-			0 = not authorized, 1 = authorized)
+			0 = not authorized, 1 = authorized, 2 = authorized
+			if device connected to internal port)

 	usbcore.autosuspend=
 			[USB] The autosuspend time delay (in seconds) used
@ -4610,7 +4754,8 @@

 	usbcore.old_scheme_first=
 			[USB] Start with the old device initialization
-			scheme (default 0 = off).
+			scheme,  applies only to low and full-speed devices
+			 (default 0 = off).

 	usbcore.usbfs_memory_mb=
 			[USB] Memory limit (in MB) for buffers allocated by
@ -4683,6 +4828,8 @@
 					prevent spurious wakeup);
 				n = USB_QUIRK_DELAY_CTRL_MSG (Device needs a
 					pause after every control message);
+				o = USB_QUIRK_HUB_SLOW_RESET (Hub needs extra
+					delay after resetting its port);
 			Example: quirks=0781:5580:bk,0a5c:5834:gij

 	usbhid.mousepoll=
@ -4825,6 +4972,18 @@
 			This is actually a boot loader parameter; the value is
 			passed to the kernel using a special protocol.

+	vm_debug[=options]	[KNL] Available with CONFIG_DEBUG_VM=y.
+			May slow down system boot speed, especially when
+			enabled on systems with a large amount of memory.
+			All options are enabled by default, and this
+			interface is meant to allow for selectively
+			enabling or disabling specific virtual memory
+			debugging features.
+
+			Available options are:
+			  P	Enable page structure init time poisoning
+			  -	Disable all of the above options
+
 	vmalloc=nn[KMG]	[KNL,BOOT] Forces the vmalloc area to have an exact
 			size of <nn>. This can be used to increase the
 			minimum size (128MB on x86). It can also be used to
@ -4919,6 +5078,14 @@
 			or other driver-specific files in the
 			Documentation/watchdog/ directory.

+	watchdog_thresh=
+			[KNL]
+			Set the hard lockup detector stall duration
+			threshold in seconds. The soft lockup detector
+			threshold is set to twice the value. A value of 0
+			disables both lockup detectors. Default is 10
+			seconds.
+
 	workqueue.watchdog_thresh=
 			If CONFIG_WQ_WATCHDOG is configured, workqueue can
 			warn stall conditions and dump internal state to
--- a/Documentation/admin-guide/l1tf.rst
+++ b/Documentation/admin-guide/l1tf.rst
@ -405,6 +405,9 @@ time with the option "l1tf=". The valid arguments for this option are:

  off		Disables hypervisor mitigations and doesn't emit any
 		warnings.
+		It also drops the swap size and available RAM limit restrictions
+		on both hypervisor and bare metal.
+
  ============  =============================================================

 The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
@ -553,7 +556,7 @@ When nested virtualization is in use, three operating systems are involved:
 the bare metal hypervisor, the nested hypervisor and the nested virtual
 machine.  VMENTER operations from the nested hypervisor into the nested
 guest will always be processed by the bare metal hypervisor. If KVM is the
-bare metal hypervisor it wiil:
+bare metal hypervisor it will:

 - Flush the L1D cache on every switch from the nested hypervisor to the
   nested virtual machine, so that the nested hypervisor's secrets are not
@ -576,7 +579,8 @@ Default mitigations
  The kernel default mitigations for vulnerable processors are:

  - PTE inversion to protect against malicious user space. This is done
-    unconditionally and cannot be controlled.
+    unconditionally and cannot be controlled. The swap storage is limited
+    to ~16TB.

  - L1D conditional flushing on VMENTER when EPT is enabled for
    a guest.
--- a/Documentation/admin-guide/md.rst
+++ b/Documentation/admin-guide/md.rst
@ -756,3 +756,6 @@ These currently include:
      The cache mode for raid5. raid5 could include an extra disk for
      caching. The mode can be "write-throuth" and "write-back". The
      default is "write-through".
+
+  ppl_write_hint
+      NVMe stream ID to be set for each PPL write request.
--- a/Documentation/admin-guide/mm/concepts.rst
+++ b/Documentation/admin-guide/mm/concepts.rst
@ -4,13 +4,13 @@
 Concepts overview
 =================

-The memory management in Linux is complex system that evolved over the
-years and included more and more functionality to support variety of
+The memory management in Linux is a complex system that evolved over the
+years and included more and more functionality to support a variety of
 systems from MMU-less microcontrollers to supercomputers. The memory
-management for systems without MMU is called ``nommu`` and it
+management for systems without an MMU is called ``nommu`` and it
 definitely deserves a dedicated document, which hopefully will be
 eventually written. Yet, although some of the concepts are the same,
-here we assume that MMU is available and CPU can translate a virtual
+here we assume that an MMU is available and a CPU can translate a virtual
 address to a physical address.

 .. contents:: :local:
@ -21,10 +21,10 @@ Virtual Memory Primer
 The physical memory in a computer system is a limited resource and
 even for systems that support memory hotplug there is a hard limit on
 the amount of memory that can be installed. The physical memory is not
-necessary contiguous, it might be accessible as a set of distinct
+necessarily contiguous; it might be accessible as a set of distinct
 address ranges. Besides, different CPU architectures, and even
-different implementations of the same architecture have different view
-how these address ranges defined.
+different implementations of the same architecture have different views
+of how these address ranges are defined.

 All this makes dealing directly with physical memory quite complex and
 to avoid this complexity a concept of virtual memory was developed.
@ -48,8 +48,8 @@ appropriate kernel configuration option.

 Each physical memory page can be mapped as one or more virtual
 pages. These mappings are described by page tables that allow
-translation from virtual address used by programs to real address in
-the physical memory. The page tables organized hierarchically.
+translation from a virtual address used by programs to the physical
+memory address. The page tables are organized hierarchically.

 The tables at the lowest level of the hierarchy contain physical
 addresses of actual pages used by the software. The tables at higher
@ -121,8 +121,8 @@ Nodes
 Many multi-processor machines are NUMA - Non-Uniform Memory Access -
 systems. In such systems the memory is arranged into banks that have
 different access latency depending on the "distance" from the
-processor. Each bank is referred as `node` and for each node Linux
-constructs an independent memory management subsystem. A node has it's
+processor. Each bank is referred to as a `node` and for each node Linux
+constructs an independent memory management subsystem. A node has its
 own set of zones, lists of free and used pages and various statistics
 counters. You can find more details about NUMA in
 :ref:`Documentation/vm/numa.rst <numa>` and in
@ -149,9 +149,9 @@ for program's stack and heap or by explicit calls to mmap(2) system
 call. Usually, the anonymous mappings only define virtual memory areas
 that the program is allowed to access. The read accesses will result
 in creation of a page table entry that references a special physical
-page filled with zeroes. When the program performs a write, regular
+page filled with zeroes. When the program performs a write, a regular
 physical page will be allocated to hold the written data. The page
-will be marked dirty and if the kernel will decide to repurpose it,
+will be marked dirty and if the kernel decides to repurpose it,
 the dirty page will be swapped out.

 Reclaim
@ -181,8 +181,8 @@ pressure.
 The process of freeing the reclaimable physical memory pages and
 repurposing them is called (surprise!) `reclaim`. Linux can reclaim
 pages either asynchronously or synchronously, depending on the state
-of the system. When system is not loaded, most of the memory is free
-and allocation request will be satisfied immediately from the free
+of the system. When the system is not loaded, most of the memory is free
+and allocation requests will be satisfied immediately from the free
 pages supply. As the load increases, the amount of the free pages goes
 down and when it reaches a certain threshold (high watermark), an
 allocation request will awaken the ``kswapd`` daemon. It will
@ -190,7 +190,7 @@ asynchronously scan memory pages and either just free them if the data
 they contain is available elsewhere, or evict to the backing storage
 device (remember those dirty pages?). As memory usage increases even
 more and reaches another threshold - min watermark - an allocation
-will trigger the `direct reclaim`. In this case allocation is stalled
+will trigger `direct reclaim`. In this case allocation is stalled
 until enough memory pages are reclaimed to satisfy the request.

 Compaction
@ -200,7 +200,7 @@ As the system runs, tasks allocate and free the memory and it becomes
 fragmented. Although with virtual memory it is possible to present
 scattered physical pages as virtually contiguous range, sometimes it is
 necessary to allocate large physically contiguous memory areas. Such
-need may arise, for instance, when a device driver requires large
+need may arise, for instance, when a device driver requires a large
 buffer for DMA, or when THP allocates a huge page. Memory `compaction`
 addresses the fragmentation issue. This mechanism moves occupied pages
 from the lower part of a memory zone to free pages in the upper part
@ -208,15 +208,16 @@ of the zone. When a compaction scan is finished free pages are grouped
 together at the beginning of the zone and allocations of large
 physically contiguous areas become possible.

-Like reclaim, the compaction may happen asynchronously in ``kcompactd``
-daemon or synchronously as a result of memory allocation request.
+Like reclaim, the compaction may happen asynchronously in the ``kcompactd``
+daemon or synchronously as a result of a memory allocation request.

 OOM killer
 ==========

-It may happen, that on a loaded machine memory will be exhausted. When
-the kernel detects that the system runs out of memory (OOM) it invokes
-`OOM killer`. Its mission is simple: all it has to do is to select a
-task to sacrifice for the sake of the overall system health. The
-selected task is killed in a hope that after it exits enough memory
-will be freed to continue normal operation.
+It is possible that on a loaded machine memory will be exhausted and the
+kernel will be unable to reclaim enough memory to continue to operate. In
+order to save the rest of the system, it invokes the `OOM killer`.
+
+The `OOM killer` selects a task to sacrifice for the sake of the overall
+system health. The selected task is killed in a hope that after it exits
+enough memory will be freed to continue normal operation.
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@ -29,6 +29,7 @@ the Linux memory management.
   hugetlbpage
   idle_page_tracking
   ksm
+   memory-hotplug
   numa_memory_policy
   pagemap
   soft-dirty
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@ -0,0 +1,444 @@
+.. _admin_guide_memory_hotplug:
+
+==============
+Memory Hotplug
+==============
+
+:Created:							Jul 28 2007
+:Updated: Add some details about locking internals:		Aug 20 2018
+
+This document is about memory hotplug including how-to-use and current status.
+Because Memory Hotplug is still under development, contents of this text will
+be changed often.
+
+.. contents:: :local:
+
+.. note::
+
+    (1) x86_64's has special implementation for memory hotplug.
+        This text does not describe it.
+    (2) This text assumes that sysfs is mounted at ``/sys``.
+
+
+Introduction
+============
+
+Purpose of memory hotplug
+-------------------------
+
+Memory Hotplug allows users to increase/decrease the amount of memory.
+Generally, there are two purposes.
+
+(A) For changing the amount of memory.
+    This is to allow a feature like capacity on demand.
+(B) For installing/removing DIMMs or NUMA-nodes physically.
+    This is to exchange DIMMs/NUMA-nodes, reduce power consumption, etc.
+
+(A) is required by highly virtualized environments and (B) is required by
+hardware which supports memory power management.
+
+Linux memory hotplug is designed for both purpose.
+
+Phases of memory hotplug
+------------------------
+
+There are 2 phases in Memory Hotplug:
+
+  1) Physical Memory Hotplug phase
+  2) Logical Memory Hotplug phase.
+
+The First phase is to communicate hardware/firmware and make/erase
+environment for hotplugged memory. Basically, this phase is necessary
+for the purpose (B), but this is good phase for communication between
+highly virtualized environments too.
+
+When memory is hotplugged, the kernel recognizes new memory, makes new memory
+management tables, and makes sysfs files for new memory's operation.
+
+If firmware supports notification of connection of new memory to OS,
+this phase is triggered automatically. ACPI can notify this event. If not,
+"probe" operation by system administration is used instead.
+(see :ref:`memory_hotplug_physical_mem`).
+
+Logical Memory Hotplug phase is to change memory state into
+available/unavailable for users. Amount of memory from user's view is
+changed by this phase. The kernel makes all memory in it as free pages
+when a memory range is available.
+
+In this document, this phase is described as online/offline.
+
+Logical Memory Hotplug phase is triggered by write of sysfs file by system
+administrator. For the hot-add case, it must be executed after Physical Hotplug
+phase by hand.
+(However, if you writes udev's hotplug scripts for memory hotplug, these
+phases can be execute in seamless way.)
+
+Unit of Memory online/offline operation
+---------------------------------------
+
+Memory hotplug uses SPARSEMEM memory model which allows memory to be divided
+into chunks of the same size. These chunks are called "sections". The size of
+a memory section is architecture dependent. For example, power uses 16MiB, ia64
+uses 1GiB.
+
+Memory sections are combined into chunks referred to as "memory blocks". The
+size of a memory block is architecture dependent and represents the logical
+unit upon which memory online/offline operations are to be performed. The
+default size of a memory block is the same as memory section size unless an
+architecture specifies otherwise. (see :ref:`memory_hotplug_sysfs_files`.)
+
+To determine the size (in bytes) of a memory block please read this file::
+
+  /sys/devices/system/memory/block_size_bytes
+
+Kernel Configuration
+====================
+
+To use memory hotplug feature, kernel must be compiled with following
+config options.
+
+- For all memory hotplug:
+    - Memory model -> Sparse Memory  (``CONFIG_SPARSEMEM``)
+    - Allow for memory hot-add       (``CONFIG_MEMORY_HOTPLUG``)
+
+- To enable memory removal, the following are also necessary:
+    - Allow for memory hot remove    (``CONFIG_MEMORY_HOTREMOVE``)
+    - Page Migration                 (``CONFIG_MIGRATION``)
+
+- For ACPI memory hotplug, the following are also necessary:
+    - Memory hotplug (under ACPI Support menu) (``CONFIG_ACPI_HOTPLUG_MEMORY``)
+    - This option can be kernel module.
+
+- As a related configuration, if your box has a feature of NUMA-node hotplug
+  via ACPI, then this option is necessary too.
+
+    - ACPI0004,PNP0A05 and PNP0A06 Container Driver (under ACPI Support menu)
+      (``CONFIG_ACPI_CONTAINER``).
+
+     This option can be kernel module too.
+
+
+.. _memory_hotplug_sysfs_files:
+
+sysfs files for memory hotplug
+==============================
+
+All memory blocks have their device information in sysfs.  Each memory block
+is described under ``/sys/devices/system/memory`` as::
+
+	/sys/devices/system/memory/memoryXXX
+
+where XXX is the memory block id.
+
+For the memory block covered by the sysfs directory.  It is expected that all
+memory sections in this range are present and no memory holes exist in the
+range. Currently there is no way to determine if there is a memory hole, but
+the existence of one should not affect the hotplug capabilities of the memory
+block.
+
+For example, assume 1GiB memory block size. A device for a memory starting at
+0x100000000 is ``/sys/device/system/memory/memory4``::
+
+	(0x100000000 / 1Gib = 4)
+
+This device covers address range [0x100000000 ... 0x140000000)
+
+Under each memory block, you can see 5 files:
+
+- ``/sys/devices/system/memory/memoryXXX/phys_index``
+- ``/sys/devices/system/memory/memoryXXX/phys_device``
+- ``/sys/devices/system/memory/memoryXXX/state``
+- ``/sys/devices/system/memory/memoryXXX/removable``
+- ``/sys/devices/system/memory/memoryXXX/valid_zones``
+
+=================== ============================================================
+``phys_index``      read-only and contains memory block id, same as XXX.
+``state``           read-write
+
+                    - at read:  contains online/offline state of memory.
+                    - at write: user can specify "online_kernel",
+
+                    "online_movable", "online", "offline" command
+                    which will be performed on all sections in the block.
+``phys_device``     read-only: designed to show the name of physical memory
+                    device.  This is not well implemented now.
+``removable``       read-only: contains an integer value indicating
+                    whether the memory block is removable or not
+                    removable.  A value of 1 indicates that the memory
+                    block is removable and a value of 0 indicates that
+                    it is not removable. A memory block is removable only if
+                    every section in the block is removable.
+``valid_zones``     read-only: designed to show which zones this memory block
+		    can be onlined to.
+
+		    The first column shows it`s default zone.
+
+		    "memory6/valid_zones: Normal Movable" shows this memoryblock
+		    can be onlined to ZONE_NORMAL by default and to ZONE_MOVABLE
+		    by online_movable.
+
+		    "memory7/valid_zones: Movable Normal" shows this memoryblock
+		    can be onlined to ZONE_MOVABLE by default and to ZONE_NORMAL
+		    by online_kernel.
+=================== ============================================================
+
+.. note::
+
+  These directories/files appear after physical memory hotplug phase.
+
+If CONFIG_NUMA is enabled the memoryXXX/ directories can also be accessed
+via symbolic links located in the ``/sys/devices/system/node/node*`` directories.
+
+For example::
+
+	/sys/devices/system/node/node0/memory9 -> ../../memory/memory9
+
+A backlink will also be created::
+
+	/sys/devices/system/memory/memory9/node0 -> ../../node/node0
+
+.. _memory_hotplug_physical_mem:
+
+Physical memory hot-add phase
+=============================
+
+Hardware(Firmware) Support
+--------------------------
+
+On x86_64/ia64 platform, memory hotplug by ACPI is supported.
+
+In general, the firmware (ACPI) which supports memory hotplug defines
+memory class object of _HID "PNP0C80". When a notify is asserted to PNP0C80,
+Linux's ACPI handler does hot-add memory to the system and calls a hotplug udev
+script. This will be done automatically.
+
+But scripts for memory hotplug are not contained in generic udev package(now).
+You may have to write it by yourself or online/offline memory by hand.
+Please see :ref:`memory_hotplug_how_to_online_memory` and
+:ref:`memory_hotplug_how_to_offline_memory`.
+
+If firmware supports NUMA-node hotplug, and defines an object _HID "ACPI0004",
+"PNP0A05", or "PNP0A06", notification is asserted to it, and ACPI handler
+calls hotplug code for all of objects which are defined in it.
+If memory device is found, memory hotplug code will be called.
+
+Notify memory hot-add event by hand
+-----------------------------------
+
+On some architectures, the firmware may not notify the kernel of a memory
+hotplug event.  Therefore, the memory "probe" interface is supported to
+explicitly notify the kernel.  This interface depends on
+CONFIG_ARCH_MEMORY_PROBE and can be configured on powerpc, sh, and x86
+if hotplug is supported, although for x86 this should be handled by ACPI
+notification.
+
+Probe interface is located at::
+
+	/sys/devices/system/memory/probe
+
+You can tell the physical address of new memory to the kernel by::
+
+	% echo start_address_of_new_memory > /sys/devices/system/memory/probe
+
+Then, [start_address_of_new_memory, start_address_of_new_memory +
+memory_block_size] memory range is hot-added. In this case, hotplug script is
+not called (in current implementation). You'll have to online memory by
+yourself.  Please see :ref:`memory_hotplug_how_to_online_memory`.
+
+Logical Memory hot-add phase
+============================
+
+State of memory
+---------------
+
+To see (online/offline) state of a memory block, read 'state' file::
+
+	% cat /sys/device/system/memory/memoryXXX/state
+
+
+- If the memory block is online, you'll read "online".
+- If the memory block is offline, you'll read "offline".
+
+
+.. _memory_hotplug_how_to_online_memory:
+
+How to online memory
+--------------------
+
+When the memory is hot-added, the kernel decides whether or not to "online"
+it according to the policy which can be read from "auto_online_blocks" file::
+
+	% cat /sys/devices/system/memory/auto_online_blocks
+
+The default depends on the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
+option. If it is disabled the default is "offline" which means the newly added
+memory is not in a ready-to-use state and you have to "online" the newly added
+memory blocks manually. Automatic onlining can be requested by writing "online"
+to "auto_online_blocks" file::
+
+	% echo online > /sys/devices/system/memory/auto_online_blocks
+
+This sets a global policy and impacts all memory blocks that will subsequently
+be hotplugged. Currently offline blocks keep their state. It is possible, under
+certain circumstances, that some memory blocks will be added but will fail to
+online. User space tools can check their "state" files
+(``/sys/devices/system/memory/memoryXXX/state``) and try to online them manually.
+
+If the automatic onlining wasn't requested, failed, or some memory block was
+offlined it is possible to change the individual block's state by writing to the
+"state" file::
+
+	% echo online > /sys/devices/system/memory/memoryXXX/state
+
+This onlining will not change the ZONE type of the target memory block,
+If the memory block doesn't belong to any zone an appropriate kernel zone
+(usually ZONE_NORMAL) will be used unless movable_node kernel command line
+option is specified when ZONE_MOVABLE will be used.
+
+You can explicitly request to associate it with ZONE_MOVABLE by::
+
+	% echo online_movable > /sys/devices/system/memory/memoryXXX/state
+
+.. note:: current limit: this memory block must be adjacent to ZONE_MOVABLE
+
+Or you can explicitly request a kernel zone (usually ZONE_NORMAL) by::
+
+	% echo online_kernel > /sys/devices/system/memory/memoryXXX/state
+
+.. note:: current limit: this memory block must be adjacent to ZONE_NORMAL
+
+An explicit zone onlining can fail (e.g. when the range is already within
+and existing and incompatible zone already).
+
+After this, memory block XXX's state will be 'online' and the amount of
+available memory will be increased.
+
+This may be changed in future.
+
+Logical memory remove
+=====================
+
+Memory offline and ZONE_MOVABLE
+-------------------------------
+
+Memory offlining is more complicated than memory online. Because memory offline
+has to make the whole memory block be unused, memory offline can fail if
+the memory block includes memory which cannot be freed.
+
+In general, memory offline can use 2 techniques.
+
+(1) reclaim and free all memory in the memory block.
+(2) migrate all pages in the memory block.
+
+In the current implementation, Linux's memory offline uses method (2), freeing
+all  pages in the memory block by page migration. But not all pages are
+migratable. Under current Linux, migratable pages are anonymous pages and
+page caches. For offlining a memory block by migration, the kernel has to
+guarantee that the memory block contains only migratable pages.
+
+Now, a boot option for making a memory block which consists of migratable pages
+is supported. By specifying "kernelcore=" or "movablecore=" boot option, you can
+create ZONE_MOVABLE...a zone which is just used for movable pages.
+(See also Documentation/admin-guide/kernel-parameters.rst)
+
+Assume the system has "TOTAL" amount of memory at boot time, this boot option
+creates ZONE_MOVABLE as following.
+
+1) When kernelcore=YYYY boot option is used,
+   Size of memory not for movable pages (not for offline) is YYYY.
+   Size of memory for movable pages (for offline) is TOTAL-YYYY.
+
+2) When movablecore=ZZZZ boot option is used,
+   Size of memory not for movable pages (not for offline) is TOTAL - ZZZZ.
+   Size of memory for movable pages (for offline) is ZZZZ.
+
+.. note::
+
+   Unfortunately, there is no information to show which memory block belongs
+   to ZONE_MOVABLE. This is TBD.
+
+.. _memory_hotplug_how_to_offline_memory:
+
+How to offline memory
+---------------------
+
+You can offline a memory block by using the same sysfs interface that was used
+in memory onlining::
+
+	% echo offline > /sys/devices/system/memory/memoryXXX/state
+
+If offline succeeds, the state of the memory block is changed to be "offline".
+If it fails, some error core (like -EBUSY) will be returned by the kernel.
+Even if a memory block does not belong to ZONE_MOVABLE, you can try to offline
+it.  If it doesn't contain 'unmovable' memory, you'll get success.
+
+A memory block under ZONE_MOVABLE is considered to be able to be offlined
+easily.  But under some busy state, it may return -EBUSY. Even if a memory
+block cannot be offlined due to -EBUSY, you can retry offlining it and may be
+able to offline it (or not). (For example, a page is referred to by some kernel
+internal call and released soon.)
+
+Consideration:
+  Memory hotplug's design direction is to make the possibility of memory
+  offlining higher and to guarantee unplugging memory under any situation. But
+  it needs more work. Returning -EBUSY under some situation may be good because
+  the user can decide to retry more or not by himself. Currently, memory
+  offlining code does some amount of retry with 120 seconds timeout.
+
+Physical memory remove
+======================
+
+Need more implementation yet....
+ - Notification completion of remove works by OS to firmware.
+ - Guard from remove if not yet.
+
+
+Locking Internals
+=================
+
+When adding/removing memory that uses memory block devices (i.e. ordinary RAM),
+the device_hotplug_lock should be held to:
+
+- synchronize against online/offline requests (e.g. via sysfs). This way, memory
+  block devices can only be accessed (.online/.state attributes) by user
+  space once memory has been fully added. And when removing memory, we
+  know nobody is in critical sections.
+- synchronize against CPU hotplug and similar (e.g. relevant for ACPI and PPC)
+
+Especially, there is a possible lock inversion that is avoided using
+device_hotplug_lock when adding memory and user space tries to online that
+memory faster than expected:
+
+- device_online() will first take the device_lock(), followed by
+  mem_hotplug_lock
+- add_memory_resource() will first take the mem_hotplug_lock, followed by
+  the device_lock() (while creating the devices, during bus_add_device()).
+
+As the device is visible to user space before taking the device_lock(), this
+can result in a lock inversion.
+
+onlining/offlining of memory should be done via device_online()/
+device_offline() - to make sure it is properly synchronized to actions
+via sysfs. Holding device_hotplug_lock is advised (to e.g. protect online_type)
+
+When adding/removing/onlining/offlining memory or adding/removing
+heterogeneous/device memory, we should always hold the mem_hotplug_lock in
+write mode to serialise memory hotplug (e.g. access to global/zone
+variables).
+
+In addition, mem_hotplug_lock (in contrast to device_hotplug_lock) in read
+mode allows for a quite efficient get_online_mems/put_online_mems
+implementation, so code accessing memory can protect from that memory
+vanishing.
+
+
+Future Work
+===========
+
+  - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like
+    sysctl or new control file.
+  - showing memory block and physical device relationship.
+  - test and make it better memory offlining.
+  - support HugeTLB page migration and offlining.
+  - memmap removing at memory offline.
+  - physical remove memory.
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@ -75,9 +75,10 @@ number of times a page is mapped.
    20. NOPAGE
    21. KSM
    22. THP
-    23. BALLOON
+    23. OFFLINE
    24. ZERO_PAGE
    25. IDLE
+    26. PGTABLE

 * ``/proc/kpagecgroup``.  This file contains a 64-bit inode number of the
   memory cgroup each page is charged to, indexed by PFN. Only available when
@ -118,8 +119,8 @@ Short descriptions to the page flags
    identical memory pages dynamically shared between one or more processes
 22 - THP
    contiguous pages which construct transparent hugepages
-23 - BALLOON
-    balloon compaction page
+23 - OFFLINE
+    page is logically offline
 24 - ZERO_PAGE
    zero page for pfn_zero or huge_zero page
 25 - IDLE
@ -128,6 +129,8 @@ Short descriptions to the page flags
    Note that this flag may be stale in case the page was accessed via
    a PTE. To make sure the flag is up-to-date one has to read
    ``/sys/kernel/mm/page_idle/bitmap`` first.
+26 - PGTABLE
+    page is in use as a page table

 IO related page flags
 ---------------------
--- a/Documentation/admin-guide/perf-security.rst
+++ b/Documentation/admin-guide/perf-security.rst
@ -0,0 +1,230 @@
+.. _perf_security:
+
+Perf Events and tool security
+=============================
+
+Overview
+--------
+
+Usage of Performance Counters for Linux (perf_events) [1]_ , [2]_ , [3]_
+can impose a considerable risk of leaking sensitive data accessed by
+monitored processes. The data leakage is possible both in scenarios of
+direct usage of perf_events system call API [2]_ and over data files
+generated by Perf tool user mode utility (Perf) [3]_ , [4]_ . The risk
+depends on the nature of data that perf_events performance monitoring
+units (PMU) [2]_ and Perf collect and expose for performance analysis.
+Collected system and performance data may be split into several
+categories:
+
+1. System hardware and software configuration data, for example: a CPU
+   model and its cache configuration, an amount of available memory and
+   its topology, used kernel and Perf versions, performance monitoring
+   setup including experiment time, events configuration, Perf command
+   line parameters, etc.
+
+2. User and kernel module paths and their load addresses with sizes,
+   process and thread names with their PIDs and TIDs, timestamps for
+   captured hardware and software events.
+
+3. Content of kernel software counters (e.g., for context switches, page
+   faults, CPU migrations), architectural hardware performance counters
+   (PMC) [8]_ and machine specific registers (MSR) [9]_ that provide
+   execution metrics for various monitored parts of the system (e.g.,
+   memory controller (IMC), interconnect (QPI/UPI) or peripheral (PCIe)
+   uncore counters) without direct attribution to any execution context
+   state.
+
+4. Content of architectural execution context registers (e.g., RIP, RSP,
+   RBP on x86_64), process user and kernel space memory addresses and
+   data, content of various architectural MSRs that capture data from
+   this category.
+
+Data that belong to the fourth category can potentially contain
+sensitive process data. If PMUs in some monitoring modes capture values
+of execution context registers or data from process memory then access
+to such monitoring capabilities requires to be ordered and secured
+properly. So, perf_events/Perf performance monitoring is the subject for
+security access control management [5]_ .
+
+perf_events/Perf access control
+-------------------------------
+
+To perform security checks, the Linux implementation splits processes
+into two categories [6]_ : a) privileged processes (whose effective user
+ID is 0, referred to as superuser or root), and b) unprivileged
+processes (whose effective UID is nonzero). Privileged processes bypass
+all kernel security permission checks so perf_events performance
+monitoring is fully available to privileged processes without access,
+scope and resource restrictions.
+
+Unprivileged processes are subject to a full security permission check
+based on the process's credentials [5]_ (usually: effective UID,
+effective GID, and supplementary group list).
+
+Linux divides the privileges traditionally associated with superuser
+into distinct units, known as capabilities [6]_ , which can be
+independently enabled and disabled on per-thread basis for processes and
+files of unprivileged users.
+
+Unprivileged processes with enabled CAP_SYS_ADMIN capability are treated
+as privileged processes with respect to perf_events performance
+monitoring and bypass *scope* permissions checks in the kernel.
+
+Unprivileged processes using perf_events system call API is also subject
+for PTRACE_MODE_READ_REALCREDS ptrace access mode check [7]_ , whose
+outcome determines whether monitoring is permitted. So unprivileged
+processes provided with CAP_SYS_PTRACE capability are effectively
+permitted to pass the check.
+
+Other capabilities being granted to unprivileged processes can
+effectively enable capturing of additional data required for later
+performance analysis of monitored processes or a system. For example,
+CAP_SYSLOG capability permits reading kernel space memory addresses from
+/proc/kallsyms file.
+
+perf_events/Perf privileged users
+---------------------------------
+
+Mechanisms of capabilities, privileged capability-dumb files [6]_ and
+file system ACLs [10]_ can be used to create a dedicated group of
+perf_events/Perf privileged users who are permitted to execute
+performance monitoring without scope limits. The following steps can be
+taken to create such a group of privileged Perf users.
+
+1. Create perf_users group of privileged Perf users, assign perf_users
+   group to Perf tool executable and limit access to the executable for
+   other users in the system who are not in the perf_users group:
+
+::
+
+   # groupadd perf_users
+   # ls -alhF
+   -rwxr-xr-x  2 root root  11M Oct 19 15:12 perf
+   # chgrp perf_users perf
+   # ls -alhF
+   -rwxr-xr-x  2 root perf_users  11M Oct 19 15:12 perf
+   # chmod o-rwx perf
+   # ls -alhF
+   -rwxr-x---  2 root perf_users  11M Oct 19 15:12 perf
+
+2. Assign the required capabilities to the Perf tool executable file and
+   enable members of perf_users group with performance monitoring
+   privileges [6]_ :
+
+::
+
+   # setcap "cap_sys_admin,cap_sys_ptrace,cap_syslog=ep" perf
+   # setcap -v "cap_sys_admin,cap_sys_ptrace,cap_syslog=ep" perf
+   perf: OK
+   # getcap perf
+   perf = cap_sys_ptrace,cap_sys_admin,cap_syslog+ep
+
+As a result, members of perf_users group are capable of conducting
+performance monitoring by using functionality of the configured Perf
+tool executable that, when executes, passes perf_events subsystem scope
+checks.
+
+This specific access control management is only available to superuser
+or root running processes with CAP_SETPCAP, CAP_SETFCAP [6]_
+capabilities.
+
+perf_events/Perf unprivileged users
+-----------------------------------
+
+perf_events/Perf *scope* and *access* control for unprivileged processes
+is governed by perf_event_paranoid [2]_ setting:
+
+-1:
+     Impose no *scope* and *access* restrictions on using perf_events
+     performance monitoring. Per-user per-cpu perf_event_mlock_kb [2]_
+     locking limit is ignored when allocating memory buffers for storing
+     performance data. This is the least secure mode since allowed
+     monitored *scope* is maximized and no perf_events specific limits
+     are imposed on *resources* allocated for performance monitoring.
+
+>=0:
+     *scope* includes per-process and system wide performance monitoring
+     but excludes raw tracepoints and ftrace function tracepoints
+     monitoring. CPU and system events happened when executing either in
+     user or in kernel space can be monitored and captured for later
+     analysis. Per-user per-cpu perf_event_mlock_kb locking limit is
+     imposed but ignored for unprivileged processes with CAP_IPC_LOCK
+     [6]_ capability.
+
+>=1:
+     *scope* includes per-process performance monitoring only and
+     excludes system wide performance monitoring. CPU and system events
+     happened when executing either in user or in kernel space can be
+     monitored and captured for later analysis. Per-user per-cpu
+     perf_event_mlock_kb locking limit is imposed but ignored for
+     unprivileged processes with CAP_IPC_LOCK capability.
+
+>=2:
+     *scope* includes per-process performance monitoring only. CPU and
+     system events happened when executing in user space only can be
+     monitored and captured for later analysis. Per-user per-cpu
+     perf_event_mlock_kb locking limit is imposed but ignored for
+     unprivileged processes with CAP_IPC_LOCK capability.
+
+perf_events/Perf resource control
+---------------------------------
+
+Open file descriptors
+++++++++++++++++++++
+
+The perf_events system call API [2]_ allocates file descriptors for
+every configured PMU event. Open file descriptors are a per-process
+accountable resource governed by the RLIMIT_NOFILE [11]_ limit
+(ulimit -n), which is usually derived from the login shell process. When
+configuring Perf collection for a long list of events on a large server
+system, this limit can be easily hit preventing required monitoring
+configuration. RLIMIT_NOFILE limit can be increased on per-user basis
+modifying content of the limits.conf file [12]_ . Ordinarily, a Perf
+sampling session (perf record) requires an amount of open perf_event
+file descriptors that is not less than the number of monitored events
+multiplied by the number of monitored CPUs.
+
+Memory allocation
+++++++++++++++++
+
+The amount of memory available to user processes for capturing
+performance monitoring data is governed by the perf_event_mlock_kb [2]_
+setting. This perf_event specific resource setting defines overall
+per-cpu limits of memory allowed for mapping by the user processes to
+execute performance monitoring. The setting essentially extends the
+RLIMIT_MEMLOCK [11]_ limit, but only for memory regions mapped
+specifically for capturing monitored performance events and related data.
+
+For example, if a machine has eight cores and perf_event_mlock_kb limit
+is set to 516 KiB, then a user process is provided with 516 KiB * 8 =
+4128 KiB of memory above the RLIMIT_MEMLOCK limit (ulimit -l) for
+perf_event mmap buffers. In particular, this means that, if the user
+wants to start two or more performance monitoring processes, the user is
+required to manually distribute the available 4128 KiB between the
+monitoring processes, for example, using the --mmap-pages Perf record
+mode option. Otherwise, the first started performance monitoring process
+allocates all available 4128 KiB and the other processes will fail to
+proceed due to the lack of memory.
+
+RLIMIT_MEMLOCK and perf_event_mlock_kb resource constraints are ignored
+for processes with the CAP_IPC_LOCK capability. Thus, perf_events/Perf
+privileged users can be provided with memory above the constraints for
+perf_events/Perf performance monitoring purpose by providing the Perf
+executable with CAP_IPC_LOCK capability.
+
+Bibliography
+------------
+
+.. [1] `<https://lwn.net/Articles/337493/>`_
+.. [2] `<http://man7.org/linux/man-pages/man2/perf_event_open.2.html>`_
+.. [3] `<http://web.eece.maine.edu/~vweaver/projects/perf_events/>`_
+.. [4] `<https://perf.wiki.kernel.org/index.php/Main_Page>`_
+.. [5] `<https://www.kernel.org/doc/html/latest/security/credentials.html>`_
+.. [6] `<http://man7.org/linux/man-pages/man7/capabilities.7.html>`_
+.. [7] `<http://man7.org/linux/man-pages/man2/ptrace.2.html>`_
+.. [8] `<https://en.wikipedia.org/wiki/Hardware_performance_counter>`_
+.. [9] `<https://en.wikipedia.org/wiki/Model-specific_register>`_
+.. [10] `<http://man7.org/linux/man-pages/man5/acl.5.html>`_
+.. [11] `<http://man7.org/linux/man-pages/man2/getrlimit.2.html>`_
+.. [12] `<http://man7.org/linux/man-pages/man5/limits.conf.5.html>`_
+
--- a/Documentation/admin-guide/pm/cpufreq.rst
+++ b/Documentation/admin-guide/pm/cpufreq.rst
@ -150,7 +150,7 @@ data structures necessary to handle the given policy and, possibly, to add
 a governor ``sysfs`` interface to it.  Next, the governor is started by
 invoking its ``->start()`` callback.

-That callback it expected to register per-CPU utilization update callbacks for
+That callback is expected to register per-CPU utilization update callbacks for
 all of the online CPUs belonging to the given policy with the CPU scheduler.
 The utilization update callbacks will be invoked by the CPU scheduler on
 important events, like task enqueue and dequeue, on every iteration of the
--- a/Documentation/admin-guide/pm/cpuidle.rst
+++ b/Documentation/admin-guide/pm/cpuidle.rst
@ -0,0 +1,719 @@
+.. |struct cpuidle_state| replace:: :c:type:`struct cpuidle_state <cpuidle_state>`
+.. |cpufreq| replace:: :doc:`CPU Performance Scaling <cpufreq>`
+
+========================
+CPU Idle Time Management
+========================
+
+::
+
+ Copyright (c) 2018 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+Concepts
+========
+
+Modern processors are generally able to enter states in which the execution of
+a program is suspended and instructions belonging to it are not fetched from
+memory or executed.  Those states are the *idle* states of the processor.
+
+Since part of the processor hardware is not used in idle states, entering them
+generally allows power drawn by the processor to be reduced and, in consequence,
+it is an opportunity to save energy.
+
+CPU idle time management is an energy-efficiency feature concerned about using
+the idle states of processors for this purpose.
+
+Logical CPUs
+------------
+
+CPU idle time management operates on CPUs as seen by the *CPU scheduler* (that
+is the part of the kernel responsible for the distribution of computational
+work in the system).  In its view, CPUs are *logical* units.  That is, they need
+not be separate physical entities and may just be interfaces appearing to
+software as individual single-core processors.  In other words, a CPU is an
+entity which appears to be fetching instructions that belong to one sequence
+(program) from memory and executing them, but it need not work this way
+physically.  Generally, three different cases can be consider here.
+
+First, if the whole processor can only follow one sequence of instructions (one
+program) at a time, it is a CPU.  In that case, if the hardware is asked to
+enter an idle state, that applies to the processor as a whole.
+
+Second, if the processor is multi-core, each core in it is able to follow at
+least one program at a time.  The cores need not be entirely independent of each
+other (for example, they may share caches), but still most of the time they
+work physically in parallel with each other, so if each of them executes only
+one program, those programs run mostly independently of each other at the same
+time.  The entire cores are CPUs in that case and if the hardware is asked to
+enter an idle state, that applies to the core that asked for it in the first
+place, but it also may apply to a larger unit (say a "package" or a "cluster")
+that the core belongs to (in fact, it may apply to an entire hierarchy of larger
+units containing the core).  Namely, if all of the cores in the larger unit
+except for one have been put into idle states at the "core level" and the
+remaining core asks the processor to enter an idle state, that may trigger it
+to put the whole larger unit into an idle state which also will affect the
+other cores in that unit.
+
+Finally, each core in a multi-core processor may be able to follow more than one
+program in the same time frame (that is, each core may be able to fetch
+instructions from multiple locations in memory and execute them in the same time
+frame, but not necessarily entirely in parallel with each other).  In that case
+the cores present themselves to software as "bundles" each consisting of
+multiple individual single-core "processors", referred to as *hardware threads*
+(or hyper-threads specifically on Intel hardware), that each can follow one
+sequence of instructions.  Then, the hardware threads are CPUs from the CPU idle
+time management perspective and if the processor is asked to enter an idle state
+by one of them, the hardware thread (or CPU) that asked for it is stopped, but
+nothing more happens, unless all of the other hardware threads within the same
+core also have asked the processor to enter an idle state.  In that situation,
+the core may be put into an idle state individually or a larger unit containing
+it may be put into an idle state as a whole (if the other cores within the
+larger unit are in idle states already).
+
+Idle CPUs
+---------
+
+Logical CPUs, simply referred to as "CPUs" in what follows, are regarded as
+*idle* by the Linux kernel when there are no tasks to run on them except for the
+special "idle" task.
+
+Tasks are the CPU scheduler's representation of work.  Each task consists of a
+sequence of instructions to execute, or code, data to be manipulated while
+running that code, and some context information that needs to be loaded into the
+processor every time the task's code is run by a CPU.  The CPU scheduler
+distributes work by assigning tasks to run to the CPUs present in the system.
+
+Tasks can be in various states.  In particular, they are *runnable* if there are
+no specific conditions preventing their code from being run by a CPU as long as
+there is a CPU available for that (for example, they are not waiting for any
+events to occur or similar).  When a task becomes runnable, the CPU scheduler
+assigns it to one of the available CPUs to run and if there are no more runnable
+tasks assigned to it, the CPU will load the given task's context and run its
+code (from the instruction following the last one executed so far, possibly by
+another CPU).  [If there are multiple runnable tasks assigned to one CPU
+simultaneously, they will be subject to prioritization and time sharing in order
+to allow them to make some progress over time.]
+
+The special "idle" task becomes runnable if there are no other runnable tasks
+assigned to the given CPU and the CPU is then regarded as idle.  In other words,
+in Linux idle CPUs run the code of the "idle" task called *the idle loop*.  That
+code may cause the processor to be put into one of its idle states, if they are
+supported, in order to save energy, but if the processor does not support any
+idle states, or there is not enough time to spend in an idle state before the
+next wakeup event, or there are strict latency constraints preventing any of the
+available idle states from being used, the CPU will simply execute more or less
+useless instructions in a loop until it is assigned a new task to run.
+
+
+.. _idle-loop:
+
+The Idle Loop
+=============
+
+The idle loop code takes two major steps in every iteration of it.  First, it
+calls into a code module referred to as the *governor* that belongs to the CPU
+idle time management subsystem called ``CPUIdle`` to select an idle state for
+the CPU to ask the hardware to enter.  Second, it invokes another code module
+from the ``CPUIdle`` subsystem, called the *driver*, to actually ask the
+processor hardware to enter the idle state selected by the governor.
+
+The role of the governor is to find an idle state most suitable for the
+conditions at hand.  For this purpose, idle states that the hardware can be
+asked to enter by logical CPUs are represented in an abstract way independent of
+the platform or the processor architecture and organized in a one-dimensional
+(linear) array.  That array has to be prepared and supplied by the ``CPUIdle``
+driver matching the platform the kernel is running on at the initialization
+time.  This allows ``CPUIdle`` governors to be independent of the underlying
+hardware and to work with any platforms that the Linux kernel can run on.
+
+Each idle state present in that array is characterized by two parameters to be
+taken into account by the governor, the *target residency* and the (worst-case)
+*exit latency*.  The target residency is the minimum time the hardware must
+spend in the given state, including the time needed to enter it (which may be
+substantial), in order to save more energy than it would save by entering one of
+the shallower idle states instead.  [The "depth" of an idle state roughly
+corresponds to the power drawn by the processor in that state.]  The exit
+latency, in turn, is the maximum time it will take a CPU asking the processor
+hardware to enter an idle state to start executing the first instruction after a
+wakeup from that state.  Note that in general the exit latency also must cover
+the time needed to enter the given state in case the wakeup occurs when the
+hardware is entering it and it must be entered completely to be exited in an
+ordered manner.
+
+There are two types of information that can influence the governor's decisions.
+First of all, the governor knows the time until the closest timer event.  That
+time is known exactly, because the kernel programs timers and it knows exactly
+when they will trigger, and it is the maximum time the hardware that the given
+CPU depends on can spend in an idle state, including the time necessary to enter
+and exit it.  However, the CPU may be woken up by a non-timer event at any time
+(in particular, before the closest timer triggers) and it generally is not known
+when that may happen.  The governor can only see how much time the CPU actually
+was idle after it has been woken up (that time will be referred to as the *idle
+duration* from now on) and it can use that information somehow along with the
+time until the closest timer to estimate the idle duration in future.  How the
+governor uses that information depends on what algorithm is implemented by it
+and that is the primary reason for having more than one governor in the
+``CPUIdle`` subsystem.
+
+There are three ``CPUIdle`` governors available, ``menu``, `TEO <teo-gov_>`_
+and ``ladder``.  Which of them is used by default depends on the configuration
+of the kernel and in particular on whether or not the scheduler tick can be
+`stopped by the idle loop <idle-cpus-and-tick_>`_.  It is possible to change the
+governor at run time if the ``cpuidle_sysfs_switch`` command line parameter has
+been passed to the kernel, but that is not safe in general, so it should not be
+done on production systems (that may change in the future, though).  The name of
+the ``CPUIdle`` governor currently used by the kernel can be read from the
+:file:`current_governor_ro` (or :file:`current_governor` if
+``cpuidle_sysfs_switch`` is present in the kernel command line) file under
+:file:`/sys/devices/system/cpu/cpuidle/` in ``sysfs``.
+
+Which ``CPUIdle`` driver is used, on the other hand, usually depends on the
+platform the kernel is running on, but there are platforms with more than one
+matching driver.  For example, there are two drivers that can work with the
+majority of Intel platforms, ``intel_idle`` and ``acpi_idle``, one with
+hardcoded idle states information and the other able to read that information
+from the system's ACPI tables, respectively.  Still, even in those cases, the
+driver chosen at the system initialization time cannot be replaced later, so the
+decision on which one of them to use has to be made early (on Intel platforms
+the ``acpi_idle`` driver will be used if ``intel_idle`` is disabled for some
+reason or if it does not recognize the processor).  The name of the ``CPUIdle``
+driver currently used by the kernel can be read from the :file:`current_driver`
+file under :file:`/sys/devices/system/cpu/cpuidle/` in ``sysfs``.
+
+
+.. _idle-cpus-and-tick:
+
+Idle CPUs and The Scheduler Tick
+================================
+
+The scheduler tick is a timer that triggers periodically in order to implement
+the time sharing strategy of the CPU scheduler.  Of course, if there are
+multiple runnable tasks assigned to one CPU at the same time, the only way to
+allow them to make reasonable progress in a given time frame is to make them
+share the available CPU time.  Namely, in rough approximation, each task is
+given a slice of the CPU time to run its code, subject to the scheduling class,
+prioritization and so on and when that time slice is used up, the CPU should be
+switched over to running (the code of) another task.  The currently running task
+may not want to give the CPU away voluntarily, however, and the scheduler tick
+is there to make the switch happen regardless.  That is not the only role of the
+tick, but it is the primary reason for using it.
+
+The scheduler tick is problematic from the CPU idle time management perspective,
+because it triggers periodically and relatively often (depending on the kernel
+configuration, the length of the tick period is between 1 ms and 10 ms).
+Thus, if the tick is allowed to trigger on idle CPUs, it will not make sense
+for them to ask the hardware to enter idle states with target residencies above
+the tick period length.  Moreover, in that case the idle duration of any CPU
+will never exceed the tick period length and the energy used for entering and
+exiting idle states due to the tick wakeups on idle CPUs will be wasted.
+
+Fortunately, it is not really necessary to allow the tick to trigger on idle
+CPUs, because (by definition) they have no tasks to run except for the special
+"idle" one.  In other words, from the CPU scheduler perspective, the only user
+of the CPU time on them is the idle loop.  Since the time of an idle CPU need
+not be shared between multiple runnable tasks, the primary reason for using the
+tick goes away if the given CPU is idle.  Consequently, it is possible to stop
+the scheduler tick entirely on idle CPUs in principle, even though that may not
+always be worth the effort.
+
+Whether or not it makes sense to stop the scheduler tick in the idle loop
+depends on what is expected by the governor.  First, if there is another
+(non-tick) timer due to trigger within the tick range, stopping the tick clearly
+would be a waste of time, even though the timer hardware may not need to be
+reprogrammed in that case.  Second, if the governor is expecting a non-timer
+wakeup within the tick range, stopping the tick is not necessary and it may even
+be harmful.  Namely, in that case the governor will select an idle state with
+the target residency within the time until the expected wakeup, so that state is
+going to be relatively shallow.  The governor really cannot select a deep idle
+state then, as that would contradict its own expectation of a wakeup in short
+order.  Now, if the wakeup really occurs shortly, stopping the tick would be a
+waste of time and in this case the timer hardware would need to be reprogrammed,
+which is expensive.  On the other hand, if the tick is stopped and the wakeup
+does not occur any time soon, the hardware may spend indefinite amount of time
+in the shallow idle state selected by the governor, which will be a waste of
+energy.  Hence, if the governor is expecting a wakeup of any kind within the
+tick range, it is better to allow the tick trigger.  Otherwise, however, the
+governor will select a relatively deep idle state, so the tick should be stopped
+so that it does not wake up the CPU too early.
+
+In any case, the governor knows what it is expecting and the decision on whether
+or not to stop the scheduler tick belongs to it.  Still, if the tick has been
+stopped already (in one of the previous iterations of the loop), it is better
+to leave it as is and the governor needs to take that into account.
+
+The kernel can be configured to disable stopping the scheduler tick in the idle
+loop altogether.  That can be done through the build-time configuration of it
+(by unsetting the ``CONFIG_NO_HZ_IDLE`` configuration option) or by passing
+``nohz=off`` to it in the command line.  In both cases, as the stopping of the
+scheduler tick is disabled, the governor's decisions regarding it are simply
+ignored by the idle loop code and the tick is never stopped.
+
+The systems that run kernels configured to allow the scheduler tick to be
+stopped on idle CPUs are referred to as *tickless* systems and they are
+generally regarded as more energy-efficient than the systems running kernels in
+which the tick cannot be stopped.  If the given system is tickless, it will use
+the ``menu`` governor by default and if it is not tickless, the default
+``CPUIdle`` governor on it will be ``ladder``.
+
+
+.. _menu-gov:
+
+The ``menu`` Governor
+=====================
+
+The ``menu`` governor is the default ``CPUIdle`` governor for tickless systems.
+It is quite complex, but the basic principle of its design is straightforward.
+Namely, when invoked to select an idle state for a CPU (i.e. an idle state that
+the CPU will ask the processor hardware to enter), it attempts to predict the
+idle duration and uses the predicted value for idle state selection.
+
+It first obtains the time until the closest timer event with the assumption
+that the scheduler tick will be stopped.  That time, referred to as the *sleep
+length* in what follows, is the upper bound on the time before the next CPU
+wakeup.  It is used to determine the sleep length range, which in turn is needed
+to get the sleep length correction factor.
+
+The ``menu`` governor maintains two arrays of sleep length correction factors.
+One of them is used when tasks previously running on the given CPU are waiting
+for some I/O operations to complete and the other one is used when that is not
+the case.  Each array contains several correction factor values that correspond
+to different sleep length ranges organized so that each range represented in the
+array is approximately 10 times wider than the previous one.
+
+The correction factor for the given sleep length range (determined before
+selecting the idle state for the CPU) is updated after the CPU has been woken
+up and the closer the sleep length is to the observed idle duration, the closer
+to 1 the correction factor becomes (it must fall between 0 and 1 inclusive).
+The sleep length is multiplied by the correction factor for the range that it
+falls into to obtain the first approximation of the predicted idle duration.
+
+Next, the governor uses a simple pattern recognition algorithm to refine its
+idle duration prediction.  Namely, it saves the last 8 observed idle duration
+values and, when predicting the idle duration next time, it computes the average
+and variance of them.  If the variance is small (smaller than 400 square
+milliseconds) or it is small relative to the average (the average is greater
+that 6 times the standard deviation), the average is regarded as the "typical
+interval" value.  Otherwise, the longest of the saved observed idle duration
+values is discarded and the computation is repeated for the remaining ones.
+Again, if the variance of them is small (in the above sense), the average is
+taken as the "typical interval" value and so on, until either the "typical
+interval" is determined or too many data points are disregarded, in which case
+the "typical interval" is assumed to equal "infinity" (the maximum unsigned
+integer value).  The "typical interval" computed this way is compared with the
+sleep length multiplied by the correction factor and the minimum of the two is
+taken as the predicted idle duration.
+
+Then, the governor computes an extra latency limit to help "interactive"
+workloads.  It uses the observation that if the exit latency of the selected
+idle state is comparable with the predicted idle duration, the total time spent
+in that state probably will be very short and the amount of energy to save by
+entering it will be relatively small, so likely it is better to avoid the
+overhead related to entering that state and exiting it.  Thus selecting a
+shallower state is likely to be a better option then.   The first approximation
+of the extra latency limit is the predicted idle duration itself which
+additionally is divided by a value depending on the number of tasks that
+previously ran on the given CPU and now they are waiting for I/O operations to
+complete.  The result of that division is compared with the latency limit coming
+from the power management quality of service, or `PM QoS <cpu-pm-qos_>`_,
+framework and the minimum of the two is taken as the limit for the idle states'
+exit latency.
+
+Now, the governor is ready to walk the list of idle states and choose one of
+them.  For this purpose, it compares the target residency of each state with
+the predicted idle duration and the exit latency of it with the computed latency
+limit.  It selects the state with the target residency closest to the predicted
+idle duration, but still below it, and exit latency that does not exceed the
+limit.
+
+In the final step the governor may still need to refine the idle state selection
+if it has not decided to `stop the scheduler tick <idle-cpus-and-tick_>`_.  That
+happens if the idle duration predicted by it is less than the tick period and
+the tick has not been stopped already (in a previous iteration of the idle
+loop).  Then, the sleep length used in the previous computations may not reflect
+the real time until the closest timer event and if it really is greater than
+that time, the governor may need to select a shallower state with a suitable
+target residency.
+
+
+.. _teo-gov:
+
+The Timer Events Oriented (TEO) Governor
+========================================
+
+The timer events oriented (TEO) governor is an alternative ``CPUIdle`` governor
+for tickless systems.  It follows the same basic strategy as the ``menu`` `one
+<menu-gov_>`_: it always tries to find the deepest idle state suitable for the
+given conditions.  However, it applies a different approach to that problem.
+
+First, it does not use sleep length correction factors, but instead it attempts
+to correlate the observed idle duration values with the available idle states
+and use that information to pick up the idle state that is most likely to
+"match" the upcoming CPU idle interval.   Second, it does not take the tasks
+that were running on the given CPU in the past and are waiting on some I/O
+operations to complete now at all (there is no guarantee that they will run on
+the same CPU when they become runnable again) and the pattern detection code in
+it avoids taking timer wakeups into account.  It also only uses idle duration
+values less than the current time till the closest timer (with the scheduler
+tick excluded) for that purpose.
+
+Like in the ``menu`` governor `case <menu-gov_>`_, the first step is to obtain
+the *sleep length*, which is the time until the closest timer event with the
+assumption that the scheduler tick will be stopped (that also is the upper bound
+on the time until the next CPU wakeup).  That value is then used to preselect an
+idle state on the basis of three metrics maintained for each idle state provided
+by the ``CPUIdle`` driver: ``hits``, ``misses`` and ``early_hits``.
+
+The ``hits`` and ``misses`` metrics measure the likelihood that a given idle
+state will "match" the observed (post-wakeup) idle duration if it "matches" the
+sleep length.  They both are subject to decay (after a CPU wakeup) every time
+the target residency of the idle state corresponding to them is less than or
+equal to the sleep length and the target residency of the next idle state is
+greater than the sleep length (that is, when the idle state corresponding to
+them "matches" the sleep length).  The ``hits`` metric is increased if the
+former condition is satisfied and the target residency of the given idle state
+is less than or equal to the observed idle duration and the target residency of
+the next idle state is greater than the observed idle duration at the same time
+(that is, it is increased when the given idle state "matches" both the sleep
+length and the observed idle duration).  In turn, the ``misses`` metric is
+increased when the given idle state "matches" the sleep length only and the
+observed idle duration is too short for its target residency.
+
+The ``early_hits`` metric measures the likelihood that a given idle state will
+"match" the observed (post-wakeup) idle duration if it does not "match" the
+sleep length.  It is subject to decay on every CPU wakeup and it is increased
+when the idle state corresponding to it "matches" the observed (post-wakeup)
+idle duration and the target residency of the next idle state is less than or
+equal to the sleep length (i.e. the idle state "matching" the sleep length is
+deeper than the given one).
+
+The governor walks the list of idle states provided by the ``CPUIdle`` driver
+and finds the last (deepest) one with the target residency less than or equal
+to the sleep length.  Then, the ``hits`` and ``misses`` metrics of that idle
+state are compared with each other and it is preselected if the ``hits`` one is
+greater (which means that that idle state is likely to "match" the observed idle
+duration after CPU wakeup).  If the ``misses`` one is greater, the governor
+preselects the shallower idle state with the maximum ``early_hits`` metric
+(or if there are multiple shallower idle states with equal ``early_hits``
+metric which also is the maximum, the shallowest of them will be preselected).
+[If there is a wakeup latency constraint coming from the `PM QoS framework
+<cpu-pm-qos_>`_ which is hit before reaching the deepest idle state with the
+target residency within the sleep length, the deepest idle state with the exit
+latency within the constraint is preselected without consulting the ``hits``,
+``misses`` and ``early_hits`` metrics.]
+
+Next, the governor takes several idle duration values observed most recently
+into consideration and if at least a half of them are greater than or equal to
+the target residency of the preselected idle state, that idle state becomes the
+final candidate to ask for.  Otherwise, the average of the most recent idle
+duration values below the target residency of the preselected idle state is
+computed and the governor walks the idle states shallower than the preselected
+one and finds the deepest of them with the target residency within that average.
+That idle state is then taken as the final candidate to ask for.
+
+Still, at this point the governor may need to refine the idle state selection if
+it has not decided to `stop the scheduler tick <idle-cpus-and-tick_>`_.  That
+generally happens if the target residency of the idle state selected so far is
+less than the tick period and the tick has not been stopped already (in a
+previous iteration of the idle loop).  Then, like in the ``menu`` governor
+`case <menu-gov_>`_, the sleep length used in the previous computations may not
+reflect the real time until the closest timer event and if it really is greater
+than that time, a shallower state with a suitable target residency may need to
+be selected.
+
+
+.. _idle-states-representation:
+
+Representation of Idle States
+=============================
+
+For the CPU idle time management purposes all of the physical idle states
+supported by the processor have to be represented as a one-dimensional array of
+|struct cpuidle_state| objects each allowing an individual (logical) CPU to ask
+the processor hardware to enter an idle state of certain properties.  If there
+is a hierarchy of units in the processor, one |struct cpuidle_state| object can
+cover a combination of idle states supported by the units at different levels of
+the hierarchy.  In that case, the `target residency and exit latency parameters
+of it <idle-loop_>`_, must reflect the properties of the idle state at the
+deepest level (i.e. the idle state of the unit containing all of the other
+units).
+
+For example, take a processor with two cores in a larger unit referred to as
+a "module" and suppose that asking the hardware to enter a specific idle state
+(say "X") at the "core" level by one core will trigger the module to try to
+enter a specific idle state of its own (say "MX") if the other core is in idle
+state "X" already.  In other words, asking for idle state "X" at the "core"
+level gives the hardware a license to go as deep as to idle state "MX" at the
+"module" level, but there is no guarantee that this is going to happen (the core
+asking for idle state "X" may just end up in that state by itself instead).
+Then, the target residency of the |struct cpuidle_state| object representing
+idle state "X" must reflect the minimum time to spend in idle state "MX" of
+the module (including the time needed to enter it), because that is the minimum
+time the CPU needs to be idle to save any energy in case the hardware enters
+that state.  Analogously, the exit latency parameter of that object must cover
+the exit time of idle state "MX" of the module (and usually its entry time too),
+because that is the maximum delay between a wakeup signal and the time the CPU
+will start to execute the first new instruction (assuming that both cores in the
+module will always be ready to execute instructions as soon as the module
+becomes operational as a whole).
+
+There are processors without direct coordination between different levels of the
+hierarchy of units inside them, however.  In those cases asking for an idle
+state at the "core" level does not automatically affect the "module" level, for
+example, in any way and the ``CPUIdle`` driver is responsible for the entire
+handling of the hierarchy.  Then, the definition of the idle state objects is
+entirely up to the driver, but still the physical properties of the idle state
+that the processor hardware finally goes into must always follow the parameters
+used by the governor for idle state selection (for instance, the actual exit
+latency of that idle state must not exceed the exit latency parameter of the
+idle state object selected by the governor).
+
+In addition to the target residency and exit latency idle state parameters
+discussed above, the objects representing idle states each contain a few other
+parameters describing the idle state and a pointer to the function to run in
+order to ask the hardware to enter that state.  Also, for each
+|struct cpuidle_state| object, there is a corresponding
+:c:type:`struct cpuidle_state_usage <cpuidle_state_usage>` one containing usage
+statistics of the given idle state.  That information is exposed by the kernel
+via ``sysfs``.
+
+For each CPU in the system, there is a :file:`/sys/devices/system/cpu<N>/cpuidle/`
+directory in ``sysfs``, where the number ``<N>`` is assigned to the given
+CPU at the initialization time.  That directory contains a set of subdirectories
+called :file:`state0`, :file:`state1` and so on, up to the number of idle state
+objects defined for the given CPU minus one.  Each of these directories
+corresponds to one idle state object and the larger the number in its name, the
+deeper the (effective) idle state represented by it.  Each of them contains
+a number of files (attributes) representing the properties of the idle state
+object corresponding to it, as follows:
+
+``above``
+	Total number of times this idle state had been asked for, but the
+	observed idle duration was certainly too short to match its target
+	residency.
+
+``below``
+	Total number of times this idle state had been asked for, but cerainly
+	a deeper idle state would have been a better match for the observed idle
+	duration.
+
+``desc``
+	Description of the idle state.
+
+``disable``
+	Whether or not this idle state is disabled.
+
+``latency``
+	Exit latency of the idle state in microseconds.
+
+``name``
+	Name of the idle state.
+
+``power``
+	Power drawn by hardware in this idle state in milliwatts (if specified,
+	0 otherwise).
+
+``residency``
+	Target residency of the idle state in microseconds.
+
+``time``
+	Total time spent in this idle state by the given CPU (as measured by the
+	kernel) in microseconds.
+
+``usage``
+	Total number of times the hardware has been asked by the given CPU to
+	enter this idle state.
+
+The :file:`desc` and :file:`name` files both contain strings.  The difference
+between them is that the name is expected to be more concise, while the
+description may be longer and it may contain white space or special characters.
+The other files listed above contain integer numbers.
+
+The :file:`disable` attribute is the only writeable one.  If it contains 1, the
+given idle state is disabled for this particular CPU, which means that the
+governor will never select it for this particular CPU and the ``CPUIdle``
+driver will never ask the hardware to enter it for that CPU as a result.
+However, disabling an idle state for one CPU does not prevent it from being
+asked for by the other CPUs, so it must be disabled for all of them in order to
+never be asked for by any of them.  [Note that, due to the way the ``ladder``
+governor is implemented, disabling an idle state prevents that governor from
+selecting any idle states deeper than the disabled one too.]
+
+If the :file:`disable` attribute contains 0, the given idle state is enabled for
+this particular CPU, but it still may be disabled for some or all of the other
+CPUs in the system at the same time.  Writing 1 to it causes the idle state to
+be disabled for this particular CPU and writing 0 to it allows the governor to
+take it into consideration for the given CPU and the driver to ask for it,
+unless that state was disabled globally in the driver (in which case it cannot
+be used at all).
+
+The :file:`power` attribute is not defined very well, especially for idle state
+objects representing combinations of idle states at different levels of the
+hierarchy of units in the processor, and it generally is hard to obtain idle
+state power numbers for complex hardware, so :file:`power` often contains 0 (not
+available) and if it contains a nonzero number, that number may not be very
+accurate and it should not be relied on for anything meaningful.
+
+The number in the :file:`time` file generally may be greater than the total time
+really spent by the given CPU in the given idle state, because it is measured by
+the kernel and it may not cover the cases in which the hardware refused to enter
+this idle state and entered a shallower one instead of it (or even it did not
+enter any idle state at all).  The kernel can only measure the time span between
+asking the hardware to enter an idle state and the subsequent wakeup of the CPU
+and it cannot say what really happened in the meantime at the hardware level.
+Moreover, if the idle state object in question represents a combination of idle
+states at different levels of the hierarchy of units in the processor,
+the kernel can never say how deep the hardware went down the hierarchy in any
+particular case.  For these reasons, the only reliable way to find out how
+much time has been spent by the hardware in different idle states supported by
+it is to use idle state residency counters in the hardware, if available.
+
+
+.. _cpu-pm-qos:
+
+Power Management Quality of Service for CPUs
+============================================
+
+The power management quality of service (PM QoS) framework in the Linux kernel
+allows kernel code and user space processes to set constraints on various
+energy-efficiency features of the kernel to prevent performance from dropping
+below a required level.  The PM QoS constraints can be set globally, in
+predefined categories referred to as PM QoS classes, or against individual
+devices.
+
+CPU idle time management can be affected by PM QoS in two ways, through the
+global constraint in the ``PM_QOS_CPU_DMA_LATENCY`` class and through the
+resume latency constraints for individual CPUs.  Kernel code (e.g. device
+drivers) can set both of them with the help of special internal interfaces
+provided by the PM QoS framework.  User space can modify the former by opening
+the :file:`cpu_dma_latency` special device file under :file:`/dev/` and writing
+a binary value (interpreted as a signed 32-bit integer) to it.  In turn, the
+resume latency constraint for a CPU can be modified by user space by writing a
+string (representing a signed 32-bit integer) to the
+:file:`power/pm_qos_resume_latency_us` file under
+:file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs``, where the CPU number
+``<N>`` is allocated at the system initialization time.  Negative values
+will be rejected in both cases and, also in both cases, the written integer
+number will be interpreted as a requested PM QoS constraint in microseconds.
+
+The requested value is not automatically applied as a new constraint, however,
+as it may be less restrictive (greater in this particular case) than another
+constraint previously requested by someone else.  For this reason, the PM QoS
+framework maintains a list of requests that have been made so far in each
+global class and for each device, aggregates them and applies the effective
+(minimum in this particular case) value as the new constraint.
+
+In fact, opening the :file:`cpu_dma_latency` special device file causes a new
+PM QoS request to be created and added to the priority list of requests in the
+``PM_QOS_CPU_DMA_LATENCY`` class and the file descriptor coming from the
+"open" operation represents that request.  If that file descriptor is then
+used for writing, the number written to it will be associated with the PM QoS
+request represented by it as a new requested constraint value.  Next, the
+priority list mechanism will be used to determine the new effective value of
+the entire list of requests and that effective value will be set as a new
+constraint.  Thus setting a new requested constraint value will only change the
+real constraint if the effective "list" value is affected by it.  In particular,
+for the ``PM_QOS_CPU_DMA_LATENCY`` class it only affects the real constraint if
+it is the minimum of the requested constraints in the list.  The process holding
+a file descriptor obtained by opening the :file:`cpu_dma_latency` special device
+file controls the PM QoS request associated with that file descriptor, but it
+controls this particular PM QoS request only.
+
+Closing the :file:`cpu_dma_latency` special device file or, more precisely, the
+file descriptor obtained while opening it, causes the PM QoS request associated
+with that file descriptor to be removed from the ``PM_QOS_CPU_DMA_LATENCY``
+class priority list and destroyed.  If that happens, the priority list mechanism
+will be used, again, to determine the new effective value for the whole list
+and that value will become the new real constraint.
+
+In turn, for each CPU there is only one resume latency PM QoS request
+associated with the :file:`power/pm_qos_resume_latency_us` file under
+:file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs`` and writing to it causes
+this single PM QoS request to be updated regardless of which user space
+process does that.  In other words, this PM QoS request is shared by the entire
+user space, so access to the file associated with it needs to be arbitrated
+to avoid confusion.  [Arguably, the only legitimate use of this mechanism in
+practice is to pin a process to the CPU in question and let it use the
+``sysfs`` interface to control the resume latency constraint for it.]  It
+still only is a request, however.  It is a member of a priority list used to
+determine the effective value to be set as the resume latency constraint for the
+CPU in question every time the list of requests is updated this way or another
+(there may be other requests coming from kernel code in that list).
+
+CPU idle time governors are expected to regard the minimum of the global
+effective ``PM_QOS_CPU_DMA_LATENCY`` class constraint and the effective
+resume latency constraint for the given CPU as the upper limit for the exit
+latency of the idle states they can select for that CPU.  They should never
+select any idle states with exit latency beyond that limit.
+
+
+Idle States Control Via Kernel Command Line
+===========================================
+
+In addition to the ``sysfs`` interface allowing individual idle states to be
+`disabled for individual CPUs <idle-states-representation_>`_, there are kernel
+command line parameters affecting CPU idle time management.
+
+The ``cpuidle.off=1`` kernel command line option can be used to disable the
+CPU idle time management entirely.  It does not prevent the idle loop from
+running on idle CPUs, but it prevents the CPU idle time governors and drivers
+from being invoked.  If it is added to the kernel command line, the idle loop
+will ask the hardware to enter idle states on idle CPUs via the CPU architecture
+support code that is expected to provide a default mechanism for this purpose.
+That default mechanism usually is the least common denominator for all of the
+processors implementing the architecture (i.e. CPU instruction set) in question,
+however, so it is rather crude and not very energy-efficient.  For this reason,
+it is not recommended for production use.
+
+The ``cpuidle.governor=`` kernel command line switch allows the ``CPUIdle``
+governor to use to be specified.  It has to be appended with a string matching
+the name of an available governor (e.g. ``cpuidle.governor=menu``) and that
+governor will be used instead of the default one.  It is possible to force
+the ``menu`` governor to be used on the systems that use the ``ladder`` governor
+by default this way, for example.
+
+The other kernel command line parameters controlling CPU idle time management
+described below are only relevant for the *x86* architecture and some of
+them affect Intel processors only.
+
+The *x86* architecture support code recognizes three kernel command line
+options related to CPU idle time management: ``idle=poll``, ``idle=halt``,
+and ``idle=nomwait``.  The first two of them disable the ``acpi_idle`` and
+``intel_idle`` drivers altogether, which effectively causes the entire
+``CPUIdle`` subsystem to be disabled and makes the idle loop invoke the
+architecture support code to deal with idle CPUs.  How it does that depends on
+which of the two parameters is added to the kernel command line.  In the
+``idle=halt`` case, the architecture support code will use the ``HLT``
+instruction of the CPUs (which, as a rule, suspends the execution of the program
+and causes the hardware to attempt to enter the shallowest available idle state)
+for this purpose, and if ``idle=poll`` is used, idle CPUs will execute a
+more or less ``lightweight'' sequence of instructions in a tight loop.  [Note
+that using ``idle=poll`` is somewhat drastic in many cases, as preventing idle
+CPUs from saving almost any energy at all may not be the only effect of it.
+For example, on Intel hardware it effectively prevents CPUs from using
+P-states (see |cpufreq|) that require any number of CPUs in a package to be
+idle, so it very well may hurt single-thread computations performance as well as
+energy-efficiency.  Thus using it for performance reasons may not be a good idea
+at all.]
+
+The ``idle=nomwait`` option disables the ``intel_idle`` driver and causes
+``acpi_idle`` to be used (as long as all of the information needed by it is
+there in the system's ACPI tables), but it is not allowed to use the
+``MWAIT`` instruction of the CPUs to ask the hardware to enter idle states.
+
+In addition to the architecture-level kernel command line options affecting CPU
+idle time management, there are parameters affecting individual ``CPUIdle``
+drivers that can be passed to them via the kernel command line.  Specifically,
+the ``intel_idle.max_cstate=<n>`` and ``processor.max_cstate=<n>`` parameters,
+where ``<n>`` is an idle state index also used in the name of the given
+state's directory in ``sysfs`` (see
+`Representation of Idle States <idle-states-representation_>`_), causes the
+``intel_idle`` and ``acpi_idle`` drivers, respectively, to discard all of the
+idle states deeper than idle state ``<n>``.  In that case, they will never ask
+for any of those idle states or expose them to the governor.  [The behavior of
+the two drivers is different for ``<n>`` equal to ``0``.  Adding
+``intel_idle.max_cstate=0`` to the kernel command line disables the
+``intel_idle`` driver and allows ``acpi_idle`` to be used, whereas
+``processor.max_cstate=0`` is equivalent to ``processor.max_cstate=1``.
+Also, the ``acpi_idle`` driver is part of the ``processor`` kernel module that
+can be loaded separately and ``max_cstate=<n>`` can be passed to it as a module
+parameter when it is loaded.]
--- a/Show more
+++ b/Show more