Merge branch 'linux-mainline' into android-mainline-tmp
Change-Id: I4380c68c3474026a42ffa9f95c525f9a563ba7a3
|
|
@ -72,8 +72,14 @@ ForEachMacros:
|
|||
- 'apei_estatus_for_each_section'
|
||||
- 'ata_for_each_dev'
|
||||
- 'ata_for_each_link'
|
||||
- '__ata_qc_for_each'
|
||||
- 'ata_qc_for_each'
|
||||
- 'ata_qc_for_each_raw'
|
||||
- 'ata_qc_for_each_with_internal'
|
||||
- 'ax25_for_each'
|
||||
- 'ax25_uid_for_each'
|
||||
- '__bio_for_each_bvec'
|
||||
- 'bio_for_each_bvec'
|
||||
- 'bio_for_each_integrity_vec'
|
||||
- '__bio_for_each_segment'
|
||||
- 'bio_for_each_segment'
|
||||
|
|
@ -85,6 +91,7 @@ ForEachMacros:
|
|||
- 'blk_queue_for_each_rl'
|
||||
- 'bond_for_each_slave'
|
||||
- 'bond_for_each_slave_rcu'
|
||||
- 'bpf_for_each_spilled_reg'
|
||||
- 'btree_for_each_safe128'
|
||||
- 'btree_for_each_safe32'
|
||||
- 'btree_for_each_safe64'
|
||||
|
|
@ -103,6 +110,8 @@ ForEachMacros:
|
|||
- 'drm_atomic_crtc_for_each_plane'
|
||||
- 'drm_atomic_crtc_state_for_each_plane'
|
||||
- 'drm_atomic_crtc_state_for_each_plane_state'
|
||||
- 'drm_atomic_for_each_plane_damage'
|
||||
- 'drm_connector_for_each_possible_encoder'
|
||||
- 'drm_for_each_connector_iter'
|
||||
- 'drm_for_each_crtc'
|
||||
- 'drm_for_each_encoder'
|
||||
|
|
@ -111,21 +120,33 @@ ForEachMacros:
|
|||
- 'drm_for_each_legacy_plane'
|
||||
- 'drm_for_each_plane'
|
||||
- 'drm_for_each_plane_mask'
|
||||
- 'drm_for_each_privobj'
|
||||
- 'drm_mm_for_each_hole'
|
||||
- 'drm_mm_for_each_node'
|
||||
- 'drm_mm_for_each_node_in_range'
|
||||
- 'drm_mm_for_each_node_safe'
|
||||
- 'flow_action_for_each'
|
||||
- 'for_each_active_drhd_unit'
|
||||
- 'for_each_active_iommu'
|
||||
- 'for_each_available_child_of_node'
|
||||
- 'for_each_bio'
|
||||
- 'for_each_board_func_rsrc'
|
||||
- 'for_each_bvec'
|
||||
- 'for_each_card_components'
|
||||
- 'for_each_card_links'
|
||||
- 'for_each_card_links_safe'
|
||||
- 'for_each_card_prelinks'
|
||||
- 'for_each_card_rtds'
|
||||
- 'for_each_card_rtds_safe'
|
||||
- 'for_each_cgroup_storage_type'
|
||||
- 'for_each_child_of_node'
|
||||
- 'for_each_clear_bit'
|
||||
- 'for_each_clear_bit_from'
|
||||
- 'for_each_cmsghdr'
|
||||
- 'for_each_compatible_node'
|
||||
- 'for_each_component_dais'
|
||||
- 'for_each_component_dais_safe'
|
||||
- 'for_each_comp_order'
|
||||
- 'for_each_console'
|
||||
- 'for_each_cpu'
|
||||
- 'for_each_cpu_and'
|
||||
|
|
@ -133,10 +154,17 @@ ForEachMacros:
|
|||
- 'for_each_cpu_wrap'
|
||||
- 'for_each_dev_addr'
|
||||
- 'for_each_dma_cap_mask'
|
||||
- 'for_each_dpcm_be'
|
||||
- 'for_each_dpcm_be_rollback'
|
||||
- 'for_each_dpcm_be_safe'
|
||||
- 'for_each_dpcm_fe'
|
||||
- 'for_each_drhd_unit'
|
||||
- 'for_each_dss_dev'
|
||||
- 'for_each_efi_memory_desc'
|
||||
- 'for_each_efi_memory_desc_in_map'
|
||||
- 'for_each_element'
|
||||
- 'for_each_element_extid'
|
||||
- 'for_each_element_id'
|
||||
- 'for_each_endpoint_of_node'
|
||||
- 'for_each_evictable_lru'
|
||||
- 'for_each_fib6_node_rt_rcu'
|
||||
|
|
@ -149,6 +177,7 @@ ForEachMacros:
|
|||
- 'for_each_iommu'
|
||||
- 'for_each_ip_tunnel_rcu'
|
||||
- 'for_each_irq_nr'
|
||||
- 'for_each_link_codecs'
|
||||
- 'for_each_lru'
|
||||
- 'for_each_matching_node'
|
||||
- 'for_each_matching_node_and_match'
|
||||
|
|
@ -160,6 +189,7 @@ ForEachMacros:
|
|||
- 'for_each_mem_range_rev'
|
||||
- 'for_each_migratetype_order'
|
||||
- 'for_each_msi_entry'
|
||||
- 'for_each_msi_entry_safe'
|
||||
- 'for_each_net'
|
||||
- 'for_each_netdev'
|
||||
- 'for_each_netdev_continue'
|
||||
|
|
@ -172,6 +202,7 @@ ForEachMacros:
|
|||
- 'for_each_net_rcu'
|
||||
- 'for_each_new_connector_in_state'
|
||||
- 'for_each_new_crtc_in_state'
|
||||
- 'for_each_new_mst_mgr_in_state'
|
||||
- 'for_each_new_plane_in_state'
|
||||
- 'for_each_new_private_obj_in_state'
|
||||
- 'for_each_node'
|
||||
|
|
@ -183,12 +214,16 @@ ForEachMacros:
|
|||
- 'for_each_node_with_property'
|
||||
- 'for_each_of_allnodes'
|
||||
- 'for_each_of_allnodes_from'
|
||||
- 'for_each_of_cpu_node'
|
||||
- 'for_each_of_pci_range'
|
||||
- 'for_each_old_connector_in_state'
|
||||
- 'for_each_old_crtc_in_state'
|
||||
- 'for_each_old_mst_mgr_in_state'
|
||||
- 'for_each_oldnew_connector_in_state'
|
||||
- 'for_each_oldnew_crtc_in_state'
|
||||
- 'for_each_oldnew_mst_mgr_in_state'
|
||||
- 'for_each_oldnew_plane_in_state'
|
||||
- 'for_each_oldnew_plane_in_state_reverse'
|
||||
- 'for_each_oldnew_private_obj_in_state'
|
||||
- 'for_each_old_plane_in_state'
|
||||
- 'for_each_old_private_obj_in_state'
|
||||
|
|
@ -206,14 +241,21 @@ ForEachMacros:
|
|||
- 'for_each_process'
|
||||
- 'for_each_process_thread'
|
||||
- 'for_each_property_of_node'
|
||||
- 'for_each_registered_fb'
|
||||
- 'for_each_reserved_mem_region'
|
||||
- 'for_each_resv_unavail_range'
|
||||
- 'for_each_rtd_codec_dai'
|
||||
- 'for_each_rtd_codec_dai_rollback'
|
||||
- 'for_each_rtdcom'
|
||||
- 'for_each_rtdcom_safe'
|
||||
- 'for_each_set_bit'
|
||||
- 'for_each_set_bit_from'
|
||||
- 'for_each_sg'
|
||||
- 'for_each_sg_dma_page'
|
||||
- 'for_each_sg_page'
|
||||
- 'for_each_sibling_event'
|
||||
- 'for_each_subelement'
|
||||
- 'for_each_subelement_extid'
|
||||
- 'for_each_subelement_id'
|
||||
- '__for_each_thread'
|
||||
- 'for_each_thread'
|
||||
- 'for_each_zone'
|
||||
|
|
@ -223,6 +265,8 @@ ForEachMacros:
|
|||
- 'fwnode_for_each_child_node'
|
||||
- 'fwnode_graph_for_each_endpoint'
|
||||
- 'gadget_for_each_ep'
|
||||
- 'genradix_for_each'
|
||||
- 'genradix_for_each_from'
|
||||
- 'hash_for_each'
|
||||
- 'hash_for_each_possible'
|
||||
- 'hash_for_each_possible_rcu'
|
||||
|
|
@ -251,6 +295,8 @@ ForEachMacros:
|
|||
- 'hlist_nulls_for_each_entry_from'
|
||||
- 'hlist_nulls_for_each_entry_rcu'
|
||||
- 'hlist_nulls_for_each_entry_safe'
|
||||
- 'i3c_bus_for_each_i2cdev'
|
||||
- 'i3c_bus_for_each_i3cdev'
|
||||
- 'ide_host_for_each_port'
|
||||
- 'ide_port_for_each_dev'
|
||||
- 'ide_port_for_each_present_dev'
|
||||
|
|
@ -259,19 +305,25 @@ ForEachMacros:
|
|||
- 'idr_for_each_entry_ul'
|
||||
- 'inet_bind_bucket_for_each'
|
||||
- 'inet_lhash2_for_each_icsk_rcu'
|
||||
- 'iov_for_each'
|
||||
- 'key_for_each'
|
||||
- 'key_for_each_safe'
|
||||
- 'klp_for_each_func'
|
||||
- 'klp_for_each_func_safe'
|
||||
- 'klp_for_each_func_static'
|
||||
- 'klp_for_each_object'
|
||||
- 'klp_for_each_object_safe'
|
||||
- 'klp_for_each_object_static'
|
||||
- 'kvm_for_each_memslot'
|
||||
- 'kvm_for_each_vcpu'
|
||||
- 'list_for_each'
|
||||
- 'list_for_each_codec'
|
||||
- 'list_for_each_codec_safe'
|
||||
- 'list_for_each_entry'
|
||||
- 'list_for_each_entry_continue'
|
||||
- 'list_for_each_entry_continue_rcu'
|
||||
- 'list_for_each_entry_continue_reverse'
|
||||
- 'list_for_each_entry_from'
|
||||
- 'list_for_each_entry_from_rcu'
|
||||
- 'list_for_each_entry_from_reverse'
|
||||
- 'list_for_each_entry_lockless'
|
||||
- 'list_for_each_entry_rcu'
|
||||
|
|
@ -291,6 +343,9 @@ ForEachMacros:
|
|||
- 'media_device_for_each_intf'
|
||||
- 'media_device_for_each_link'
|
||||
- 'media_device_for_each_pad'
|
||||
- 'mp_bvec_for_each_page'
|
||||
- 'mp_bvec_for_each_segment'
|
||||
- 'nanddev_io_for_each_page'
|
||||
- 'netdev_for_each_lower_dev'
|
||||
- 'netdev_for_each_lower_private'
|
||||
- 'netdev_for_each_lower_private_rcu'
|
||||
|
|
@ -323,10 +378,10 @@ ForEachMacros:
|
|||
- 'protocol_for_each_card'
|
||||
- 'protocol_for_each_dev'
|
||||
- 'queue_for_each_hw_ctx'
|
||||
- 'radix_tree_for_each_contig'
|
||||
- 'radix_tree_for_each_slot'
|
||||
- 'radix_tree_for_each_tagged'
|
||||
- 'rbtree_postorder_for_each_entry_safe'
|
||||
- 'rdma_for_each_port'
|
||||
- 'resource_list_for_each_entry'
|
||||
- 'resource_list_for_each_entry_safe'
|
||||
- 'rhl_for_each_entry_rcu'
|
||||
|
|
@ -341,6 +396,7 @@ ForEachMacros:
|
|||
- 'rht_for_each_rcu'
|
||||
- 'rht_for_each_rcu_continue'
|
||||
- '__rq_for_each_bio'
|
||||
- 'rq_for_each_bvec'
|
||||
- 'rq_for_each_segment'
|
||||
- 'scsi_for_each_prot_sg'
|
||||
- 'scsi_for_each_sg'
|
||||
|
|
@ -358,12 +414,14 @@ ForEachMacros:
|
|||
- 'sk_nulls_for_each'
|
||||
- 'sk_nulls_for_each_from'
|
||||
- 'sk_nulls_for_each_rcu'
|
||||
- 'snd_array_for_each'
|
||||
- 'snd_pcm_group_for_each_entry'
|
||||
- 'snd_soc_dapm_widget_for_each_path'
|
||||
- 'snd_soc_dapm_widget_for_each_path_safe'
|
||||
- 'snd_soc_dapm_widget_for_each_sink_path'
|
||||
- 'snd_soc_dapm_widget_for_each_source_path'
|
||||
- 'tb_property_for_each'
|
||||
- 'tcf_exts_for_each_action'
|
||||
- 'udp_portaddr_for_each_entry'
|
||||
- 'udp_portaddr_for_each_entry_rcu'
|
||||
- 'usb_hub_for_each_child'
|
||||
|
|
@ -372,6 +430,13 @@ ForEachMacros:
|
|||
- 'v4l2_m2m_for_each_dst_buf_safe'
|
||||
- 'v4l2_m2m_for_each_src_buf'
|
||||
- 'v4l2_m2m_for_each_src_buf_safe'
|
||||
- 'virtio_device_for_each_vq'
|
||||
- 'xa_for_each'
|
||||
- 'xa_for_each_marked'
|
||||
- 'xa_for_each_start'
|
||||
- 'xas_for_each'
|
||||
- 'xas_for_each_conflict'
|
||||
- 'xas_for_each_marked'
|
||||
- 'zorro_for_each_dev'
|
||||
|
||||
#IncludeBlocks: Preserve # Unknown to clang-format-5.0
|
||||
|
|
|
|||
1
.gitignore
vendored
|
|
@ -15,6 +15,7 @@
|
|||
*.bin
|
||||
*.bz2
|
||||
*.c.[012]*.*
|
||||
*.dt.yaml
|
||||
*.dtb
|
||||
*.dtb.S
|
||||
*.dwo
|
||||
|
|
|
|||
30
.mailmap
|
|
@ -36,9 +36,10 @@ Bart Van Assche <bvanassche@acm.org> <bart.vanassche@sandisk.com>
|
|||
Ben Gardner <bgardner@wabtec.com>
|
||||
Ben M Cahill <ben.m.cahill@intel.com>
|
||||
Björn Steinbrink <B.Steinbrink@gmx.de>
|
||||
Boris Brezillon <boris.brezillon@bootlin.com> <boris.brezillon@free-electrons.com>
|
||||
Boris Brezillon <boris.brezillon@bootlin.com> <b.brezillon.dev@gmail.com>
|
||||
Boris Brezillon <boris.brezillon@bootlin.com> <b.brezillon@overkiz.com>
|
||||
Boris Brezillon <bbrezillon@kernel.org> <boris.brezillon@bootlin.com>
|
||||
Boris Brezillon <bbrezillon@kernel.org> <boris.brezillon@free-electrons.com>
|
||||
Boris Brezillon <bbrezillon@kernel.org> <b.brezillon.dev@gmail.com>
|
||||
Boris Brezillon <bbrezillon@kernel.org> <b.brezillon@overkiz.com>
|
||||
Brian Avery <b.avery@hp.com>
|
||||
Brian King <brking@us.ibm.com>
|
||||
Christoph Hellwig <hch@lst.de>
|
||||
|
|
@ -47,7 +48,10 @@ Corey Minyard <minyard@acm.org>
|
|||
Damian Hobson-Garcia <dhobsong@igel.co.jp>
|
||||
David Brownell <david-b@pacbell.net>
|
||||
David Woodhouse <dwmw2@shinybook.infradead.org>
|
||||
Deng-Cheng Zhu <dengcheng.zhu@mips.com> <dengcheng.zhu@imgtec.com>
|
||||
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@mips.com>
|
||||
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@imgtec.com>
|
||||
Dengcheng Zhu <dzhu@wavecomp.com> <dczhu@mips.com>
|
||||
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@gmail.com>
|
||||
Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
|
||||
Domen Puncer <domen@coderock.org>
|
||||
Douglas Gilbert <dougg@torque.net>
|
||||
|
|
@ -119,6 +123,14 @@ Mark Brown <broonie@sirena.org.uk>
|
|||
Mark Yao <markyao0591@gmail.com> <mark.yao@rock-chips.com>
|
||||
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com>
|
||||
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com>
|
||||
Mathieu Othacehe <m.othacehe@gmail.com>
|
||||
Matthew Wilcox <willy@infradead.org> <matthew.r.wilcox@intel.com>
|
||||
Matthew Wilcox <willy@infradead.org> <matthew@wil.cx>
|
||||
Matthew Wilcox <willy@infradead.org> <mawilcox@linuxonhyperv.com>
|
||||
Matthew Wilcox <willy@infradead.org> <mawilcox@microsoft.com>
|
||||
Matthew Wilcox <willy@infradead.org> <willy@debian.org>
|
||||
Matthew Wilcox <willy@infradead.org> <willy@linux.intel.com>
|
||||
Matthew Wilcox <willy@infradead.org> <willy@parisc-linux.org>
|
||||
Matthieu CASTET <castet.matthieu@free.fr>
|
||||
Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@brturbo.com.br>
|
||||
Mauro Carvalho Chehab <mchehab@kernel.org> <maurochehab@gmail.com>
|
||||
|
|
@ -144,6 +156,8 @@ Morten Welinder <welinder@darter.rentec.com>
|
|||
Morten Welinder <welinder@troll.com>
|
||||
Mythri P K <mythripk@ti.com>
|
||||
Nguyen Anh Quynh <aquynh@gmail.com>
|
||||
Nicolas Pitre <nico@fluxnic.net> <nicolas.pitre@linaro.org>
|
||||
Nicolas Pitre <nico@fluxnic.net> <nico@linaro.org>
|
||||
Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
|
||||
Patrick Mochel <mochel@digitalimplant.org>
|
||||
Paul Burton <paul.burton@mips.com> <paul.burton@imgtec.com>
|
||||
|
|
@ -152,7 +166,13 @@ Peter Oruba <peter@oruba.de>
|
|||
Peter Oruba <peter.oruba@amd.com>
|
||||
Pratyush Anand <pratyush.anand@gmail.com> <pratyush.anand@st.com>
|
||||
Praveen BP <praveenbp@ti.com>
|
||||
Punit Agrawal <punitagrawal@gmail.com> <punit.agrawal@arm.com>
|
||||
Qais Yousef <qsyousef@gmail.com> <qais.yousef@imgtec.com>
|
||||
Oleksij Rempel <linux@rempel-privat.de> <bug-track@fisher-privat.net>
|
||||
Oleksij Rempel <linux@rempel-privat.de> <external.Oleksij.Rempel@de.bosch.com>
|
||||
Oleksij Rempel <linux@rempel-privat.de> <fixed-term.Oleksij.Rempel@de.bosch.com>
|
||||
Oleksij Rempel <linux@rempel-privat.de> <o.rempel@pengutronix.de>
|
||||
Oleksij Rempel <linux@rempel-privat.de> <ore@pengutronix.de>
|
||||
Rajesh Shah <rajesh.shah@intel.com>
|
||||
Ralf Baechle <ralf@linux-mips.org>
|
||||
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
|
||||
|
|
@ -206,3 +226,5 @@ Yakir Yang <kuankuan.y@gmail.com> <ykk@rock-chips.com>
|
|||
Yusuke Goda <goda.yusuke@renesas.com>
|
||||
Gustavo Padovan <gustavo@las.ic.unicamp.br>
|
||||
Gustavo Padovan <padovan@profusion.mobi>
|
||||
Changbin Du <changbin.du@intel.com> <changbin.du@intel.com>
|
||||
Changbin Du <changbin.du@intel.com> <changbin.du@gmail.com>
|
||||
|
|
|
|||
44
CREDITS
|
|
@ -842,10 +842,9 @@ D: ax25-utils maintainer.
|
|||
|
||||
N: Helge Deller
|
||||
E: deller@gmx.de
|
||||
E: hdeller@redhat.de
|
||||
D: PA-RISC Linux hacker, LASI-, ASP-, WAX-, LCD/LED-driver
|
||||
S: Schimmelsrain 1
|
||||
S: D-69231 Rauenberg
|
||||
W: http://www.parisc-linux.org/
|
||||
D: PA-RISC Linux architecture maintainer
|
||||
D: LASI-, ASP-, WAX-, LCD/LED-driver
|
||||
S: Germany
|
||||
|
||||
N: Jean Delvare
|
||||
|
|
@ -1222,7 +1221,7 @@ S: Brazil
|
|||
|
||||
N: Oded Gabbay
|
||||
E: oded.gabbay@gmail.com
|
||||
D: AMD KFD maintainer
|
||||
D: HabanaLabs and AMD KFD maintainer
|
||||
S: 12 Shraga Raphaeli
|
||||
S: Petah-Tikva, 4906418
|
||||
S: Israel
|
||||
|
|
@ -1361,7 +1360,7 @@ S: Stellenbosch, Western Cape
|
|||
S: South Africa
|
||||
|
||||
N: Grant Grundler
|
||||
E: grundler@parisc-linux.org
|
||||
E: grantgrundler@gmail.com
|
||||
W: http://obmouse.sourceforge.net/
|
||||
W: http://www.parisc-linux.org/
|
||||
D: obmouse - rewrote Olivier Florent's Omnibook 600 "pop-up" mouse driver
|
||||
|
|
@ -2138,6 +2137,10 @@ E: paul@laufernet.com
|
|||
D: Soundblaster driver fixes, ISAPnP quirk
|
||||
S: California, USA
|
||||
|
||||
N: Jarkko Lavinen
|
||||
E: jarkko.lavinen@nokia.com
|
||||
D: OMAP MMC support
|
||||
|
||||
N: Jonathan Layes
|
||||
D: ARPD support
|
||||
|
||||
|
|
@ -2200,6 +2203,16 @@ S: Post Office Box 371
|
|||
S: North Little Rock, Arkansas 72115
|
||||
S: USA
|
||||
|
||||
N: Christopher Li
|
||||
E: sparse@chrisli.org
|
||||
D: Sparse maintainer 2009 - 2018
|
||||
|
||||
N: Shaohua Li
|
||||
D: Worked on many parts of the kernel, from core x86, ACPI, PCI, KVM, MM,
|
||||
D: and much more. He was the maintainer of MD from 2016 to 2018. Shaohua
|
||||
D: passed away late 2018, he will be greatly missed.
|
||||
W: https://www.spinics.net/lists/raid/msg61993.html
|
||||
|
||||
N: Stephan Linz
|
||||
E: linz@mazet.de
|
||||
E: Stephan.Linz@gmx.de
|
||||
|
|
@ -2478,7 +2491,7 @@ S: Syracuse, New York 13206
|
|||
S: USA
|
||||
|
||||
N: Kyle McMartin
|
||||
E: kyle@parisc-linux.org
|
||||
E: kyle@mcmartin.ca
|
||||
D: Linux/PARISC hacker
|
||||
D: AD1889 sound driver
|
||||
S: Ottawa, Canada
|
||||
|
|
@ -2533,6 +2546,10 @@ S: Ormond
|
|||
S: Victoria 3163
|
||||
S: Australia
|
||||
|
||||
N: Eric Miao
|
||||
E: eric.y.miao@gmail.com
|
||||
D: MMP support
|
||||
|
||||
N: Pauline Middelink
|
||||
E: middelin@polyware.nl
|
||||
D: General low-level bug fixes, /proc fixes, identd support
|
||||
|
|
@ -3762,14 +3779,13 @@ S: 21513 Conradia Ct
|
|||
S: Cupertino, CA 95014
|
||||
S: USA
|
||||
|
||||
N: Thibaut Varene
|
||||
E: T-Bone@parisc-linux.org
|
||||
W: http://www.parisc-linux.org/~varenet/
|
||||
P: 1024D/B7D2F063 E67C 0D43 A75E 12A5 BB1C FA2F 1E32 C3DA B7D2 F063
|
||||
N: Thibaut Varène
|
||||
E: hacks+kernel@slashdirt.org
|
||||
W: http://hacks.slashdirt.org/
|
||||
D: PA-RISC port minion, PDC and GSCPS2 drivers, debuglocks and other bits
|
||||
D: Some ARM at91rm9200 bits, S1D13XXX FB driver, random patches here and there
|
||||
D: AD1889 sound driver
|
||||
S: Paris, France
|
||||
S: France
|
||||
|
||||
N: Heikki Vatiainen
|
||||
E: hessu@cs.tut.fi
|
||||
|
|
@ -4107,6 +4123,10 @@ S: 1507 145th Place SE #B5
|
|||
S: Bellevue, Washington 98007
|
||||
S: USA
|
||||
|
||||
N: Haojian Zhuang
|
||||
E: haojian.zhuang@gmail.com
|
||||
D: MMP support
|
||||
|
||||
N: Richard Zidlicky
|
||||
E: rz@linux-m68k.org, rdzidlic@geocities.com
|
||||
W: http://www.geocities.com/rdzidlic
|
||||
|
|
|
|||
|
|
@ -1,428 +0,0 @@
|
|||
|
||||
This is a brief list of all the files in ./linux/Documentation and what
|
||||
they contain. If you add a documentation file, please list it here in
|
||||
alphabetical order as well, or risk being hunted down like a rabid dog.
|
||||
Please keep the descriptions small enough to fit on one line.
|
||||
Thanks -- Paul G.
|
||||
|
||||
Following translations are available on the WWW:
|
||||
|
||||
- Japanese, maintained by the JF Project (jf@listserv.linux.or.jp), at
|
||||
http://linuxjf.sourceforge.jp/
|
||||
|
||||
00-INDEX
|
||||
- this file.
|
||||
ABI/
|
||||
- info on kernel <-> userspace ABI and relative interface stability.
|
||||
CodingStyle
|
||||
- nothing here, just a pointer to process/coding-style.rst.
|
||||
DMA-API.txt
|
||||
- DMA API, pci_ API & extensions for non-consistent memory machines.
|
||||
DMA-API-HOWTO.txt
|
||||
- Dynamic DMA mapping Guide
|
||||
DMA-ISA-LPC.txt
|
||||
- How to do DMA with ISA (and LPC) devices.
|
||||
DMA-attributes.txt
|
||||
- listing of the various possible attributes a DMA region can have
|
||||
EDID/
|
||||
- directory with info on customizing EDID for broken gfx/displays.
|
||||
IPMI.txt
|
||||
- info on Linux Intelligent Platform Management Interface (IPMI) Driver.
|
||||
IRQ-affinity.txt
|
||||
- how to select which CPU(s) handle which interrupt events on SMP.
|
||||
IRQ-domain.txt
|
||||
- info on interrupt numbering and setting up IRQ domains.
|
||||
IRQ.txt
|
||||
- description of what an IRQ is.
|
||||
Intel-IOMMU.txt
|
||||
- basic info on the Intel IOMMU virtualization support.
|
||||
Makefile
|
||||
- It's not of interest for those who aren't touching the build system.
|
||||
PCI/
|
||||
- info related to PCI drivers.
|
||||
RCU/
|
||||
- directory with info on RCU (read-copy update).
|
||||
SAK.txt
|
||||
- info on Secure Attention Keys.
|
||||
SM501.txt
|
||||
- Silicon Motion SM501 multimedia companion chip
|
||||
SubmittingPatches
|
||||
- nothing here, just a pointer to process/coding-style.rst.
|
||||
accounting/
|
||||
- documentation on accounting and taskstats.
|
||||
acpi/
|
||||
- info on ACPI-specific hooks in the kernel.
|
||||
admin-guide/
|
||||
- info related to Linux users and system admins.
|
||||
aoe/
|
||||
- description of AoE (ATA over Ethernet) along with config examples.
|
||||
arm/
|
||||
- directory with info about Linux on the ARM architecture.
|
||||
arm64/
|
||||
- directory with info about Linux on the 64 bit ARM architecture.
|
||||
auxdisplay/
|
||||
- misc. LCD driver documentation (cfag12864b, ks0108).
|
||||
backlight/
|
||||
- directory with info on controlling backlights in flat panel displays
|
||||
block/
|
||||
- info on the Block I/O (BIO) layer.
|
||||
blockdev/
|
||||
- info on block devices & drivers
|
||||
bt8xxgpio.txt
|
||||
- info on how to modify a bt8xx video card for GPIO usage.
|
||||
btmrvl.txt
|
||||
- info on Marvell Bluetooth driver usage.
|
||||
bus-devices/
|
||||
- directory with info on TI GPMC (General Purpose Memory Controller)
|
||||
bus-virt-phys-mapping.txt
|
||||
- how to access I/O mapped memory from within device drivers.
|
||||
cdrom/
|
||||
- directory with information on the CD-ROM drivers that Linux has.
|
||||
cgroup-v1/
|
||||
- cgroups v1 features, including cpusets and memory controller.
|
||||
cma/
|
||||
- Continuous Memory Area (CMA) debugfs interface.
|
||||
conf.py
|
||||
- It's not of interest for those who aren't touching the build system.
|
||||
connector/
|
||||
- docs on the netlink based userspace<->kernel space communication mod.
|
||||
console/
|
||||
- documentation on Linux console drivers.
|
||||
core-api/
|
||||
- documentation on kernel core components.
|
||||
cpu-freq/
|
||||
- info on CPU frequency and voltage scaling.
|
||||
cpu-hotplug.txt
|
||||
- document describing CPU hotplug support in the Linux kernel.
|
||||
cpu-load.txt
|
||||
- document describing how CPU load statistics are collected.
|
||||
cpuidle/
|
||||
- info on CPU_IDLE, CPU idle state management subsystem.
|
||||
cputopology.txt
|
||||
- documentation on how CPU topology info is exported via sysfs.
|
||||
crc32.txt
|
||||
- brief tutorial on CRC computation
|
||||
crypto/
|
||||
- directory with info on the Crypto API.
|
||||
dcdbas.txt
|
||||
- information on the Dell Systems Management Base Driver.
|
||||
debugging-modules.txt
|
||||
- some notes on debugging modules after Linux 2.6.3.
|
||||
debugging-via-ohci1394.txt
|
||||
- how to use firewire like a hardware debugger memory reader.
|
||||
dell_rbu.txt
|
||||
- document demonstrating the use of the Dell Remote BIOS Update driver.
|
||||
dev-tools/
|
||||
- directory with info on development tools for the kernel.
|
||||
device-mapper/
|
||||
- directory with info on Device Mapper.
|
||||
dmaengine/
|
||||
- the DMA engine and controller API guides.
|
||||
devicetree/
|
||||
- directory with info on device tree files used by OF/PowerPC/ARM
|
||||
digsig.txt
|
||||
-info on the Digital Signature Verification API
|
||||
dma-buf-sharing.txt
|
||||
- the DMA Buffer Sharing API Guide
|
||||
docutils.conf
|
||||
- nothing here. Just a configuration file for docutils.
|
||||
dontdiff
|
||||
- file containing a list of files that should never be diff'ed.
|
||||
driver-api/
|
||||
- the Linux driver implementer's API guide.
|
||||
driver-model/
|
||||
- directory with info about Linux driver model.
|
||||
early-userspace/
|
||||
- info about initramfs, klibc, and userspace early during boot.
|
||||
efi-stub.txt
|
||||
- How to use the EFI boot stub to bypass GRUB or elilo on EFI systems.
|
||||
eisa.txt
|
||||
- info on EISA bus support.
|
||||
extcon/
|
||||
- directory with porting guide for Android kernel switch driver.
|
||||
isa.txt
|
||||
- info on EISA bus support.
|
||||
fault-injection/
|
||||
- dir with docs about the fault injection capabilities infrastructure.
|
||||
fb/
|
||||
- directory with info on the frame buffer graphics abstraction layer.
|
||||
features/
|
||||
- status of feature implementation on different architectures.
|
||||
filesystems/
|
||||
- info on the vfs and the various filesystems that Linux supports.
|
||||
firmware_class/
|
||||
- request_firmware() hotplug interface info.
|
||||
flexible-arrays.txt
|
||||
- how to make use of flexible sized arrays in linux
|
||||
fmc/
|
||||
- information about the FMC bus abstraction
|
||||
fpga/
|
||||
- FPGA Manager Core.
|
||||
futex-requeue-pi.txt
|
||||
- info on requeueing of tasks from a non-PI futex to a PI futex
|
||||
gcc-plugins.txt
|
||||
- GCC plugin infrastructure.
|
||||
gpio/
|
||||
- gpio related documentation
|
||||
gpu/
|
||||
- directory with information on GPU driver developer's guide.
|
||||
hid/
|
||||
- directory with information on human interface devices
|
||||
highuid.txt
|
||||
- notes on the change from 16 bit to 32 bit user/group IDs.
|
||||
hwspinlock.txt
|
||||
- hardware spinlock provides hardware assistance for synchronization
|
||||
timers/
|
||||
- info on the timer related topics
|
||||
hw_random.txt
|
||||
- info on Linux support for random number generator in i8xx chipsets.
|
||||
hwmon/
|
||||
- directory with docs on various hardware monitoring drivers.
|
||||
i2c/
|
||||
- directory with info about the I2C bus/protocol (2 wire, kHz speed).
|
||||
x86/i386/
|
||||
- directory with info about Linux on Intel 32 bit architecture.
|
||||
ia64/
|
||||
- directory with info about Linux on Intel 64 bit architecture.
|
||||
ide/
|
||||
- Information regarding the Enhanced IDE drive.
|
||||
iio/
|
||||
- info on industrial IIO configfs support.
|
||||
index.rst
|
||||
- main index for the documentation at ReST format.
|
||||
infiniband/
|
||||
- directory with documents concerning Linux InfiniBand support.
|
||||
input/
|
||||
- info on Linux input device support.
|
||||
intel_txt.txt
|
||||
- info on intel Trusted Execution Technology (intel TXT).
|
||||
io-mapping.txt
|
||||
- description of io_mapping functions in linux/io-mapping.h
|
||||
io_ordering.txt
|
||||
- info on ordering I/O writes to memory-mapped addresses.
|
||||
ioctl/
|
||||
- directory with documents describing various IOCTL calls.
|
||||
iostats.txt
|
||||
- info on I/O statistics Linux kernel provides.
|
||||
irqflags-tracing.txt
|
||||
- how to use the irq-flags tracing feature.
|
||||
isapnp.txt
|
||||
- info on Linux ISA Plug & Play support.
|
||||
isdn/
|
||||
- directory with info on the Linux ISDN support, and supported cards.
|
||||
kbuild/
|
||||
- directory with info about the kernel build process.
|
||||
kdump/
|
||||
- directory with mini HowTo on getting the crash dump code to work.
|
||||
doc-guide/
|
||||
- how to write and format reStructuredText kernel documentation
|
||||
kernel-per-CPU-kthreads.txt
|
||||
- List of all per-CPU kthreads and how they introduce jitter.
|
||||
kobject.txt
|
||||
- info of the kobject infrastructure of the Linux kernel.
|
||||
kprobes.txt
|
||||
- documents the kernel probes debugging feature.
|
||||
kref.txt
|
||||
- docs on adding reference counters (krefs) to kernel objects.
|
||||
laptops/
|
||||
- directory with laptop related info and laptop driver documentation.
|
||||
ldm.txt
|
||||
- a brief description of LDM (Windows Dynamic Disks).
|
||||
leds/
|
||||
- directory with info about LED handling under Linux.
|
||||
livepatch/
|
||||
- info on kernel live patching.
|
||||
locking/
|
||||
- directory with info about kernel locking primitives
|
||||
lockup-watchdogs.txt
|
||||
- info on soft and hard lockup detectors (aka nmi_watchdog).
|
||||
logo.gif
|
||||
- full colour GIF image of Linux logo (penguin - Tux).
|
||||
logo.txt
|
||||
- info on creator of above logo & site to get additional images from.
|
||||
lsm.txt
|
||||
- Linux Security Modules: General Security Hooks for Linux
|
||||
lzo.txt
|
||||
- kernel LZO decompressor input formats
|
||||
m68k/
|
||||
- directory with info about Linux on Motorola 68k architecture.
|
||||
mailbox.txt
|
||||
- How to write drivers for the common mailbox framework (IPC).
|
||||
md/
|
||||
- directory with info about Linux Software RAID
|
||||
media/
|
||||
- info on media drivers: uAPI, kAPI and driver documentation.
|
||||
memory-barriers.txt
|
||||
- info on Linux kernel memory barriers.
|
||||
memory-devices/
|
||||
- directory with info on parts like the Texas Instruments EMIF driver
|
||||
memory-hotplug.txt
|
||||
- Hotpluggable memory support, how to use and current status.
|
||||
men-chameleon-bus.txt
|
||||
- info on MEN chameleon bus.
|
||||
mic/
|
||||
- Intel Many Integrated Core (MIC) architecture device driver.
|
||||
mips/
|
||||
- directory with info about Linux on MIPS architecture.
|
||||
misc-devices/
|
||||
- directory with info about devices using the misc dev subsystem
|
||||
mmc/
|
||||
- directory with info about the MMC subsystem
|
||||
mtd/
|
||||
- directory with info about memory technology devices (flash)
|
||||
namespaces/
|
||||
- directory with various information about namespaces
|
||||
netlabel/
|
||||
- directory with information on the NetLabel subsystem.
|
||||
networking/
|
||||
- directory with info on various aspects of networking with Linux.
|
||||
nfc/
|
||||
- directory relating info about Near Field Communications support.
|
||||
nios2/
|
||||
- Linux on the Nios II architecture.
|
||||
nommu-mmap.txt
|
||||
- documentation about no-mmu memory mapping support.
|
||||
numastat.txt
|
||||
- info on how to read Numa policy hit/miss statistics in sysfs.
|
||||
ntb.txt
|
||||
- info on Non-Transparent Bridge (NTB) drivers.
|
||||
nvdimm/
|
||||
- info on non-volatile devices.
|
||||
nvmem/
|
||||
- info on non volatile memory framework.
|
||||
output/
|
||||
- default directory where html/LaTeX/pdf files will be written.
|
||||
padata.txt
|
||||
- An introduction to the "padata" parallel execution API
|
||||
parisc/
|
||||
- directory with info on using Linux on PA-RISC architecture.
|
||||
parport-lowlevel.txt
|
||||
- description and usage of the low level parallel port functions.
|
||||
pcmcia/
|
||||
- info on the Linux PCMCIA driver.
|
||||
percpu-rw-semaphore.txt
|
||||
- RCU based read-write semaphore optimized for locking for reading
|
||||
perf/
|
||||
- info about the APM X-Gene SoC Performance Monitoring Unit (PMU).
|
||||
phy/
|
||||
- ino on Samsung USB 2.0 PHY adaptation layer.
|
||||
phy.txt
|
||||
- Description of the generic PHY framework.
|
||||
pi-futex.txt
|
||||
- documentation on lightweight priority inheritance futexes.
|
||||
pinctrl.txt
|
||||
- info on pinctrl subsystem and the PINMUX/PINCONF and drivers
|
||||
platform/
|
||||
- List of supported hardware by compal and Dell laptop.
|
||||
pnp.txt
|
||||
- Linux Plug and Play documentation.
|
||||
power/
|
||||
- directory with info on Linux PCI power management.
|
||||
powerpc/
|
||||
- directory with info on using Linux with the PowerPC.
|
||||
prctl/
|
||||
- directory with info on the priveledge control subsystem
|
||||
preempt-locking.txt
|
||||
- info on locking under a preemptive kernel.
|
||||
process/
|
||||
- how to work with the mainline kernel development process.
|
||||
pps/
|
||||
- directory with information on the pulse-per-second support
|
||||
pti/
|
||||
- directory with info on Intel MID PTI.
|
||||
ptp/
|
||||
- directory with info on support for IEEE 1588 PTP clocks in Linux.
|
||||
pwm.txt
|
||||
- info on the pulse width modulation driver subsystem
|
||||
rapidio/
|
||||
- directory with info on RapidIO packet-based fabric interconnect
|
||||
rbtree.txt
|
||||
- info on what red-black trees are and what they are for.
|
||||
remoteproc.txt
|
||||
- info on how to handle remote processor (e.g. AMP) offloads/usage.
|
||||
rfkill.txt
|
||||
- info on the radio frequency kill switch subsystem/support.
|
||||
robust-futex-ABI.txt
|
||||
- documentation of the robust futex ABI.
|
||||
robust-futexes.txt
|
||||
- a description of what robust futexes are.
|
||||
rpmsg.txt
|
||||
- info on the Remote Processor Messaging (rpmsg) Framework
|
||||
rtc.txt
|
||||
- notes on how to use the Real Time Clock (aka CMOS clock) driver.
|
||||
s390/
|
||||
- directory with info on using Linux on the IBM S390.
|
||||
scheduler/
|
||||
- directory with info on the scheduler.
|
||||
scsi/
|
||||
- directory with info on Linux scsi support.
|
||||
security/
|
||||
- directory that contains security-related info
|
||||
serial/
|
||||
- directory with info on the low level serial API.
|
||||
sgi-ioc4.txt
|
||||
- description of the SGI IOC4 PCI (multi function) device.
|
||||
sh/
|
||||
- directory with info on porting Linux to a new architecture.
|
||||
smsc_ece1099.txt
|
||||
-info on the smsc Keyboard Scan Expansion/GPIO Expansion device.
|
||||
sound/
|
||||
- directory with info on sound card support.
|
||||
spi/
|
||||
- overview of Linux kernel Serial Peripheral Interface (SPI) support.
|
||||
sphinx/
|
||||
- no documentation here, just files required by Sphinx toolchain.
|
||||
sphinx-static/
|
||||
- no documentation here, just files required by Sphinx toolchain.
|
||||
static-keys.txt
|
||||
- info on how static keys allow debug code in hotpaths via patching
|
||||
svga.txt
|
||||
- short guide on selecting video modes at boot via VGA BIOS.
|
||||
sync_file.txt
|
||||
- Sync file API guide.
|
||||
sysctl/
|
||||
- directory with info on the /proc/sys/* files.
|
||||
target/
|
||||
- directory with info on generating TCM v4 fabric .ko modules
|
||||
tee.txt
|
||||
- info on the TEE subsystem and drivers
|
||||
this_cpu_ops.txt
|
||||
- List rationale behind and the way to use this_cpu operations.
|
||||
thermal/
|
||||
- directory with information on managing thermal issues (CPU/temp)
|
||||
trace/
|
||||
- directory with info on tracing technologies within linux
|
||||
translations/
|
||||
- translations of this document from English to another language
|
||||
unaligned-memory-access.txt
|
||||
- info on how to avoid arch breaking unaligned memory access in code.
|
||||
unshare.txt
|
||||
- description of the Linux unshare system call.
|
||||
usb/
|
||||
- directory with info regarding the Universal Serial Bus.
|
||||
vfio.txt
|
||||
- info on Virtual Function I/O used in guest/hypervisor instances.
|
||||
video-output.txt
|
||||
- sysfs class driver interface to enable/disable a video output device.
|
||||
virtual/
|
||||
- directory with information on the various linux virtualizations.
|
||||
vm/
|
||||
- directory with info on the Linux vm code.
|
||||
w1/
|
||||
- directory with documents regarding the 1-wire (w1) subsystem.
|
||||
watchdog/
|
||||
- how to auto-reboot Linux if it has "fallen and can't get up". ;-)
|
||||
wimax/
|
||||
- directory with info about Intel Wireless Wimax Connections
|
||||
core-api/workqueue.rst
|
||||
- information on the Concurrency Managed Workqueue implementation
|
||||
x86/x86_64/
|
||||
- directory with info on Linux support for AMD x86-64 (Hammer) machines.
|
||||
xillybus.txt
|
||||
- Overview and basic ui of xillybus driver
|
||||
xtensa/
|
||||
- directory with documents relating to arch/xtensa port/implementation
|
||||
xz.txt
|
||||
- how to make use of the XZ data compression within linux kernel
|
||||
zorro.txt
|
||||
- info on writing drivers for Zorro bus devices found on Amigas.
|
||||
22
Documentation/ABI/obsolete/sysfs-class-dax
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
What: /sys/class/dax/
|
||||
Date: May, 2016
|
||||
KernelVersion: v4.7
|
||||
Contact: linux-nvdimm@lists.01.org
|
||||
Description: Device DAX is the device-centric analogue of Filesystem
|
||||
DAX (CONFIG_FS_DAX). It allows memory ranges to be
|
||||
allocated and mapped without need of an intervening file
|
||||
system. Device DAX is strict, precise and predictable.
|
||||
Specifically this interface:
|
||||
|
||||
1/ Guarantees fault granularity with respect to a given
|
||||
page size (pte, pmd, or pud) set at configuration time.
|
||||
|
||||
2/ Enforces deterministic behavior by being strict about
|
||||
what fault scenarios are supported.
|
||||
|
||||
The /sys/class/dax/ interface enumerates all the
|
||||
device-dax instances in the system. The ABI is
|
||||
deprecated and will be removed after 2020. It is
|
||||
replaced with the DAX bus interface /sys/bus/dax/ where
|
||||
device-dax instances can be found under
|
||||
/sys/bus/dax/devices/
|
||||
|
|
@ -146,3 +146,36 @@ KernelVersion: 4.16
|
|||
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
||||
Description: Binary file created by uio_hv_generic for ring buffer
|
||||
Users: Userspace drivers
|
||||
|
||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/intr_in_full
|
||||
Date: February 2019
|
||||
KernelVersion: 5.0
|
||||
Contact: Michael Kelley <mikelley@microsoft.com>
|
||||
Description: Number of guest to host interrupts caused by the inbound ring
|
||||
buffer transitioning from full to not full while a packet is
|
||||
waiting for buffer space to become available
|
||||
Users: Debugging tools
|
||||
|
||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/intr_out_empty
|
||||
Date: February 2019
|
||||
KernelVersion: 5.0
|
||||
Contact: Michael Kelley <mikelley@microsoft.com>
|
||||
Description: Number of guest to host interrupts caused by the outbound ring
|
||||
buffer transitioning from empty to not empty
|
||||
Users: Debugging tools
|
||||
|
||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/out_full_first
|
||||
Date: February 2019
|
||||
KernelVersion: 5.0
|
||||
Contact: Michael Kelley <mikelley@microsoft.com>
|
||||
Description: Number of write operations that were the first to encounter an
|
||||
outbound ring buffer full condition
|
||||
Users: Debugging tools
|
||||
|
||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/out_full_total
|
||||
Date: February 2019
|
||||
KernelVersion: 5.0
|
||||
Contact: Michael Kelley <mikelley@microsoft.com>
|
||||
Description: Total number of write operations that encountered an outbound
|
||||
ring buffer full condition
|
||||
Users: Debugging tools
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@ Description: This file shows ASIC health status. The possible values are:
|
|||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
|
||||
cpld1_version
|
||||
cpld2_version
|
||||
|
||||
Date: June 2018
|
||||
KernelVersion: 4.19
|
||||
Contact: Vadim Pasternak <vadimpmellanox.com>
|
||||
|
|
@ -21,6 +20,40 @@ Description: These files show with which CPLD versions have been burned
|
|||
|
||||
The files are read only.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
|
||||
fan_dir
|
||||
|
||||
Date: December 2018
|
||||
KernelVersion: 5.0
|
||||
Contact: Vadim Pasternak <vadimpmellanox.com>
|
||||
Description: This file shows the system fans direction:
|
||||
forward direction - relevant bit is set 0;
|
||||
reversed direction - relevant bit is set 1.
|
||||
|
||||
The files are read only.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
|
||||
jtag_enable
|
||||
|
||||
Date: November 2018
|
||||
KernelVersion: 5.0
|
||||
Contact: Vadim Pasternak <vadimpmellanox.com>
|
||||
Description: These files show with which CPLD versions have been burned
|
||||
on LED board.
|
||||
|
||||
The files are read only.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
|
||||
jtag_enable
|
||||
|
||||
Date: November 2018
|
||||
KernelVersion: 5.0
|
||||
Contact: Vadim Pasternak <vadimpmellanox.com>
|
||||
Description: These files enable and disable the access to the JTAG domain.
|
||||
By default access to the JTAG domain is disabled.
|
||||
|
||||
The file is read/write.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/select_iio
|
||||
Date: June 2018
|
||||
KernelVersion: 4.19
|
||||
|
|
@ -76,3 +109,21 @@ Description: These files show the system reset cause, as following: power
|
|||
reset cause.
|
||||
|
||||
The files are read only.
|
||||
|
||||
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
|
||||
reset_comex_pwr_fail
|
||||
reset_from_comex
|
||||
reset_system
|
||||
reset_voltmon_upgrade_fail
|
||||
|
||||
Date: November 2018
|
||||
KernelVersion: 5.0
|
||||
Contact: Vadim Pasternak <vadimpmellanox.com>
|
||||
Description: These files show the system reset cause, as following: ComEx
|
||||
power fail, reset from ComEx, system platform reset, reset
|
||||
due to voltage monitor devices upgrade failure,
|
||||
Value 1 in file means this is reset cause, 0 - otherwise.
|
||||
Only one bit could be 1 at the same time, representing only
|
||||
the last reset cause.
|
||||
|
||||
The files are read only.
|
||||
|
|
|
|||
|
|
@ -25,38 +25,3 @@ Description:
|
|||
4.2.2.
|
||||
|
||||
The files are read only.
|
||||
|
||||
|
||||
What: /sys/bus/usb/drivers/usbtmc/*/TermChar
|
||||
Date: August 2008
|
||||
Contact: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
||||
Description:
|
||||
This file is the TermChar value to be sent to the USB TMC
|
||||
device as described by the document, "Universal Serial Bus Test
|
||||
and Measurement Class Specification
|
||||
(USBTMC) Revision 1.0" as published by the USB-IF.
|
||||
|
||||
Note that the TermCharEnabled file determines if this value is
|
||||
sent to the device or not.
|
||||
|
||||
|
||||
What: /sys/bus/usb/drivers/usbtmc/*/TermCharEnabled
|
||||
Date: August 2008
|
||||
Contact: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
||||
Description:
|
||||
This file determines if the TermChar is to be sent to the
|
||||
device on every transaction or not. For more details about
|
||||
this, please see the document, "Universal Serial Bus Test and
|
||||
Measurement Class Specification (USBTMC) Revision 1.0" as
|
||||
published by the USB-IF.
|
||||
|
||||
|
||||
What: /sys/bus/usb/drivers/usbtmc/*/auto_abort
|
||||
Date: August 2008
|
||||
Contact: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
||||
Description:
|
||||
This file determines if the transaction of the USB TMC
|
||||
device is to be automatically aborted if there is any error.
|
||||
For more details about this, please see the document,
|
||||
"Universal Serial Bus Test and Measurement Class Specification
|
||||
(USBTMC) Revision 1.0" as published by the USB-IF.
|
||||
|
|
|
|||
41
Documentation/ABI/testing/configfs-stp-policy-p_sys-t
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
What: /config/stp-policy/<device>:p_sys-t.<policy>/<node>/uuid
|
||||
Date: June 2018
|
||||
KernelVersion: 4.19
|
||||
Description:
|
||||
UUID source identifier string, RW.
|
||||
Default value is randomly generated at the mkdir <node> time.
|
||||
Data coming from trace sources that use this <node> will be
|
||||
tagged with this UUID in the MIPI SyS-T packet stream, to
|
||||
allow the decoder to discern between different sources
|
||||
within the same master/channel range, and identify the
|
||||
higher level decoders that may be needed for each source.
|
||||
|
||||
What: /config/stp-policy/<device>:p_sys-t.<policy>/<node>/do_len
|
||||
Date: June 2018
|
||||
KernelVersion: 4.19
|
||||
Description:
|
||||
Include payload length in the MIPI SyS-T header, boolean.
|
||||
If enabled, the SyS-T protocol encoder will include payload
|
||||
length in each packet's metadata. This is normally redundant
|
||||
if the underlying transport protocol supports marking message
|
||||
boundaries (which STP does), so this is off by default.
|
||||
|
||||
What: /config/stp-policy/<device>:p_sys-t.<policy>/<node>/ts_interval
|
||||
Date: June 2018
|
||||
KernelVersion: 4.19
|
||||
Description:
|
||||
Time interval in milliseconds. Include a timestamp in the
|
||||
MIPI SyS-T packet metadata, if this many milliseconds have
|
||||
passed since the previous packet from this source. Zero is
|
||||
the default and stands for "never send the timestamp".
|
||||
|
||||
What: /config/stp-policy/<device>:p_sys-t.<policy>/<node>/clocksync_interval
|
||||
Date: June 2018
|
||||
KernelVersion: 4.19
|
||||
Description:
|
||||
Time interval in milliseconds. Send a CLOCKSYNC packet if
|
||||
this many milliseconds have passed since the previous
|
||||
CLOCKSYNC packet from this source. Zero is the default and
|
||||
stands for "never send the CLOCKSYNC". It makes sense to
|
||||
use this option with sources that generate constant and/or
|
||||
periodic data, like stm_heartbeat.
|
||||
|
|
@ -12,6 +12,10 @@ Date: Dec 2014
|
|||
KernelVersion: 4.0
|
||||
Description: Control descriptors
|
||||
|
||||
All attributes read only:
|
||||
bInterfaceNumber - USB interface number for this
|
||||
streaming interface
|
||||
|
||||
What: /config/usb-gadget/gadget/functions/uvc.name/control/class
|
||||
Date: Dec 2014
|
||||
KernelVersion: 4.0
|
||||
|
|
@ -109,6 +113,10 @@ Date: Dec 2014
|
|||
KernelVersion: 4.0
|
||||
Description: Streaming descriptors
|
||||
|
||||
All attributes read only:
|
||||
bInterfaceNumber - USB interface number for this
|
||||
streaming interface
|
||||
|
||||
What: /config/usb-gadget/gadget/functions/uvc.name/streaming/class
|
||||
Date: Dec 2014
|
||||
KernelVersion: 4.0
|
||||
|
|
@ -160,6 +168,10 @@ Description: Specific MJPEG format descriptors
|
|||
|
||||
All attributes read only,
|
||||
except bmaControls and bDefaultFrameIndex:
|
||||
bFormatIndex - unique id for this format descriptor;
|
||||
only defined after parent header is
|
||||
linked into the streaming class;
|
||||
read-only
|
||||
bmaControls - this format's data for bmaControls in
|
||||
the streaming header
|
||||
bmInterfaceFlags - specifies interlace information,
|
||||
|
|
@ -177,6 +189,10 @@ Date: Dec 2014
|
|||
KernelVersion: 4.0
|
||||
Description: Specific MJPEG frame descriptors
|
||||
|
||||
bFrameIndex - unique id for this framedescriptor;
|
||||
only defined after parent format is
|
||||
linked into the streaming header;
|
||||
read-only
|
||||
dwFrameInterval - indicates how frame interval can be
|
||||
programmed; a number of values
|
||||
separated by newline can be specified
|
||||
|
|
@ -204,6 +220,10 @@ Date: Dec 2014
|
|||
KernelVersion: 4.0
|
||||
Description: Specific uncompressed format descriptors
|
||||
|
||||
bFormatIndex - unique id for this format descriptor;
|
||||
only defined after parent header is
|
||||
linked into the streaming class;
|
||||
read-only
|
||||
bmaControls - this format's data for bmaControls in
|
||||
the streaming header
|
||||
bmInterfaceFlags - specifies interlace information,
|
||||
|
|
@ -224,6 +244,10 @@ Date: Dec 2014
|
|||
KernelVersion: 4.0
|
||||
Description: Specific uncompressed frame descriptors
|
||||
|
||||
bFrameIndex - unique id for this framedescriptor;
|
||||
only defined after parent format is
|
||||
linked into the streaming header;
|
||||
read-only
|
||||
dwFrameInterval - indicates how frame interval can be
|
||||
programmed; a number of values
|
||||
separated by newline can be specified
|
||||
|
|
|
|||
126
Documentation/ABI/testing/debugfs-driver-habanalabs
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
What: /sys/kernel/debug/habanalabs/hl<n>/addr
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Sets the device address to be used for read or write through
|
||||
PCI bar. The acceptable value is a string that starts with "0x"
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/command_buffers
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays a list with information about the currently allocated
|
||||
command buffers
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/command_submission
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays a list with information about the currently active
|
||||
command submissions
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/command_submission_jobs
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays a list with detailed information about each JOB (CB) of
|
||||
each active command submission
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/data32
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Allows the root user to read or write directly through the
|
||||
device's PCI bar. Writing to this file generates a write
|
||||
transaction while reading from the file generates a read
|
||||
transcation. This custom interface is needed (instead of using
|
||||
the generic Linux user-space PCI mapping) because the DDR bar
|
||||
is very small compared to the DDR memory and only the driver can
|
||||
move the bar before and after the transaction
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/device
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Enables the root user to set the device to specific state.
|
||||
Valid values are "disable", "enable", "suspend", "resume".
|
||||
User can read this property to see the valid values
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_addr
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Sets I2C device address for I2C transaction that is generated
|
||||
by the device's CPU
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_bus
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Sets I2C bus address for I2C transaction that is generated by
|
||||
the device's CPU
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_data
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Triggers an I2C transaction that is generated by the device's
|
||||
CPU. Writing to this file generates a write transaction while
|
||||
reading from the file generates a read transcation
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_reg
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Sets I2C register id for I2C transaction that is generated by
|
||||
the device's CPU
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/led0
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Sets the state of the first S/W led on the device
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/led1
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Sets the state of the second S/W led on the device
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/led2
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Sets the state of the third S/W led on the device
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/mmu
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays the hop values and physical address for a given ASID
|
||||
and virtual address. The user should write the ASID and VA into
|
||||
the file and then read the file to get the result.
|
||||
e.g. to display info about VA 0x1000 for ASID 1 you need to do:
|
||||
echo "1 0x1000" > /sys/kernel/debug/habanalabs/hl0/mmu
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/set_power_state
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Sets the PCI power state. Valid values are "1" for D0 and "2"
|
||||
for D3Hot
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/userptr
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays a list with information about the currently user
|
||||
pointers (user virtual addresses) that are pinned and mapped
|
||||
to DMA addresses
|
||||
|
||||
What: /sys/kernel/debug/habanalabs/hl<n>/vm
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays a list with information about all the active virtual
|
||||
address mappings per ASID
|
||||
23
Documentation/ABI/testing/debugfs-wilco-ec
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
What: /sys/kernel/debug/wilco_ec/raw
|
||||
Date: January 2019
|
||||
KernelVersion: 5.1
|
||||
Description:
|
||||
Write and read raw mailbox commands to the EC.
|
||||
|
||||
For writing:
|
||||
Bytes 0-1 indicate the message type:
|
||||
00 F0 = Execute Legacy Command
|
||||
00 F2 = Read/Write NVRAM Property
|
||||
Byte 2 provides the command code
|
||||
Bytes 3+ consist of the data passed in the request
|
||||
|
||||
At least three bytes are required, for the msg type and command,
|
||||
with additional bytes optional for additional data.
|
||||
|
||||
Example:
|
||||
// Request EC info type 3 (EC firmware build date)
|
||||
$ echo 00 f0 38 00 03 00 > raw
|
||||
// View the result. The decoded ASCII result "12/21/18" is
|
||||
// included after the raw hex.
|
||||
$ cat raw
|
||||
00 31 32 2f 32 31 2f 31 38 00 38 00 01 00 2f 00 .12/21/18.8...
|
||||
|
|
@ -244,7 +244,7 @@ Description:
|
|||
|
||||
What: /sys/block/<disk>/queue/zoned
|
||||
Date: September 2016
|
||||
Contact: Damien Le Moal <damien.lemoal@hgst.com>
|
||||
Contact: Damien Le Moal <damien.lemoal@wdc.com>
|
||||
Description:
|
||||
zoned indicates if the device is a zoned block device
|
||||
and the zone model of the device if it is indeed zoned.
|
||||
|
|
@ -259,6 +259,14 @@ Description:
|
|||
zone commands, they will be treated as regular block
|
||||
devices and zoned will report "none".
|
||||
|
||||
What: /sys/block/<disk>/queue/nr_zones
|
||||
Date: November 2018
|
||||
Contact: Damien Le Moal <damien.lemoal@wdc.com>
|
||||
Description:
|
||||
nr_zones indicates the total number of zones of a zoned block
|
||||
device ("host-aware" or "host-managed" zone model). For regular
|
||||
block devices, the value is always 0.
|
||||
|
||||
What: /sys/block/<disk>/queue/chunk_sectors
|
||||
Date: September 2016
|
||||
Contact: Hannes Reinecke <hare@suse.com>
|
||||
|
|
@ -268,6 +276,15 @@ Description:
|
|||
indicates the size in 512B sectors of the RAID volume
|
||||
stripe segment. For a zoned block device, either
|
||||
host-aware or host-managed, chunk_sectors indicates the
|
||||
size of 512B sectors of the zones of the device, with
|
||||
size in 512B sectors of the zones of the device, with
|
||||
the eventual exception of the last zone of the device
|
||||
which may be smaller.
|
||||
|
||||
What: /sys/block/<disk>/queue/io_timeout
|
||||
Date: November 2018
|
||||
Contact: Weiping Zhang <zhangweiping@didiglobal.com>
|
||||
Description:
|
||||
io_timeout is the request timeout in milliseconds. If a request
|
||||
does not complete in this time then the block driver timeout
|
||||
handler is invoked. That timeout handler can decide to retry
|
||||
the request, to fail it or to start a device recovery strategy.
|
||||
|
|
|
|||
|
|
@ -98,3 +98,42 @@ Description:
|
|||
The backing_dev file is read-write and set up backing
|
||||
device for zram to write incompressible pages.
|
||||
For using, user should enable CONFIG_ZRAM_WRITEBACK.
|
||||
|
||||
What: /sys/block/zram<id>/idle
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
idle file is write-only and mark zram slot as idle.
|
||||
If system has mounted debugfs, user can see which slots
|
||||
are idle via /sys/kernel/debug/zram/zram<id>/block_state
|
||||
|
||||
What: /sys/block/zram<id>/writeback
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The writeback file is write-only and trigger idle and/or
|
||||
huge page writeback to backing device.
|
||||
|
||||
What: /sys/block/zram<id>/bd_stat
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The bd_stat file is read-only and represents backing device's
|
||||
statistics (bd_count, bd_reads, bd_writes) in a format
|
||||
similar to block layer statistics file format.
|
||||
|
||||
What: /sys/block/zram<id>/writeback_limit_enable
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The writeback_limit_enable file is read-write and specifies
|
||||
eanbe of writeback_limit feature. "1" means eable the feature.
|
||||
No limit "0" is the initial state.
|
||||
|
||||
What: /sys/block/zram<id>/writeback_limit
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The writeback_limit file is read-write and specifies the maximum
|
||||
amount of writeback ZRAM can do. The limit could be changed
|
||||
in run time.
|
||||
|
|
|
|||
146
Documentation/ABI/testing/sysfs-bus-i3c
Normal file
|
|
@ -0,0 +1,146 @@
|
|||
What: /sys/bus/i3c/devices/i3c-<bus-id>
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
An I3C bus. This directory will contain one sub-directory per
|
||||
I3C device present on the bus.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/current_master
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
Expose the master that owns the bus (<bus-id>-<master-pid>) at
|
||||
the time this file is read. Note that bus ownership can change
|
||||
overtime, so there's no guarantee that when the read() call
|
||||
returns, the value returned is still valid.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/mode
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
I3C bus mode. Can be "pure", "mixed-fast" or "mixed-slow". See
|
||||
the I3C specification for a detailed description of what each
|
||||
of these modes implies.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/i3c_scl_frequency
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
The frequency (expressed in Hz) of the SCL signal when
|
||||
operating in I3C SDR mode.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/i2c_scl_frequency
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
The frequency (expressed in Hz) of the SCL signal when
|
||||
operating in I2C mode.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/dynamic_address
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
Dynamic address assigned to the master controller. This
|
||||
address may change if the bus is re-initialized.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/bcr
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
BCR stands for Bus Characteristics Register and express the
|
||||
device capabilities in term of speed, maximum read/write
|
||||
length, etc. See the I3C specification for more details.
|
||||
This entry describes the BCR of the master controller driving
|
||||
the bus.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/dcr
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
DCR stands for Device Characteristics Register and express the
|
||||
device capabilities in term of exposed features. See the I3C
|
||||
specification for more details.
|
||||
This entry describes the DCR of the master controller driving
|
||||
the bus.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/pid
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
PID stands for Provisional ID and is used to uniquely identify
|
||||
a device on a bus. This PID contains information about the
|
||||
vendor, the part and an instance ID so that several devices of
|
||||
the same type can be connected on the same bus.
|
||||
See the I3C specification for more details.
|
||||
This entry describes the PID of the master controller driving
|
||||
the bus.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/hdrcap
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
Expose the HDR (High Data Rate) capabilities of a device.
|
||||
Returns a list of supported HDR mode, each element is separated
|
||||
by space. Modes can be "hdr-ddr", "hdr-tsp" and "hdr-tsl".
|
||||
See the I3C specification for more details about these HDR
|
||||
modes.
|
||||
This entry describes the HDRCAP of the master controller
|
||||
driving the bus.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
An I3C device present on I3C bus identified by <bus-id>. Note
|
||||
that all devices are represented including the master driving
|
||||
the bus.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/dynamic_address
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
Dynamic address assigned to device <bus-id>-<device-pid>. This
|
||||
address may change if the bus is re-initialized.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/bcr
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
BCR stands for Bus Characteristics Register and express the
|
||||
device capabilities in term of speed, maximum read/write
|
||||
length, etc. See the I3C specification for more details.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/dcr
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
DCR stands for Device Characteristics Register and express the
|
||||
device capabilities in term of exposed features. See the I3C
|
||||
specification for more details.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/pid
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
PID stands for Provisional ID and is used to uniquely identify
|
||||
a device on a bus. This PID contains information about the
|
||||
vendor, the part and an instance ID so that several devices of
|
||||
the same type can be connected on the same bus.
|
||||
See the I3C specification for more details.
|
||||
|
||||
What: /sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>/hdrcap
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
Expose the HDR (High Data Rate) capabilities of a device.
|
||||
Returns a list of supported HDR mode, each element is separated
|
||||
by space. Modes can be "hdr-ddr", "hdr-tsp" and "hdr-tsl".
|
||||
See the I3C specification for more details about these HDR
|
||||
modes.
|
||||
|
||||
What: /sys/bus/i3c/devices/<bus-id>-<device-pid>
|
||||
KernelVersion: 5.0
|
||||
Contact: linux-i3c@vger.kernel.org
|
||||
Description:
|
||||
These directories are just symbolic links to
|
||||
/sys/bus/i3c/devices/i3c-<bus-id>/<bus-id>-<device-pid>.
|
||||
|
|
@ -199,7 +199,7 @@ Description:
|
|||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_positionrelative_x_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_positionrelative_y_raw
|
||||
KernelVersion: 4.18
|
||||
KernelVersion: 4.19
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Relative position in direction x or y on a pad (may be
|
||||
|
|
@ -1554,6 +1554,10 @@ What: /sys/bus/iio/devices/iio:deviceX/in_concentration_raw
|
|||
What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentration_co2_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_co2_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentration_ethanol_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_ethanol_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentration_h2_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_h2_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentration_voc_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_voc_raw
|
||||
KernelVersion: 4.3
|
||||
|
|
@ -1684,4 +1688,19 @@ KernelVersion: 4.18
|
|||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Raw (unscaled) phase difference reading from channel Y
|
||||
that can be processed to radians.
|
||||
that can be processed to radians.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm1_input
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm1_input
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm2p5_input
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm2p5_input
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm4_input
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm4_input
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm10_input
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm10_input
|
||||
KernelVersion: 4.22
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Mass concentration reading of particulate matter in ug / m3.
|
||||
pmX consists of particles with aerodynamic diameter less or
|
||||
equal to X micrometers.
|
||||
|
|
|
|||
28
Documentation/ABI/testing/sysfs-bus-iio-sps30
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
What: /sys/bus/iio/devices/iio:deviceX/start_cleaning
|
||||
Date: December 2018
|
||||
KernelVersion: 4.22
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Writing 1 starts sensor self cleaning. Internal fan accelerates
|
||||
to its maximum speed and keeps spinning for about 10 seconds in
|
||||
order to blow out accumulated dust.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/cleaning_period
|
||||
Date: January 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Sensor is capable of triggering self cleaning periodically.
|
||||
Period can be changed by writing a new value here. Upon reading
|
||||
the current one is returned. Units are seconds.
|
||||
|
||||
Writing 0 disables periodical self cleaning entirely.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/cleaning_period_available
|
||||
Date: January 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
The range of available values in seconds represented as the
|
||||
minimum value, the step and the maximum value, all enclosed in
|
||||
square brackets.
|
||||
|
|
@ -3,11 +3,13 @@ Date: June 2015
|
|||
KernelVersion: 4.3
|
||||
Contact: Alexander Shishkin <alexander.shishkin@linux.intel.com>
|
||||
Description: (RW) Writes of 1 or 0 enable or disable trace output to this
|
||||
output device. Reads return current status.
|
||||
output device. Reads return current status. Requires that the
|
||||
correstponding output port driver be loaded.
|
||||
|
||||
What: /sys/bus/intel_th/devices/<intel_th_id>-msc<msc-id>/port
|
||||
Date: June 2015
|
||||
KernelVersion: 4.3
|
||||
Contact: Alexander Shishkin <alexander.shishkin@linux.intel.com>
|
||||
Description: (RO) Port number, corresponding to this output device on the
|
||||
switch (GTH).
|
||||
switch (GTH) or "unassigned" if the corresponding output
|
||||
port driver is not loaded.
|
||||
|
|
|
|||
|
|
@ -323,3 +323,27 @@ Description:
|
|||
|
||||
This is similar to /sys/bus/pci/drivers_autoprobe, but
|
||||
affects only the VFs associated with a specific PF.
|
||||
|
||||
What: /sys/bus/pci/devices/.../p2pmem/size
|
||||
Date: November 2017
|
||||
Contact: Logan Gunthorpe <logang@deltatee.com>
|
||||
Description:
|
||||
If the device has any Peer-to-Peer memory registered, this
|
||||
file contains the total amount of memory that the device
|
||||
provides (in decimal).
|
||||
|
||||
What: /sys/bus/pci/devices/.../p2pmem/available
|
||||
Date: November 2017
|
||||
Contact: Logan Gunthorpe <logang@deltatee.com>
|
||||
Description:
|
||||
If the device has any Peer-to-Peer memory registered, this
|
||||
file contains the amount of memory that has not been
|
||||
allocated (in decimal).
|
||||
|
||||
What: /sys/bus/pci/devices/.../p2pmem/published
|
||||
Date: November 2017
|
||||
Contact: Logan Gunthorpe <logang@deltatee.com>
|
||||
Description:
|
||||
If the device has any Peer-to-Peer memory registered, this
|
||||
file contains a '1' if the memory has been published for
|
||||
use outside the driver that owns the device.
|
||||
|
|
|
|||
|
|
@ -21,6 +21,15 @@ Description: Holds a comma separated list of device unique_ids that
|
|||
If a device is authorized automatically during boot its
|
||||
boot attribute is set to 1.
|
||||
|
||||
What: /sys/bus/thunderbolt/devices/.../domainX/iommu_dma_protection
|
||||
Date: Mar 2019
|
||||
KernelVersion: 4.21
|
||||
Contact: thunderbolt-software@lists.01.org
|
||||
Description: This attribute tells whether the system uses IOMMU
|
||||
for DMA protection. Value of 1 means IOMMU is used 0 means
|
||||
it is not (DMA protection is solely based on Thunderbolt
|
||||
security levels).
|
||||
|
||||
What: /sys/bus/thunderbolt/devices/.../domainX/security
|
||||
Date: Sep 2017
|
||||
KernelVersion: 4.13
|
||||
|
|
|
|||
|
|
@ -186,9 +186,19 @@ Contact: Lan Tianyu <tianyu.lan@intel.com>
|
|||
Description:
|
||||
Some platforms provide usb port connect types through ACPI.
|
||||
This attribute is to expose these information to user space.
|
||||
The file will read "hotplug", "wired" and "not used" if the
|
||||
The file will read "hotplug", "hardwired" and "not used" if the
|
||||
information is available, and "unknown" otherwise.
|
||||
|
||||
What: /sys/bus/usb/devices/.../(hub interface)/portX/location
|
||||
Date: October 2018
|
||||
Contact: Bjørn Mork <bjorn@mork.no>
|
||||
Description:
|
||||
Some platforms provide usb port physical location through
|
||||
firmware. This is used by the kernel to pair up logical ports
|
||||
mapping to the same physical connector. The attribute exposes the
|
||||
raw location value as a hex integer.
|
||||
|
||||
|
||||
What: /sys/bus/usb/devices/.../(hub interface)/portX/quirks
|
||||
Date: May 2018
|
||||
Contact: Nicolas Boichat <drinkcat@chromium.org>
|
||||
|
|
@ -219,7 +229,14 @@ Description:
|
|||
ports and report them to the kernel. This attribute is to expose
|
||||
the number of over-current situation occurred on a specific port
|
||||
to user space. This file will contain an unsigned 32 bit value
|
||||
which wraps to 0 after its maximum is reached.
|
||||
which wraps to 0 after its maximum is reached. This file supports
|
||||
poll() for monitoring changes to this value in user space.
|
||||
|
||||
Any time this value changes the corresponding hub device will send a
|
||||
udev event with the following attributes:
|
||||
|
||||
OVER_CURRENT_PORT=/sys/bus/usb/devices/.../(hub interface)/portX
|
||||
OVER_CURRENT_COUNT=[current value of this sysfs attribute]
|
||||
|
||||
What: /sys/bus/usb/devices/.../(hub interface)/portX/usb3_lpm_permit
|
||||
Date: November 2015
|
||||
|
|
|
|||
21
Documentation/ABI/testing/sysfs-bus-vmbus
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
What: /sys/bus/vmbus/devices/.../driver_override
|
||||
Date: August 2019
|
||||
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
||||
Description:
|
||||
This file allows the driver for a device to be specified which
|
||||
will override standard static and dynamic ID matching. When
|
||||
specified, only a driver with a name matching the value written
|
||||
to driver_override will have an opportunity to bind to the
|
||||
device. The override is specified by writing a string to the
|
||||
driver_override file (echo uio_hv_generic > driver_override) and
|
||||
may be cleared with an empty string (echo > driver_override).
|
||||
This returns the device to standard matching rules binding.
|
||||
Writing to driver_override does not automatically unbind the
|
||||
device from its current driver or make any attempt to
|
||||
automatically load the specified driver. If no driver with a
|
||||
matching name is currently loaded in the kernel, the device
|
||||
will not bind to any driver. This also allows devices to
|
||||
opt-out of driver binding using a driver_override name such as
|
||||
"none". Only a single driver may be specified in the override,
|
||||
there is no support for parsing delimiters.
|
||||
|
||||
32
Documentation/ABI/testing/sysfs-class-chromeos
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
What: /sys/class/chromeos/<ec-device-name>/flashinfo
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
Show the EC flash information.
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/kb_wake_angle
|
||||
Date: March 2018
|
||||
KernelVersion: 4.17
|
||||
Description:
|
||||
Control the keyboard wake lid angle. Values are between
|
||||
0 and 360. This file will also show the keyboard wake lid
|
||||
angle by querying the hardware.
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/reboot
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
Tell the EC to reboot in various ways. Options are:
|
||||
"cancel": Cancel a pending reboot.
|
||||
"ro": Jump to RO without rebooting.
|
||||
"rw": Jump to RW without rebooting.
|
||||
"cold": Cold reboot.
|
||||
"disable-jump": Disable jump until next reboot.
|
||||
"hibernate": Hibernate the EC.
|
||||
"at-shutdown": Reboot after an AP shutdown.
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/version
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
Show the information about the EC software and hardware.
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
What: /sys/class/chromeos/<ec-device-name>/lightbar/brightness
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
Writing to this file adjusts the overall brightness of
|
||||
the lightbar, separate from any color intensity. The
|
||||
valid range is 0 (off) to 255 (maximum brightness).
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/lightbar/interval_msec
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
The lightbar is controlled by an embedded controller (EC),
|
||||
which also manages the keyboard, battery charging, fans,
|
||||
and other system hardware. To prevent unprivileged users
|
||||
from interfering with the other EC functions, the rate at
|
||||
which the lightbar control files can be read or written is
|
||||
limited.
|
||||
|
||||
Reading this file will return the number of milliseconds
|
||||
that must elapse between accessing any of the lightbar
|
||||
functions through this interface. Going faster will simply
|
||||
block until the necessary interval has lapsed. The interval
|
||||
applies uniformly to all accesses of any kind by any user.
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/lightbar/led_rgb
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
This allows you to control each LED segment. If the
|
||||
lightbar is already running one of the automatic
|
||||
sequences, you probably won’t see anything change because
|
||||
your color setting will be almost immediately replaced.
|
||||
To get useful results, you should stop the lightbar
|
||||
sequence first.
|
||||
|
||||
The values written to this file are sets of four integers,
|
||||
indicating LED, RED, GREEN, BLUE. The LED number is 0 to 3
|
||||
to select a single segment, or 4 to set all four segments
|
||||
to the same value at once. The RED, GREEN, and BLUE
|
||||
numbers should be in the range 0 (off) to 255 (maximum).
|
||||
You can update more than one segment at a time by writing
|
||||
more than one set of four integers.
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/lightbar/program
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
This allows you to upload and run custom lightbar sequences.
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/lightbar/sequence
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
The Pixel lightbar has a number of built-in sequences
|
||||
that it displays under various conditions, such as at
|
||||
power on, shut down, or while running. Reading from this
|
||||
file displays the current sequence that the lightbar is
|
||||
displaying. Writing to this file allows you to change the
|
||||
sequence.
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/lightbar/userspace_control
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
This allows you to take the control of the lightbar. This
|
||||
prevents the kernel from going through its normal
|
||||
sequences.
|
||||
|
||||
What: /sys/class/chromeos/<ec-device-name>/lightbar/version
|
||||
Date: August 2015
|
||||
KernelVersion: 4.2
|
||||
Description:
|
||||
Show the information about the lightbar version.
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
What: /sys/class/chromeos/<ec-device-name>/vbc/vboot_context
|
||||
Date: October 2015
|
||||
KernelVersion: 4.4
|
||||
Description:
|
||||
Read/write the verified boot context data included on a
|
||||
small nvram space on some EC implementations.
|
||||
|
|
@ -1,27 +0,0 @@
|
|||
sysfs interface for the S6E63M0 AMOLED LCD panel driver
|
||||
-------------------------------------------------------
|
||||
|
||||
What: /sys/class/lcd/<lcd>/gamma_mode
|
||||
Date: May, 2010
|
||||
KernelVersion: v2.6.35
|
||||
Contact: dri-devel@lists.freedesktop.org
|
||||
Description:
|
||||
(RW) Read or write the gamma mode. Following three modes are
|
||||
supported:
|
||||
0 - gamma value 2.2,
|
||||
1 - gamma value 1.9 and
|
||||
2 - gamma value 1.7.
|
||||
|
||||
|
||||
What: /sys/class/lcd/<lcd>/gamma_table
|
||||
Date: May, 2010
|
||||
KernelVersion: v2.6.35
|
||||
Contact: dri-devel@lists.freedesktop.org
|
||||
Description:
|
||||
(RO) Displays the size of the gamma table i.e. the number of
|
||||
gamma modes available.
|
||||
|
||||
This is a backlight lcd driver. These interfaces are an extension to the API
|
||||
documented in Documentation/ABI/testing/sysfs-class-lcd and in
|
||||
Documentation/ABI/stable/sysfs-class-backlight (under
|
||||
/sys/class/backlight/<backlight>/).
|
||||
22
Documentation/ABI/testing/sysfs-class-led-driver-sc27xx
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
What: /sys/class/leds/<led>/hw_pattern
|
||||
Date: September 2018
|
||||
KernelVersion: 4.20
|
||||
Description:
|
||||
Specify a hardware pattern for the SC27XX LED. For the SC27XX
|
||||
LED controller, it only supports 4 stages to make a single
|
||||
hardware pattern, which is used to configure the rise time,
|
||||
high time, fall time and low time for the breathing mode.
|
||||
|
||||
For the breathing mode, the SC27XX LED only expects one brightness
|
||||
for the high stage. To be compatible with the hardware pattern
|
||||
format, we should set brightness as 0 for rise stage, fall
|
||||
stage and low stage.
|
||||
|
||||
Min stage duration: 125 ms
|
||||
Max stage duration: 31875 ms
|
||||
|
||||
Since the stage duration step is 125 ms, the duration should be
|
||||
a multiplier of 125, like 125ms, 250ms, 375ms, 500ms ... 31875ms.
|
||||
|
||||
Thus the format of the hardware pattern values should be:
|
||||
"0 rise_duration brightness high_duration 0 fall_duration 0 low_duration".
|
||||
37
Documentation/ABI/testing/sysfs-class-led-trigger-pattern
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
What: /sys/class/leds/<led>/pattern
|
||||
Date: September 2018
|
||||
KernelVersion: 4.20
|
||||
Description:
|
||||
Specify a software pattern for the LED, that supports altering
|
||||
the brightness for the specified duration with one software
|
||||
timer. It can do gradual dimming and step change of brightness.
|
||||
|
||||
The pattern is given by a series of tuples, of brightness and
|
||||
duration (ms).
|
||||
|
||||
The exact format is described in:
|
||||
Documentation/devicetree/bindings/leds/leds-trigger-pattern.txt
|
||||
|
||||
What: /sys/class/leds/<led>/hw_pattern
|
||||
Date: September 2018
|
||||
KernelVersion: 4.20
|
||||
Description:
|
||||
Specify a hardware pattern for the LED, for LED hardware that
|
||||
supports autonomously controlling brightness over time, according
|
||||
to some preprogrammed hardware patterns. It deactivates any active
|
||||
software pattern.
|
||||
|
||||
Since different LED hardware can have different semantics of
|
||||
hardware patterns, each driver is expected to provide its own
|
||||
description for the hardware patterns in their ABI documentation
|
||||
file.
|
||||
|
||||
What: /sys/class/leds/<led>/repeat
|
||||
Date: September 2018
|
||||
KernelVersion: 4.20
|
||||
Description:
|
||||
Specify a pattern repeat number. -1 means repeat indefinitely,
|
||||
other negative numbers and number 0 are invalid.
|
||||
|
||||
This file will always return the originally written repeat
|
||||
number.
|
||||
|
|
@ -91,6 +91,24 @@ Description:
|
|||
stacked (e.g: VLAN interfaces) but still have the same MAC
|
||||
address as their parent device.
|
||||
|
||||
What: /sys/class/net/<iface>/dev_port
|
||||
Date: February 2014
|
||||
KernelVersion: 3.15
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
Indicates the port number of this network device, formatted
|
||||
as a decimal value. Some NICs have multiple independent ports
|
||||
on the same PCI bus, device and function. This attribute allows
|
||||
userspace to distinguish the respective interfaces.
|
||||
|
||||
Note: some device drivers started to use 'dev_id' for this
|
||||
purpose since long before 3.15 and have not adopted the new
|
||||
attribute ever since. To query the port number, some tools look
|
||||
exclusively at 'dev_port', while others only consult 'dev_id'.
|
||||
If a network device has multiple client adapter ports as
|
||||
described in the previous paragraph and does not set this
|
||||
attribute to its port number, it's a kernel bug.
|
||||
|
||||
What: /sys/class/net/<iface>/dormant
|
||||
Date: March 2006
|
||||
KernelVersion: 2.6.17
|
||||
|
|
@ -117,7 +135,7 @@ Description:
|
|||
full: full duplex
|
||||
|
||||
Note: This attribute is only valid for interfaces that implement
|
||||
the ethtool get_settings method (mostly Ethernet).
|
||||
the ethtool get_link_ksettings method (mostly Ethernet).
|
||||
|
||||
What: /sys/class/net/<iface>/flags
|
||||
Date: April 2005
|
||||
|
|
@ -224,7 +242,7 @@ Description:
|
|||
an integer representing the link speed in Mbits/sec.
|
||||
|
||||
Note: this attribute is only valid for interfaces that implement
|
||||
the ethtool get_settings method (mostly Ethernet ).
|
||||
the ethtool get_link_ksettings method (mostly Ethernet).
|
||||
|
||||
What: /sys/class/net/<iface>/tx_queue_len
|
||||
Date: April 2005
|
||||
|
|
|
|||
7
Documentation/ABI/testing/sysfs-class-net-dsa
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
What: /sys/class/net/<iface>/dsa/tagging
|
||||
Date: August 2018
|
||||
KernelVersion: 4.20
|
||||
Contact: netdev@vger.kernel.org
|
||||
Description:
|
||||
String indicating the type of tagging protocol used by the
|
||||
DSA slave network device.
|
||||
|
|
@ -49,3 +49,26 @@ Contact: Wim Van Sebroeck <wim@iguana.be>
|
|||
Description:
|
||||
It is a read only file. It is read to know about current
|
||||
value of timeout programmed.
|
||||
|
||||
What: /sys/class/watchdog/watchdogn/pretimeout
|
||||
Date: December 2016
|
||||
Contact: Wim Van Sebroeck <wim@iguana.be>
|
||||
Description:
|
||||
It is a read only file. It specifies the time in seconds before
|
||||
timeout when the pretimeout interrupt is delivered. Pretimeout
|
||||
is an optional feature.
|
||||
|
||||
What: /sys/class/watchdog/watchdogn/pretimeout_avaialable_governors
|
||||
Date: February 2017
|
||||
Contact: Wim Van Sebroeck <wim@iguana.be>
|
||||
Description:
|
||||
It is a read only file. It shows the pretimeout governors
|
||||
available for this watchdog.
|
||||
|
||||
What: /sys/class/watchdog/watchdogn/pretimeout_governor
|
||||
Date: February 2017
|
||||
Contact: Wim Van Sebroeck <wim@iguana.be>
|
||||
Description:
|
||||
It is a read/write file. When read, the currently assigned
|
||||
pretimeout governor is returned. When written, it sets
|
||||
the pretimeout governor.
|
||||
|
|
|
|||
10
Documentation/ABI/testing/sysfs-devices-software_node
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
What: /sys/devices/.../software_node/
|
||||
Date: January 2019
|
||||
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
|
||||
Description:
|
||||
This directory contains the details about the device that are
|
||||
assigned in kernel (i.e. software), as opposed to the
|
||||
firmware_node directory which contains the details that are
|
||||
assigned for the device in firmware. The main attributes in the
|
||||
directory will show the properties the device has, and the
|
||||
relationship it has to some of the other devices.
|
||||
|
|
@ -145,6 +145,8 @@ What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/name
|
|||
/sys/devices/system/cpu/cpuX/cpuidle/stateN/power
|
||||
/sys/devices/system/cpu/cpuX/cpuidle/stateN/time
|
||||
/sys/devices/system/cpu/cpuX/cpuidle/stateN/usage
|
||||
/sys/devices/system/cpu/cpuX/cpuidle/stateN/above
|
||||
/sys/devices/system/cpu/cpuX/cpuidle/stateN/below
|
||||
Date: September 2007
|
||||
KernelVersion: v2.6.24
|
||||
Contact: Linux power management list <linux-pm@vger.kernel.org>
|
||||
|
|
@ -166,6 +168,11 @@ Description:
|
|||
|
||||
usage: (RO) Number of times this state was entered (a count).
|
||||
|
||||
above: (RO) Number of times this state was entered, but the
|
||||
observed CPU idle duration was too short for it (a count).
|
||||
|
||||
below: (RO) Number of times this state was entered, but the
|
||||
observed CPU idle duration was too long for it (a count).
|
||||
|
||||
What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/desc
|
||||
Date: February 2008
|
||||
|
|
|
|||
190
Documentation/ABI/testing/sysfs-driver-habanalabs
Normal file
|
|
@ -0,0 +1,190 @@
|
|||
What: /sys/class/habanalabs/hl<n>/armcp_kernel_ver
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Version of the Linux kernel running on the device's CPU
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/armcp_ver
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Version of the application running on the device's CPU
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/cpld_ver
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Version of the Device's CPLD F/W
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/device_type
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays the code name of the device according to its type.
|
||||
The supported values are: "GOYA"
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/eeprom
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: A binary file attribute that contains the contents of the
|
||||
on-board EEPROM
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/fuse_ver
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays the device's version from the eFuse
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/hard_reset
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Interface to trigger a hard-reset operation for the device.
|
||||
Hard-reset will reset ALL internal components of the device
|
||||
except for the PCI interface and the internal PLLs
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/hard_reset_cnt
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays how many times the device have undergone a hard-reset
|
||||
operation since the driver was loaded
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/high_pll
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Allows the user to set the maximum clock frequency for MME, TPC
|
||||
and IC when the power management profile is set to "automatic".
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/ic_clk
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Allows the user to set the maximum clock frequency of the
|
||||
Interconnect fabric. Writes to this parameter affect the device
|
||||
only when the power management profile is set to "manual" mode.
|
||||
The device IC clock might be set to lower value then the
|
||||
maximum. The user should read the ic_clk_curr to see the actual
|
||||
frequency value of the IC
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/ic_clk_curr
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays the current clock frequency of the Interconnect fabric
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/infineon_ver
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Version of the Device's power supply F/W code
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/max_power
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Allows the user to set the maximum power consumption of the
|
||||
device in milliwatts.
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/mme_clk
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Allows the user to set the maximum clock frequency of the
|
||||
MME compute engine. Writes to this parameter affect the device
|
||||
only when the power management profile is set to "manual" mode.
|
||||
The device MME clock might be set to lower value then the
|
||||
maximum. The user should read the mme_clk_curr to see the actual
|
||||
frequency value of the MME
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/mme_clk_curr
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays the current clock frequency of the MME compute engine
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/pci_addr
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays the PCI address of the device. This is needed so the
|
||||
user would be able to open a device based on its PCI address
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/pm_mng_profile
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Power management profile. Values are "auto", "manual". In "auto"
|
||||
mode, the driver will set the maximum clock frequency to a high
|
||||
value when a user-space process opens the device's file (unless
|
||||
it was already opened by another process). The driver will set
|
||||
the max clock frequency to a low value when there are no user
|
||||
processes that are opened on the device's file. In "manual"
|
||||
mode, the user sets the maximum clock frequency by writing to
|
||||
ic_clk, mme_clk and tpc_clk
|
||||
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/preboot_btl_ver
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Version of the device's preboot F/W code
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/soft_reset
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Interface to trigger a soft-reset operation for the device.
|
||||
Soft-reset will reset only the compute and DMA engines of the
|
||||
device
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/soft_reset_cnt
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays how many times the device have undergone a soft-reset
|
||||
operation since the driver was loaded
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/status
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Status of the card: "Operational", "Malfunction", "In reset".
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/thermal_ver
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Version of the Device's thermal daemon
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/tpc_clk
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Allows the user to set the maximum clock frequency of the
|
||||
TPC compute engines. Writes to this parameter affect the device
|
||||
only when the power management profile is set to "manual" mode.
|
||||
The device TPC clock might be set to lower value then the
|
||||
maximum. The user should read the tpc_clk_curr to see the actual
|
||||
frequency value of the TPC
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/tpc_clk_curr
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays the current clock frequency of the TPC compute engines
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/uboot_ver
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Version of the u-boot running on the device's CPU
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/write_open_cnt
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
Contact: oded.gabbay@gmail.com
|
||||
Description: Displays the total number of user processes that are currently
|
||||
opened on the device's file
|
||||
|
|
@ -109,3 +109,10 @@ Description:
|
|||
write operation (since a 4k random write might turn
|
||||
into a much larger write due to the zeroout
|
||||
operation).
|
||||
|
||||
What: /sys/fs/ext4/<disk>/journal_task
|
||||
Date: February 2019
|
||||
Contact: "Theodore Ts'o" <tytso@mit.edu>
|
||||
Description:
|
||||
This file is read-only and shows the pid of journal thread in
|
||||
current pid-namespace or 0 if task is unreachable.
|
||||
|
|
|
|||
|
|
@ -86,12 +86,28 @@ Description:
|
|||
The unit size is one block, now only support configuring in range
|
||||
of [1, 512].
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/umount_discard_timeout
|
||||
Date: January 2019
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Set timeout to issue discard commands during umount.
|
||||
Default: 5 secs
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/max_victim_search
|
||||
Date: January 2014
|
||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
Description:
|
||||
Controls the number of trials to find a victim segment.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/migration_granularity
|
||||
Date: October 2018
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Description:
|
||||
Controls migration granularity of garbage collection on large
|
||||
section, it can let GC move partial segment{s} of one section
|
||||
in one GC cycle, so that dispersing heavy overhead GC to
|
||||
multiple lightweight one.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/dir_level
|
||||
Date: March 2014
|
||||
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
|
||||
|
|
@ -121,7 +137,22 @@ What: /sys/fs/f2fs/<disk>/idle_interval
|
|||
Date: January 2016
|
||||
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
|
||||
Description:
|
||||
Controls the idle timing.
|
||||
Controls the idle timing for all paths other than
|
||||
discard and gc path.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/discard_idle_interval
|
||||
Date: September 2018
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Contact: "Sahitya Tummala" <stummala@codeaurora.org>
|
||||
Description:
|
||||
Controls the idle timing for discard path.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/gc_idle_interval
|
||||
Date: September 2018
|
||||
Contact: "Chao Yu" <yuchao0@huawei.com>
|
||||
Contact: "Sahitya Tummala" <stummala@codeaurora.org>
|
||||
Description:
|
||||
Controls the idle timing for gc path.
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/iostat_enable
|
||||
Date: August 2017
|
||||
|
|
|
|||
|
|
@ -33,18 +33,6 @@ Description:
|
|||
An attribute which indicates whether the patch is currently in
|
||||
transition.
|
||||
|
||||
What: /sys/kernel/livepatch/<patch>/signal
|
||||
Date: Nov 2017
|
||||
KernelVersion: 4.15.0
|
||||
Contact: live-patching@vger.kernel.org
|
||||
Description:
|
||||
A writable attribute that allows administrator to affect the
|
||||
course of an existing transition. Writing 1 sends a fake
|
||||
signal to all remaining blocking tasks. The fake signal
|
||||
means that no proper signal is delivered (there is no data in
|
||||
signal pending structures). Tasks are interrupted or woken up,
|
||||
and forced to change their patched state.
|
||||
|
||||
What: /sys/kernel/livepatch/<patch>/force
|
||||
Date: Nov 2017
|
||||
KernelVersion: 4.15.0
|
||||
|
|
|
|||
35
Documentation/ABI/testing/sysfs-platform-lg-laptop
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
What: /sys/devices/platform/lg-laptop/reader_mode
|
||||
Date: October 2018
|
||||
KernelVersion: 4.20
|
||||
Contact: "Matan Ziv-Av <matan@svgalib.org>
|
||||
Description:
|
||||
Control reader mode. 1 means on, 0 means off.
|
||||
|
||||
What: /sys/devices/platform/lg-laptop/fn_lock
|
||||
Date: October 2018
|
||||
KernelVersion: 4.20
|
||||
Contact: "Matan Ziv-Av <matan@svgalib.org>
|
||||
Description:
|
||||
Control FN lock mode. 1 means on, 0 means off.
|
||||
|
||||
What: /sys/devices/platform/lg-laptop/battery_care_limit
|
||||
Date: October 2018
|
||||
KernelVersion: 4.20
|
||||
Contact: "Matan Ziv-Av <matan@svgalib.org>
|
||||
Description:
|
||||
Maximal battery charge level. Accepted values are 80 or 100.
|
||||
|
||||
What: /sys/devices/platform/lg-laptop/fan_mode
|
||||
Date: October 2018
|
||||
KernelVersion: 4.20
|
||||
Contact: "Matan Ziv-Av <matan@svgalib.org>
|
||||
Description:
|
||||
Control fan mode. 1 for performance mode, 0 for silent mode.
|
||||
|
||||
What: /sys/devices/platform/lg-laptop/usb_charge
|
||||
Date: October 2018
|
||||
KernelVersion: 4.20
|
||||
Contact: "Matan Ziv-Av <matan@svgalib.org>
|
||||
Description:
|
||||
Control USB port charging when device is turned off.
|
||||
1 means on, 0 means off.
|
||||
|
|
@ -99,7 +99,7 @@ Description:
|
|||
this file, the suspend image will be as small as possible.
|
||||
|
||||
Reading from this file will display the current image size
|
||||
limit, which is set to 500 MB by default.
|
||||
limit, which is set to around 2/5 of available RAM by default.
|
||||
|
||||
What: /sys/power/pm_trace
|
||||
Date: August 2006
|
||||
|
|
|
|||
|
|
@ -146,114 +146,75 @@ What about block I/O and networking buffers? The block I/O and
|
|||
networking subsystems make sure that the buffers they use are valid
|
||||
for you to DMA from/to.
|
||||
|
||||
DMA addressing limitations
|
||||
DMA addressing capabilities
|
||||
==========================
|
||||
|
||||
Does your device have any DMA addressing limitations? For example, is
|
||||
your device only capable of driving the low order 24-bits of address?
|
||||
If so, you need to inform the kernel of this fact.
|
||||
By default, the kernel assumes that your device can address 32-bits of DMA
|
||||
addressing. For a 64-bit capable device, this needs to be increased, and for
|
||||
a device with limitations, it needs to be decreased.
|
||||
|
||||
By default, the kernel assumes that your device can address the full
|
||||
32-bits. For a 64-bit capable device, this needs to be increased.
|
||||
And for a device with limitations, as discussed in the previous
|
||||
paragraph, it needs to be decreased.
|
||||
Special note about PCI: PCI-X specification requires PCI-X devices to support
|
||||
64-bit addressing (DAC) for all transactions. And at least one platform (SGI
|
||||
SN2) requires 64-bit consistent allocations to operate correctly when the IO
|
||||
bus is in PCI-X mode.
|
||||
|
||||
Special note about PCI: PCI-X specification requires PCI-X devices to
|
||||
support 64-bit addressing (DAC) for all transactions. And at least
|
||||
one platform (SGI SN2) requires 64-bit consistent allocations to
|
||||
operate correctly when the IO bus is in PCI-X mode.
|
||||
For correct operation, you must set the DMA mask to inform the kernel about
|
||||
your devices DMA addressing capabilities.
|
||||
|
||||
For correct operation, you must interrogate the kernel in your device
|
||||
probe routine to see if the DMA controller on the machine can properly
|
||||
support the DMA addressing limitation your device has. It is good
|
||||
style to do this even if your device holds the default setting,
|
||||
because this shows that you did think about these issues wrt. your
|
||||
device.
|
||||
|
||||
The query is performed via a call to dma_set_mask_and_coherent()::
|
||||
This is performed via a call to dma_set_mask_and_coherent()::
|
||||
|
||||
int dma_set_mask_and_coherent(struct device *dev, u64 mask);
|
||||
|
||||
which will query the mask for both streaming and coherent APIs together.
|
||||
If you have some special requirements, then the following two separate
|
||||
queries can be used instead:
|
||||
which will set the mask for both streaming and coherent APIs together. If you
|
||||
have some special requirements, then the following two separate calls can be
|
||||
used instead:
|
||||
|
||||
The query for streaming mappings is performed via a call to
|
||||
The setup for streaming mappings is performed via a call to
|
||||
dma_set_mask()::
|
||||
|
||||
int dma_set_mask(struct device *dev, u64 mask);
|
||||
|
||||
The query for consistent allocations is performed via a call
|
||||
The setup for consistent allocations is performed via a call
|
||||
to dma_set_coherent_mask()::
|
||||
|
||||
int dma_set_coherent_mask(struct device *dev, u64 mask);
|
||||
|
||||
Here, dev is a pointer to the device struct of your device, and mask
|
||||
is a bit mask describing which bits of an address your device
|
||||
supports. It returns zero if your card can perform DMA properly on
|
||||
the machine given the address mask you provided. In general, the
|
||||
device struct of your device is embedded in the bus-specific device
|
||||
struct of your device. For example, &pdev->dev is a pointer to the
|
||||
device struct of a PCI device (pdev is a pointer to the PCI device
|
||||
struct of your device).
|
||||
Here, dev is a pointer to the device struct of your device, and mask is a bit
|
||||
mask describing which bits of an address your device supports. Often the
|
||||
device struct of your device is embedded in the bus-specific device struct of
|
||||
your device. For example, &pdev->dev is a pointer to the device struct of a
|
||||
PCI device (pdev is a pointer to the PCI device struct of your device).
|
||||
|
||||
If it returns non-zero, your device cannot perform DMA properly on
|
||||
this platform, and attempting to do so will result in undefined
|
||||
behavior. You must either use a different mask, or not use DMA.
|
||||
These calls usually return zero to indicated your device can perform DMA
|
||||
properly on the machine given the address mask you provided, but they might
|
||||
return an error if the mask is too small to be supportable on the given
|
||||
system. If it returns non-zero, your device cannot perform DMA properly on
|
||||
this platform, and attempting to do so will result in undefined behavior.
|
||||
You must not use DMA on this device unless the dma_set_mask family of
|
||||
functions has returned success.
|
||||
|
||||
This means that in the failure case, you have three options:
|
||||
This means that in the failure case, you have two options:
|
||||
|
||||
1) Use another DMA mask, if possible (see below).
|
||||
2) Use some non-DMA mode for data transfer, if possible.
|
||||
3) Ignore this device and do not initialize it.
|
||||
1) Use some non-DMA mode for data transfer, if possible.
|
||||
2) Ignore this device and do not initialize it.
|
||||
|
||||
It is recommended that your driver print a kernel KERN_WARNING message
|
||||
when you end up performing either #2 or #3. In this manner, if a user
|
||||
of your driver reports that performance is bad or that the device is not
|
||||
even detected, you can ask them for the kernel messages to find out
|
||||
exactly why.
|
||||
It is recommended that your driver print a kernel KERN_WARNING message when
|
||||
setting the DMA mask fails. In this manner, if a user of your driver reports
|
||||
that performance is bad or that the device is not even detected, you can ask
|
||||
them for the kernel messages to find out exactly why.
|
||||
|
||||
The standard 32-bit addressing device would do something like this::
|
||||
The standard 64-bit addressing device would do something like this::
|
||||
|
||||
if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32))) {
|
||||
if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64))) {
|
||||
dev_warn(dev, "mydev: No suitable DMA available\n");
|
||||
goto ignore_this_device;
|
||||
}
|
||||
|
||||
Another common scenario is a 64-bit capable device. The approach here
|
||||
is to try for 64-bit addressing, but back down to a 32-bit mask that
|
||||
should not fail. The kernel may fail the 64-bit mask not because the
|
||||
platform is not capable of 64-bit addressing. Rather, it may fail in
|
||||
this case simply because 32-bit addressing is done more efficiently
|
||||
than 64-bit addressing. For example, Sparc64 PCI SAC addressing is
|
||||
more efficient than DAC addressing.
|
||||
If the device only supports 32-bit addressing for descriptors in the
|
||||
coherent allocations, but supports full 64-bits for streaming mappings
|
||||
it would look like this:
|
||||
|
||||
Here is how you would handle a 64-bit capable device which can drive
|
||||
all 64-bits when accessing streaming DMA::
|
||||
|
||||
int using_dac;
|
||||
|
||||
if (!dma_set_mask(dev, DMA_BIT_MASK(64))) {
|
||||
using_dac = 1;
|
||||
} else if (!dma_set_mask(dev, DMA_BIT_MASK(32))) {
|
||||
using_dac = 0;
|
||||
} else {
|
||||
dev_warn(dev, "mydev: No suitable DMA available\n");
|
||||
goto ignore_this_device;
|
||||
}
|
||||
|
||||
If a card is capable of using 64-bit consistent allocations as well,
|
||||
the case would look like this::
|
||||
|
||||
int using_dac, consistent_using_dac;
|
||||
|
||||
if (!dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64))) {
|
||||
using_dac = 1;
|
||||
consistent_using_dac = 1;
|
||||
} else if (!dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32))) {
|
||||
using_dac = 0;
|
||||
consistent_using_dac = 0;
|
||||
} else {
|
||||
if (dma_set_mask(dev, DMA_BIT_MASK(64))) {
|
||||
dev_warn(dev, "mydev: No suitable DMA available\n");
|
||||
goto ignore_this_device;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -58,15 +58,6 @@ specify the ``GFP_`` flags (see kmalloc()) for the allocation (the
|
|||
implementation may choose to ignore flags that affect the location of
|
||||
the returned memory, like GFP_DMA).
|
||||
|
||||
::
|
||||
|
||||
void *
|
||||
dma_zalloc_coherent(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t flag)
|
||||
|
||||
Wraps dma_alloc_coherent() and also zeroes the returned memory if the
|
||||
allocation attempt succeeded.
|
||||
|
||||
::
|
||||
|
||||
void
|
||||
|
|
@ -204,6 +195,14 @@ Requesting the required mask does not alter the current mask. If you
|
|||
wish to take advantage of it, you should issue a dma_set_mask()
|
||||
call to set the mask to the value returned.
|
||||
|
||||
::
|
||||
|
||||
size_t
|
||||
dma_direct_max_mapping_size(struct device *dev);
|
||||
|
||||
Returns the maximum size of a mapping for the device. The size parameter
|
||||
of the mapping functions like dma_map_single(), dma_map_page() and
|
||||
others should not be larger than the returned value.
|
||||
|
||||
Part Id - Streaming DMA mappings
|
||||
--------------------------------
|
||||
|
|
@ -539,8 +538,8 @@ that simply cannot make consistent memory.
|
|||
dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
|
||||
dma_addr_t dma_handle, unsigned long attrs)
|
||||
|
||||
Free memory allocated by the dma_alloc_attrs(). All parameters common
|
||||
parameters must identical to those otherwise passed to dma_fre_coherent,
|
||||
Free memory allocated by the dma_alloc_attrs(). All common
|
||||
parameters must be identical to those otherwise passed to dma_free_coherent,
|
||||
and the attrs argument must be identical to the attrs passed to
|
||||
dma_alloc_attrs().
|
||||
|
||||
|
|
@ -575,8 +574,7 @@ boundaries when doing this.
|
|||
|
||||
int
|
||||
dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
|
||||
dma_addr_t device_addr, size_t size, int
|
||||
flags)
|
||||
dma_addr_t device_addr, size_t size);
|
||||
|
||||
Declare region of memory to be handed out by dma_alloc_coherent() when
|
||||
it's asked for coherent memory for this device.
|
||||
|
|
@ -590,12 +588,6 @@ dma_addr_t in dma_alloc_coherent()).
|
|||
|
||||
size is the size of the area (must be multiples of PAGE_SIZE).
|
||||
|
||||
flags can be ORed together and are:
|
||||
|
||||
- DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions.
|
||||
Do not allow dma_alloc_coherent() to fall back to system memory when
|
||||
it's out of memory in the declared region.
|
||||
|
||||
As a simplification for the platforms, only *one* such region of
|
||||
memory may be declared per device.
|
||||
|
||||
|
|
@ -614,23 +606,6 @@ unconditionally having removed all the required structures. It is the
|
|||
driver's job to ensure that no parts of this memory region are
|
||||
currently in use.
|
||||
|
||||
::
|
||||
|
||||
void *
|
||||
dma_mark_declared_memory_occupied(struct device *dev,
|
||||
dma_addr_t device_addr, size_t size)
|
||||
|
||||
This is used to occupy specific regions of the declared space
|
||||
(dma_alloc_coherent() will hand out the first free region it finds).
|
||||
|
||||
device_addr is the *device* address of the region requested.
|
||||
|
||||
size is the size (and should be a page-sized multiple).
|
||||
|
||||
The return value will be either a pointer to the processor virtual
|
||||
address of the memory, or an error (via PTR_ERR()) if any part of the
|
||||
region is occupied.
|
||||
|
||||
Part III - Debug drivers use of the DMA-API
|
||||
-------------------------------------------
|
||||
|
||||
|
|
@ -705,6 +680,9 @@ dma-api/disabled This read-only file contains the character 'Y'
|
|||
happen when it runs out of memory or if it was
|
||||
disabled at boot time
|
||||
|
||||
dma-api/dump This read-only file contains current DMA
|
||||
mappings.
|
||||
|
||||
dma-api/error_count This file is read-only and shows the total
|
||||
numbers of errors found.
|
||||
|
||||
|
|
@ -717,13 +695,16 @@ dma-api/num_errors The number in this file shows how many
|
|||
dma-api/min_free_entries This read-only file can be read to get the
|
||||
minimum number of free dma_debug_entries the
|
||||
allocator has ever seen. If this value goes
|
||||
down to zero the code will disable itself
|
||||
because it is not longer reliable.
|
||||
down to zero the code will attempt to increase
|
||||
nr_total_entries to compensate.
|
||||
|
||||
dma-api/num_free_entries The current number of free dma_debug_entries
|
||||
in the allocator.
|
||||
|
||||
dma-api/driver-filter You can write a name of a driver into this file
|
||||
dma-api/nr_total_entries The total number of dma_debug_entries in the
|
||||
allocator, both free and used.
|
||||
|
||||
dma-api/driver_filter You can write a name of a driver into this file
|
||||
to limit the debug output to requests from that
|
||||
particular driver. Write an empty string to
|
||||
that file to disable the filter and see
|
||||
|
|
@ -742,10 +723,15 @@ driver filter at boot time. The debug code will only print errors for that
|
|||
driver afterwards. This filter can be disabled or changed later using debugfs.
|
||||
|
||||
When the code disables itself at runtime this is most likely because it ran
|
||||
out of dma_debug_entries. These entries are preallocated at boot. The number
|
||||
of preallocated entries is defined per architecture. If it is too low for you
|
||||
boot with 'dma_debug_entries=<your_desired_number>' to overwrite the
|
||||
architectural default.
|
||||
out of dma_debug_entries and was unable to allocate more on-demand. 65536
|
||||
entries are preallocated at boot - if this is too low for you boot with
|
||||
'dma_debug_entries=<your_desired_number>' to overwrite the default. Note
|
||||
that the code allocates entries in batches, so the exact number of
|
||||
preallocated entries may be greater than the actual number requested. The
|
||||
code will print to the kernel log each time it has dynamically allocated
|
||||
as many entries as were initially preallocated. This is to indicate that a
|
||||
larger preallocation size may be appropriate, or if it happens continually
|
||||
that a driver may be leaking mappings.
|
||||
|
||||
::
|
||||
|
||||
|
|
|
|||
|
|
@ -52,8 +52,8 @@ Address translation
|
|||
-------------------
|
||||
|
||||
To translate the virtual address to a bus address, use the normal DMA
|
||||
API. Do _not_ use isa_virt_to_phys() even though it does the same
|
||||
thing. The reason for this is that the function isa_virt_to_phys()
|
||||
API. Do _not_ use isa_virt_to_bus() even though it does the same
|
||||
thing. The reason for this is that the function isa_virt_to_bus()
|
||||
will require a Kconfig dependency to ISA, not just ISA_DMA_API which
|
||||
is really all you need. Remember that even though the DMA controller
|
||||
has its origins in ISA it is used elsewhere.
|
||||
|
|
|
|||
|
|
@ -31,14 +31,13 @@
|
|||
#define YBLANK 38
|
||||
#define XOFFSET 8
|
||||
#define XPULSE 144
|
||||
#define YOFFSET (63+3)
|
||||
#define YPULSE (63+6)
|
||||
#define YOFFSET 3
|
||||
#define YPULSE 6
|
||||
#define DPI 72
|
||||
#define VFREQ 60 /* Hz */
|
||||
#define TIMING_NAME "Linux XGA"
|
||||
#define ESTABLISHED_TIMING2_BITS 0x08 /* Bit 3 -> 1024x768 @60 Hz */
|
||||
#define HSYNC_POL 0
|
||||
#define VSYNC_POL 0
|
||||
#define CRC 0x55
|
||||
|
||||
#include "edid.S"
|
||||
|
|
|
|||
|
|
@ -31,14 +31,13 @@
|
|||
#define YBLANK 42
|
||||
#define XOFFSET 48
|
||||
#define XPULSE 112
|
||||
#define YOFFSET (63+1)
|
||||
#define YPULSE (63+3)
|
||||
#define YOFFSET 1
|
||||
#define YPULSE 3
|
||||
#define DPI 72
|
||||
#define VFREQ 60 /* Hz */
|
||||
#define TIMING_NAME "Linux SXGA"
|
||||
/* No ESTABLISHED_TIMINGx_BITS */
|
||||
#define HSYNC_POL 1
|
||||
#define VSYNC_POL 1
|
||||
#define CRC 0xa0
|
||||
|
||||
#include "edid.S"
|
||||
|
|
|
|||
|
|
@ -31,14 +31,13 @@
|
|||
#define YBLANK 50
|
||||
#define XOFFSET 64
|
||||
#define XPULSE 192
|
||||
#define YOFFSET (63+1)
|
||||
#define YPULSE (63+3)
|
||||
#define YOFFSET 1
|
||||
#define YPULSE 3
|
||||
#define DPI 72
|
||||
#define VFREQ 60 /* Hz */
|
||||
#define TIMING_NAME "Linux UXGA"
|
||||
/* No ESTABLISHED_TIMINGx_BITS */
|
||||
#define HSYNC_POL 1
|
||||
#define VSYNC_POL 1
|
||||
#define CRC 0x9d
|
||||
|
||||
#include "edid.S"
|
||||
|
|
|
|||
|
|
@ -31,14 +31,13 @@
|
|||
#define YBLANK 39
|
||||
#define XOFFSET 104
|
||||
#define XPULSE 176
|
||||
#define YOFFSET (63+3)
|
||||
#define YPULSE (63+6)
|
||||
#define YOFFSET 3
|
||||
#define YPULSE 6
|
||||
#define DPI 96
|
||||
#define VFREQ 60 /* Hz */
|
||||
#define TIMING_NAME "Linux WSXGA"
|
||||
/* No ESTABLISHED_TIMINGx_BITS */
|
||||
#define HSYNC_POL 1
|
||||
#define VSYNC_POL 1
|
||||
#define CRC 0x26
|
||||
|
||||
#include "edid.S"
|
||||
|
|
|
|||
|
|
@ -31,14 +31,13 @@
|
|||
#define YBLANK 45
|
||||
#define XOFFSET 88
|
||||
#define XPULSE 44
|
||||
#define YOFFSET (63+4)
|
||||
#define YPULSE (63+5)
|
||||
#define YOFFSET 4
|
||||
#define YPULSE 5
|
||||
#define DPI 96
|
||||
#define VFREQ 60 /* Hz */
|
||||
#define TIMING_NAME "Linux FHD"
|
||||
/* No ESTABLISHED_TIMINGx_BITS */
|
||||
#define HSYNC_POL 1
|
||||
#define VSYNC_POL 1
|
||||
#define CRC 0x05
|
||||
|
||||
#include "edid.S"
|
||||
|
|
|
|||
|
|
@ -28,14 +28,13 @@
|
|||
#define YBLANK 28
|
||||
#define XOFFSET 40
|
||||
#define XPULSE 128
|
||||
#define YOFFSET (63+1)
|
||||
#define YPULSE (63+4)
|
||||
#define YOFFSET 1
|
||||
#define YPULSE 4
|
||||
#define DPI 72
|
||||
#define VFREQ 60 /* Hz */
|
||||
#define TIMING_NAME "Linux SVGA"
|
||||
#define ESTABLISHED_TIMING1_BITS 0x01 /* Bit 0: 800x600 @ 60Hz */
|
||||
#define HSYNC_POL 1
|
||||
#define VSYNC_POL 1
|
||||
#define CRC 0xc2
|
||||
|
||||
#include "edid.S"
|
||||
|
|
|
|||
|
|
@ -45,14 +45,5 @@ EDID:
|
|||
|
||||
#define YPIX vdisp
|
||||
#define YBLANK vtotal-vdisp
|
||||
#define YOFFSET (63+(vsyncstart-vdisp))
|
||||
#define YPULSE (63+(vsyncend-vsyncstart))
|
||||
|
||||
The CRC value in the last line
|
||||
#define CRC 0x55
|
||||
also is a bit tricky. After a first version of the binary data set is
|
||||
created, it must be checked with the "edid-decode" utility which will
|
||||
most probably complain about a wrong CRC. Fortunately, the utility also
|
||||
displays the correct CRC which must then be inserted into the source
|
||||
file. After the make procedure is repeated, the EDID data set is ready
|
||||
to be used.
|
||||
#define YOFFSET vsyncstart-vdisp
|
||||
#define YPULSE vsyncend-vsyncstart
|
||||
|
|
|
|||
|
|
@ -15,10 +15,21 @@ clean:
|
|||
%.o: %.S
|
||||
@cc -c $^
|
||||
|
||||
%.bin: %.o
|
||||
%.bin.nocrc: %.o
|
||||
@objcopy -Obinary $^ $@
|
||||
|
||||
%.bin.ihex: %.o
|
||||
%.crc: %.bin.nocrc
|
||||
@list=$$(for i in `seq 1 127`; do head -c$$i $^ | tail -c1 \
|
||||
| hexdump -v -e '/1 "%02X+"'; done); \
|
||||
echo "ibase=16;100-($${list%?})%100" | bc >$@
|
||||
|
||||
%.p: %.crc %.S
|
||||
@cc -c -DCRC="$$(cat $*.crc)" -o $@ $*.S
|
||||
|
||||
%.bin: %.p
|
||||
@objcopy -Obinary $^ $@
|
||||
|
||||
%.bin.ihex: %.p
|
||||
@objcopy -Oihex $^ $@
|
||||
@dos2unix $@ 2>/dev/null
|
||||
|
||||
|
|
|
|||
|
|
@ -47,9 +47,11 @@
|
|||
#define mfgname2id(v1,v2,v3) \
|
||||
((((v1-'@')&0x1f)<<10)+(((v2-'@')&0x1f)<<5)+((v3-'@')&0x1f))
|
||||
#define swap16(v1) ((v1>>8)+((v1&0xff)<<8))
|
||||
#define lsbs2(v1,v2) (((v1&0x0f)<<4)+(v2&0x0f))
|
||||
#define msbs2(v1,v2) ((((v1>>8)&0x0f)<<4)+((v2>>8)&0x0f))
|
||||
#define msbs4(v1,v2,v3,v4) \
|
||||
(((v1&0x03)>>2)+((v2&0x03)>>4)+((v3&0x03)>>6)+((v4&0x03)>>8))
|
||||
((((v1>>8)&0x03)<<6)+(((v2>>8)&0x03)<<4)+\
|
||||
(((v3>>4)&0x03)<<2)+((v4>>4)&0x03))
|
||||
#define pixdpi2mm(pix,dpi) ((pix*25)/dpi)
|
||||
#define xsize pixdpi2mm(XPIX,DPI)
|
||||
#define ysize pixdpi2mm(YPIX,DPI)
|
||||
|
|
@ -200,9 +202,9 @@ y_msbs: .byte msbs2(YPIX,YBLANK)
|
|||
x_snc_off_lsb: .byte XOFFSET&0xff
|
||||
/* Horizontal sync pulse width pixels 8 lsbits (0-1023) */
|
||||
x_snc_pls_lsb: .byte XPULSE&0xff
|
||||
/* Bits 7-4 Vertical sync offset lines 4 lsbits -63)
|
||||
Bits 3-0 Vertical sync pulse width lines 4 lsbits -63) */
|
||||
y_snc_lsb: .byte ((YOFFSET-63)<<4)+(YPULSE-63)
|
||||
/* Bits 7-4 Vertical sync offset lines 4 lsbits (0-63)
|
||||
Bits 3-0 Vertical sync pulse width lines 4 lsbits (0-63) */
|
||||
y_snc_lsb: .byte lsbs2(YOFFSET, YPULSE)
|
||||
/* Bits 7-6 Horizontal sync offset pixels 2 msbits
|
||||
Bits 5-4 Horizontal sync pulse width pixels 2 msbits
|
||||
Bits 3-2 Vertical sync offset lines 2 msbits
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
# Makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
subdir-y :=
|
||||
subdir-y := devicetree/bindings/
|
||||
|
||||
# You can set these variables from the command line.
|
||||
SPHINXBUILD = sphinx-build
|
||||
|
|
|
|||
|
|
@ -1,26 +0,0 @@
|
|||
00-INDEX
|
||||
- this file
|
||||
acpi-info.txt
|
||||
- info on how PCI host bridges are represented in ACPI
|
||||
MSI-HOWTO.txt
|
||||
- the Message Signaled Interrupts (MSI) Driver Guide HOWTO and FAQ.
|
||||
PCIEBUS-HOWTO.txt
|
||||
- a guide describing the PCI Express Port Bus driver
|
||||
pci-error-recovery.txt
|
||||
- info on PCI error recovery
|
||||
pci-iov-howto.txt
|
||||
- the PCI Express I/O Virtualization HOWTO
|
||||
pci.txt
|
||||
- info on the PCI subsystem for device driver authors
|
||||
pcieaer-howto.txt
|
||||
- the PCI Express Advanced Error Reporting Driver Guide HOWTO
|
||||
endpoint/pci-endpoint.txt
|
||||
- guide to add endpoint controller driver and endpoint function driver.
|
||||
endpoint/pci-endpoint-cfs.txt
|
||||
- guide to use configfs to configure the PCI endpoint function.
|
||||
endpoint/pci-test-function.txt
|
||||
- specification of *PCI test* function device.
|
||||
endpoint/pci-test-howto.txt
|
||||
- userguide for PCI endpoint test function.
|
||||
endpoint/function/binding/
|
||||
- binding documentation for PCI endpoint function
|
||||
|
|
@ -99,17 +99,20 @@ Note that the devices listed here correspond to the value populated in 1.4 above
|
|||
2.2 Using Endpoint Test function Device
|
||||
|
||||
pcitest.sh added in tools/pci/ can be used to run all the default PCI endpoint
|
||||
tests. Before pcitest.sh can be used pcitest.c should be compiled using the
|
||||
following commands.
|
||||
tests. To compile this tool the following commands should be used:
|
||||
|
||||
cd <kernel-dir>
|
||||
make headers_install ARCH=arm
|
||||
arm-linux-gnueabihf-gcc -Iusr/include tools/pci/pcitest.c -o pcitest
|
||||
cp pcitest <rootfs>/usr/sbin/
|
||||
cp tools/pci/pcitest.sh <rootfs>
|
||||
# cd <kernel-dir>
|
||||
# make -C tools/pci
|
||||
|
||||
or if you desire to compile and install in your system:
|
||||
|
||||
# cd <kernel-dir>
|
||||
# make -C tools/pci install
|
||||
|
||||
The tool and script will be located in <rootfs>/usr/bin/
|
||||
|
||||
2.2.1 pcitest.sh Output
|
||||
# ./pcitest.sh
|
||||
# pcitest.sh
|
||||
BAR tests
|
||||
|
||||
BAR0: OKAY
|
||||
|
|
|
|||
|
|
@ -110,7 +110,7 @@ The actual steps taken by a platform to recover from a PCI error
|
|||
event will be platform-dependent, but will follow the general
|
||||
sequence described below.
|
||||
|
||||
STEP 0: Error Event: ERR_NONFATAL
|
||||
STEP 0: Error Event
|
||||
-------------------
|
||||
A PCI bus error is detected by the PCI hardware. On powerpc, the slot
|
||||
is isolated, in that all I/O is blocked: all reads return 0xffffffff,
|
||||
|
|
@ -228,7 +228,13 @@ proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume Operations).
|
|||
If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
|
||||
proceeds to STEP 4 (Slot Reset)
|
||||
|
||||
STEP 3: Slot Reset
|
||||
STEP 3: Link Reset
|
||||
------------------
|
||||
The platform resets the link. This is a PCI-Express specific step
|
||||
and is done whenever a fatal error has been detected that can be
|
||||
"solved" by resetting the link.
|
||||
|
||||
STEP 4: Slot Reset
|
||||
------------------
|
||||
|
||||
In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
|
||||
|
|
@ -314,7 +320,7 @@ Failure).
|
|||
>>> However, it probably should.
|
||||
|
||||
|
||||
STEP 4: Resume Operations
|
||||
STEP 5: Resume Operations
|
||||
-------------------------
|
||||
The platform will call the resume() callback on all affected device
|
||||
drivers if all drivers on the segment have returned
|
||||
|
|
@ -326,7 +332,7 @@ a result code.
|
|||
At this point, if a new error happens, the platform will restart
|
||||
a new error recovery sequence.
|
||||
|
||||
STEP 5: Permanent Failure
|
||||
STEP 6: Permanent Failure
|
||||
-------------------------
|
||||
A "permanent failure" has occurred, and the platform cannot recover
|
||||
the device. The platform will call error_detected() with a
|
||||
|
|
@ -349,27 +355,6 @@ errors. See the discussion in powerpc/eeh-pci-error-recovery.txt
|
|||
for additional detail on real-life experience of the causes of
|
||||
software errors.
|
||||
|
||||
STEP 0: Error Event: ERR_FATAL
|
||||
-------------------
|
||||
PCI bus error is detected by the PCI hardware. On powerpc, the slot is
|
||||
isolated, in that all I/O is blocked: all reads return 0xffffffff, all
|
||||
writes are ignored.
|
||||
|
||||
STEP 1: Remove devices
|
||||
--------------------
|
||||
Platform removes the devices depending on the error agent, it could be
|
||||
this port for all subordinates or upstream component (likely downstream
|
||||
port)
|
||||
|
||||
STEP 2: Reset link
|
||||
--------------------
|
||||
The platform resets the link. This is a PCI-Express specific step and is
|
||||
done whenever a fatal error has been detected that can be "solved" by
|
||||
resetting the link.
|
||||
|
||||
STEP 3: Re-enumerate the devices
|
||||
--------------------
|
||||
Initiates the re-enumeration.
|
||||
|
||||
Conclusion; General Remarks
|
||||
---------------------------
|
||||
|
|
|
|||
|
|
@ -1,34 +0,0 @@
|
|||
00-INDEX
|
||||
- This file
|
||||
arrayRCU.txt
|
||||
- Using RCU to Protect Read-Mostly Arrays
|
||||
checklist.txt
|
||||
- Review Checklist for RCU Patches
|
||||
listRCU.txt
|
||||
- Using RCU to Protect Read-Mostly Linked Lists
|
||||
lockdep.txt
|
||||
- RCU and lockdep checking
|
||||
lockdep-splat.txt
|
||||
- RCU Lockdep splats explained.
|
||||
NMI-RCU.txt
|
||||
- Using RCU to Protect Dynamic NMI Handlers
|
||||
rcu_dereference.txt
|
||||
- Proper care and feeding of return values from rcu_dereference()
|
||||
rcubarrier.txt
|
||||
- RCU and Unloadable Modules
|
||||
rculist_nulls.txt
|
||||
- RCU list primitives for use with SLAB_TYPESAFE_BY_RCU
|
||||
rcuref.txt
|
||||
- Reference-count design for elements of lists/arrays protected by RCU
|
||||
rcu.txt
|
||||
- RCU Concepts
|
||||
RTFP.txt
|
||||
- List of RCU papers (bibliography) going back to 1980.
|
||||
stallwarn.txt
|
||||
- RCU CPU stall warnings (module parameter rcu_cpu_stall_suppress)
|
||||
torture.txt
|
||||
- RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
|
||||
UP.txt
|
||||
- RCU on Uniprocessor Systems
|
||||
whatisRCU.txt
|
||||
- What is RCU?
|
||||
|
|
@ -1,499 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
|
||||
|
||||
<!-- CreationDate: Wed Dec 9 17:26:09 2015 -->
|
||||
|
||||
<!-- Magnification: 2.000 -->
|
||||
|
||||
<svg
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
width="5.7in"
|
||||
height="6.6in"
|
||||
viewBox="-44 -44 6838 7888"
|
||||
id="svg2"
|
||||
version="1.1"
|
||||
inkscape:version="0.48.4 r9939"
|
||||
sodipodi:docname="BigTreeClassicRCUBH.fig">
|
||||
<metadata
|
||||
id="metadata110">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<defs
|
||||
id="defs108">
|
||||
<marker
|
||||
inkscape:stockid="Arrow1Mend"
|
||||
orient="auto"
|
||||
refY="0.0"
|
||||
refX="0.0"
|
||||
id="Arrow1Mend"
|
||||
style="overflow:visible;">
|
||||
<path
|
||||
id="path3868"
|
||||
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
|
||||
style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
|
||||
transform="scale(0.4) rotate(180) translate(10,0)" />
|
||||
</marker>
|
||||
<marker
|
||||
inkscape:stockid="Arrow2Mend"
|
||||
orient="auto"
|
||||
refY="0.0"
|
||||
refX="0.0"
|
||||
id="Arrow2Mend"
|
||||
style="overflow:visible;">
|
||||
<path
|
||||
id="path3886"
|
||||
style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
|
||||
d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
|
||||
transform="scale(0.6) rotate(180) translate(0,0)" />
|
||||
</marker>
|
||||
</defs>
|
||||
<sodipodi:namedview
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1"
|
||||
objecttolerance="10"
|
||||
gridtolerance="10"
|
||||
guidetolerance="10"
|
||||
inkscape:pageopacity="0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:window-width="878"
|
||||
inkscape:window-height="1148"
|
||||
id="namedview106"
|
||||
showgrid="false"
|
||||
inkscape:zoom="1.3547758"
|
||||
inkscape:cx="256.5"
|
||||
inkscape:cy="297"
|
||||
inkscape:window-x="45"
|
||||
inkscape:window-y="24"
|
||||
inkscape:window-maximized="0"
|
||||
inkscape:current-layer="g4" />
|
||||
<g
|
||||
style="stroke-width:.025in; fill:none"
|
||||
id="g4">
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="450"
|
||||
y="0"
|
||||
width="6300"
|
||||
height="7350"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
|
||||
id="rect6" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="4950"
|
||||
y="4950"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect8" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="750"
|
||||
y="600"
|
||||
width="5700"
|
||||
height="3750"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
|
||||
id="rect10" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="0"
|
||||
y="450"
|
||||
width="6300"
|
||||
height="7350"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
|
||||
id="rect12" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="300"
|
||||
y="1050"
|
||||
width="5700"
|
||||
height="3750"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
|
||||
id="rect14" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="2850"
|
||||
cy="3900"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle16" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="3150"
|
||||
cy="3900"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle18" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="3450"
|
||||
cy="3900"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle20" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1350"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle22" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1650"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle24" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1950"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle26" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4350"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle28" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4650"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle30" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4950"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle32" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1350,3450 2350,2590 "
|
||||
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline34" />
|
||||
<!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4950,3450 3948,2590 "
|
||||
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline38" />
|
||||
<!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510-->
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="750"
|
||||
y="3450"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect42" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="2250,5400 2250,4414 "
|
||||
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline44" />
|
||||
<!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290-->
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="1500"
|
||||
y="5400"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect48" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="300"
|
||||
y="6600"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect50" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="3750"
|
||||
y="3450"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect52" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="4500"
|
||||
y="5400"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect54" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="3300"
|
||||
y="6600"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect56" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="2250"
|
||||
y="1650"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect58" />
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="6450"
|
||||
y="300"
|
||||
fill="#000000"
|
||||
font-family="Helvetica"
|
||||
font-style="normal"
|
||||
font-weight="normal"
|
||||
font-size="192"
|
||||
text-anchor="end"
|
||||
id="text60">rcu_bh</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="3150"
|
||||
y="1950"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text62">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="3150"
|
||||
y="2250"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text64">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1650"
|
||||
y="3750"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text66">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1650"
|
||||
y="4050"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text68">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4650"
|
||||
y="4050"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text70">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4650"
|
||||
y="3750"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text72">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2250"
|
||||
y="5700"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text74">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2250"
|
||||
y="6000"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text76">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="6900"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text78">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="7200"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text80">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5250"
|
||||
y="5700"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text82">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5250"
|
||||
y="6000"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text84">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="6900"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text86">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="7200"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text88">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="450"
|
||||
y="1350"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="start"
|
||||
id="text90">struct rcu_state</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="6000"
|
||||
y="750"
|
||||
fill="#000000"
|
||||
font-family="Helvetica"
|
||||
font-style="normal"
|
||||
font-weight="normal"
|
||||
font-size="192"
|
||||
text-anchor="end"
|
||||
id="text92">rcu_sched</text>
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="5250,5400 5250,4414 "
|
||||
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline94" />
|
||||
<!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4050,6600 4050,4414 "
|
||||
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline98" />
|
||||
<!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1050,6600 1050,4414 "
|
||||
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline102" />
|
||||
<!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290-->
|
||||
</g>
|
||||
</svg>
|
||||
|
Before Width: | Height: | Size: 13 KiB |
|
|
@ -1,695 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
|
||||
|
||||
<!-- CreationDate: Wed Dec 9 17:20:02 2015 -->
|
||||
|
||||
<!-- Magnification: 2.000 -->
|
||||
|
||||
<svg
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
width="5.7in"
|
||||
height="8.6in"
|
||||
viewBox="-44 -44 6838 10288"
|
||||
id="svg2"
|
||||
version="1.1"
|
||||
inkscape:version="0.48.4 r9939"
|
||||
sodipodi:docname="BigTreeClassicRCUBHdyntick.fig">
|
||||
<metadata
|
||||
id="metadata166">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<defs
|
||||
id="defs164">
|
||||
<marker
|
||||
inkscape:stockid="Arrow1Mend"
|
||||
orient="auto"
|
||||
refY="0.0"
|
||||
refX="0.0"
|
||||
id="Arrow1Mend"
|
||||
style="overflow:visible;">
|
||||
<path
|
||||
id="path3924"
|
||||
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
|
||||
style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
|
||||
transform="scale(0.4) rotate(180) translate(10,0)" />
|
||||
</marker>
|
||||
<marker
|
||||
inkscape:stockid="Arrow2Lend"
|
||||
orient="auto"
|
||||
refY="0.0"
|
||||
refX="0.0"
|
||||
id="Arrow2Lend"
|
||||
style="overflow:visible;">
|
||||
<path
|
||||
id="path3936"
|
||||
style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
|
||||
d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
|
||||
transform="scale(1.1) rotate(180) translate(1,0)" />
|
||||
</marker>
|
||||
</defs>
|
||||
<sodipodi:namedview
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1"
|
||||
objecttolerance="10"
|
||||
gridtolerance="10"
|
||||
guidetolerance="10"
|
||||
inkscape:pageopacity="0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:window-width="845"
|
||||
inkscape:window-height="988"
|
||||
id="namedview162"
|
||||
showgrid="false"
|
||||
inkscape:zoom="1.0452196"
|
||||
inkscape:cx="256.5"
|
||||
inkscape:cy="387.00003"
|
||||
inkscape:window-x="356"
|
||||
inkscape:window-y="61"
|
||||
inkscape:window-maximized="0"
|
||||
inkscape:current-layer="g4" />
|
||||
<g
|
||||
style="stroke-width:.025in; fill:none"
|
||||
id="g4">
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="450"
|
||||
y="0"
|
||||
width="6300"
|
||||
height="7350"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
|
||||
id="rect6" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="4950"
|
||||
y="4950"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect8" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="750"
|
||||
y="600"
|
||||
width="5700"
|
||||
height="3750"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
|
||||
id="rect10" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="5250,8100 5688,5912 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline12" />
|
||||
<!-- Arrowhead on XXXpoint 5250 8100 - 5710 5790-->
|
||||
<polyline
|
||||
points="5714 6068 5704 5822 5598 6044 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline14" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4050,9300 4486,7262 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline16" />
|
||||
<!-- Arrowhead on XXXpoint 4050 9300 - 4512 7140-->
|
||||
<polyline
|
||||
points="4514 7418 4506 7172 4396 7394 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline18" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1040,9300 1476,7262 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline20" />
|
||||
<!-- Arrowhead on XXXpoint 1040 9300 - 1502 7140-->
|
||||
<polyline
|
||||
points="1504 7418 1496 7172 1386 7394 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline22" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="2240,8100 2676,6062 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline24" />
|
||||
<!-- Arrowhead on XXXpoint 2240 8100 - 2702 5940-->
|
||||
<polyline
|
||||
points="2704 6218 2696 5972 2586 6194 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline26" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="0"
|
||||
y="450"
|
||||
width="6300"
|
||||
height="7350"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
|
||||
id="rect28" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="300"
|
||||
y="1050"
|
||||
width="5700"
|
||||
height="3750"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
|
||||
id="rect30" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1350,3450 2350,2590 "
|
||||
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline32" />
|
||||
<!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4950,3450 3948,2590 "
|
||||
style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline36" />
|
||||
<!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4050,6600 4050,4414 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline40" />
|
||||
<!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1050,6600 1050,4414 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline44" />
|
||||
<!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="2250,5400 2250,4414 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline48" />
|
||||
<!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="2250,8100 2250,6364 "
|
||||
style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline52" />
|
||||
<!-- Arrowhead on XXXpoint 2250 8100 - 2250 6240-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1050,9300 1050,7564 "
|
||||
style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline56" />
|
||||
<!-- Arrowhead on XXXpoint 1050 9300 - 1050 7440-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4050,9300 4050,7564 "
|
||||
style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline60" />
|
||||
<!-- Arrowhead on XXXpoint 4050 9300 - 4050 7440-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="5250,8100 5250,6364 "
|
||||
style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline64" />
|
||||
<!-- Arrowhead on XXXpoint 5250 8100 - 5250 6240-->
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="2850"
|
||||
cy="3900"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle68" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="3150"
|
||||
cy="3900"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle70" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="3450"
|
||||
cy="3900"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle72" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1350"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle74" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1650"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle76" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1950"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle78" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4350"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle80" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4650"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle82" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4950"
|
||||
cy="5100"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle84" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="750"
|
||||
y="3450"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect86" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="300"
|
||||
y="6600"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect88" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="3750"
|
||||
y="3450"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect90" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="4500"
|
||||
y="5400"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect92" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="3300"
|
||||
y="6600"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect94" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="2250"
|
||||
y="1650"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect96" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="0"
|
||||
y="9300"
|
||||
width="2100"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
|
||||
id="rect98" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="1350"
|
||||
y="8100"
|
||||
width="2100"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
|
||||
id="rect100" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="3000"
|
||||
y="9300"
|
||||
width="2100"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
|
||||
id="rect102" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="4350"
|
||||
y="8100"
|
||||
width="2100"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
|
||||
id="rect104" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="1500"
|
||||
y="5400"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect106" />
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="6450"
|
||||
y="300"
|
||||
fill="#000000"
|
||||
font-family="Helvetica"
|
||||
font-style="normal"
|
||||
font-weight="normal"
|
||||
font-size="192"
|
||||
text-anchor="end"
|
||||
id="text108">rcu_bh</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="3150"
|
||||
y="1950"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text110">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="3150"
|
||||
y="2250"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text112">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1650"
|
||||
y="3750"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text114">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1650"
|
||||
y="4050"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text116">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4650"
|
||||
y="4050"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text118">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4650"
|
||||
y="3750"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text120">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2250"
|
||||
y="5700"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text122">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2250"
|
||||
y="6000"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text124">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="6900"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text126">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="7200"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text128">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5250"
|
||||
y="5700"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text130">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5250"
|
||||
y="6000"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text132">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="6900"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text134">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="7200"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text136">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="450"
|
||||
y="1350"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="start"
|
||||
id="text138">struct rcu_state</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="9600"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text140">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="9900"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text142">rcu_dynticks</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="9600"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text144">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="9900"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text146">rcu_dynticks</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2400"
|
||||
y="8400"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text148">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2400"
|
||||
y="8700"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text150">rcu_dynticks</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5400"
|
||||
y="8400"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text152">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5400"
|
||||
y="8700"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text154">rcu_dynticks</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="6000"
|
||||
y="750"
|
||||
fill="#000000"
|
||||
font-family="Helvetica"
|
||||
font-style="normal"
|
||||
font-weight="normal"
|
||||
font-size="192"
|
||||
text-anchor="end"
|
||||
id="text156">rcu_sched</text>
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="5250,5400 5250,4414 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline158" />
|
||||
<!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290-->
|
||||
</g>
|
||||
</svg>
|
||||
|
Before Width: | Height: | Size: 19 KiB |
|
|
@ -1,741 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
|
||||
|
||||
<!-- CreationDate: Wed Dec 9 17:32:59 2015 -->
|
||||
|
||||
<!-- Magnification: 2.000 -->
|
||||
|
||||
<svg
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
width="6.1in"
|
||||
height="8.9in"
|
||||
viewBox="-44 -44 7288 10738"
|
||||
id="svg2"
|
||||
version="1.1"
|
||||
inkscape:version="0.48.4 r9939"
|
||||
sodipodi:docname="BigTreePreemptRCUBHdyntick.fig">
|
||||
<metadata
|
||||
id="metadata182">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<defs
|
||||
id="defs180">
|
||||
<marker
|
||||
inkscape:stockid="Arrow1Mend"
|
||||
orient="auto"
|
||||
refY="0.0"
|
||||
refX="0.0"
|
||||
id="Arrow1Mend"
|
||||
style="overflow:visible;">
|
||||
<path
|
||||
id="path3940"
|
||||
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
|
||||
style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
|
||||
transform="scale(0.4) rotate(180) translate(10,0)" />
|
||||
</marker>
|
||||
</defs>
|
||||
<sodipodi:namedview
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1"
|
||||
objecttolerance="10"
|
||||
gridtolerance="10"
|
||||
guidetolerance="10"
|
||||
inkscape:pageopacity="0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:window-width="874"
|
||||
inkscape:window-height="1148"
|
||||
id="namedview178"
|
||||
showgrid="false"
|
||||
inkscape:zoom="1.2097379"
|
||||
inkscape:cx="274.5"
|
||||
inkscape:cy="400.49997"
|
||||
inkscape:window-x="946"
|
||||
inkscape:window-y="24"
|
||||
inkscape:window-maximized="0"
|
||||
inkscape:current-layer="g4" />
|
||||
<g
|
||||
style="stroke-width:.025in; fill:none"
|
||||
id="g4">
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="900"
|
||||
y="0"
|
||||
width="6300"
|
||||
height="7350"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
|
||||
id="rect6" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="1200"
|
||||
y="600"
|
||||
width="5700"
|
||||
height="3750"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
|
||||
id="rect8" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="5400"
|
||||
y="4950"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect10" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="450"
|
||||
y="450"
|
||||
width="6300"
|
||||
height="7350"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
|
||||
id="rect12" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="750"
|
||||
y="1050"
|
||||
width="5700"
|
||||
height="3750"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
|
||||
id="rect14" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="4950"
|
||||
y="5400"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect16" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="5250,8550 5688,6362 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline18" />
|
||||
<!-- Arrowhead on XXXpoint 5250 8550 - 5710 6240-->
|
||||
<polyline
|
||||
points="5714 6518 5704 6272 5598 6494 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline20" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4050,9750 4486,7712 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline22" />
|
||||
<!-- Arrowhead on XXXpoint 4050 9750 - 4512 7590-->
|
||||
<polyline
|
||||
points="4514 7868 4506 7622 4396 7844 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline24" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1040,9750 1476,7712 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline26" />
|
||||
<!-- Arrowhead on XXXpoint 1040 9750 - 1502 7590-->
|
||||
<polyline
|
||||
points="1504 7868 1496 7622 1386 7844 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline28" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="2240,8550 2676,6512 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline30" />
|
||||
<!-- Arrowhead on XXXpoint 2240 8550 - 2702 6390-->
|
||||
<polyline
|
||||
points="2704 6668 2696 6422 2586 6644 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline32" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4050,9750 5682,6360 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline34" />
|
||||
<!-- Arrowhead on XXXpoint 4050 9750 - 5736 6246-->
|
||||
<polyline
|
||||
points="5672 6518 5722 6276 5562 6466 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline36" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1010,9750 2642,6360 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline38" />
|
||||
<!-- Arrowhead on XXXpoint 1010 9750 - 2696 6246-->
|
||||
<polyline
|
||||
points="2632 6518 2682 6276 2522 6466 "
|
||||
style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
|
||||
id="polyline40" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="0"
|
||||
y="900"
|
||||
width="6300"
|
||||
height="7350"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
|
||||
id="rect42" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="300"
|
||||
y="1500"
|
||||
width="5700"
|
||||
height="3750"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
|
||||
id="rect44" />
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1350,3900 2350,3040 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline46" />
|
||||
<!-- Arrowhead on XXXpoint 1350 3900 - 2444 2960-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4950,3900 3948,3040 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline50" />
|
||||
<!-- Arrowhead on XXXpoint 4950 3900 - 3854 2960-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4050,7050 4050,4864 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline54" />
|
||||
<!-- Arrowhead on XXXpoint 4050 7050 - 4050 4740-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1050,7050 1050,4864 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline58" />
|
||||
<!-- Arrowhead on XXXpoint 1050 7050 - 1050 4740-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="2250,5850 2250,4864 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline62" />
|
||||
<!-- Arrowhead on XXXpoint 2250 5850 - 2250 4740-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="2250,8550 2250,6814 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline66" />
|
||||
<!-- Arrowhead on XXXpoint 2250 8550 - 2250 6690-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="1050,9750 1050,8014 "
|
||||
style="stroke:#00ff00;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline70" />
|
||||
<!-- Arrowhead on XXXpoint 1050 9750 - 1050 7890-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="4050,9750 4050,8014 "
|
||||
style="stroke:#00ff00;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline74" />
|
||||
<!-- Arrowhead on XXXpoint 4050 9750 - 4050 7890-->
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="5250,8550 5250,6814 "
|
||||
style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
|
||||
id="polyline78" />
|
||||
<!-- Arrowhead on XXXpoint 5250 8550 - 5250 6690-->
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="2850"
|
||||
cy="4350"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle82" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="3150"
|
||||
cy="4350"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle84" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="3450"
|
||||
cy="4350"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle86" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1350"
|
||||
cy="5550"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle88" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1650"
|
||||
cy="5550"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle90" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="1950"
|
||||
cy="5550"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle92" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4350"
|
||||
cy="5550"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle94" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4650"
|
||||
cy="5550"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle96" />
|
||||
<!-- Circle -->
|
||||
<circle
|
||||
cx="4950"
|
||||
cy="5550"
|
||||
r="76"
|
||||
style="fill:#000000;stroke:#000000;stroke-width:14;"
|
||||
id="circle98" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="750"
|
||||
y="3900"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect100" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="300"
|
||||
y="7050"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect102" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="3750"
|
||||
y="3900"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect104" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="4500"
|
||||
y="5850"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect106" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="3300"
|
||||
y="7050"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect108" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="2250"
|
||||
y="2100"
|
||||
width="1800"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
|
||||
id="rect110" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="0"
|
||||
y="9750"
|
||||
width="2100"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
|
||||
id="rect112" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="1350"
|
||||
y="8550"
|
||||
width="2100"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
|
||||
id="rect114" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="3000"
|
||||
y="9750"
|
||||
width="2100"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
|
||||
id="rect116" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="4350"
|
||||
y="8550"
|
||||
width="2100"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
|
||||
id="rect118" />
|
||||
<!-- Line: box -->
|
||||
<rect
|
||||
x="1500"
|
||||
y="5850"
|
||||
width="1500"
|
||||
height="900"
|
||||
rx="0"
|
||||
style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
|
||||
id="rect120" />
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="6450"
|
||||
y="750"
|
||||
fill="#000000"
|
||||
font-family="Helvetica"
|
||||
font-style="normal"
|
||||
font-weight="normal"
|
||||
font-size="192"
|
||||
text-anchor="end"
|
||||
id="text122">rcu_bh</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="3150"
|
||||
y="2400"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text124">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="3150"
|
||||
y="2700"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text126">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1650"
|
||||
y="4200"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text128">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1650"
|
||||
y="4500"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text130">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4650"
|
||||
y="4500"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text132">rcu_node</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4650"
|
||||
y="4200"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text134">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2250"
|
||||
y="6150"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text136">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2250"
|
||||
y="6450"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text138">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="7350"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text140">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="7650"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text142">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5250"
|
||||
y="6150"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text144">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5250"
|
||||
y="6450"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text146">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="7350"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text148">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="7650"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text150">rcu_data</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="450"
|
||||
y="1800"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="start"
|
||||
id="text152">struct rcu_state</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="10050"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text154">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="1050"
|
||||
y="10350"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text156">rcu_dynticks</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="10050"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text158">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="4050"
|
||||
y="10350"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text160">rcu_dynticks</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2400"
|
||||
y="8850"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text162">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="2400"
|
||||
y="9150"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text164">rcu_dynticks</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5400"
|
||||
y="8850"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text166">struct</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="5400"
|
||||
y="9150"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="192"
|
||||
text-anchor="middle"
|
||||
id="text168">rcu_dynticks</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="6900"
|
||||
y="300"
|
||||
fill="#000000"
|
||||
font-family="Helvetica"
|
||||
font-style="normal"
|
||||
font-weight="normal"
|
||||
font-size="192"
|
||||
text-anchor="end"
|
||||
id="text170">rcu_preempt</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="6000"
|
||||
y="1200"
|
||||
fill="#000000"
|
||||
font-family="Helvetica"
|
||||
font-style="normal"
|
||||
font-weight="normal"
|
||||
font-size="192"
|
||||
text-anchor="end"
|
||||
id="text172">rcu_sched</text>
|
||||
<!-- Line -->
|
||||
<polyline
|
||||
points="5250,5850 5250,4864 "
|
||||
style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
|
||||
id="polyline174" />
|
||||
<!-- Arrowhead on XXXpoint 5250 5850 - 5250 4740-->
|
||||
</g>
|
||||
</svg>
|
||||
|
Before Width: | Height: | Size: 20 KiB |
|
Before Width: | Height: | Size: 24 KiB After Width: | Height: | Size: 22 KiB |
|
|
@ -23,8 +23,6 @@ to each other.
|
|||
The <tt>rcu_segcblist</tt> Structure</a>
|
||||
<li> <a href="#The rcu_data Structure">
|
||||
The <tt>rcu_data</tt> Structure</a>
|
||||
<li> <a href="#The rcu_dynticks Structure">
|
||||
The <tt>rcu_dynticks</tt> Structure</a>
|
||||
<li> <a href="#The rcu_head Structure">
|
||||
The <tt>rcu_head</tt> Structure</a>
|
||||
<li> <a href="#RCU-Specific Fields in the task_struct Structure">
|
||||
|
|
@ -127,9 +125,11 @@ CPUs, RCU would configure the <tt>rcu_node</tt> tree as follows:
|
|||
</p><p>RCU currently permits up to a four-level tree, which on a 64-bit system
|
||||
accommodates up to 4,194,304 CPUs, though only a mere 524,288 CPUs for
|
||||
32-bit systems.
|
||||
On the other hand, you can set <tt>CONFIG_RCU_FANOUT</tt> to be
|
||||
as small as 2 if you wish, which would permit only 16 CPUs, which
|
||||
is useful for testing.
|
||||
On the other hand, you can set both <tt>CONFIG_RCU_FANOUT</tt> and
|
||||
<tt>CONFIG_RCU_FANOUT_LEAF</tt> to be as small as 2, which would result
|
||||
in a 16-CPU test using a 4-level tree.
|
||||
This can be useful for testing large-system capabilities on small test
|
||||
machines.
|
||||
|
||||
</p><p>This multi-level combining tree allows us to get most of the
|
||||
performance and scalability
|
||||
|
|
@ -154,44 +154,9 @@ on that root <tt>rcu_node</tt> structure remains acceptably low.
|
|||
keeping lock contention under control at all tree levels regardless
|
||||
of the level of loading on the system.
|
||||
|
||||
</p><p>The Linux kernel actually supports multiple flavors of RCU
|
||||
running concurrently, so RCU builds separate data structures for each
|
||||
flavor.
|
||||
For example, for <tt>CONFIG_TREE_RCU=y</tt> kernels, RCU provides
|
||||
rcu_sched and rcu_bh, as shown below:
|
||||
|
||||
</p><p><img src="BigTreeClassicRCUBH.svg" alt="BigTreeClassicRCUBH.svg" width="33%">
|
||||
|
||||
</p><p>Energy efficiency is increasingly important, and for that
|
||||
reason the Linux kernel provides <tt>CONFIG_NO_HZ_IDLE</tt>, which
|
||||
turns off the scheduling-clock interrupts on idle CPUs, which in
|
||||
turn allows those CPUs to attain deeper sleep states and to consume
|
||||
less energy.
|
||||
CPUs whose scheduling-clock interrupts have been turned off are
|
||||
said to be in <i>dyntick-idle mode</i>.
|
||||
RCU must handle dyntick-idle CPUs specially
|
||||
because RCU would otherwise wake up each CPU on every grace period,
|
||||
which would defeat the whole purpose of <tt>CONFIG_NO_HZ_IDLE</tt>.
|
||||
RCU uses the <tt>rcu_dynticks</tt> structure to track
|
||||
which CPUs are in dyntick idle mode, as shown below:
|
||||
|
||||
</p><p><img src="BigTreeClassicRCUBHdyntick.svg" alt="BigTreeClassicRCUBHdyntick.svg" width="33%">
|
||||
|
||||
</p><p>However, if a CPU is in dyntick-idle mode, it is in that mode
|
||||
for all flavors of RCU.
|
||||
Therefore, a single <tt>rcu_dynticks</tt> structure is allocated per
|
||||
CPU, and all of a given CPU's <tt>rcu_data</tt> structures share
|
||||
that <tt>rcu_dynticks</tt>, as shown in the figure.
|
||||
|
||||
</p><p>Kernels built with <tt>CONFIG_PREEMPT_RCU</tt> support
|
||||
rcu_preempt in addition to rcu_sched and rcu_bh, as shown below:
|
||||
|
||||
</p><p><img src="BigTreePreemptRCUBHdyntick.svg" alt="BigTreePreemptRCUBHdyntick.svg" width="35%">
|
||||
|
||||
</p><p>RCU updaters wait for normal grace periods by registering
|
||||
RCU callbacks, either directly via <tt>call_rcu()</tt> and
|
||||
friends (namely <tt>call_rcu_bh()</tt> and <tt>call_rcu_sched()</tt>),
|
||||
there being a separate interface per flavor of RCU)
|
||||
or indirectly via <tt>synchronize_rcu()</tt> and friends.
|
||||
RCU callbacks are represented by <tt>rcu_head</tt> structures,
|
||||
which are queued on <tt>rcu_data</tt> structures while they are
|
||||
|
|
@ -214,9 +179,6 @@ its own synchronization:
|
|||
<li> Each <tt>rcu_node</tt> structure has a spinlock.
|
||||
<li> The fields in <tt>rcu_data</tt> are private to the corresponding
|
||||
CPU, although a few can be read and written by other CPUs.
|
||||
<li> Similarly, the fields in <tt>rcu_dynticks</tt> are private
|
||||
to the corresponding CPU, although a few can be read by
|
||||
other CPUs.
|
||||
</ol>
|
||||
|
||||
<p>It is important to note that different data structures can have
|
||||
|
|
@ -272,11 +234,6 @@ follows:
|
|||
access to this information from the corresponding CPU.
|
||||
Finally, this structure records past dyntick-idle state
|
||||
for the corresponding CPU and also tracks statistics.
|
||||
<li> <tt>rcu_dynticks</tt>:
|
||||
This per-CPU structure tracks the current dyntick-idle
|
||||
state for the corresponding CPU.
|
||||
Unlike the other three structures, the <tt>rcu_dynticks</tt>
|
||||
structure is not replicated per RCU flavor.
|
||||
<li> <tt>rcu_head</tt>:
|
||||
This structure represents RCU callbacks, and is the
|
||||
only structure allocated and managed by RCU users.
|
||||
|
|
@ -287,14 +244,14 @@ follows:
|
|||
<p>If all you wanted from this article was a general notion of how
|
||||
RCU's data structures are related, you are done.
|
||||
Otherwise, each of the following sections give more details on
|
||||
the <tt>rcu_state</tt>, <tt>rcu_node</tt>, <tt>rcu_data</tt>,
|
||||
and <tt>rcu_dynticks</tt> data structures.
|
||||
the <tt>rcu_state</tt>, <tt>rcu_node</tt> and <tt>rcu_data</tt> data
|
||||
structures.
|
||||
|
||||
<h3><a name="The rcu_state Structure">
|
||||
The <tt>rcu_state</tt> Structure</a></h3>
|
||||
|
||||
<p>The <tt>rcu_state</tt> structure is the base structure that
|
||||
represents a flavor of RCU.
|
||||
represents the state of RCU in the system.
|
||||
This structure forms the interconnection between the
|
||||
<tt>rcu_node</tt> and <tt>rcu_data</tt> structures,
|
||||
tracks grace periods, contains the lock used to
|
||||
|
|
@ -389,7 +346,7 @@ sequence number.
|
|||
The bottom two bits are the state of the current grace period,
|
||||
which can be zero for not yet started or one for in progress.
|
||||
In other words, if the bottom two bits of <tt>->gp_seq</tt> are
|
||||
zero, the corresponding flavor of RCU is idle.
|
||||
zero, then RCU is idle.
|
||||
Any other value in the bottom two bits indicates that something is broken.
|
||||
This field is protected by the root <tt>rcu_node</tt> structure's
|
||||
<tt>->lock</tt> field.
|
||||
|
|
@ -419,10 +376,10 @@ as follows:
|
|||
grace period in jiffies.
|
||||
It is protected by the root <tt>rcu_node</tt>'s <tt>->lock</tt>.
|
||||
|
||||
<p>The <tt>->name</tt> field points to the name of the RCU flavor
|
||||
(for example, “rcu_sched”), and is constant.
|
||||
The <tt>->abbr</tt> field contains a one-character abbreviation,
|
||||
for example, “s” for RCU-sched.
|
||||
<p>The <tt>->name</tt> and <tt>->abbr</tt> fields distinguish
|
||||
between preemptible RCU (“rcu_preempt” and “p”)
|
||||
and non-preemptible RCU (“rcu_sched” and “s”).
|
||||
These fields are used for diagnostic and tracing purposes.
|
||||
|
||||
<h3><a name="The rcu_node Structure">
|
||||
The <tt>rcu_node</tt> Structure</a></h3>
|
||||
|
|
@ -971,25 +928,31 @@ this <tt>rcu_segcblist</tt> structure, <i>not</i> the <tt>->head</tt>
|
|||
pointer.
|
||||
The reason for this is that all the ready-to-invoke callbacks
|
||||
(that is, those in the <tt>RCU_DONE_TAIL</tt> segment) are extracted
|
||||
all at once at callback-invocation time.
|
||||
all at once at callback-invocation time (<tt>rcu_do_batch</tt>), due
|
||||
to which <tt>->head</tt> may be set to NULL if there are no not-done
|
||||
callbacks remaining in the <tt>rcu_segcblist</tt>.
|
||||
If callback invocation must be postponed, for example, because a
|
||||
high-priority process just woke up on this CPU, then the remaining
|
||||
callbacks are placed back on the <tt>RCU_DONE_TAIL</tt> segment.
|
||||
Either way, the <tt>->len</tt> and <tt>->len_lazy</tt> counts
|
||||
are adjusted after the corresponding callbacks have been invoked, and so
|
||||
again it is the <tt>->len</tt> count that accurately reflects whether
|
||||
or not there are callbacks associated with this <tt>rcu_segcblist</tt>
|
||||
structure.
|
||||
callbacks are placed back on the <tt>RCU_DONE_TAIL</tt> segment and
|
||||
<tt>->head</tt> once again points to the start of the segment.
|
||||
In short, the head field can briefly be <tt>NULL</tt> even though the
|
||||
CPU has callbacks present the entire time.
|
||||
Therefore, it is not appropriate to test the <tt>->head</tt> pointer
|
||||
for <tt>NULL</tt>.
|
||||
|
||||
<p>In contrast, the <tt>->len</tt> and <tt>->len_lazy</tt> counts
|
||||
are adjusted only after the corresponding callbacks have been invoked.
|
||||
This means that the <tt>->len</tt> count is zero only if
|
||||
the <tt>rcu_segcblist</tt> structure really is devoid of callbacks.
|
||||
Of course, off-CPU sampling of the <tt>->len</tt> count requires
|
||||
the use of appropriate synchronization, for example, memory barriers.
|
||||
careful use of appropriate synchronization, for example, memory barriers.
|
||||
This synchronization can be a bit subtle, particularly in the case
|
||||
of <tt>rcu_barrier()</tt>.
|
||||
|
||||
<h3><a name="The rcu_data Structure">
|
||||
The <tt>rcu_data</tt> Structure</a></h3>
|
||||
|
||||
<p>The <tt>rcu_data</tt> maintains the per-CPU state for the
|
||||
corresponding flavor of RCU.
|
||||
<p>The <tt>rcu_data</tt> maintains the per-CPU state for the RCU subsystem.
|
||||
The fields in this structure may be accessed only from the corresponding
|
||||
CPU (and from tracing) unless otherwise stated.
|
||||
This structure is the
|
||||
|
|
@ -1015,30 +978,19 @@ as follows:
|
|||
|
||||
<pre>
|
||||
1 int cpu;
|
||||
2 struct rcu_state *rsp;
|
||||
3 struct rcu_node *mynode;
|
||||
4 struct rcu_dynticks *dynticks;
|
||||
5 unsigned long grpmask;
|
||||
6 bool beenonline;
|
||||
2 struct rcu_node *mynode;
|
||||
3 unsigned long grpmask;
|
||||
4 bool beenonline;
|
||||
</pre>
|
||||
|
||||
<p>The <tt>->cpu</tt> field contains the number of the
|
||||
corresponding CPU, the <tt>->rsp</tt> pointer references
|
||||
the corresponding <tt>rcu_state</tt> structure (and is most frequently
|
||||
used to locate the name of the corresponding flavor of RCU for tracing),
|
||||
and the <tt>->mynode</tt> field references the corresponding
|
||||
<tt>rcu_node</tt> structure.
|
||||
corresponding CPU and the <tt>->mynode</tt> field references the
|
||||
corresponding <tt>rcu_node</tt> structure.
|
||||
The <tt>->mynode</tt> is used to propagate quiescent states
|
||||
up the combining tree.
|
||||
<p>The <tt>->dynticks</tt> pointer references the
|
||||
<tt>rcu_dynticks</tt> structure corresponding to this
|
||||
CPU.
|
||||
Recall that a single per-CPU instance of the <tt>rcu_dynticks</tt>
|
||||
structure is shared among all flavors of RCU.
|
||||
These first four fields are constant and therefore require not
|
||||
synchronization.
|
||||
These two fields are constant and therefore do not require synchronization.
|
||||
|
||||
</p><p>The <tt>->grpmask</tt> field indicates the bit in
|
||||
<p>The <tt>->grpmask</tt> field indicates the bit in
|
||||
the <tt>->mynode->qsmask</tt> corresponding to this
|
||||
<tt>rcu_data</tt> structure, and is also used when propagating
|
||||
quiescent states.
|
||||
|
|
@ -1057,12 +1009,12 @@ as follows:
|
|||
3 bool cpu_no_qs;
|
||||
4 bool core_needs_qs;
|
||||
5 bool gpwrap;
|
||||
6 unsigned long rcu_qs_ctr_snap;
|
||||
</pre>
|
||||
|
||||
<p>The <tt>->gp_seq</tt> and <tt>->gp_seq_needed</tt>
|
||||
fields are the counterparts of the fields of the same name
|
||||
in the <tt>rcu_state</tt> and <tt>rcu_node</tt> structures.
|
||||
<p>The <tt>->gp_seq</tt> field is the counterpart of the field of the same
|
||||
name in the <tt>rcu_state</tt> and <tt>rcu_node</tt> structures. The
|
||||
<tt>->gp_seq_needed</tt> field is the counterpart of the field of the same
|
||||
name in the rcu_node</tt> structure.
|
||||
They may each lag up to one behind their <tt>rcu_node</tt>
|
||||
counterparts, but in <tt>CONFIG_NO_HZ_IDLE</tt> and
|
||||
<tt>CONFIG_NO_HZ_FULL</tt> kernels can lag
|
||||
|
|
@ -1103,10 +1055,6 @@ CPU has remained idle for so long that the
|
|||
<tt>gp_seq</tt> counter is in danger of overflow, which
|
||||
will cause the CPU to disregard the values of its counters on
|
||||
its next exit from idle.
|
||||
Finally, the <tt>rcu_qs_ctr_snap</tt> field is used to detect
|
||||
cases where a given operation has resulted in a quiescent state
|
||||
for all flavors of RCU, for example, <tt>cond_resched()</tt>
|
||||
when RCU has indicated a need for quiescent states.
|
||||
|
||||
<h5>RCU Callback Handling</h5>
|
||||
|
||||
|
|
@ -1179,26 +1127,22 @@ Finally, the <tt>->dynticks_fqs</tt> field is used to
|
|||
count the number of times this CPU is determined to be in
|
||||
dyntick-idle state, and is used for tracing and debugging purposes.
|
||||
|
||||
<h3><a name="The rcu_dynticks Structure">
|
||||
The <tt>rcu_dynticks</tt> Structure</a></h3>
|
||||
|
||||
<p>The <tt>rcu_dynticks</tt> maintains the per-CPU dyntick-idle state
|
||||
for the corresponding CPU.
|
||||
Unlike the other structures, <tt>rcu_dynticks</tt> is not
|
||||
replicated over the different flavors of RCU.
|
||||
The fields in this structure may be accessed only from the corresponding
|
||||
CPU (and from tracing) unless otherwise stated.
|
||||
Its fields are as follows:
|
||||
<p>
|
||||
This portion of the rcu_data structure is declared as follows:
|
||||
|
||||
<pre>
|
||||
1 long dynticks_nesting;
|
||||
2 long dynticks_nmi_nesting;
|
||||
3 atomic_t dynticks;
|
||||
4 bool rcu_need_heavy_qs;
|
||||
5 unsigned long rcu_qs_ctr;
|
||||
6 bool rcu_urgent_qs;
|
||||
5 bool rcu_urgent_qs;
|
||||
</pre>
|
||||
|
||||
<p>These fields in the rcu_data structure maintain the per-CPU dyntick-idle
|
||||
state for the corresponding CPU.
|
||||
The fields may be accessed only from the corresponding CPU (and from tracing)
|
||||
unless otherwise stated.
|
||||
|
||||
<p>The <tt>->dynticks_nesting</tt> field counts the
|
||||
nesting depth of process execution, so that in normal circumstances
|
||||
this counter has value zero or one.
|
||||
|
|
@ -1227,9 +1171,11 @@ to overflow the counter, this approach corrects the
|
|||
CPU enters the idle loop from process context.
|
||||
|
||||
</p><p>The <tt>->dynticks</tt> field counts the corresponding
|
||||
CPU's transitions to and from dyntick-idle mode, so that this counter
|
||||
has an even value when the CPU is in dyntick-idle mode and an odd
|
||||
value otherwise.
|
||||
CPU's transitions to and from either dyntick-idle or user mode, so
|
||||
that this counter has an even value when the CPU is in dyntick-idle
|
||||
mode or user mode and an odd value otherwise. The transitions to/from
|
||||
user mode need to be counted for user mode adaptive-ticks support
|
||||
(see timers/NO_HZ.txt).
|
||||
|
||||
</p><p>The <tt>->rcu_need_heavy_qs</tt> field is used
|
||||
to record the fact that the RCU core code would really like to
|
||||
|
|
@ -1238,19 +1184,12 @@ it is willing to call for heavy-weight dyntick-counter operations.
|
|||
This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
|
||||
code, which provide a momentary idle sojourn in response.
|
||||
|
||||
</p><p>The <tt>->rcu_qs_ctr</tt> field is used to record
|
||||
quiescent states from <tt>cond_resched()</tt>.
|
||||
Because <tt>cond_resched()</tt> can execute quite frequently, this
|
||||
must be quite lightweight, as in a non-atomic increment of this
|
||||
per-CPU field.
|
||||
|
||||
</p><p>Finally, the <tt>->rcu_urgent_qs</tt> field is used to record
|
||||
the fact that the RCU core code would really like to see a quiescent
|
||||
state from the corresponding CPU, with the various other fields indicating
|
||||
just how badly RCU wants this quiescent state.
|
||||
This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
|
||||
code, which, if nothing else, non-atomically increment <tt>->rcu_qs_ctr</tt>
|
||||
in response.
|
||||
the fact that the RCU core code would really like to see a quiescent state from
|
||||
the corresponding CPU, with the various other fields indicating just how badly
|
||||
RCU wants this quiescent state.
|
||||
This flag is checked by RCU's context-switch path
|
||||
(<tt>rcu_note_context_switch</tt>) and the cond_resched code.
|
||||
|
||||
<table>
|
||||
<tr><th> </th></tr>
|
||||
|
|
@ -1372,8 +1311,7 @@ that is, if the CPU is currently idle.
|
|||
Accessor Functions</a></h3>
|
||||
|
||||
<p>The following listing shows the
|
||||
<tt>rcu_get_root()</tt>, <tt>rcu_for_each_node_breadth_first</tt>,
|
||||
<tt>rcu_for_each_nonleaf_node_breadth_first()</tt>, and
|
||||
<tt>rcu_get_root()</tt>, <tt>rcu_for_each_node_breadth_first</tt> and
|
||||
<tt>rcu_for_each_leaf_node()</tt> function and macros:
|
||||
|
||||
<pre>
|
||||
|
|
@ -1386,13 +1324,9 @@ Accessor Functions</a></h3>
|
|||
7 for ((rnp) = &(rsp)->node[0]; \
|
||||
8 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
|
||||
9
|
||||
10 #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
|
||||
11 for ((rnp) = &(rsp)->node[0]; \
|
||||
12 (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
|
||||
13
|
||||
14 #define rcu_for_each_leaf_node(rsp, rnp) \
|
||||
15 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
|
||||
16 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
|
||||
10 #define rcu_for_each_leaf_node(rsp, rnp) \
|
||||
11 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
|
||||
12 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
|
||||
</pre>
|
||||
|
||||
<p>The <tt>rcu_get_root()</tt> simply returns a pointer to the
|
||||
|
|
@ -1405,10 +1339,7 @@ macro takes advantage of the layout of the <tt>rcu_node</tt>
|
|||
structures in the <tt>rcu_state</tt> structure's
|
||||
<tt>->node[]</tt> array, performing a breadth-first traversal by
|
||||
simply traversing the array in order.
|
||||
The <tt>rcu_for_each_nonleaf_node_breadth_first()</tt> macro operates
|
||||
similarly, but traverses only the first part of the array, thus excluding
|
||||
the leaf <tt>rcu_node</tt> structures.
|
||||
Finally, the <tt>rcu_for_each_leaf_node()</tt> macro traverses only
|
||||
Similarly, the <tt>rcu_for_each_leaf_node()</tt> macro traverses only
|
||||
the last part of the array, thus traversing only the leaf
|
||||
<tt>rcu_node</tt> structures.
|
||||
|
||||
|
|
@ -1416,15 +1347,14 @@ the last part of the array, thus traversing only the leaf
|
|||
<tr><th> </th></tr>
|
||||
<tr><th align="left">Quick Quiz:</th></tr>
|
||||
<tr><td>
|
||||
What do <tt>rcu_for_each_nonleaf_node_breadth_first()</tt> and
|
||||
What does
|
||||
<tt>rcu_for_each_leaf_node()</tt> do if the <tt>rcu_node</tt> tree
|
||||
contains only a single node?
|
||||
</td></tr>
|
||||
<tr><th align="left">Answer:</th></tr>
|
||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
||||
In the single-node case,
|
||||
<tt>rcu_for_each_nonleaf_node_breadth_first()</tt> is a no-op
|
||||
and <tt>rcu_for_each_leaf_node()</tt> traverses the single node.
|
||||
<tt>rcu_for_each_leaf_node()</tt> traverses the single node.
|
||||
</font></td></tr>
|
||||
<tr><td> </td></tr>
|
||||
</table>
|
||||
|
|
@ -1432,11 +1362,11 @@ the last part of the array, thus traversing only the leaf
|
|||
<h3><a name="Summary">
|
||||
Summary</a></h3>
|
||||
|
||||
So each flavor of RCU is represented by an <tt>rcu_state</tt> structure,
|
||||
So the state of RCU is represented by an <tt>rcu_state</tt> structure,
|
||||
which contains a combining tree of <tt>rcu_node</tt> and
|
||||
<tt>rcu_data</tt> structures.
|
||||
Finally, in <tt>CONFIG_NO_HZ_IDLE</tt> kernels, each CPU's dyntick-idle
|
||||
state is tracked by an <tt>rcu_dynticks</tt> structure.
|
||||
state is tracked by dynticks-related fields in the <tt>rcu_data</tt> structure.
|
||||
|
||||
If you made it this far, you are well prepared to read the code
|
||||
walkthroughs in the other articles in this series.
|
||||
|
|
|
|||
|
Before Width: | Height: | Size: 24 KiB After Width: | Height: | Size: 20 KiB |
|
|
@ -328,13 +328,13 @@
|
|||
inkscape:window-height="1148"
|
||||
id="namedview90"
|
||||
showgrid="true"
|
||||
inkscape:zoom="0.80021373"
|
||||
inkscape:cx="462.49289"
|
||||
inkscape:cy="473.6718"
|
||||
inkscape:zoom="0.69092787"
|
||||
inkscape:cx="476.34085"
|
||||
inkscape:cy="712.80957"
|
||||
inkscape:window-x="770"
|
||||
inkscape:window-y="24"
|
||||
inkscape:window-maximized="0"
|
||||
inkscape:current-layer="g4114-9-3-9"
|
||||
inkscape:current-layer="g4"
|
||||
inkscape:snap-grids="false"
|
||||
fit-margin-top="5"
|
||||
fit-margin-right="5"
|
||||
|
|
@ -813,14 +813,18 @@
|
|||
<text
|
||||
sodipodi:linespacing="125%"
|
||||
id="text4110-5-7-6-2-4-0"
|
||||
y="841.88086"
|
||||
y="670.74316"
|
||||
x="1460.1007"
|
||||
style="font-size:267.24359131px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
|
||||
xml:space="preserve"><tspan
|
||||
y="841.88086"
|
||||
y="670.74316"
|
||||
x="1460.1007"
|
||||
sodipodi:role="line"
|
||||
id="tspan4925-1-2-4-5">reched_cpu()</tspan></text>
|
||||
id="tspan4925-1-2-4-5">Request</tspan><tspan
|
||||
y="1004.7976"
|
||||
x="1460.1007"
|
||||
sodipodi:role="line"
|
||||
id="tspan3100">context switch</tspan></text>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
|
|
|||
|
Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB |
|
|
@ -12,10 +12,9 @@ high efficiency and minimal disturbance, expedited grace periods accept
|
|||
lower efficiency and significant disturbance to attain shorter latencies.
|
||||
|
||||
<p>
|
||||
There are three flavors of RCU (RCU-bh, RCU-preempt, and RCU-sched),
|
||||
but only two flavors of expedited grace periods because the RCU-bh
|
||||
expedited grace period maps onto the RCU-sched expedited grace period.
|
||||
Each of the remaining two implementations is covered in its own section.
|
||||
There are two flavors of RCU (RCU-preempt and RCU-sched), with an earlier
|
||||
third RCU-bh flavor having been implemented in terms of the other two.
|
||||
Each of the two implementations is covered in its own section.
|
||||
|
||||
<ol>
|
||||
<li> <a href="#Expedited Grace Period Design">
|
||||
|
|
@ -73,10 +72,10 @@ will ignore it because idle and offline CPUs are already residing
|
|||
in quiescent states.
|
||||
Otherwise, the expedited grace period will use
|
||||
<tt>smp_call_function_single()</tt> to send the CPU an IPI, which
|
||||
is handled by <tt>sync_rcu_exp_handler()</tt>.
|
||||
is handled by <tt>rcu_exp_handler()</tt>.
|
||||
|
||||
<p>
|
||||
However, because this is preemptible RCU, <tt>sync_rcu_exp_handler()</tt>
|
||||
However, because this is preemptible RCU, <tt>rcu_exp_handler()</tt>
|
||||
can check to see if the CPU is currently running in an RCU read-side
|
||||
critical section.
|
||||
If not, the handler can immediately report a quiescent state.
|
||||
|
|
@ -146,24 +145,23 @@ expedited grace period is shown in the following diagram:
|
|||
<p><img src="ExpSchedFlow.svg" alt="ExpSchedFlow.svg" width="55%">
|
||||
|
||||
<p>
|
||||
As with RCU-preempt's <tt>synchronize_rcu_expedited()</tt>,
|
||||
As with RCU-preempt, RCU-sched's
|
||||
<tt>synchronize_sched_expedited()</tt> ignores offline and
|
||||
idle CPUs, again because they are in remotely detectable
|
||||
quiescent states.
|
||||
However, the <tt>synchronize_rcu_expedited()</tt> handler
|
||||
is <tt>sync_sched_exp_handler()</tt>, and because the
|
||||
However, because the
|
||||
<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
|
||||
leave no trace of their invocation, in general it is not possible to tell
|
||||
whether or not the current CPU is in an RCU read-side critical section.
|
||||
The best that <tt>sync_sched_exp_handler()</tt> can do is to check
|
||||
The best that RCU-sched's <tt>rcu_exp_handler()</tt> can do is to check
|
||||
for idle, on the off-chance that the CPU went idle while the IPI
|
||||
was in flight.
|
||||
If the CPU is idle, then tt>sync_sched_exp_handler()</tt> reports
|
||||
If the CPU is idle, then <tt>rcu_exp_handler()</tt> reports
|
||||
the quiescent state.
|
||||
|
||||
<p>
|
||||
Otherwise, the handler invokes <tt>resched_cpu()</tt>, which forces
|
||||
a future context switch.
|
||||
<p> Otherwise, the handler forces a future context switch by setting the
|
||||
NEED_RESCHED flag of the current task's thread flag and the CPU preempt
|
||||
counter.
|
||||
At the time of the context switch, the CPU reports the quiescent state.
|
||||
Should the CPU go offline first, it will report the quiescent state
|
||||
at that time.
|
||||
|
|
@ -299,19 +297,18 @@ Instead, the task pushing the grace period forward will include the
|
|||
idle CPUs in the mask passed to <tt>rcu_report_exp_cpu_mult()</tt>.
|
||||
|
||||
<p>
|
||||
For RCU-sched, there is an additional check for idle in the IPI
|
||||
handler, <tt>sync_sched_exp_handler()</tt>.
|
||||
For RCU-sched, there is an additional check:
|
||||
If the IPI has interrupted the idle loop, then
|
||||
<tt>sync_sched_exp_handler()</tt> invokes <tt>rcu_report_exp_rdp()</tt>
|
||||
<tt>rcu_exp_handler()</tt> invokes <tt>rcu_report_exp_rdp()</tt>
|
||||
to report the corresponding quiescent state.
|
||||
|
||||
<p>
|
||||
For RCU-preempt, there is no specific check for idle in the
|
||||
IPI handler (<tt>sync_rcu_exp_handler()</tt>), but because
|
||||
IPI handler (<tt>rcu_exp_handler()</tt>), but because
|
||||
RCU read-side critical sections are not permitted within the
|
||||
idle loop, if <tt>sync_rcu_exp_handler()</tt> sees that the CPU is within
|
||||
idle loop, if <tt>rcu_exp_handler()</tt> sees that the CPU is within
|
||||
RCU read-side critical section, the CPU cannot possibly be idle.
|
||||
Otherwise, <tt>sync_rcu_exp_handler()</tt> invokes
|
||||
Otherwise, <tt>rcu_exp_handler()</tt> invokes
|
||||
<tt>rcu_report_exp_rdp()</tt> to report the corresponding quiescent
|
||||
state, regardless of whether or not that quiescent state was due to
|
||||
the CPU being idle.
|
||||
|
|
@ -626,6 +623,8 @@ checks, but only during the mid-boot dead zone.
|
|||
<p>
|
||||
With this refinement, synchronous grace periods can now be used from
|
||||
task context pretty much any time during the life of the kernel.
|
||||
That is, aside from some points in the suspend, hibernate, or shutdown
|
||||
code path.
|
||||
|
||||
<h3><a name="Summary">
|
||||
Summary</a></h3>
|
||||
|
|
|
|||
|
|
@ -77,7 +77,7 @@ The key point is that the lock-acquisition functions, including
|
|||
<tt>smp_mb__after_unlock_lock()</tt> immediately after successful
|
||||
acquisition of the lock.
|
||||
|
||||
<p>Therefore, for any given <tt>rcu_node</tt> struction, any access
|
||||
<p>Therefore, for any given <tt>rcu_node</tt> structure, any access
|
||||
happening before one of the above lock-release functions will be seen
|
||||
by all CPUs as happening before any access happening after a later
|
||||
one of the above lock-acquisition functions.
|
||||
|
|
@ -485,13 +485,13 @@ section that the grace period must wait on.
|
|||
noted by <tt>rcu_node_context_switch()</tt> on the left.
|
||||
On the other hand, if the CPU takes a scheduler-clock interrupt
|
||||
while executing in usermode, a quiescent state will be noted by
|
||||
<tt>rcu_check_callbacks()</tt> on the right.
|
||||
<tt>rcu_sched_clock_irq()</tt> on the right.
|
||||
Either way, the passage through a quiescent state will be noted
|
||||
in a per-CPU variable.
|
||||
|
||||
<p>The next time an <tt>RCU_SOFTIRQ</tt> handler executes on
|
||||
this CPU (for example, after the next scheduler-clock
|
||||
interrupt), <tt>__rcu_process_callbacks()</tt> will invoke
|
||||
interrupt), <tt>rcu_core()</tt> will invoke
|
||||
<tt>rcu_check_quiescent_state()</tt>, which will notice the
|
||||
recorded quiescent state, and invoke
|
||||
<tt>rcu_report_qs_rdp()</tt>.
|
||||
|
|
@ -651,7 +651,7 @@ to end.
|
|||
These callbacks are identified by <tt>rcu_advance_cbs()</tt>,
|
||||
which is usually invoked by <tt>__note_gp_changes()</tt>.
|
||||
As shown in the diagram below, this invocation can be triggered by
|
||||
the scheduling-clock interrupt (<tt>rcu_check_callbacks()</tt> on
|
||||
the scheduling-clock interrupt (<tt>rcu_sched_clock_irq()</tt> on
|
||||
the left) or by idle entry (<tt>rcu_cleanup_after_idle()</tt> on
|
||||
the right, but only for kernels build with
|
||||
<tt>CONFIG_RCU_FAST_NO_HZ=y</tt>).
|
||||
|
|
|
|||
|
|
@ -349,7 +349,7 @@
|
|||
font-weight="bold"
|
||||
font-size="192"
|
||||
id="text202-7-5"
|
||||
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_check_callbacks()</text>
|
||||
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_sched_clock_irq()</text>
|
||||
<rect
|
||||
x="7069.6187"
|
||||
y="5087.4678"
|
||||
|
|
|
|||
|
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 16 KiB |
|
|
@ -3902,7 +3902,7 @@
|
|||
font-style="normal"
|
||||
y="-4418.6582"
|
||||
x="3745.7725"
|
||||
xml:space="preserve">rcu_check_callbacks()</text>
|
||||
xml:space="preserve">rcu_sched_clock_irq()</text>
|
||||
</g>
|
||||
<g
|
||||
transform="translate(-850.30204,55463.106)"
|
||||
|
|
@ -3924,7 +3924,7 @@
|
|||
font-style="normal"
|
||||
y="-4418.6582"
|
||||
x="3745.7725"
|
||||
xml:space="preserve">rcu_process_callbacks()</text>
|
||||
xml:space="preserve">rcu_core()</text>
|
||||
<text
|
||||
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"
|
||||
id="text202-7-5-3-27-0"
|
||||
|
|
@ -3933,7 +3933,7 @@
|
|||
font-style="normal"
|
||||
y="-4165.7954"
|
||||
x="3745.7725"
|
||||
xml:space="preserve">rcu_check_quiescent_state())</text>
|
||||
xml:space="preserve">rcu_check_quiescent_state()</text>
|
||||
<text
|
||||
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"
|
||||
id="text202-7-5-3-27-0-9"
|
||||
|
|
@ -4968,7 +4968,7 @@
|
|||
font-weight="bold"
|
||||
font-size="192"
|
||||
id="text202-7-5-19"
|
||||
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_check_callbacks()</text>
|
||||
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_sched_clock_irq()</text>
|
||||
<rect
|
||||
x="5314.2671"
|
||||
y="82817.688"
|
||||
|
|
|
|||
|
Before Width: | Height: | Size: 209 KiB After Width: | Height: | Size: 209 KiB |
|
|
@ -775,7 +775,7 @@
|
|||
font-style="normal"
|
||||
y="-4418.6582"
|
||||
x="3745.7725"
|
||||
xml:space="preserve">rcu_check_callbacks()</text>
|
||||
xml:space="preserve">rcu_sched_clock_irq()</text>
|
||||
</g>
|
||||
<g
|
||||
transform="translate(399.7744,828.86448)"
|
||||
|
|
@ -797,7 +797,7 @@
|
|||
font-style="normal"
|
||||
y="-4418.6582"
|
||||
x="3745.7725"
|
||||
xml:space="preserve">rcu_process_callbacks()</text>
|
||||
xml:space="preserve">rcu_core()</text>
|
||||
<text
|
||||
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"
|
||||
id="text202-7-5-3-27-0"
|
||||
|
|
@ -806,7 +806,7 @@
|
|||
font-style="normal"
|
||||
y="-4165.7954"
|
||||
x="3745.7725"
|
||||
xml:space="preserve">rcu_check_quiescent_state())</text>
|
||||
xml:space="preserve">rcu_check_quiescent_state()</text>
|
||||
<text
|
||||
style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier"
|
||||
id="text202-7-5-3-27-0-9"
|
||||
|
|
|
|||
|
Before Width: | Height: | Size: 43 KiB After Width: | Height: | Size: 43 KiB |
|
|
@ -900,8 +900,6 @@ Except where otherwise noted, these non-guarantees were premeditated.
|
|||
Grace Periods Don't Partition Read-Side Critical Sections</a>
|
||||
<li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods">
|
||||
Read-Side Critical Sections Don't Partition Grace Periods</a>
|
||||
<li> <a href="#Disabling Preemption Does Not Block Grace Periods">
|
||||
Disabling Preemption Does Not Block Grace Periods</a>
|
||||
</ol>
|
||||
|
||||
<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3>
|
||||
|
|
@ -1259,56 +1257,6 @@ of RCU grace periods.
|
|||
<tr><td> </td></tr>
|
||||
</table>
|
||||
|
||||
<h3><a name="Disabling Preemption Does Not Block Grace Periods">
|
||||
Disabling Preemption Does Not Block Grace Periods</a></h3>
|
||||
|
||||
<p>
|
||||
There was a time when disabling preemption on any given CPU would block
|
||||
subsequent grace periods.
|
||||
However, this was an accident of implementation and is not a requirement.
|
||||
And in the current Linux-kernel implementation, disabling preemption
|
||||
on a given CPU in fact does not block grace periods, as Oleg Nesterov
|
||||
<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>.
|
||||
|
||||
<p>
|
||||
If you need a preempt-disable region to block grace periods, you need to add
|
||||
<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example
|
||||
as follows:
|
||||
|
||||
<blockquote>
|
||||
<pre>
|
||||
1 preempt_disable();
|
||||
2 rcu_read_lock();
|
||||
3 do_something();
|
||||
4 rcu_read_unlock();
|
||||
5 preempt_enable();
|
||||
6
|
||||
7 /* Spinlocks implicitly disable preemption. */
|
||||
8 spin_lock(&mylock);
|
||||
9 rcu_read_lock();
|
||||
10 do_something();
|
||||
11 rcu_read_unlock();
|
||||
12 spin_unlock(&mylock);
|
||||
</pre>
|
||||
</blockquote>
|
||||
|
||||
<p>
|
||||
In theory, you could enter the RCU read-side critical section first,
|
||||
but it is more efficient to keep the entire RCU read-side critical
|
||||
section contained in the preempt-disable region as shown above.
|
||||
Of course, RCU read-side critical sections that extend outside of
|
||||
preempt-disable regions will work correctly, but such critical sections
|
||||
can be preempted, which forces <tt>rcu_read_unlock()</tt> to do
|
||||
more work.
|
||||
And no, this is <i>not</i> an invitation to enclose all of your RCU
|
||||
read-side critical sections within preempt-disable regions, because
|
||||
doing so would degrade real-time response.
|
||||
|
||||
<p>
|
||||
This non-requirement appeared with preemptible RCU.
|
||||
If you need a grace period that waits on non-preemptible code regions, use
|
||||
<a href="#Sched Flavor">RCU-sched</a>.
|
||||
|
||||
<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2>
|
||||
|
||||
<p>
|
||||
|
|
@ -1383,6 +1331,7 @@ Classes of quality-of-implementation requirements are as follows:
|
|||
<ol>
|
||||
<li> <a href="#Specialization">Specialization</a>
|
||||
<li> <a href="#Performance and Scalability">Performance and Scalability</a>
|
||||
<li> <a href="#Forward Progress">Forward Progress</a>
|
||||
<li> <a href="#Composability">Composability</a>
|
||||
<li> <a href="#Corner Cases">Corner Cases</a>
|
||||
</ol>
|
||||
|
|
@ -1647,7 +1596,7 @@ used in place of <tt>synchronize_rcu()</tt> as follows:
|
|||
16 struct foo *p;
|
||||
17
|
||||
18 spin_lock(&gp_lock);
|
||||
19 p = rcu_dereference(gp);
|
||||
19 p = rcu_access_pointer(gp);
|
||||
20 if (!p) {
|
||||
21 spin_unlock(&gp_lock);
|
||||
22 return false;
|
||||
|
|
@ -1824,6 +1773,106 @@ so it is too early to tell whether they will stand the test of time.
|
|||
RCU thus provides a range of tools to allow updaters to strike the
|
||||
required tradeoff between latency, flexibility and CPU overhead.
|
||||
|
||||
<h3><a name="Forward Progress">Forward Progress</a></h3>
|
||||
|
||||
<p>
|
||||
In theory, delaying grace-period completion and callback invocation
|
||||
is harmless.
|
||||
In practice, not only are memory sizes finite but also callbacks sometimes
|
||||
do wakeups, and sufficiently deferred wakeups can be difficult
|
||||
to distinguish from system hangs.
|
||||
Therefore, RCU must provide a number of mechanisms to promote forward
|
||||
progress.
|
||||
|
||||
<p>
|
||||
These mechanisms are not foolproof, nor can they be.
|
||||
For one simple example, an infinite loop in an RCU read-side critical
|
||||
section must by definition prevent later grace periods from ever completing.
|
||||
For a more involved example, consider a 64-CPU system built with
|
||||
<tt>CONFIG_RCU_NOCB_CPU=y</tt> and booted with <tt>rcu_nocbs=1-63</tt>,
|
||||
where CPUs 1 through 63 spin in tight loops that invoke
|
||||
<tt>call_rcu()</tt>.
|
||||
Even if these tight loops also contain calls to <tt>cond_resched()</tt>
|
||||
(thus allowing grace periods to complete), CPU 0 simply will
|
||||
not be able to invoke callbacks as fast as the other 63 CPUs can
|
||||
register them, at least not until the system runs out of memory.
|
||||
In both of these examples, the Spiderman principle applies: With great
|
||||
power comes great responsibility.
|
||||
However, short of this level of abuse, RCU is required to
|
||||
ensure timely completion of grace periods and timely invocation of
|
||||
callbacks.
|
||||
|
||||
<p>
|
||||
RCU takes the following steps to encourage timely completion of
|
||||
grace periods:
|
||||
|
||||
<ol>
|
||||
<li> If a grace period fails to complete within 100 milliseconds,
|
||||
RCU causes future invocations of <tt>cond_resched()</tt> on
|
||||
the holdout CPUs to provide an RCU quiescent state.
|
||||
RCU also causes those CPUs' <tt>need_resched()</tt> invocations
|
||||
to return <tt>true</tt>, but only after the corresponding CPU's
|
||||
next scheduling-clock.
|
||||
<li> CPUs mentioned in the <tt>nohz_full</tt> kernel boot parameter
|
||||
can run indefinitely in the kernel without scheduling-clock
|
||||
interrupts, which defeats the above <tt>need_resched()</tt>
|
||||
strategem.
|
||||
RCU will therefore invoke <tt>resched_cpu()</tt> on any
|
||||
<tt>nohz_full</tt> CPUs still holding out after
|
||||
109 milliseconds.
|
||||
<li> In kernels built with <tt>CONFIG_RCU_BOOST=y</tt>, if a given
|
||||
task that has been preempted within an RCU read-side critical
|
||||
section is holding out for more than 500 milliseconds,
|
||||
RCU will resort to priority boosting.
|
||||
<li> If a CPU is still holding out 10 seconds into the grace
|
||||
period, RCU will invoke <tt>resched_cpu()</tt> on it regardless
|
||||
of its <tt>nohz_full</tt> state.
|
||||
</ol>
|
||||
|
||||
<p>
|
||||
The above values are defaults for systems running with <tt>HZ=1000</tt>.
|
||||
They will vary as the value of <tt>HZ</tt> varies, and can also be
|
||||
changed using the relevant Kconfig options and kernel boot parameters.
|
||||
RCU currently does not do much sanity checking of these
|
||||
parameters, so please use caution when changing them.
|
||||
Note that these forward-progress measures are provided only for RCU,
|
||||
not for
|
||||
<a href="#Sleepable RCU">SRCU</a> or
|
||||
<a href="#Tasks RCU">Tasks RCU</a>.
|
||||
|
||||
<p>
|
||||
RCU takes the following steps in <tt>call_rcu()</tt> to encourage timely
|
||||
invocation of callbacks when any given non-<tt>rcu_nocbs</tt> CPU has
|
||||
10,000 callbacks, or has 10,000 more callbacks than it had the last time
|
||||
encouragement was provided:
|
||||
|
||||
<ol>
|
||||
<li> Starts a grace period, if one is not already in progress.
|
||||
<li> Forces immediate checking for quiescent states, rather than
|
||||
waiting for three milliseconds to have elapsed since the
|
||||
beginning of the grace period.
|
||||
<li> Immediately tags the CPU's callbacks with their grace period
|
||||
completion numbers, rather than waiting for the <tt>RCU_SOFTIRQ</tt>
|
||||
handler to get around to it.
|
||||
<li> Lifts callback-execution batch limits, which speeds up callback
|
||||
invocation at the expense of degrading realtime response.
|
||||
</ol>
|
||||
|
||||
<p>
|
||||
Again, these are default values when running at <tt>HZ=1000</tt>,
|
||||
and can be overridden.
|
||||
Again, these forward-progress measures are provided only for RCU,
|
||||
not for
|
||||
<a href="#Sleepable RCU">SRCU</a> or
|
||||
<a href="#Tasks RCU">Tasks RCU</a>.
|
||||
Even for RCU, callback-invocation forward progress for <tt>rcu_nocbs</tt>
|
||||
CPUs is much less well-developed, in part because workloads benefiting
|
||||
from <tt>rcu_nocbs</tt> CPUs tend to invoke <tt>call_rcu()</tt>
|
||||
relatively infrequently.
|
||||
If workloads emerge that need both <tt>rcu_nocbs</tt> CPUs and high
|
||||
<tt>call_rcu()</tt> invocation rates, then additional forward-progress
|
||||
work will be required.
|
||||
|
||||
<h3><a name="Composability">Composability</a></h3>
|
||||
|
||||
<p>
|
||||
|
|
@ -2165,14 +2214,9 @@ however, this is not a panacea because there would be severe restrictions
|
|||
on what operations those callbacks could invoke.
|
||||
|
||||
<p>
|
||||
Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
|
||||
<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
|
||||
(<a href="#Bottom-Half Flavor">discussed below</a>),
|
||||
<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>,
|
||||
Perhaps surprisingly, <tt>synchronize_rcu()</tt> and
|
||||
<tt>synchronize_rcu_expedited()</tt>,
|
||||
<tt>synchronize_rcu_bh_expedited()</tt>, and
|
||||
<tt>synchronize_sched_expedited()</tt>
|
||||
will all operate normally
|
||||
will operate normally
|
||||
during very early boot, the reason being that there is only one CPU
|
||||
and preemption is disabled.
|
||||
This means that the call <tt>synchronize_rcu()</tt> (or friends)
|
||||
|
|
@ -2269,12 +2313,23 @@ Thankfully, RCU update-side primitives, including
|
|||
The name notwithstanding, some Linux-kernel architectures
|
||||
can have nested NMIs, which RCU must handle correctly.
|
||||
Andy Lutomirski
|
||||
<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a>
|
||||
<a href="https://lkml.kernel.org/r/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a>
|
||||
with this requirement;
|
||||
he also kindly surprised me with
|
||||
<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a>
|
||||
<a href="https://lkml.kernel.org/r/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a>
|
||||
that meets this requirement.
|
||||
|
||||
<p>
|
||||
Furthermore, NMI handlers can be interrupted by what appear to RCU
|
||||
to be normal interrupts.
|
||||
One way that this can happen is for code that directly invokes
|
||||
<tt>rcu_irq_enter()</tt> and <tt>rcu_irq_exit()</tt> to be called
|
||||
from an NMI handler.
|
||||
This astonishing fact of life prompted the current code structure,
|
||||
which has <tt>rcu_irq_enter()</tt> invoking <tt>rcu_nmi_enter()</tt>
|
||||
and <tt>rcu_irq_exit()</tt> invoking <tt>rcu_nmi_exit()</tt>.
|
||||
And yes, I also learned of this requirement the hard way.
|
||||
|
||||
<h3><a name="Loadable Modules">Loadable Modules</a></h3>
|
||||
|
||||
<p>
|
||||
|
|
@ -2290,7 +2345,7 @@ via <tt>del_timer_sync()</tt> or similar.
|
|||
<p>
|
||||
Unfortunately, there is no way to cancel an RCU callback;
|
||||
once you invoke <tt>call_rcu()</tt>, the callback function is
|
||||
going to eventually be invoked, unless the system goes down first.
|
||||
eventually going to be invoked, unless the system goes down first.
|
||||
Because it is normally considered socially irresponsible to crash the system
|
||||
in response to a module unload request, we need some other way
|
||||
to deal with in-flight RCU callbacks.
|
||||
|
|
@ -2394,30 +2449,9 @@ when invoked from a CPU-hotplug notifier.
|
|||
<p>
|
||||
RCU depends on the scheduler, and the scheduler uses RCU to
|
||||
protect some of its data structures.
|
||||
This means the scheduler is forbidden from acquiring
|
||||
the runqueue locks and the priority-inheritance locks
|
||||
in the middle of an outermost RCU read-side critical section unless either
|
||||
(1) it releases them before exiting that same
|
||||
RCU read-side critical section, or
|
||||
(2) interrupts are disabled across
|
||||
that entire RCU read-side critical section.
|
||||
This same prohibition also applies (recursively!) to any lock that is acquired
|
||||
while holding any lock to which this prohibition applies.
|
||||
Adhering to this rule prevents preemptible RCU from invoking
|
||||
<tt>rcu_read_unlock_special()</tt> while either runqueue or
|
||||
priority-inheritance locks are held, thus avoiding deadlock.
|
||||
|
||||
<p>
|
||||
Prior to v4.4, it was only necessary to disable preemption across
|
||||
RCU read-side critical sections that acquired scheduler locks.
|
||||
In v4.4, expedited grace periods started using IPIs, and these
|
||||
IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath.
|
||||
Therefore, this expedited-grace-period change required disabling of
|
||||
interrupts, not just preemption.
|
||||
|
||||
<p>
|
||||
For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
|
||||
implementation must be written carefully to avoid similar deadlocks.
|
||||
The preemptible-RCU <tt>rcu_read_unlock()</tt>
|
||||
implementation must therefore be written carefully to avoid deadlocks
|
||||
involving the scheduler's runqueue and priority-inheritance locks.
|
||||
In particular, <tt>rcu_read_unlock()</tt> must tolerate an
|
||||
interrupt where the interrupt handler invokes both
|
||||
<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
|
||||
|
|
@ -2426,7 +2460,7 @@ negative nesting levels to avoid destructive recursion via
|
|||
interrupt handler's use of RCU.
|
||||
|
||||
<p>
|
||||
This pair of mutual scheduler-RCU requirements came as a
|
||||
This scheduler-RCU requirement came as a
|
||||
<a href="https://lwn.net/Articles/453002/">complete surprise</a>.
|
||||
|
||||
<p>
|
||||
|
|
@ -2437,9 +2471,42 @@ when running context-switch-heavy workloads when built with
|
|||
<tt>CONFIG_NO_HZ_FULL=y</tt>
|
||||
<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>.
|
||||
RCU has made good progress towards meeting this requirement, even
|
||||
for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
|
||||
for context-switch-heavy <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
|
||||
but there is room for further improvement.
|
||||
|
||||
<p>
|
||||
It is forbidden to hold any of scheduler's runqueue or priority-inheritance
|
||||
spinlocks across an <tt>rcu_read_unlock()</tt> unless interrupts have been
|
||||
disabled across the entire RCU read-side critical section, that is,
|
||||
up to and including the matching <tt>rcu_read_lock()</tt>.
|
||||
Violating this restriction can result in deadlocks involving these
|
||||
scheduler spinlocks.
|
||||
There was hope that this restriction might be lifted when interrupt-disabled
|
||||
calls to <tt>rcu_read_unlock()</tt> started deferring the reporting of
|
||||
the resulting RCU-preempt quiescent state until the end of the corresponding
|
||||
interrupts-disabled region.
|
||||
Unfortunately, timely reporting of the corresponding quiescent state
|
||||
to expedited grace periods requires a call to <tt>raise_softirq()</tt>,
|
||||
which can acquire these scheduler spinlocks.
|
||||
In addition, real-time systems using RCU priority boosting
|
||||
need this restriction to remain in effect because deferred
|
||||
quiescent-state reporting would also defer deboosting, which in turn
|
||||
would degrade real-time latencies.
|
||||
|
||||
<p>
|
||||
In theory, if a given RCU read-side critical section could be
|
||||
guaranteed to be less than one second in duration, holding a scheduler
|
||||
spinlock across that critical section's <tt>rcu_read_unlock()</tt>
|
||||
would require only that preemption be disabled across the entire
|
||||
RCU read-side critical section, not interrupts.
|
||||
Unfortunately, given the possibility of vCPU preemption, long-running
|
||||
interrupts, and so on, it is not possible in practice to guarantee
|
||||
that a given RCU read-side critical section will complete in less than
|
||||
one second.
|
||||
Therefore, as noted above, if scheduler spinlocks are held across
|
||||
a given call to <tt>rcu_read_unlock()</tt>, interrupts must be
|
||||
disabled across the entire RCU read-side critical section.
|
||||
|
||||
<h3><a name="Tracing and RCU">Tracing and RCU</a></h3>
|
||||
|
||||
<p>
|
||||
|
|
@ -2850,15 +2917,22 @@ The other four flavors are listed below, with requirements for each
|
|||
described in a separate section.
|
||||
|
||||
<ol>
|
||||
<li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a>
|
||||
<li> <a href="#Sched Flavor">Sched Flavor</a>
|
||||
<li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor (Historical)</a>
|
||||
<li> <a href="#Sched Flavor">Sched Flavor (Historical)</a>
|
||||
<li> <a href="#Sleepable RCU">Sleepable RCU</a>
|
||||
<li> <a href="#Tasks RCU">Tasks RCU</a>
|
||||
<li> <a href="#Waiting for Multiple Grace Periods">
|
||||
Waiting for Multiple Grace Periods</a>
|
||||
</ol>
|
||||
|
||||
<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3>
|
||||
<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor (Historical)</a></h3>
|
||||
|
||||
<p>
|
||||
The RCU-bh flavor of RCU has since been expressed in terms of
|
||||
the other RCU flavors as part of a consolidation of the three
|
||||
flavors into a single flavor.
|
||||
The read-side API remains, and continues to disable softirq and to
|
||||
be accounted for by lockdep.
|
||||
Much of the material in this section is therefore strictly historical
|
||||
in nature.
|
||||
|
||||
<p>
|
||||
The softirq-disable (AKA “bottom-half”,
|
||||
|
|
@ -2918,8 +2992,20 @@ includes
|
|||
<tt>call_rcu_bh()</tt>,
|
||||
<tt>rcu_barrier_bh()</tt>, and
|
||||
<tt>rcu_read_lock_bh_held()</tt>.
|
||||
However, the update-side APIs are now simple wrappers for other RCU
|
||||
flavors, namely RCU-sched in CONFIG_PREEMPT=n kernels and RCU-preempt
|
||||
otherwise.
|
||||
|
||||
<h3><a name="Sched Flavor">Sched Flavor</a></h3>
|
||||
<h3><a name="Sched Flavor">Sched Flavor (Historical)</a></h3>
|
||||
|
||||
<p>
|
||||
The RCU-sched flavor of RCU has since been expressed in terms of
|
||||
the other RCU flavors as part of a consolidation of the three
|
||||
flavors into a single flavor.
|
||||
The read-side API remains, and continues to disable preemption and to
|
||||
be accounted for by lockdep.
|
||||
Much of the material in this section is therefore strictly historical
|
||||
in nature.
|
||||
|
||||
<p>
|
||||
Before preemptible RCU, waiting for an RCU grace period had the
|
||||
|
|
@ -3013,7 +3099,7 @@ If you block forever in one of a given domain's SRCU read-side critical
|
|||
sections, then that domain's grace periods will also be blocked forever.
|
||||
Of course, one good way to block forever is to deadlock, which can
|
||||
happen if any operation in a given domain's SRCU read-side critical
|
||||
section can block waiting, either directly or indirectly, for that domain's
|
||||
section can wait, either directly or indirectly, for that domain's
|
||||
grace period to elapse.
|
||||
For example, this results in a self-deadlock:
|
||||
|
||||
|
|
@ -3053,12 +3139,18 @@ API, which, in combination with <tt>srcu_read_unlock()</tt>,
|
|||
guarantees a full memory barrier.
|
||||
|
||||
<p>
|
||||
Also unlike other RCU flavors, SRCU's callbacks-wait function
|
||||
<tt>srcu_barrier()</tt> may be invoked from CPU-hotplug notifiers,
|
||||
though this is not necessarily a good idea.
|
||||
The reason that this is possible is that SRCU is insensitive
|
||||
to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt>
|
||||
need not exclude CPU-hotplug operations.
|
||||
Also unlike other RCU flavors, <tt>synchronize_srcu()</tt> may <b>not</b>
|
||||
be invoked from CPU-hotplug notifiers, due to the fact that SRCU grace
|
||||
periods make use of timers and the possibility of timers being temporarily
|
||||
“stranded” on the outgoing CPU.
|
||||
This stranding of timers means that timers posted to the outgoing CPU
|
||||
will not fire until late in the CPU-hotplug process.
|
||||
The problem is that if a notifier is waiting on an SRCU grace period,
|
||||
that grace period is waiting on a timer, and that timer is stranded on the
|
||||
outgoing CPU, then the notifier will never be awakened, in other words,
|
||||
deadlock has occurred.
|
||||
This same situation of course also prohibits <tt>srcu_barrier()</tt>
|
||||
from being invoked from CPU-hotplug notifiers.
|
||||
|
||||
<p>
|
||||
SRCU also differs from other RCU flavors in that SRCU's expedited and
|
||||
|
|
@ -3139,94 +3231,14 @@ The tasks-RCU API is quite compact, consisting only of
|
|||
<tt>call_rcu_tasks()</tt>,
|
||||
<tt>synchronize_rcu_tasks()</tt>, and
|
||||
<tt>rcu_barrier_tasks()</tt>.
|
||||
|
||||
<h3><a name="Waiting for Multiple Grace Periods">
|
||||
Waiting for Multiple Grace Periods</a></h3>
|
||||
|
||||
<p>
|
||||
Perhaps you have an RCU protected data structure that is accessed from
|
||||
RCU read-side critical sections, from softirq handlers, and from
|
||||
hardware interrupt handlers.
|
||||
That is three flavors of RCU, the normal flavor, the bottom-half flavor,
|
||||
and the sched flavor.
|
||||
How to wait for a compound grace period?
|
||||
|
||||
<p>
|
||||
The best approach is usually to “just say no!” and
|
||||
insert <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>
|
||||
around each RCU read-side critical section, regardless of what
|
||||
environment it happens to be in.
|
||||
But suppose that some of the RCU read-side critical sections are
|
||||
on extremely hot code paths, and that use of <tt>CONFIG_PREEMPT=n</tt>
|
||||
is not a viable option, so that <tt>rcu_read_lock()</tt> and
|
||||
<tt>rcu_read_unlock()</tt> are not free.
|
||||
What then?
|
||||
|
||||
<p>
|
||||
You <i>could</i> wait on all three grace periods in succession, as follows:
|
||||
|
||||
<blockquote>
|
||||
<pre>
|
||||
1 synchronize_rcu();
|
||||
2 synchronize_rcu_bh();
|
||||
3 synchronize_sched();
|
||||
</pre>
|
||||
</blockquote>
|
||||
|
||||
<p>
|
||||
This works, but triples the update-side latency penalty.
|
||||
In cases where this is not acceptable, <tt>synchronize_rcu_mult()</tt>
|
||||
may be used to wait on all three flavors of grace period concurrently:
|
||||
|
||||
<blockquote>
|
||||
<pre>
|
||||
1 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched);
|
||||
</pre>
|
||||
</blockquote>
|
||||
|
||||
<p>
|
||||
But what if it is necessary to also wait on SRCU?
|
||||
This can be done as follows:
|
||||
|
||||
<blockquote>
|
||||
<pre>
|
||||
1 static void call_my_srcu(struct rcu_head *head,
|
||||
2 void (*func)(struct rcu_head *head))
|
||||
3 {
|
||||
4 call_srcu(&my_srcu, head, func);
|
||||
5 }
|
||||
6
|
||||
7 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched, call_my_srcu);
|
||||
</pre>
|
||||
</blockquote>
|
||||
|
||||
<p>
|
||||
If you needed to wait on multiple different flavors of SRCU
|
||||
(but why???), you would need to create a wrapper function resembling
|
||||
<tt>call_my_srcu()</tt> for each SRCU flavor.
|
||||
|
||||
<table>
|
||||
<tr><th> </th></tr>
|
||||
<tr><th align="left">Quick Quiz:</th></tr>
|
||||
<tr><td>
|
||||
But what if I need to wait for multiple RCU flavors, but I also need
|
||||
the grace periods to be expedited?
|
||||
</td></tr>
|
||||
<tr><th align="left">Answer:</th></tr>
|
||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
||||
If you are using expedited grace periods, there should be less penalty
|
||||
for waiting on them in succession.
|
||||
But if that is nevertheless a problem, you can use workqueues
|
||||
or multiple kthreads to wait on the various expedited grace
|
||||
periods concurrently.
|
||||
</font></td></tr>
|
||||
<tr><td> </td></tr>
|
||||
</table>
|
||||
|
||||
<p>
|
||||
Again, it is usually better to adjust the RCU read-side critical sections
|
||||
to use a single flavor of RCU, but when this is not feasible, you can use
|
||||
<tt>synchronize_rcu_mult()</tt>.
|
||||
In <tt>CONFIG_PREEMPT=n</tt> kernels, trampolines cannot be preempted,
|
||||
so these APIs map to
|
||||
<tt>call_rcu()</tt>,
|
||||
<tt>synchronize_rcu()</tt>, and
|
||||
<tt>rcu_barrier()</tt>, respectively.
|
||||
In <tt>CONFIG_PREEMPT=y</tt> kernels, trampolines can be preempted,
|
||||
and these three APIs are therefore implemented by separate functions
|
||||
that check for voluntary context switches.
|
||||
|
||||
<h2><a name="Possible Future Changes">Possible Future Changes</a></h2>
|
||||
|
||||
|
|
@ -3237,12 +3249,6 @@ If this becomes a serious problem, it will be necessary to rework the
|
|||
grace-period state machine so as to avoid the need for the additional
|
||||
latency.
|
||||
|
||||
<p>
|
||||
Expedited grace periods scan the CPUs, so their latency and overhead
|
||||
increases with increasing numbers of CPUs.
|
||||
If this becomes a serious problem on large systems, it will be necessary
|
||||
to do some redesign to avoid this scalability problem.
|
||||
|
||||
<p>
|
||||
RCU disables CPU hotplug in a few places, perhaps most notably in the
|
||||
<tt>rcu_barrier()</tt> operations.
|
||||
|
|
@ -3287,11 +3293,6 @@ Please note that arrangements that require RCU to remap CPU numbers will
|
|||
require extremely good demonstration of need and full exploration of
|
||||
alternatives.
|
||||
|
||||
<p>
|
||||
There is an embarrassingly large number of flavors of RCU, and this
|
||||
number has been increasing over time.
|
||||
Perhaps it will be possible to combine some at some future date.
|
||||
|
||||
<p>
|
||||
RCU's various kthreads are reasonably recent additions.
|
||||
It is quite likely that adjustments will be required to more gracefully
|
||||
|
|
@ -3303,6 +3304,11 @@ For example, RCU callback overhead might be charged back to the
|
|||
originating <tt>call_rcu()</tt> instance, though probably not
|
||||
in production kernels.
|
||||
|
||||
<p>
|
||||
Additional work may be required to provide reasonable forward-progress
|
||||
guarantees under heavy load for grace periods and for callback
|
||||
invocation.
|
||||
|
||||
<h2><a name="Summary">Summary</a></h2>
|
||||
|
||||
<p>
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ over a rather long period of time, but improvements are always welcome!
|
|||
pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(),
|
||||
rcu_read_lock_sched(), or by the appropriate update-side lock.
|
||||
Disabling of preemption can serve as rcu_read_lock_sched(), but
|
||||
is less readable.
|
||||
is less readable and prevents lockdep from detecting locking issues.
|
||||
|
||||
Letting RCU-protected pointers "leak" out of an RCU read-side
|
||||
critical section is every bid as bad as letting them leak out
|
||||
|
|
@ -285,11 +285,7 @@ over a rather long period of time, but improvements are always welcome!
|
|||
here is that superuser already has lots of ways to crash
|
||||
the machine.
|
||||
|
||||
d. Use call_rcu_bh() rather than call_rcu(), in order to take
|
||||
advantage of call_rcu_bh()'s faster grace periods. (This
|
||||
is only a partial solution, though.)
|
||||
|
||||
e. Periodically invoke synchronize_rcu(), permitting a limited
|
||||
d. Periodically invoke synchronize_rcu(), permitting a limited
|
||||
number of updates per grace period.
|
||||
|
||||
The same cautions apply to call_rcu_bh(), call_rcu_sched(),
|
||||
|
|
@ -324,37 +320,14 @@ over a rather long period of time, but improvements are always welcome!
|
|||
will break Alpha, cause aggressive compilers to generate bad code,
|
||||
and confuse people trying to read your code.
|
||||
|
||||
11. Note that synchronize_rcu() -only- guarantees to wait until
|
||||
all currently executing rcu_read_lock()-protected RCU read-side
|
||||
critical sections complete. It does -not- necessarily guarantee
|
||||
that all currently running interrupts, NMIs, preempt_disable()
|
||||
code, or idle loops will complete. Therefore, if your
|
||||
read-side critical sections are protected by something other
|
||||
than rcu_read_lock(), do -not- use synchronize_rcu().
|
||||
|
||||
Similarly, disabling preemption is not an acceptable substitute
|
||||
for rcu_read_lock(). Code that attempts to use preemption
|
||||
disabling where it should be using rcu_read_lock() will break
|
||||
in CONFIG_PREEMPT=y kernel builds.
|
||||
|
||||
If you want to wait for interrupt handlers, NMI handlers, and
|
||||
code under the influence of preempt_disable(), you instead
|
||||
need to use synchronize_irq() or synchronize_sched().
|
||||
|
||||
This same limitation also applies to synchronize_rcu_bh()
|
||||
and synchronize_srcu(), as well as to the asynchronous and
|
||||
expedited forms of the three primitives, namely call_rcu(),
|
||||
call_rcu_bh(), call_srcu(), synchronize_rcu_expedited(),
|
||||
synchronize_rcu_bh_expedited(), and synchronize_srcu_expedited().
|
||||
|
||||
12. Any lock acquired by an RCU callback must be acquired elsewhere
|
||||
11. Any lock acquired by an RCU callback must be acquired elsewhere
|
||||
with softirq disabled, e.g., via spin_lock_irqsave(),
|
||||
spin_lock_bh(), etc. Failing to disable irq on a given
|
||||
acquisition of that lock will result in deadlock as soon as
|
||||
the RCU softirq handler happens to run your RCU callback while
|
||||
interrupting that acquisition's critical section.
|
||||
|
||||
13. RCU callbacks can be and are executed in parallel. In many cases,
|
||||
12. RCU callbacks can be and are executed in parallel. In many cases,
|
||||
the callback code simply wrappers around kfree(), so that this
|
||||
is not an issue (or, more accurately, to the extent that it is
|
||||
an issue, the memory-allocator locking handles it). However,
|
||||
|
|
@ -370,7 +343,7 @@ over a rather long period of time, but improvements are always welcome!
|
|||
not the case, a self-spawning RCU callback would prevent the
|
||||
victim CPU from ever going offline.)
|
||||
|
||||
14. Unlike other forms of RCU, it -is- permissible to block in an
|
||||
13. Unlike other forms of RCU, it -is- permissible to block in an
|
||||
SRCU read-side critical section (demarked by srcu_read_lock()
|
||||
and srcu_read_unlock()), hence the "SRCU": "sleepable RCU".
|
||||
Please note that if you don't need to sleep in read-side critical
|
||||
|
|
@ -414,7 +387,7 @@ over a rather long period of time, but improvements are always welcome!
|
|||
Note that rcu_dereference() and rcu_assign_pointer() relate to
|
||||
SRCU just as they do to other forms of RCU.
|
||||
|
||||
15. The whole point of call_rcu(), synchronize_rcu(), and friends
|
||||
14. The whole point of call_rcu(), synchronize_rcu(), and friends
|
||||
is to wait until all pre-existing readers have finished before
|
||||
carrying out some otherwise-destructive operation. It is
|
||||
therefore critically important to -first- remove any path
|
||||
|
|
@ -426,13 +399,13 @@ over a rather long period of time, but improvements are always welcome!
|
|||
is the caller's responsibility to guarantee that any subsequent
|
||||
readers will execute safely.
|
||||
|
||||
16. The various RCU read-side primitives do -not- necessarily contain
|
||||
15. The various RCU read-side primitives do -not- necessarily contain
|
||||
memory barriers. You should therefore plan for the CPU
|
||||
and the compiler to freely reorder code into and out of RCU
|
||||
read-side critical sections. It is the responsibility of the
|
||||
RCU update-side primitives to deal with this.
|
||||
|
||||
17. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
|
||||
16. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
|
||||
__rcu sparse checks to validate your RCU code. These can help
|
||||
find problems as follows:
|
||||
|
||||
|
|
@ -455,7 +428,7 @@ over a rather long period of time, but improvements are always welcome!
|
|||
These debugging aids can help you find problems that are
|
||||
otherwise extremely difficult to spot.
|
||||
|
||||
18. If you register a callback using call_rcu(), call_rcu_bh(),
|
||||
17. If you register a callback using call_rcu(), call_rcu_bh(),
|
||||
call_rcu_sched(), or call_srcu(), and pass in a function defined
|
||||
within a loadable module, then it in necessary to wait for
|
||||
all pending callbacks to be invoked after the last invocation
|
||||
|
|
@ -469,8 +442,8 @@ over a rather long period of time, but improvements are always welcome!
|
|||
You instead need to use one of the barrier functions:
|
||||
|
||||
o call_rcu() -> rcu_barrier()
|
||||
o call_rcu_bh() -> rcu_barrier_bh()
|
||||
o call_rcu_sched() -> rcu_barrier_sched()
|
||||
o call_rcu_bh() -> rcu_barrier()
|
||||
o call_rcu_sched() -> rcu_barrier()
|
||||
o call_srcu() -> srcu_barrier()
|
||||
|
||||
However, these barrier functions are absolutely -not- guaranteed
|
||||
|
|
|
|||
|
|
@ -14,9 +14,9 @@ being the real world and all that.
|
|||
So let's look at an example RCU lockdep splat from 3.0-rc5, one that
|
||||
has long since been fixed:
|
||||
|
||||
===============================
|
||||
[ INFO: suspicious RCU usage. ]
|
||||
-------------------------------
|
||||
=============================
|
||||
WARNING: suspicious RCU usage
|
||||
-----------------------------
|
||||
block/cfq-iosched.c:2776 suspicious rcu_dereference_protected() usage!
|
||||
|
||||
other info that might help us debug this:
|
||||
|
|
@ -24,11 +24,11 @@ other info that might help us debug this:
|
|||
|
||||
rcu_scheduler_active = 1, debug_locks = 0
|
||||
3 locks held by scsi_scan_6/1552:
|
||||
#0: (&shost->scan_mutex){+.+.+.}, at: [<ffffffff8145efca>]
|
||||
#0: (&shost->scan_mutex){+.+.}, at: [<ffffffff8145efca>]
|
||||
scsi_scan_host_selected+0x5a/0x150
|
||||
#1: (&eq->sysfs_lock){+.+...}, at: [<ffffffff812a5032>]
|
||||
#1: (&eq->sysfs_lock){+.+.}, at: [<ffffffff812a5032>]
|
||||
elevator_exit+0x22/0x60
|
||||
#2: (&(&q->__queue_lock)->rlock){-.-...}, at: [<ffffffff812b6233>]
|
||||
#2: (&(&q->__queue_lock)->rlock){-.-.}, at: [<ffffffff812b6233>]
|
||||
cfq_exit_queue+0x43/0x190
|
||||
|
||||
stack backtrace:
|
||||
|
|
|
|||
|
|
@ -87,7 +87,3 @@ o Where can I find more information on RCU?
|
|||
|
||||
See the RTFP.txt file in this directory.
|
||||
Or point your browser at http://www.rdrop.com/users/paulmck/RCU/.
|
||||
|
||||
o What are all these files in this directory?
|
||||
|
||||
See 00-INDEX for the list.
|
||||
|
|
|
|||
|
|
@ -16,12 +16,9 @@ o A CPU looping in an RCU read-side critical section.
|
|||
|
||||
o A CPU looping with interrupts disabled.
|
||||
|
||||
o A CPU looping with preemption disabled. This condition can
|
||||
result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
|
||||
stalls.
|
||||
o A CPU looping with preemption disabled.
|
||||
|
||||
o A CPU looping with bottom halves disabled. This condition can
|
||||
result in RCU-sched and RCU-bh stalls.
|
||||
o A CPU looping with bottom halves disabled.
|
||||
|
||||
o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
|
||||
without invoking schedule(). If the looping in the kernel is
|
||||
|
|
@ -87,9 +84,9 @@ o A hardware failure. This is quite unlikely, but has occurred
|
|||
This resulted in a series of RCU CPU stall warnings, eventually
|
||||
leading the realization that the CPU had failed.
|
||||
|
||||
The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall
|
||||
warning. Note that SRCU does -not- have CPU stall warnings. Please note
|
||||
that RCU only detects CPU stalls when there is a grace period in progress.
|
||||
The RCU, RCU-sched, and RCU-tasks implementations have CPU stall warning.
|
||||
Note that SRCU does -not- have CPU stall warnings. Please note that
|
||||
RCU only detects CPU stalls when there is a grace period in progress.
|
||||
No grace period, no CPU stall warnings.
|
||||
|
||||
To diagnose the cause of the stall, inspect the stack traces.
|
||||
|
|
@ -179,9 +176,8 @@ causing stalls, and that the stall was affecting RCU-sched. This message
|
|||
will normally be followed by stack dumps for each CPU. Please note that
|
||||
PREEMPT_RCU builds can be stalled by tasks as well as by CPUs, and that
|
||||
the tasks will be indicated by PID, for example, "P3421". It is even
|
||||
possible for a rcu_preempt_state stall to be caused by both CPUs -and-
|
||||
tasks, in which case the offending CPUs and tasks will all be called
|
||||
out in the list.
|
||||
possible for an rcu_state stall to be caused by both CPUs -and- tasks,
|
||||
in which case the offending CPUs and tasks will all be called out in the list.
|
||||
|
||||
CPU 2's "(3 GPs behind)" indicates that this CPU has not interacted with
|
||||
the RCU core for the past three grace periods. In contrast, CPU 16's "(0
|
||||
|
|
@ -209,7 +205,7 @@ handlers are no longer able to execute on this CPU. This can happen if
|
|||
the stalled CPU is spinning with interrupts are disabled, or, in -rt
|
||||
kernels, if a high-priority process is starving RCU's softirq handler.
|
||||
|
||||
The "fps=" shows the number of force-quiescent-state idle/offline
|
||||
The "fqs=" shows the number of force-quiescent-state idle/offline
|
||||
detection passes that the grace-period kthread has made across this
|
||||
CPU since the last time that this CPU noted the beginning of a grace
|
||||
period.
|
||||
|
|
@ -223,17 +219,18 @@ an estimate of the total number of RCU callbacks queued across all CPUs
|
|||
In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
|
||||
for each CPU:
|
||||
|
||||
0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D
|
||||
0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 Nonlazy posted: ..D
|
||||
|
||||
The "last_accelerate:" prints the low-order 16 bits (in hex) of the
|
||||
jiffies counter when this CPU last invoked rcu_try_advance_all_cbs()
|
||||
from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from
|
||||
rcu_prepare_for_idle(). The "nonlazy_posted:" prints the number
|
||||
of non-lazy callbacks posted since the last call to rcu_needs_cpu().
|
||||
Finally, an "L" indicates that there are currently no non-lazy callbacks
|
||||
("." is printed otherwise, as shown above) and "D" indicates that
|
||||
dyntick-idle processing is enabled ("." is printed otherwise, for example,
|
||||
if disabled via the "nohz=" kernel boot parameter).
|
||||
rcu_prepare_for_idle(). The "Nonlazy posted:" indicates lazy-callback
|
||||
status, so that an "l" indicates that all callbacks were lazy at the start
|
||||
of the last idle period and an "L" indicates that there are currently
|
||||
no non-lazy callbacks (in both cases, "." is printed otherwise, as
|
||||
shown above) and "D" indicates that dyntick-idle processing is enabled
|
||||
("." is printed otherwise, for example, if disabled via the "nohz="
|
||||
kernel boot parameter).
|
||||
|
||||
If the grace period ends just as the stall warning starts printing,
|
||||
there will be a spurious stall-warning message, which will include
|
||||
|
|
|
|||
|
|
@ -10,173 +10,8 @@ status messages via printk(), which can be examined via the dmesg
|
|||
command (perhaps grepping for "torture"). The test is started
|
||||
when the module is loaded, and stops when the module is unloaded.
|
||||
|
||||
|
||||
MODULE PARAMETERS
|
||||
|
||||
This module has the following parameters:
|
||||
|
||||
fqs_duration Duration (in microseconds) of artificially induced bursts
|
||||
of force_quiescent_state() invocations. In RCU
|
||||
implementations having force_quiescent_state(), these
|
||||
bursts help force races between forcing a given grace
|
||||
period and that grace period ending on its own.
|
||||
|
||||
fqs_holdoff Holdoff time (in microseconds) between consecutive calls
|
||||
to force_quiescent_state() within a burst.
|
||||
|
||||
fqs_stutter Wait time (in seconds) between consecutive bursts
|
||||
of calls to force_quiescent_state().
|
||||
|
||||
gp_normal Make the fake writers use normal synchronous grace-period
|
||||
primitives.
|
||||
|
||||
gp_exp Make the fake writers use expedited synchronous grace-period
|
||||
primitives. If both gp_normal and gp_exp are set, or
|
||||
if neither gp_normal nor gp_exp are set, then randomly
|
||||
choose the primitive so that about 50% are normal and
|
||||
50% expedited. By default, neither are set, which
|
||||
gives best overall test coverage.
|
||||
|
||||
irqreader Says to invoke RCU readers from irq level. This is currently
|
||||
done via timers. Defaults to "1" for variants of RCU that
|
||||
permit this. (Or, more accurately, variants of RCU that do
|
||||
-not- permit this know to ignore this variable.)
|
||||
|
||||
n_barrier_cbs If this is nonzero, RCU barrier testing will be conducted,
|
||||
in which case n_barrier_cbs specifies the number of
|
||||
RCU callbacks (and corresponding kthreads) to use for
|
||||
this testing. The value cannot be negative. If you
|
||||
specify this to be non-zero when torture_type indicates a
|
||||
synchronous RCU implementation (one for which a member of
|
||||
the synchronize_rcu() rather than the call_rcu() family is
|
||||
used -- see the documentation for torture_type below), an
|
||||
error will be reported and no testing will be carried out.
|
||||
|
||||
nfakewriters This is the number of RCU fake writer threads to run. Fake
|
||||
writer threads repeatedly use the synchronous "wait for
|
||||
current readers" function of the interface selected by
|
||||
torture_type, with a delay between calls to allow for various
|
||||
different numbers of writers running in parallel.
|
||||
nfakewriters defaults to 4, which provides enough parallelism
|
||||
to trigger special cases caused by multiple writers, such as
|
||||
the synchronize_srcu() early return optimization.
|
||||
|
||||
nreaders This is the number of RCU reading threads supported.
|
||||
The default is twice the number of CPUs. Why twice?
|
||||
To properly exercise RCU implementations with preemptible
|
||||
read-side critical sections.
|
||||
|
||||
onoff_interval
|
||||
The number of seconds between each attempt to execute a
|
||||
randomly selected CPU-hotplug operation. Defaults to
|
||||
zero, which disables CPU hotplugging. In HOTPLUG_CPU=n
|
||||
kernels, rcutorture will silently refuse to do any
|
||||
CPU-hotplug operations regardless of what value is
|
||||
specified for onoff_interval.
|
||||
|
||||
onoff_holdoff The number of seconds to wait until starting CPU-hotplug
|
||||
operations. This would normally only be used when
|
||||
rcutorture was built into the kernel and started
|
||||
automatically at boot time, in which case it is useful
|
||||
in order to avoid confusing boot-time code with CPUs
|
||||
coming and going.
|
||||
|
||||
shuffle_interval
|
||||
The number of seconds to keep the test threads affinitied
|
||||
to a particular subset of the CPUs, defaults to 3 seconds.
|
||||
Used in conjunction with test_no_idle_hz.
|
||||
|
||||
shutdown_secs The number of seconds to run the test before terminating
|
||||
the test and powering off the system. The default is
|
||||
zero, which disables test termination and system shutdown.
|
||||
This capability is useful for automated testing.
|
||||
|
||||
stall_cpu The number of seconds that a CPU should be stalled while
|
||||
within both an rcu_read_lock() and a preempt_disable().
|
||||
This stall happens only once per rcutorture run.
|
||||
If you need multiple stalls, use modprobe and rmmod to
|
||||
repeatedly run rcutorture. The default for stall_cpu
|
||||
is zero, which prevents rcutorture from stalling a CPU.
|
||||
|
||||
Note that attempts to rmmod rcutorture while the stall
|
||||
is ongoing will hang, so be careful what value you
|
||||
choose for this module parameter! In addition, too-large
|
||||
values for stall_cpu might well induce failures and
|
||||
warnings in other parts of the kernel. You have been
|
||||
warned!
|
||||
|
||||
stall_cpu_holdoff
|
||||
The number of seconds to wait after rcutorture starts
|
||||
before stalling a CPU. Defaults to 10 seconds.
|
||||
|
||||
stat_interval The number of seconds between output of torture
|
||||
statistics (via printk()). Regardless of the interval,
|
||||
statistics are printed when the module is unloaded.
|
||||
Setting the interval to zero causes the statistics to
|
||||
be printed -only- when the module is unloaded, and this
|
||||
is the default.
|
||||
|
||||
stutter The length of time to run the test before pausing for this
|
||||
same period of time. Defaults to "stutter=5", so as
|
||||
to run and pause for (roughly) five-second intervals.
|
||||
Specifying "stutter=0" causes the test to run continuously
|
||||
without pausing, which is the old default behavior.
|
||||
|
||||
test_boost Whether or not to test the ability of RCU to do priority
|
||||
boosting. Defaults to "test_boost=1", which performs
|
||||
RCU priority-inversion testing only if the selected
|
||||
RCU implementation supports priority boosting. Specifying
|
||||
"test_boost=0" never performs RCU priority-inversion
|
||||
testing. Specifying "test_boost=2" performs RCU
|
||||
priority-inversion testing even if the selected RCU
|
||||
implementation does not support RCU priority boosting,
|
||||
which can be used to test rcutorture's ability to
|
||||
carry out RCU priority-inversion testing.
|
||||
|
||||
test_boost_interval
|
||||
The number of seconds in an RCU priority-inversion test
|
||||
cycle. Defaults to "test_boost_interval=7". It is
|
||||
usually wise for this value to be relatively prime to
|
||||
the value selected for "stutter".
|
||||
|
||||
test_boost_duration
|
||||
The number of seconds to do RCU priority-inversion testing
|
||||
within any given "test_boost_interval". Defaults to
|
||||
"test_boost_duration=4".
|
||||
|
||||
test_no_idle_hz Whether or not to test the ability of RCU to operate in
|
||||
a kernel that disables the scheduling-clock interrupt to
|
||||
idle CPUs. Boolean parameter, "1" to test, "0" otherwise.
|
||||
Defaults to omitting this test.
|
||||
|
||||
torture_type The type of RCU to test, with string values as follows:
|
||||
|
||||
"rcu": rcu_read_lock(), rcu_read_unlock() and call_rcu(),
|
||||
along with expedited, synchronous, and polling
|
||||
variants.
|
||||
|
||||
"rcu_bh": rcu_read_lock_bh(), rcu_read_unlock_bh(), and
|
||||
call_rcu_bh(), along with expedited and synchronous
|
||||
variants.
|
||||
|
||||
"rcu_busted": This tests an intentionally incorrect version
|
||||
of RCU in order to help test rcutorture itself.
|
||||
|
||||
"srcu": srcu_read_lock(), srcu_read_unlock() and
|
||||
call_srcu(), along with expedited and
|
||||
synchronous variants.
|
||||
|
||||
"sched": preempt_disable(), preempt_enable(), and
|
||||
call_rcu_sched(), along with expedited,
|
||||
synchronous, and polling variants.
|
||||
|
||||
"tasks": voluntary context switch and call_rcu_tasks(),
|
||||
along with expedited and synchronous variants.
|
||||
|
||||
Defaults to "rcu".
|
||||
|
||||
verbose Enable debug printk()s. Default is disabled.
|
||||
|
||||
Module parameters are prefixed by "rcutorture." in
|
||||
Documentation/admin-guide/kernel-parameters.txt.
|
||||
|
||||
OUTPUT
|
||||
|
||||
|
|
|
|||
|
|
@ -266,7 +266,7 @@ rcu_dereference()
|
|||
unnecessary overhead on Alpha CPUs.
|
||||
|
||||
Note that the value returned by rcu_dereference() is valid
|
||||
only within the enclosing RCU read-side critical section.
|
||||
only within the enclosing RCU read-side critical section [1].
|
||||
For example, the following is -not- legal:
|
||||
|
||||
rcu_read_lock();
|
||||
|
|
@ -292,6 +292,19 @@ rcu_dereference()
|
|||
typically used indirectly, via the _rcu list-manipulation
|
||||
primitives, such as list_for_each_entry_rcu().
|
||||
|
||||
[1] The variant rcu_dereference_protected() can be used outside
|
||||
of an RCU read-side critical section as long as the usage is
|
||||
protected by locks acquired by the update-side code. This variant
|
||||
avoids the lockdep warning that would happen when using (for
|
||||
example) rcu_dereference() without rcu_read_lock() protection.
|
||||
Using rcu_dereference_protected() also has the advantage
|
||||
of permitting compiler optimizations that rcu_dereference()
|
||||
must prohibit. The rcu_dereference_protected() variant takes
|
||||
a lockdep expression to indicate which locks must be acquired
|
||||
by the caller. If the indicated protection is not provided,
|
||||
a lockdep splat is emitted. See RCU/Design/Requirements/Requirements.html
|
||||
and the API's code comments for more details and example usage.
|
||||
|
||||
The following diagram shows how each API communicates among the
|
||||
reader, updater, and reclaimer.
|
||||
|
||||
|
|
@ -322,28 +335,27 @@ to their callers and (2) call_rcu() callbacks may be invoked. Efficient
|
|||
implementations of the RCU infrastructure make heavy use of batching in
|
||||
order to amortize their overhead over many uses of the corresponding APIs.
|
||||
|
||||
There are no fewer than three RCU mechanisms in the Linux kernel; the
|
||||
diagram above shows the first one, which is by far the most commonly used.
|
||||
The rcu_dereference() and rcu_assign_pointer() primitives are used for
|
||||
all three mechanisms, but different defer and protect primitives are
|
||||
used as follows:
|
||||
There are at least three flavors of RCU usage in the Linux kernel. The diagram
|
||||
above shows the most common one. On the updater side, the rcu_assign_pointer(),
|
||||
sychronize_rcu() and call_rcu() primitives used are the same for all three
|
||||
flavors. However for protection (on the reader side), the primitives used vary
|
||||
depending on the flavor:
|
||||
|
||||
Defer Protect
|
||||
a. rcu_read_lock() / rcu_read_unlock()
|
||||
rcu_dereference()
|
||||
|
||||
a. synchronize_rcu() rcu_read_lock() / rcu_read_unlock()
|
||||
call_rcu() rcu_dereference()
|
||||
b. rcu_read_lock_bh() / rcu_read_unlock_bh()
|
||||
local_bh_disable() / local_bh_enable()
|
||||
rcu_dereference_bh()
|
||||
|
||||
b. synchronize_rcu_bh() rcu_read_lock_bh() / rcu_read_unlock_bh()
|
||||
call_rcu_bh() rcu_dereference_bh()
|
||||
c. rcu_read_lock_sched() / rcu_read_unlock_sched()
|
||||
preempt_disable() / preempt_enable()
|
||||
local_irq_save() / local_irq_restore()
|
||||
hardirq enter / hardirq exit
|
||||
NMI enter / NMI exit
|
||||
rcu_dereference_sched()
|
||||
|
||||
c. synchronize_sched() rcu_read_lock_sched() / rcu_read_unlock_sched()
|
||||
call_rcu_sched() preempt_disable() / preempt_enable()
|
||||
local_irq_save() / local_irq_restore()
|
||||
hardirq enter / hardirq exit
|
||||
NMI enter / NMI exit
|
||||
rcu_dereference_sched()
|
||||
|
||||
These three mechanisms are used as follows:
|
||||
These three flavors are used as follows:
|
||||
|
||||
a. RCU applied to normal data structures.
|
||||
|
||||
|
|
@ -548,7 +560,7 @@ presents two such "toy" implementations of RCU, one that is implemented
|
|||
in terms of familiar locking primitives, and another that more closely
|
||||
resembles "classic" RCU. Both are way too simple for real-world use,
|
||||
lacking both functionality and performance. However, they are useful
|
||||
in getting a feel for how RCU works. See kernel/rcupdate.c for a
|
||||
in getting a feel for how RCU works. See kernel/rcu/update.c for a
|
||||
production-quality implementation, and see:
|
||||
|
||||
http://www.rdrop.com/users/paulmck/RCU
|
||||
|
|
@ -867,18 +879,20 @@ RCU: Critical sections Grace period Barrier
|
|||
|
||||
bh: Critical sections Grace period Barrier
|
||||
|
||||
rcu_read_lock_bh call_rcu_bh rcu_barrier_bh
|
||||
rcu_read_unlock_bh synchronize_rcu_bh
|
||||
rcu_dereference_bh synchronize_rcu_bh_expedited
|
||||
rcu_read_lock_bh call_rcu rcu_barrier
|
||||
rcu_read_unlock_bh synchronize_rcu
|
||||
[local_bh_disable] synchronize_rcu_expedited
|
||||
[and friends]
|
||||
rcu_dereference_bh
|
||||
rcu_dereference_bh_check
|
||||
rcu_dereference_bh_protected
|
||||
rcu_read_lock_bh_held
|
||||
|
||||
sched: Critical sections Grace period Barrier
|
||||
|
||||
rcu_read_lock_sched synchronize_sched rcu_barrier_sched
|
||||
rcu_read_unlock_sched call_rcu_sched
|
||||
[preempt_disable] synchronize_sched_expedited
|
||||
rcu_read_lock_sched call_rcu rcu_barrier
|
||||
rcu_read_unlock_sched synchronize_rcu
|
||||
[preempt_disable] synchronize_rcu_expedited
|
||||
[and friends]
|
||||
rcu_read_lock_sched_notrace
|
||||
rcu_read_unlock_sched_notrace
|
||||
|
|
@ -890,8 +904,8 @@ sched: Critical sections Grace period Barrier
|
|||
|
||||
SRCU: Critical sections Grace period Barrier
|
||||
|
||||
srcu_read_lock synchronize_srcu srcu_barrier
|
||||
srcu_read_unlock call_srcu
|
||||
srcu_read_lock call_srcu srcu_barrier
|
||||
srcu_read_unlock synchronize_srcu
|
||||
srcu_dereference synchronize_srcu_expedited
|
||||
srcu_dereference_check
|
||||
srcu_read_lock_held
|
||||
|
|
@ -934,7 +948,8 @@ c. Do you need to treat NMI handlers, hardirq handlers,
|
|||
d. Do you need RCU grace periods to complete even in the face
|
||||
of softirq monopolization of one or more of the CPUs? For
|
||||
example, is your code subject to network-based denial-of-service
|
||||
attacks? If so, you need RCU-bh.
|
||||
attacks? If so, you should disable softirq across your readers,
|
||||
for example, by using rcu_read_lock_bh().
|
||||
|
||||
e. Is your workload too update-intensive for normal use of
|
||||
RCU, but inappropriate for other synchronization mechanisms?
|
||||
|
|
@ -1033,7 +1048,7 @@ Answer: Just as PREEMPT_RT permits preemption of spinlock
|
|||
spinlocks blocking while in RCU read-side critical
|
||||
sections.
|
||||
|
||||
Why the apparent inconsistency? Because it is it
|
||||
Why the apparent inconsistency? Because it is
|
||||
possible to use priority boosting to keep the RCU
|
||||
grace periods short if need be (for example, if running
|
||||
short of memory). In contrast, if blocking waiting
|
||||
|
|
|
|||
73
Documentation/accounting/psi.txt
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
================================
|
||||
PSI - Pressure Stall Information
|
||||
================================
|
||||
|
||||
:Date: April, 2018
|
||||
:Author: Johannes Weiner <hannes@cmpxchg.org>
|
||||
|
||||
When CPU, memory or IO devices are contended, workloads experience
|
||||
latency spikes, throughput losses, and run the risk of OOM kills.
|
||||
|
||||
Without an accurate measure of such contention, users are forced to
|
||||
either play it safe and under-utilize their hardware resources, or
|
||||
roll the dice and frequently suffer the disruptions resulting from
|
||||
excessive overcommit.
|
||||
|
||||
The psi feature identifies and quantifies the disruptions caused by
|
||||
such resource crunches and the time impact it has on complex workloads
|
||||
or even entire systems.
|
||||
|
||||
Having an accurate measure of productivity losses caused by resource
|
||||
scarcity aids users in sizing workloads to hardware--or provisioning
|
||||
hardware according to workload demand.
|
||||
|
||||
As psi aggregates this information in realtime, systems can be managed
|
||||
dynamically using techniques such as load shedding, migrating jobs to
|
||||
other systems or data centers, or strategically pausing or killing low
|
||||
priority or restartable batch jobs.
|
||||
|
||||
This allows maximizing hardware utilization without sacrificing
|
||||
workload health or risking major disruptions such as OOM kills.
|
||||
|
||||
Pressure interface
|
||||
==================
|
||||
|
||||
Pressure information for each resource is exported through the
|
||||
respective file in /proc/pressure/ -- cpu, memory, and io.
|
||||
|
||||
The format for CPU is as such:
|
||||
|
||||
some avg10=0.00 avg60=0.00 avg300=0.00 total=0
|
||||
|
||||
and for memory and IO:
|
||||
|
||||
some avg10=0.00 avg60=0.00 avg300=0.00 total=0
|
||||
full avg10=0.00 avg60=0.00 avg300=0.00 total=0
|
||||
|
||||
The "some" line indicates the share of time in which at least some
|
||||
tasks are stalled on a given resource.
|
||||
|
||||
The "full" line indicates the share of time in which all non-idle
|
||||
tasks are stalled on a given resource simultaneously. In this state
|
||||
actual CPU cycles are going to waste, and a workload that spends
|
||||
extended time in this state is considered to be thrashing. This has
|
||||
severe impact on performance, and it's useful to distinguish this
|
||||
situation from a state where some tasks are stalled but the CPU is
|
||||
still doing productive work. As such, time spent in this subset of the
|
||||
stall state is tracked separately and exported in the "full" averages.
|
||||
|
||||
The ratios (in %) are tracked as recent trends over ten, sixty, and
|
||||
three hundred second windows, which gives insight into short term events
|
||||
as well as medium and long term trends. The total absolute stall time
|
||||
(in us) is tracked and exported as well, to allow detection of latency
|
||||
spikes which wouldn't necessarily make a dent in the time averages,
|
||||
or to average trends over custom time frames.
|
||||
|
||||
Cgroup2 interface
|
||||
=================
|
||||
|
||||
In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem
|
||||
mounted, pressure stall information is also tracked for tasks grouped
|
||||
into cgroups. Each subdirectory in the cgroupfs mountpoint contains
|
||||
cpu.pressure, memory.pressure, and io.pressure files; the format is
|
||||
the same as the /proc/pressure/ files.
|
||||
|
|
@ -23,7 +23,7 @@ kernel.
|
|||
|
||||
The resultant userspace tool binary is then located at:
|
||||
|
||||
tools/acpi/power/acpi/acpidbg/acpidbg
|
||||
tools/power/acpi/acpidbg
|
||||
|
||||
It can be installed to system directories by running "make install" (as a
|
||||
sufficiently privileged user).
|
||||
|
|
@ -35,7 +35,7 @@ kernel.
|
|||
|
||||
# mount -t debugfs none /sys/kernel/debug
|
||||
# modprobe acpi_dbg
|
||||
# tools/acpi/power/acpi/acpidbg/acpidbg
|
||||
# tools/power/acpi/acpidbg
|
||||
|
||||
That spawns the interactive AML debugger environment where you can execute
|
||||
debugger commands.
|
||||
|
|
|
|||
|
|
@ -14,6 +14,10 @@ upgrade the ACPI execution environment that is defined by the ACPI tables
|
|||
via upgrading the ACPI tables provided by the BIOS with an instrumented,
|
||||
modified, more recent version one, or installing brand new ACPI tables.
|
||||
|
||||
When building initrd with kernel in a single image, option
|
||||
ACPI_TABLE_OVERRIDE_VIA_BUILTIN_INITRD should also be true for this
|
||||
feature to work.
|
||||
|
||||
For a full list of ACPI tables that can be upgraded/installed, take a look
|
||||
at the char *table_sigs[MAX_ACPI_SIGNATURE]; definition in
|
||||
drivers/acpi/tables.c.
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ If you want to use SELinux, chances are you will want
|
|||
to use the distro-provided policies, or install the
|
||||
latest reference policy release from
|
||||
|
||||
http://oss.tresys.com/projects/refpolicy
|
||||
https://github.com/SELinuxProject/refpolicy
|
||||
|
||||
However, if you want to install a dummy policy for
|
||||
testing, you can do using ``mdp`` provided under
|
||||
|
|
|
|||
107
Documentation/admin-guide/LSM/SafeSetID.rst
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
=========
|
||||
SafeSetID
|
||||
=========
|
||||
SafeSetID is an LSM module that gates the setid family of syscalls to restrict
|
||||
UID/GID transitions from a given UID/GID to only those approved by a
|
||||
system-wide whitelist. These restrictions also prohibit the given UIDs/GIDs
|
||||
from obtaining auxiliary privileges associated with CAP_SET{U/G}ID, such as
|
||||
allowing a user to set up user namespace UID mappings.
|
||||
|
||||
|
||||
Background
|
||||
==========
|
||||
In absence of file capabilities, processes spawned on a Linux system that need
|
||||
to switch to a different user must be spawned with CAP_SETUID privileges.
|
||||
CAP_SETUID is granted to programs running as root or those running as a non-root
|
||||
user that have been explicitly given the CAP_SETUID runtime capability. It is
|
||||
often preferable to use Linux runtime capabilities rather than file
|
||||
capabilities, since using file capabilities to run a program with elevated
|
||||
privileges opens up possible security holes since any user with access to the
|
||||
file can exec() that program to gain the elevated privileges.
|
||||
|
||||
While it is possible to implement a tree of processes by giving full
|
||||
CAP_SET{U/G}ID capabilities, this is often at odds with the goals of running a
|
||||
tree of processes under non-root user(s) in the first place. Specifically,
|
||||
since CAP_SETUID allows changing to any user on the system, including the root
|
||||
user, it is an overpowered capability for what is needed in this scenario,
|
||||
especially since programs often only call setuid() to drop privileges to a
|
||||
lesser-privileged user -- not elevate privileges. Unfortunately, there is no
|
||||
generally feasible way in Linux to restrict the potential UIDs that a user can
|
||||
switch to through setuid() beyond allowing a switch to any user on the system.
|
||||
This SafeSetID LSM seeks to provide a solution for restricting setid
|
||||
capabilities in such a way.
|
||||
|
||||
The main use case for this LSM is to allow a non-root program to transition to
|
||||
other untrusted uids without full blown CAP_SETUID capabilities. The non-root
|
||||
program would still need CAP_SETUID to do any kind of transition, but the
|
||||
additional restrictions imposed by this LSM would mean it is a "safer" version
|
||||
of CAP_SETUID since the non-root program cannot take advantage of CAP_SETUID to
|
||||
do any unapproved actions (e.g. setuid to uid 0 or create/enter new user
|
||||
namespace). The higher level goal is to allow for uid-based sandboxing of system
|
||||
services without having to give out CAP_SETUID all over the place just so that
|
||||
non-root programs can drop to even-lesser-privileged uids. This is especially
|
||||
relevant when one non-root daemon on the system should be allowed to spawn other
|
||||
processes as different uids, but its undesirable to give the daemon a
|
||||
basically-root-equivalent CAP_SETUID.
|
||||
|
||||
|
||||
Other Approaches Considered
|
||||
===========================
|
||||
|
||||
Solve this problem in userspace
|
||||
-------------------------------
|
||||
For candidate applications that would like to have restricted setid capabilities
|
||||
as implemented in this LSM, an alternative option would be to simply take away
|
||||
setid capabilities from the application completely and refactor the process
|
||||
spawning semantics in the application (e.g. by using a privileged helper program
|
||||
to do process spawning and UID/GID transitions). Unfortunately, there are a
|
||||
number of semantics around process spawning that would be affected by this, such
|
||||
as fork() calls where the program doesn???t immediately call exec() after the
|
||||
fork(), parent processes specifying custom environment variables or command line
|
||||
args for spawned child processes, or inheritance of file handles across a
|
||||
fork()/exec(). Because of this, as solution that uses a privileged helper in
|
||||
userspace would likely be less appealing to incorporate into existing projects
|
||||
that rely on certain process-spawning semantics in Linux.
|
||||
|
||||
Use user namespaces
|
||||
-------------------
|
||||
Another possible approach would be to run a given process tree in its own user
|
||||
namespace and give programs in the tree setid capabilities. In this way,
|
||||
programs in the tree could change to any desired UID/GID in the context of their
|
||||
own user namespace, and only approved UIDs/GIDs could be mapped back to the
|
||||
initial system user namespace, affectively preventing privilege escalation.
|
||||
Unfortunately, it is not generally feasible to use user namespaces in isolation,
|
||||
without pairing them with other namespace types, which is not always an option.
|
||||
Linux checks for capabilities based off of the user namespace that ???owns??? some
|
||||
entity. For example, Linux has the notion that network namespaces are owned by
|
||||
the user namespace in which they were created. A consequence of this is that
|
||||
capability checks for access to a given network namespace are done by checking
|
||||
whether a task has the given capability in the context of the user namespace
|
||||
that owns the network namespace -- not necessarily the user namespace under
|
||||
which the given task runs. Therefore spawning a process in a new user namespace
|
||||
effectively prevents it from accessing the network namespace owned by the
|
||||
initial namespace. This is a deal-breaker for any application that expects to
|
||||
retain the CAP_NET_ADMIN capability for the purpose of adjusting network
|
||||
configurations. Using user namespaces in isolation causes problems regarding
|
||||
other system interactions, including use of pid namespaces and device creation.
|
||||
|
||||
Use an existing LSM
|
||||
-------------------
|
||||
None of the other in-tree LSMs have the capability to gate setid transitions, or
|
||||
even employ the security_task_fix_setuid hook at all. SELinux says of that hook:
|
||||
"Since setuid only affects the current process, and since the SELinux controls
|
||||
are not based on the Linux identity attributes, SELinux does not need to control
|
||||
this operation."
|
||||
|
||||
|
||||
Directions for use
|
||||
==================
|
||||
This LSM hooks the setid syscalls to make sure transitions are allowed if an
|
||||
applicable restriction policy is in place. Policies are configured through
|
||||
securityfs by writing to the safesetid/add_whitelist_policy and
|
||||
safesetid/flush_whitelist_policies files at the location where securityfs is
|
||||
mounted. The format for adding a policy is '<UID>:<UID>', using literal
|
||||
numbers, such as '123:456'. To flush the policies, any write to the file is
|
||||
sufficient. Again, configuring a policy for a UID will prevent that UID from
|
||||
obtaining auxiliary setid privileges, such as allowing a user to set up user
|
||||
namespace UID mappings.
|
||||
|
|
@ -818,6 +818,10 @@ Smack supports some mount options:
|
|||
specifies a label to which all labels set on the
|
||||
filesystem must have read access. Not yet enforced.
|
||||
|
||||
smackfstransmute=label:
|
||||
behaves exactly like smackfsroot except that it also
|
||||
sets the transmute flag on the root of the mount
|
||||
|
||||
These mount options apply to all file system types.
|
||||
|
||||
Smack auditing
|
||||
|
|
|
|||
|
|
@ -64,8 +64,8 @@ The sysctl settings (writable only with ``CAP_SYS_PTRACE``) are:
|
|||
Using ``PTRACE_TRACEME`` is unchanged.
|
||||
|
||||
2 - admin-only attach:
|
||||
only processes with ``CAP_SYS_PTRACE`` may use ptrace
|
||||
with ``PTRACE_ATTACH``, or through children calling ``PTRACE_TRACEME``.
|
||||
only processes with ``CAP_SYS_PTRACE`` may use ptrace, either with
|
||||
``PTRACE_ATTACH`` or through children calling ``PTRACE_TRACEME``.
|
||||
|
||||
3 - no attach:
|
||||
no processes may use ptrace with ``PTRACE_ATTACH`` nor via
|
||||
|
|
|
|||
|
|
@ -17,9 +17,8 @@ MAC extensions, other extensions can be built using the LSM to provide
|
|||
specific changes to system operation when these tweaks are not available
|
||||
in the core functionality of Linux itself.
|
||||
|
||||
Without a specific LSM built into the kernel, the default LSM will be the
|
||||
Linux capabilities system. Most LSMs choose to extend the capabilities
|
||||
system, building their checks on top of the defined capability hooks.
|
||||
The Linux capabilities modules will always be included. This may be
|
||||
followed by any number of "minor" modules and at most one "major" module.
|
||||
For more details on capabilities, see ``capabilities(7)`` in the Linux
|
||||
man-pages project.
|
||||
|
||||
|
|
@ -30,6 +29,14 @@ order in which checks are made. The capability module will always
|
|||
be first, followed by any "minor" modules (e.g. Yama) and then
|
||||
the one "major" module (e.g. SELinux) if there is one configured.
|
||||
|
||||
Process attributes associated with "major" security modules should
|
||||
be accessed and maintained using the special files in ``/proc/.../attr``.
|
||||
A security module may maintain a module specific subdirectory there,
|
||||
named after the module. ``/proc/.../attr/smack`` is provided by the Smack
|
||||
security module and contains all its special files. The files directly
|
||||
in ``/proc/.../attr`` remain as legacy interfaces for modules that provide
|
||||
subdirectories.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
||||
|
|
@ -39,3 +46,4 @@ the one "major" module (e.g. SELinux) if there is one configured.
|
|||
Smack
|
||||
tomoyo
|
||||
Yama
|
||||
SafeSetID
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
.. _readme:
|
||||
|
||||
Linux kernel release 4.x <http://kernel.org/>
|
||||
Linux kernel release 5.x <http://kernel.org/>
|
||||
=============================================
|
||||
|
||||
These are the release notes for Linux version 4. Read them carefully,
|
||||
These are the release notes for Linux version 5. Read them carefully,
|
||||
as they tell you what this is all about, explain how to install the
|
||||
kernel, and what to do if something goes wrong.
|
||||
|
||||
|
|
@ -51,8 +51,7 @@ Documentation
|
|||
|
||||
- There are various README files in the Documentation/ subdirectory:
|
||||
these typically contain kernel-specific installation notes for some
|
||||
drivers for example. See Documentation/00-INDEX for a list of what
|
||||
is contained in each file. Please read the
|
||||
drivers for example. Please read the
|
||||
:ref:`Documentation/process/changes.rst <changes>` file, as it
|
||||
contains information about the problems, which may result by upgrading
|
||||
your kernel.
|
||||
|
|
@ -64,7 +63,7 @@ Installing the kernel source
|
|||
directory where you have permissions (e.g. your home directory) and
|
||||
unpack it::
|
||||
|
||||
xz -cd linux-4.X.tar.xz | tar xvf -
|
||||
xz -cd linux-5.x.tar.xz | tar xvf -
|
||||
|
||||
Replace "X" with the version number of the latest kernel.
|
||||
|
||||
|
|
@ -73,26 +72,26 @@ Installing the kernel source
|
|||
files. They should match the library, and not get messed up by
|
||||
whatever the kernel-du-jour happens to be.
|
||||
|
||||
- You can also upgrade between 4.x releases by patching. Patches are
|
||||
- You can also upgrade between 5.x releases by patching. Patches are
|
||||
distributed in the xz format. To install by patching, get all the
|
||||
newer patch files, enter the top level directory of the kernel source
|
||||
(linux-4.X) and execute::
|
||||
(linux-5.x) and execute::
|
||||
|
||||
xz -cd ../patch-4.x.xz | patch -p1
|
||||
xz -cd ../patch-5.x.xz | patch -p1
|
||||
|
||||
Replace "x" for all versions bigger than the version "X" of your current
|
||||
Replace "x" for all versions bigger than the version "x" of your current
|
||||
source tree, **in_order**, and you should be ok. You may want to remove
|
||||
the backup files (some-file-name~ or some-file-name.orig), and make sure
|
||||
that there are no failed patches (some-file-name# or some-file-name.rej).
|
||||
If there are, either you or I have made a mistake.
|
||||
|
||||
Unlike patches for the 4.x kernels, patches for the 4.x.y kernels
|
||||
Unlike patches for the 5.x kernels, patches for the 5.x.y kernels
|
||||
(also known as the -stable kernels) are not incremental but instead apply
|
||||
directly to the base 4.x kernel. For example, if your base kernel is 4.0
|
||||
and you want to apply the 4.0.3 patch, you must not first apply the 4.0.1
|
||||
and 4.0.2 patches. Similarly, if you are running kernel version 4.0.2 and
|
||||
want to jump to 4.0.3, you must first reverse the 4.0.2 patch (that is,
|
||||
patch -R) **before** applying the 4.0.3 patch. You can read more on this in
|
||||
directly to the base 5.x kernel. For example, if your base kernel is 5.0
|
||||
and you want to apply the 5.0.3 patch, you must not first apply the 5.0.1
|
||||
and 5.0.2 patches. Similarly, if you are running kernel version 5.0.2 and
|
||||
want to jump to 5.0.3, you must first reverse the 5.0.2 patch (that is,
|
||||
patch -R) **before** applying the 5.0.3 patch. You can read more on this in
|
||||
:ref:`Documentation/process/applying-patches.rst <applying_patches>`.
|
||||
|
||||
Alternatively, the script patch-kernel can be used to automate this
|
||||
|
|
@ -115,7 +114,7 @@ Installing the kernel source
|
|||
Software requirements
|
||||
---------------------
|
||||
|
||||
Compiling and running the 4.x kernels requires up-to-date
|
||||
Compiling and running the 5.x kernels requires up-to-date
|
||||
versions of various software packages. Consult
|
||||
:ref:`Documentation/process/changes.rst <changes>` for the minimum version numbers
|
||||
required and how to get updates for these packages. Beware that using
|
||||
|
|
@ -133,12 +132,12 @@ Build directory for the kernel
|
|||
place for the output files (including .config).
|
||||
Example::
|
||||
|
||||
kernel source code: /usr/src/linux-4.X
|
||||
kernel source code: /usr/src/linux-5.x
|
||||
build directory: /home/name/build/kernel
|
||||
|
||||
To configure and build the kernel, use::
|
||||
|
||||
cd /usr/src/linux-4.X
|
||||
cd /usr/src/linux-5.x
|
||||
make O=/home/name/build/kernel menuconfig
|
||||
make O=/home/name/build/kernel
|
||||
sudo make O=/home/name/build/kernel modules_install install
|
||||
|
|
@ -252,7 +251,7 @@ Configuring the kernel
|
|||
Compiling the kernel
|
||||
--------------------
|
||||
|
||||
- Make sure you have at least gcc 3.2 available.
|
||||
- Make sure you have at least gcc 4.6 available.
|
||||
For more information, refer to :ref:`Documentation/process/changes.rst <changes>`.
|
||||
|
||||
Please note that you can still run a.out user programs with this kernel.
|
||||
|
|
|
|||
|
|
@ -56,11 +56,13 @@ v1 is available under Documentation/cgroup-v1/.
|
|||
5-3-3-2. IO Latency Interface Files
|
||||
5-4. PID
|
||||
5-4-1. PID Interface Files
|
||||
5-5. Device
|
||||
5-6. RDMA
|
||||
5-6-1. RDMA Interface Files
|
||||
5-7. Misc
|
||||
5-7-1. perf_event
|
||||
5-5. Cpuset
|
||||
5.5-1. Cpuset Interface Files
|
||||
5-6. Device
|
||||
5-7. RDMA
|
||||
5-7-1. RDMA Interface Files
|
||||
5-8. Misc
|
||||
5-8-1. perf_event
|
||||
5-N. Non-normative information
|
||||
5-N-1. CPU controller root cgroup process behaviour
|
||||
5-N-2. IO controller root cgroup process behaviour
|
||||
|
|
@ -966,6 +968,12 @@ All time durations are in microseconds.
|
|||
$PERIOD duration. "max" for $MAX indicates no limit. If only
|
||||
one number is written, $MAX is updated.
|
||||
|
||||
cpu.pressure
|
||||
A read-only nested-key file which exists on non-root cgroups.
|
||||
|
||||
Shows pressure stall information for CPU. See
|
||||
Documentation/accounting/psi.txt for details.
|
||||
|
||||
|
||||
Memory
|
||||
------
|
||||
|
|
@ -1127,6 +1135,10 @@ PAGE_SIZE multiple when read back.
|
|||
disk readahead. For now OOM in memory cgroup kills
|
||||
tasks iff shortage has happened inside page fault.
|
||||
|
||||
This event is not raised if the OOM killer is not
|
||||
considered as an option, e.g. for failed high-order
|
||||
allocations.
|
||||
|
||||
oom_kill
|
||||
The number of processes belonging to this cgroup
|
||||
killed by any kind of OOM killer.
|
||||
|
|
@ -1177,6 +1189,10 @@ PAGE_SIZE multiple when read back.
|
|||
Amount of cached filesystem data that was modified and
|
||||
is currently being written back to disk
|
||||
|
||||
anon_thp
|
||||
Amount of memory used in anonymous mappings backed by
|
||||
transparent hugepages
|
||||
|
||||
inactive_anon, active_anon, inactive_file, active_file, unevictable
|
||||
Amount of memory, swap-backed and filesystem-backed,
|
||||
on the internal memory management lists used by the
|
||||
|
|
@ -1236,6 +1252,18 @@ PAGE_SIZE multiple when read back.
|
|||
|
||||
Amount of reclaimed lazyfree pages
|
||||
|
||||
thp_fault_alloc
|
||||
|
||||
Number of transparent hugepages which were allocated to satisfy
|
||||
a page fault, including COW faults. This counter is not present
|
||||
when CONFIG_TRANSPARENT_HUGEPAGE is not set.
|
||||
|
||||
thp_collapse_alloc
|
||||
|
||||
Number of transparent hugepages which were allocated to allow
|
||||
collapsing an existing range of pages. This counter is not
|
||||
present when CONFIG_TRANSPARENT_HUGEPAGE is not set.
|
||||
|
||||
memory.swap.current
|
||||
A read-only single value file which exists on non-root
|
||||
cgroups.
|
||||
|
|
@ -1271,6 +1299,12 @@ PAGE_SIZE multiple when read back.
|
|||
higher than the limit for an extended period of time. This
|
||||
reduces the impact on the workload and memory management.
|
||||
|
||||
memory.pressure
|
||||
A read-only nested-key file which exists on non-root cgroups.
|
||||
|
||||
Shows pressure stall information for memory. See
|
||||
Documentation/accounting/psi.txt for details.
|
||||
|
||||
|
||||
Usage Guidelines
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
|
@ -1408,6 +1442,12 @@ IO Interface Files
|
|||
|
||||
8:16 rbps=2097152 wbps=max riops=max wiops=max
|
||||
|
||||
io.pressure
|
||||
A read-only nested-key file which exists on non-root cgroups.
|
||||
|
||||
Shows pressure stall information for IO. See
|
||||
Documentation/accounting/psi.txt for details.
|
||||
|
||||
|
||||
Writeback
|
||||
~~~~~~~~~
|
||||
|
|
@ -1479,7 +1519,7 @@ protected workload.
|
|||
|
||||
The limits are only applied at the peer level in the hierarchy. This means that
|
||||
in the diagram below, only groups A, B, and C will influence each other, and
|
||||
groups D and F will influence each other. Group G will influence nobody.
|
||||
groups D and F will influence each other. Group G will influence nobody::
|
||||
|
||||
[root]
|
||||
/ | \
|
||||
|
|
@ -1588,6 +1628,176 @@ through fork() or clone(). These will return -EAGAIN if the creation
|
|||
of a new process would cause a cgroup policy to be violated.
|
||||
|
||||
|
||||
Cpuset
|
||||
------
|
||||
|
||||
The "cpuset" controller provides a mechanism for constraining
|
||||
the CPU and memory node placement of tasks to only the resources
|
||||
specified in the cpuset interface files in a task's current cgroup.
|
||||
This is especially valuable on large NUMA systems where placing jobs
|
||||
on properly sized subsets of the systems with careful processor and
|
||||
memory placement to reduce cross-node memory access and contention
|
||||
can improve overall system performance.
|
||||
|
||||
The "cpuset" controller is hierarchical. That means the controller
|
||||
cannot use CPUs or memory nodes not allowed in its parent.
|
||||
|
||||
|
||||
Cpuset Interface Files
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
cpuset.cpus
|
||||
A read-write multiple values file which exists on non-root
|
||||
cpuset-enabled cgroups.
|
||||
|
||||
It lists the requested CPUs to be used by tasks within this
|
||||
cgroup. The actual list of CPUs to be granted, however, is
|
||||
subjected to constraints imposed by its parent and can differ
|
||||
from the requested CPUs.
|
||||
|
||||
The CPU numbers are comma-separated numbers or ranges.
|
||||
For example:
|
||||
|
||||
# cat cpuset.cpus
|
||||
0-4,6,8-10
|
||||
|
||||
An empty value indicates that the cgroup is using the same
|
||||
setting as the nearest cgroup ancestor with a non-empty
|
||||
"cpuset.cpus" or all the available CPUs if none is found.
|
||||
|
||||
The value of "cpuset.cpus" stays constant until the next update
|
||||
and won't be affected by any CPU hotplug events.
|
||||
|
||||
cpuset.cpus.effective
|
||||
A read-only multiple values file which exists on all
|
||||
cpuset-enabled cgroups.
|
||||
|
||||
It lists the onlined CPUs that are actually granted to this
|
||||
cgroup by its parent. These CPUs are allowed to be used by
|
||||
tasks within the current cgroup.
|
||||
|
||||
If "cpuset.cpus" is empty, the "cpuset.cpus.effective" file shows
|
||||
all the CPUs from the parent cgroup that can be available to
|
||||
be used by this cgroup. Otherwise, it should be a subset of
|
||||
"cpuset.cpus" unless none of the CPUs listed in "cpuset.cpus"
|
||||
can be granted. In this case, it will be treated just like an
|
||||
empty "cpuset.cpus".
|
||||
|
||||
Its value will be affected by CPU hotplug events.
|
||||
|
||||
cpuset.mems
|
||||
A read-write multiple values file which exists on non-root
|
||||
cpuset-enabled cgroups.
|
||||
|
||||
It lists the requested memory nodes to be used by tasks within
|
||||
this cgroup. The actual list of memory nodes granted, however,
|
||||
is subjected to constraints imposed by its parent and can differ
|
||||
from the requested memory nodes.
|
||||
|
||||
The memory node numbers are comma-separated numbers or ranges.
|
||||
For example:
|
||||
|
||||
# cat cpuset.mems
|
||||
0-1,3
|
||||
|
||||
An empty value indicates that the cgroup is using the same
|
||||
setting as the nearest cgroup ancestor with a non-empty
|
||||
"cpuset.mems" or all the available memory nodes if none
|
||||
is found.
|
||||
|
||||
The value of "cpuset.mems" stays constant until the next update
|
||||
and won't be affected by any memory nodes hotplug events.
|
||||
|
||||
cpuset.mems.effective
|
||||
A read-only multiple values file which exists on all
|
||||
cpuset-enabled cgroups.
|
||||
|
||||
It lists the onlined memory nodes that are actually granted to
|
||||
this cgroup by its parent. These memory nodes are allowed to
|
||||
be used by tasks within the current cgroup.
|
||||
|
||||
If "cpuset.mems" is empty, it shows all the memory nodes from the
|
||||
parent cgroup that will be available to be used by this cgroup.
|
||||
Otherwise, it should be a subset of "cpuset.mems" unless none of
|
||||
the memory nodes listed in "cpuset.mems" can be granted. In this
|
||||
case, it will be treated just like an empty "cpuset.mems".
|
||||
|
||||
Its value will be affected by memory nodes hotplug events.
|
||||
|
||||
cpuset.cpus.partition
|
||||
A read-write single value file which exists on non-root
|
||||
cpuset-enabled cgroups. This flag is owned by the parent cgroup
|
||||
and is not delegatable.
|
||||
|
||||
It accepts only the following input values when written to.
|
||||
|
||||
"root" - a paritition root
|
||||
"member" - a non-root member of a partition
|
||||
|
||||
When set to be a partition root, the current cgroup is the
|
||||
root of a new partition or scheduling domain that comprises
|
||||
itself and all its descendants except those that are separate
|
||||
partition roots themselves and their descendants. The root
|
||||
cgroup is always a partition root.
|
||||
|
||||
There are constraints on where a partition root can be set.
|
||||
It can only be set in a cgroup if all the following conditions
|
||||
are true.
|
||||
|
||||
1) The "cpuset.cpus" is not empty and the list of CPUs are
|
||||
exclusive, i.e. they are not shared by any of its siblings.
|
||||
2) The parent cgroup is a partition root.
|
||||
3) The "cpuset.cpus" is also a proper subset of the parent's
|
||||
"cpuset.cpus.effective".
|
||||
4) There is no child cgroups with cpuset enabled. This is for
|
||||
eliminating corner cases that have to be handled if such a
|
||||
condition is allowed.
|
||||
|
||||
Setting it to partition root will take the CPUs away from the
|
||||
effective CPUs of the parent cgroup. Once it is set, this
|
||||
file cannot be reverted back to "member" if there are any child
|
||||
cgroups with cpuset enabled.
|
||||
|
||||
A parent partition cannot distribute all its CPUs to its
|
||||
child partitions. There must be at least one cpu left in the
|
||||
parent partition.
|
||||
|
||||
Once becoming a partition root, changes to "cpuset.cpus" is
|
||||
generally allowed as long as the first condition above is true,
|
||||
the change will not take away all the CPUs from the parent
|
||||
partition and the new "cpuset.cpus" value is a superset of its
|
||||
children's "cpuset.cpus" values.
|
||||
|
||||
Sometimes, external factors like changes to ancestors'
|
||||
"cpuset.cpus" or cpu hotplug can cause the state of the partition
|
||||
root to change. On read, the "cpuset.sched.partition" file
|
||||
can show the following values.
|
||||
|
||||
"member" Non-root member of a partition
|
||||
"root" Partition root
|
||||
"root invalid" Invalid partition root
|
||||
|
||||
It is a partition root if the first 2 partition root conditions
|
||||
above are true and at least one CPU from "cpuset.cpus" is
|
||||
granted by the parent cgroup.
|
||||
|
||||
A partition root can become invalid if none of CPUs requested
|
||||
in "cpuset.cpus" can be granted by the parent cgroup or the
|
||||
parent cgroup is no longer a partition root itself. In this
|
||||
case, it is not a real partition even though the restriction
|
||||
of the first partition root condition above will still apply.
|
||||
The cpu affinity of all the tasks in the cgroup will then be
|
||||
associated with CPUs in the nearest ancestor partition.
|
||||
|
||||
An invalid partition root can be transitioned back to a
|
||||
real partition root if at least one of the requested CPUs
|
||||
can now be granted by its parent. In this case, the cpu
|
||||
affinity of all the tasks in the formerly invalid partition
|
||||
will be associated to the CPUs of the newly formed partition.
|
||||
Changing the partition state of an invalid partition root to
|
||||
"member" is always allowed even if child cpusets are present.
|
||||
|
||||
|
||||
Device controller
|
||||
-----------------
|
||||
|
||||
|
|
@ -1857,8 +2067,10 @@ following two functions.
|
|||
|
||||
wbc_init_bio(@wbc, @bio)
|
||||
Should be called for each bio carrying writeback data and
|
||||
associates the bio with the inode's owner cgroup. Can be
|
||||
called anytime between bio allocation and submission.
|
||||
associates the bio with the inode's owner cgroup and the
|
||||
corresponding request queue. This must be called after
|
||||
a queue (device) has been associated with the bio and
|
||||
before submission.
|
||||
|
||||
wbc_account_io(@wbc, @page, @bytes)
|
||||
Should be called for each data segment being written out.
|
||||
|
|
@ -1877,7 +2089,7 @@ the configuration, the bio may be executed at a lower priority and if
|
|||
the writeback session is holding shared resources, e.g. a journal
|
||||
entry, may lead to priority inversion. There is no one easy solution
|
||||
for the problem. Filesystems can try to work around specific problem
|
||||
cases by skipping wbc_init_bio() or using bio_associate_blkcg()
|
||||
cases by skipping wbc_init_bio() and using bio_associate_blkg()
|
||||
directly.
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
.. _admin_devices:
|
||||
|
||||
Linux allocated devices (4.x+ version)
|
||||
======================================
|
||||
|
|
|
|||
|
|
@ -110,8 +110,8 @@ If your query set is big, you can batch them too::
|
|||
|
||||
~# cat query-batch-file > <debugfs>/dynamic_debug/control
|
||||
|
||||
A another way is to use wildcard. The match rule support ``*`` (matches
|
||||
zero or more characters) and ``?`` (matches exactly one character).For
|
||||
Another way is to use wildcards. The match rule supports ``*`` (matches
|
||||
zero or more characters) and ``?`` (matches exactly one character). For
|
||||
example, you can match all usb drivers::
|
||||
|
||||
~# echo "file drivers/usb/* +p" > <debugfs>/dynamic_debug/control
|
||||
|
|
@ -258,7 +258,7 @@ this boot parameter for debugging purposes.
|
|||
|
||||
If ``foo`` module is not built-in, ``foo.dyndbg`` will still be processed at
|
||||
boot time, without effect, but will be reprocessed when module is
|
||||
loaded later. ``dyndbg_query=`` and bare ``dyndbg=`` are only processed at
|
||||
loaded later. ``ddebug_query=`` and bare ``dyndbg=`` are only processed at
|
||||
boot.
|
||||
|
||||
|
||||
|
|
@ -301,7 +301,7 @@ The ``dyndbg`` option is a "fake" module parameter, which means:
|
|||
|
||||
For ``CONFIG_DYNAMIC_DEBUG`` kernels, any settings given at boot-time (or
|
||||
enabled by ``-DDEBUG`` flag during compilation) can be disabled later via
|
||||
the sysfs interface if the debug messages are no longer needed::
|
||||
the debugfs interface if the debug messages are no longer needed::
|
||||
|
||||
echo "module module_name -p" > <debugfs>/dynamic_debug/control
|
||||
|
||||
|
|
|
|||
574
Documentation/admin-guide/ext4.rst
Normal file
|
|
@ -0,0 +1,574 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
========================
|
||||
ext4 General Information
|
||||
========================
|
||||
|
||||
Ext4 is an advanced level of the ext3 filesystem which incorporates
|
||||
scalability and reliability enhancements for supporting large filesystems
|
||||
(64 bit) in keeping with increasing disk capacities and state-of-the-art
|
||||
feature requirements.
|
||||
|
||||
Mailing list: linux-ext4@vger.kernel.org
|
||||
Web site: http://ext4.wiki.kernel.org
|
||||
|
||||
|
||||
Quick usage instructions
|
||||
========================
|
||||
|
||||
Note: More extensive information for getting started with ext4 can be
|
||||
found at the ext4 wiki site at the URL:
|
||||
http://ext4.wiki.kernel.org/index.php/Ext4_Howto
|
||||
|
||||
- The latest version of e2fsprogs can be found at:
|
||||
|
||||
https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
|
||||
|
||||
or
|
||||
|
||||
http://sourceforge.net/project/showfiles.php?group_id=2406
|
||||
|
||||
or grab the latest git repository from:
|
||||
|
||||
https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
|
||||
|
||||
- Create a new filesystem using the ext4 filesystem type:
|
||||
|
||||
# mke2fs -t ext4 /dev/hda1
|
||||
|
||||
Or to configure an existing ext3 filesystem to support extents:
|
||||
|
||||
# tune2fs -O extents /dev/hda1
|
||||
|
||||
If the filesystem was created with 128 byte inodes, it can be
|
||||
converted to use 256 byte for greater efficiency via:
|
||||
|
||||
# tune2fs -I 256 /dev/hda1
|
||||
|
||||
- Mounting:
|
||||
|
||||
# mount -t ext4 /dev/hda1 /wherever
|
||||
|
||||
- When comparing performance with other filesystems, it's always
|
||||
important to try multiple workloads; very often a subtle change in a
|
||||
workload parameter can completely change the ranking of which
|
||||
filesystems do well compared to others. When comparing versus ext3,
|
||||
note that ext4 enables write barriers by default, while ext3 does
|
||||
not enable write barriers by default. So it is useful to use
|
||||
explicitly specify whether barriers are enabled or not when via the
|
||||
'-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems
|
||||
for a fair comparison. When tuning ext3 for best benchmark numbers,
|
||||
it is often worthwhile to try changing the data journaling mode; '-o
|
||||
data=writeback' can be faster for some workloads. (Note however that
|
||||
running mounted with data=writeback can potentially leave stale data
|
||||
exposed in recently written files in case of an unclean shutdown,
|
||||
which could be a security exposure in some situations.) Configuring
|
||||
the filesystem with a large journal can also be helpful for
|
||||
metadata-intensive workloads.
|
||||
|
||||
Features
|
||||
========
|
||||
|
||||
Currently Available
|
||||
-------------------
|
||||
|
||||
* ability to use filesystems > 16TB (e2fsprogs support not available yet)
|
||||
* extent format reduces metadata overhead (RAM, IO for access, transactions)
|
||||
* extent format more robust in face of on-disk corruption due to magics,
|
||||
* internal redundancy in tree
|
||||
* improved file allocation (multi-block alloc)
|
||||
* lift 32000 subdirectory limit imposed by i_links_count[1]
|
||||
* nsec timestamps for mtime, atime, ctime, create time
|
||||
* inode version field on disk (NFSv4, Lustre)
|
||||
* reduced e2fsck time via uninit_bg feature
|
||||
* journal checksumming for robustness, performance
|
||||
* persistent file preallocation (e.g for streaming media, databases)
|
||||
* ability to pack bitmaps and inode tables into larger virtual groups via the
|
||||
flex_bg feature
|
||||
* large file support
|
||||
* inode allocation using large virtual block groups via flex_bg
|
||||
* delayed allocation
|
||||
* large block (up to pagesize) support
|
||||
* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
|
||||
the ordering)
|
||||
|
||||
[1] Filesystems with a block size of 1k may see a limit imposed by the
|
||||
directory hash tree having a maximum depth of two.
|
||||
|
||||
Options
|
||||
=======
|
||||
|
||||
When mounting an ext4 filesystem, the following option are accepted:
|
||||
(*) == default
|
||||
|
||||
ro
|
||||
Mount filesystem read only. Note that ext4 will replay the journal (and
|
||||
thus write to the partition) even when mounted "read only". The mount
|
||||
options "ro,noload" can be used to prevent writes to the filesystem.
|
||||
|
||||
journal_checksum
|
||||
Enable checksumming of the journal transactions. This will allow the
|
||||
recovery code in e2fsck and the kernel to detect corruption in the
|
||||
kernel. It is a compatible change and will be ignored by older
|
||||
kernels.
|
||||
|
||||
journal_async_commit
|
||||
Commit block can be written to disk without waiting for descriptor
|
||||
blocks. If enabled older kernels cannot mount the device. This will
|
||||
enable 'journal_checksum' internally.
|
||||
|
||||
journal_path=path, journal_dev=devnum
|
||||
When the external journal device's major/minor numbers have changed,
|
||||
these options allow the user to specify the new journal location. The
|
||||
journal device is identified through either its new major/minor numbers
|
||||
encoded in devnum, or via a path to the device.
|
||||
|
||||
norecovery, noload
|
||||
Don't load the journal on mounting. Note that if the filesystem was
|
||||
not unmounted cleanly, skipping the journal replay will lead to the
|
||||
filesystem containing inconsistencies that can lead to any number of
|
||||
problems.
|
||||
|
||||
data=journal
|
||||
All data are committed into the journal prior to being written into the
|
||||
main file system. Enabling this mode will disable delayed allocation
|
||||
and O_DIRECT support.
|
||||
|
||||
data=ordered (*)
|
||||
All data are forced directly out to the main file system prior to its
|
||||
metadata being committed to the journal.
|
||||
|
||||
data=writeback
|
||||
Data ordering is not preserved, data may be written into the main file
|
||||
system after its metadata has been committed to the journal.
|
||||
|
||||
commit=nrsec (*)
|
||||
Ext4 can be told to sync all its data and metadata every 'nrsec'
|
||||
seconds. The default value is 5 seconds. This means that if you lose
|
||||
your power, you will lose as much as the latest 5 seconds of work (your
|
||||
filesystem will not be damaged though, thanks to the journaling). This
|
||||
default value (or any low value) will hurt performance, but it's good
|
||||
for data-safety. Setting it to 0 will have the same effect as leaving
|
||||
it at the default (5 seconds). Setting it to very large values will
|
||||
improve performance.
|
||||
|
||||
barrier=<0|1(*)>, barrier(*), nobarrier
|
||||
This enables/disables the use of write barriers in the jbd code.
|
||||
barrier=0 disables, barrier=1 enables. This also requires an IO stack
|
||||
which can support barriers, and if jbd gets an error on a barrier
|
||||
write, it will disable again with a warning. Write barriers enforce
|
||||
proper on-disk ordering of journal commits, making volatile disk write
|
||||
caches safe to use, at some performance penalty. If your disks are
|
||||
battery-backed in one way or another, disabling barriers may safely
|
||||
improve performance. The mount options "barrier" and "nobarrier" can
|
||||
also be used to enable or disable barriers, for consistency with other
|
||||
ext4 mount options.
|
||||
|
||||
inode_readahead_blks=n
|
||||
This tuning parameter controls the maximum number of inode table blocks
|
||||
that ext4's inode table readahead algorithm will pre-read into the
|
||||
buffer cache. The default value is 32 blocks.
|
||||
|
||||
nouser_xattr
|
||||
Disables Extended User Attributes. See the attr(5) manual page for
|
||||
more information about extended attributes.
|
||||
|
||||
noacl
|
||||
This option disables POSIX Access Control List support. If ACL support
|
||||
is enabled in the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL
|
||||
is enabled by default on mount. See the acl(5) manual page for more
|
||||
information about acl.
|
||||
|
||||
bsddf (*)
|
||||
Make 'df' act like BSD.
|
||||
|
||||
minixdf
|
||||
Make 'df' act like Minix.
|
||||
|
||||
debug
|
||||
Extra debugging information is sent to syslog.
|
||||
|
||||
abort
|
||||
Simulate the effects of calling ext4_abort() for debugging purposes.
|
||||
This is normally used while remounting a filesystem which is already
|
||||
mounted.
|
||||
|
||||
errors=remount-ro
|
||||
Remount the filesystem read-only on an error.
|
||||
|
||||
errors=continue
|
||||
Keep going on a filesystem error.
|
||||
|
||||
errors=panic
|
||||
Panic and halt the machine if an error occurs. (These mount options
|
||||
override the errors behavior specified in the superblock, which can be
|
||||
configured using tune2fs)
|
||||
|
||||
data_err=ignore(*)
|
||||
Just print an error message if an error occurs in a file data buffer in
|
||||
ordered mode.
|
||||
data_err=abort
|
||||
Abort the journal if an error occurs in a file data buffer in ordered
|
||||
mode.
|
||||
|
||||
grpid | bsdgroups
|
||||
New objects have the group ID of their parent.
|
||||
|
||||
nogrpid (*) | sysvgroups
|
||||
New objects have the group ID of their creator.
|
||||
|
||||
resgid=n
|
||||
The group ID which may use the reserved blocks.
|
||||
|
||||
resuid=n
|
||||
The user ID which may use the reserved blocks.
|
||||
|
||||
sb=
|
||||
Use alternate superblock at this location.
|
||||
|
||||
quota, noquota, grpquota, usrquota
|
||||
These options are ignored by the filesystem. They are used only by
|
||||
quota tools to recognize volumes where quota should be turned on. See
|
||||
documentation in the quota-tools package for more details
|
||||
(http://sourceforge.net/projects/linuxquota).
|
||||
|
||||
jqfmt=<quota type>, usrjquota=<file>, grpjquota=<file>
|
||||
These options tell filesystem details about quota so that quota
|
||||
information can be properly updated during journal replay. They replace
|
||||
the above quota options. See documentation in the quota-tools package
|
||||
for more details (http://sourceforge.net/projects/linuxquota).
|
||||
|
||||
stripe=n
|
||||
Number of filesystem blocks that mballoc will try to use for allocation
|
||||
size and alignment. For RAID5/6 systems this should be the number of
|
||||
data disks * RAID chunk size in file system blocks.
|
||||
|
||||
delalloc (*)
|
||||
Defer block allocation until just before ext4 writes out the block(s)
|
||||
in question. This allows ext4 to better allocation decisions more
|
||||
efficiently.
|
||||
|
||||
nodelalloc
|
||||
Disable delayed allocation. Blocks are allocated when the data is
|
||||
copied from userspace to the page cache, either via the write(2) system
|
||||
call or when an mmap'ed page which was previously unallocated is
|
||||
written for the first time.
|
||||
|
||||
max_batch_time=usec
|
||||
Maximum amount of time ext4 should wait for additional filesystem
|
||||
operations to be batch together with a synchronous write operation.
|
||||
Since a synchronous write operation is going to force a commit and then
|
||||
a wait for the I/O complete, it doesn't cost much, and can be a huge
|
||||
throughput win, we wait for a small amount of time to see if any other
|
||||
transactions can piggyback on the synchronous write. The algorithm
|
||||
used is designed to automatically tune for the speed of the disk, by
|
||||
measuring the amount of time (on average) that it takes to finish
|
||||
committing a transaction. Call this time the "commit time". If the
|
||||
time that the transaction has been running is less than the commit
|
||||
time, ext4 will try sleeping for the commit time to see if other
|
||||
operations will join the transaction. The commit time is capped by
|
||||
the max_batch_time, which defaults to 15000us (15ms). This
|
||||
optimization can be turned off entirely by setting max_batch_time to 0.
|
||||
|
||||
min_batch_time=usec
|
||||
This parameter sets the commit time (as described above) to be at least
|
||||
min_batch_time. It defaults to zero microseconds. Increasing this
|
||||
parameter may improve the throughput of multi-threaded, synchronous
|
||||
workloads on very fast disks, at the cost of increasing latency.
|
||||
|
||||
journal_ioprio=prio
|
||||
The I/O priority (from 0 to 7, where 0 is the highest priority) which
|
||||
should be used for I/O operations submitted by kjournald2 during a
|
||||
commit operation. This defaults to 3, which is a slightly higher
|
||||
priority than the default I/O priority.
|
||||
|
||||
auto_da_alloc(*), noauto_da_alloc
|
||||
Many broken applications don't use fsync() when replacing existing
|
||||
files via patterns such as fd = open("foo.new")/write(fd,..)/close(fd)/
|
||||
rename("foo.new", "foo"), or worse yet, fd = open("foo",
|
||||
O_TRUNC)/write(fd,..)/close(fd). If auto_da_alloc is enabled, ext4
|
||||
will detect the replace-via-rename and replace-via-truncate patterns
|
||||
and force that any delayed allocation blocks are allocated such that at
|
||||
the next journal commit, in the default data=ordered mode, the data
|
||||
blocks of the new file are forced to disk before the rename() operation
|
||||
is committed. This provides roughly the same level of guarantees as
|
||||
ext3, and avoids the "zero-length" problem that can happen when a
|
||||
system crashes before the delayed allocation blocks are forced to disk.
|
||||
|
||||
noinit_itable
|
||||
Do not initialize any uninitialized inode table blocks in the
|
||||
background. This feature may be used by installation CD's so that the
|
||||
install process can complete as quickly as possible; the inode table
|
||||
initialization process would then be deferred until the next time the
|
||||
file system is unmounted.
|
||||
|
||||
init_itable=n
|
||||
The lazy itable init code will wait n times the number of milliseconds
|
||||
it took to zero out the previous block group's inode table. This
|
||||
minimizes the impact on the system performance while file system's
|
||||
inode table is being initialized.
|
||||
|
||||
discard, nodiscard(*)
|
||||
Controls whether ext4 should issue discard/TRIM commands to the
|
||||
underlying block device when blocks are freed. This is useful for SSD
|
||||
devices and sparse/thinly-provisioned LUNs, but it is off by default
|
||||
until sufficient testing has been done.
|
||||
|
||||
nouid32
|
||||
Disables 32-bit UIDs and GIDs. This is for interoperability with
|
||||
older kernels which only store and expect 16-bit values.
|
||||
|
||||
block_validity(*), noblock_validity
|
||||
These options enable or disable the in-kernel facility for tracking
|
||||
filesystem metadata blocks within internal data structures. This
|
||||
allows multi- block allocator and other routines to notice bugs or
|
||||
corrupted allocation bitmaps which cause blocks to be allocated which
|
||||
overlap with filesystem metadata blocks.
|
||||
|
||||
dioread_lock, dioread_nolock
|
||||
Controls whether or not ext4 should use the DIO read locking. If the
|
||||
dioread_nolock option is specified ext4 will allocate uninitialized
|
||||
extent before buffer write and convert the extent to initialized after
|
||||
IO completes. This approach allows ext4 code to avoid using inode
|
||||
mutex, which improves scalability on high speed storages. However this
|
||||
does not work with data journaling and dioread_nolock option will be
|
||||
ignored with kernel warning. Note that dioread_nolock code path is only
|
||||
used for extent-based files. Because of the restrictions this options
|
||||
comprises it is off by default (e.g. dioread_lock).
|
||||
|
||||
max_dir_size_kb=n
|
||||
This limits the size of directories so that any attempt to expand them
|
||||
beyond the specified limit in kilobytes will cause an ENOSPC error.
|
||||
This is useful in memory constrained environments, where a very large
|
||||
directory can cause severe performance problems or even provoke the Out
|
||||
Of Memory killer. (For example, if there is only 512mb memory
|
||||
available, a 176mb directory may seriously cramp the system's style.)
|
||||
|
||||
i_version
|
||||
Enable 64-bit inode version support. This option is off by default.
|
||||
|
||||
dax
|
||||
Use direct access (no page cache). See
|
||||
Documentation/filesystems/dax.txt. Note that this option is
|
||||
incompatible with data=journal.
|
||||
|
||||
Data Mode
|
||||
=========
|
||||
There are 3 different data modes:
|
||||
|
||||
* writeback mode
|
||||
|
||||
In data=writeback mode, ext4 does not journal data at all. This mode provides
|
||||
a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
|
||||
mode - metadata journaling. A crash+recovery can cause incorrect data to
|
||||
appear in files which were written shortly before the crash. This mode will
|
||||
typically provide the best ext4 performance.
|
||||
|
||||
* ordered mode
|
||||
|
||||
In data=ordered mode, ext4 only officially journals metadata, but it logically
|
||||
groups metadata information related to data changes with the data blocks into
|
||||
a single unit called a transaction. When it's time to write the new metadata
|
||||
out to disk, the associated data blocks are written first. In general, this
|
||||
mode performs slightly slower than writeback but significantly faster than
|
||||
journal mode.
|
||||
|
||||
* journal mode
|
||||
|
||||
data=journal mode provides full data and metadata journaling. All new data is
|
||||
written to the journal first, and then to its final location. In the event of
|
||||
a crash, the journal can be replayed, bringing both data and metadata into a
|
||||
consistent state. This mode is the slowest except when data needs to be read
|
||||
from and written to disk at the same time where it outperforms all others
|
||||
modes. Enabling this mode will disable delayed allocation and O_DIRECT
|
||||
support.
|
||||
|
||||
/proc entries
|
||||
=============
|
||||
|
||||
Information about mounted ext4 file systems can be found in
|
||||
/proc/fs/ext4. Each mounted filesystem will have a directory in
|
||||
/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
|
||||
/proc/fs/ext4/dm-0). The files in each per-device directory are shown
|
||||
in table below.
|
||||
|
||||
Files in /proc/fs/ext4/<devname>
|
||||
|
||||
mb_groups
|
||||
details of multiblock allocator buddy cache of free blocks
|
||||
|
||||
/sys entries
|
||||
============
|
||||
|
||||
Information about mounted ext4 file systems can be found in
|
||||
/sys/fs/ext4. Each mounted filesystem will have a directory in
|
||||
/sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or
|
||||
/sys/fs/ext4/dm-0). The files in each per-device directory are shown
|
||||
in table below.
|
||||
|
||||
Files in /sys/fs/ext4/<devname>:
|
||||
|
||||
(see also Documentation/ABI/testing/sysfs-fs-ext4)
|
||||
|
||||
delayed_allocation_blocks
|
||||
This file is read-only and shows the number of blocks that are dirty in
|
||||
the page cache, but which do not have their location in the filesystem
|
||||
allocated yet.
|
||||
|
||||
inode_goal
|
||||
Tuning parameter which (if non-zero) controls the goal inode used by
|
||||
the inode allocator in preference to all other allocation heuristics.
|
||||
This is intended for debugging use only, and should be 0 on production
|
||||
systems.
|
||||
|
||||
inode_readahead_blks
|
||||
Tuning parameter which controls the maximum number of inode table
|
||||
blocks that ext4's inode table readahead algorithm will pre-read into
|
||||
the buffer cache.
|
||||
|
||||
lifetime_write_kbytes
|
||||
This file is read-only and shows the number of kilobytes of data that
|
||||
have been written to this filesystem since it was created.
|
||||
|
||||
max_writeback_mb_bump
|
||||
The maximum number of megabytes the writeback code will try to write
|
||||
out before move on to another inode.
|
||||
|
||||
mb_group_prealloc
|
||||
The multiblock allocator will round up allocation requests to a
|
||||
multiple of this tuning parameter if the stripe size is not set in the
|
||||
ext4 superblock
|
||||
|
||||
mb_max_to_scan
|
||||
The maximum number of extents the multiblock allocator will search to
|
||||
find the best extent.
|
||||
|
||||
mb_min_to_scan
|
||||
The minimum number of extents the multiblock allocator will search to
|
||||
find the best extent.
|
||||
|
||||
mb_order2_req
|
||||
Tuning parameter which controls the minimum size for requests (as a
|
||||
power of 2) where the buddy cache is used.
|
||||
|
||||
mb_stats
|
||||
Controls whether the multiblock allocator should collect statistics,
|
||||
which are shown during the unmount. 1 means to collect statistics, 0
|
||||
means not to collect statistics.
|
||||
|
||||
mb_stream_req
|
||||
Files which have fewer blocks than this tunable parameter will have
|
||||
their blocks allocated out of a block group specific preallocation
|
||||
pool, so that small files are packed closely together. Each large file
|
||||
will have its blocks allocated out of its own unique preallocation
|
||||
pool.
|
||||
|
||||
session_write_kbytes
|
||||
This file is read-only and shows the number of kilobytes of data that
|
||||
have been written to this filesystem since it was mounted.
|
||||
|
||||
reserved_clusters
|
||||
This is RW file and contains number of reserved clusters in the file
|
||||
system which will be used in the specific situations to avoid costly
|
||||
zeroout, unexpected ENOSPC, or possible data loss. The default is 2% or
|
||||
4096 clusters, whichever is smaller and this can be changed however it
|
||||
can never exceed number of clusters in the file system. If there is not
|
||||
enough space for the reserved space when mounting the file mount will
|
||||
_not_ fail.
|
||||
|
||||
Ioctls
|
||||
======
|
||||
|
||||
There is some Ext4 specific functionality which can be accessed by applications
|
||||
through the system call interfaces. The list of all Ext4 specific ioctls are
|
||||
shown in the table below.
|
||||
|
||||
Table of Ext4 specific ioctls
|
||||
|
||||
EXT4_IOC_GETFLAGS
|
||||
Get additional attributes associated with inode. The ioctl argument is
|
||||
an integer bitfield, with bit values described in ext4.h. This ioctl is
|
||||
an alias for FS_IOC_GETFLAGS.
|
||||
|
||||
EXT4_IOC_SETFLAGS
|
||||
Set additional attributes associated with inode. The ioctl argument is
|
||||
an integer bitfield, with bit values described in ext4.h. This ioctl is
|
||||
an alias for FS_IOC_SETFLAGS.
|
||||
|
||||
EXT4_IOC_GETVERSION, EXT4_IOC_GETVERSION_OLD
|
||||
Get the inode i_generation number stored for each inode. The
|
||||
i_generation number is normally changed only when new inode is created
|
||||
and it is particularly useful for network filesystems. The '_OLD'
|
||||
version of this ioctl is an alias for FS_IOC_GETVERSION.
|
||||
|
||||
EXT4_IOC_SETVERSION, EXT4_IOC_SETVERSION_OLD
|
||||
Set the inode i_generation number stored for each inode. The '_OLD'
|
||||
version of this ioctl is an alias for FS_IOC_SETVERSION.
|
||||
|
||||
EXT4_IOC_GROUP_EXTEND
|
||||
This ioctl has the same purpose as the resize mount option. It allows
|
||||
to resize filesystem to the end of the last existing block group,
|
||||
further resize has to be done with resize2fs, either online, or
|
||||
offline. The argument points to the unsigned logn number representing
|
||||
the filesystem new block count.
|
||||
|
||||
EXT4_IOC_MOVE_EXT
|
||||
Move the block extents from orig_fd (the one this ioctl is pointing to)
|
||||
to the donor_fd (the one specified in move_extent structure passed as
|
||||
an argument to this ioctl). Then, exchange inode metadata between
|
||||
orig_fd and donor_fd. This is especially useful for online
|
||||
defragmentation, because the allocator has the opportunity to allocate
|
||||
moved blocks better, ideally into one contiguous extent.
|
||||
|
||||
EXT4_IOC_GROUP_ADD
|
||||
Add a new group descriptor to an existing or new group descriptor
|
||||
block. The new group descriptor is described by ext4_new_group_input
|
||||
structure, which is passed as an argument to this ioctl. This is
|
||||
especially useful in conjunction with EXT4_IOC_GROUP_EXTEND, which
|
||||
allows online resize of the filesystem to the end of the last existing
|
||||
block group. Those two ioctls combined is used in userspace online
|
||||
resize tool (e.g. resize2fs).
|
||||
|
||||
EXT4_IOC_MIGRATE
|
||||
This ioctl operates on the filesystem itself. It converts (migrates)
|
||||
ext3 indirect block mapped inode to ext4 extent mapped inode by walking
|
||||
through indirect block mapping of the original inode and converting
|
||||
contiguous block ranges into ext4 extents of the temporary inode. Then,
|
||||
inodes are swapped. This ioctl might help, when migrating from ext3 to
|
||||
ext4 filesystem, however suggestion is to create fresh ext4 filesystem
|
||||
and copy data from the backup. Note, that filesystem has to support
|
||||
extents for this ioctl to work.
|
||||
|
||||
EXT4_IOC_ALLOC_DA_BLKS
|
||||
Force all of the delay allocated blocks to be allocated to preserve
|
||||
application-expected ext3 behaviour. Note that this will also start
|
||||
triggering a write of the data blocks, but this behaviour may change in
|
||||
the future as it is not necessary and has been done this way only for
|
||||
sake of simplicity.
|
||||
|
||||
EXT4_IOC_RESIZE_FS
|
||||
Resize the filesystem to a new size. The number of blocks of resized
|
||||
filesystem is passed in via 64 bit integer argument. The kernel
|
||||
allocates bitmaps and inode table, the userspace tool thus just passes
|
||||
the new number of blocks.
|
||||
|
||||
EXT4_IOC_SWAP_BOOT
|
||||
Swap i_blocks and associated attributes (like i_blocks, i_size,
|
||||
i_flags, ...) from the specified inode with inode EXT4_BOOT_LOADER_INO
|
||||
(#5). This is typically used to store a boot loader in a secure part of
|
||||
the filesystem, where it can't be changed by a normal user by accident.
|
||||
The data blocks of the previous boot loader will be associated with the
|
||||
given inode.
|
||||
|
||||
References
|
||||
==========
|
||||
|
||||
kernel source: <file:fs/ext4/>
|
||||
<file:fs/jbd2/>
|
||||
|
||||
programs: http://e2fsprogs.sourceforge.net/
|
||||
|
||||
useful links: http://fedoraproject.org/wiki/ext3-devel
|
||||
http://www.bullopensource.org/ext4/
|
||||
http://ext4.wiki.kernel.org/index.php/Main_Page
|
||||
http://fedoraproject.org/wiki/Features/Ext4
|
||||
|
|
@ -71,10 +71,12 @@ configure specific aspects of kernel behavior to your liking.
|
|||
java
|
||||
ras
|
||||
bcache
|
||||
ext4
|
||||
pm/index
|
||||
thunderbolt
|
||||
LSM/index
|
||||
mm/index
|
||||
perf-security
|
||||
|
||||
.. only:: subproject and html
|
||||
|
||||
|
|
|
|||
|
|
@ -331,7 +331,7 @@
|
|||
APC and your system crashes randomly.
|
||||
|
||||
apic= [APIC,X86] Advanced Programmable Interrupt Controller
|
||||
Change the output verbosity whilst booting
|
||||
Change the output verbosity while booting
|
||||
Format: { quiet (default) | verbose | debug }
|
||||
Change the amount of debugging information output
|
||||
when initialising the APIC and IO-APIC components.
|
||||
|
|
@ -461,6 +461,11 @@
|
|||
possible to determine what the correct size should be.
|
||||
This option provides an override for these situations.
|
||||
|
||||
carrier_timeout=
|
||||
[NET] Specifies amount of time (in seconds) that
|
||||
the kernel should wait for a network carrier. By default
|
||||
it waits 120 seconds.
|
||||
|
||||
ca_keys= [KEYS] This parameter identifies a specific key(s) on
|
||||
the system trusted keyring to be used for certificate
|
||||
trust validation.
|
||||
|
|
@ -486,10 +491,14 @@
|
|||
cut the overhead, others just disable the usage. So
|
||||
only cgroup_disable=memory is actually worthy}
|
||||
|
||||
cgroup_no_v1= [KNL] Disable one, multiple, all cgroup controllers in v1
|
||||
Format: { controller[,controller...] | "all" }
|
||||
cgroup_no_v1= [KNL] Disable cgroup controllers and named hierarchies in v1
|
||||
Format: { { controller | "all" | "named" }
|
||||
[,{ controller | "all" | "named" }...] }
|
||||
Like cgroup_disable, but only applies to cgroup v1;
|
||||
the blacklisted controllers remain available in cgroup2.
|
||||
"all" blacklists all controllers and "named" disables
|
||||
named mounts. Specifying both "all" and "named" disables
|
||||
all v1 hierarchies.
|
||||
|
||||
cgroup.memory= [KNL] Pass options to the cgroup memory controller.
|
||||
Format: <string>
|
||||
|
|
@ -674,6 +683,9 @@
|
|||
cpuidle.off=1 [CPU_IDLE]
|
||||
disable the cpuidle sub-system
|
||||
|
||||
cpuidle.governor=
|
||||
[CPU_IDLE] Name of the cpuidle governor to use.
|
||||
|
||||
cpufreq.off=1 [CPU_FREQ]
|
||||
disable the cpufreq sub-system
|
||||
|
||||
|
|
@ -856,6 +868,12 @@
|
|||
causing system reset or hang due to sending
|
||||
INIT from AP to BSP.
|
||||
|
||||
perf_v4_pmi= [X86,INTEL]
|
||||
Format: <bool>
|
||||
Disable Intel PMU counter freezing feature.
|
||||
The feature only exists starting from
|
||||
Arch Perfmon v4 (Skylake and newer).
|
||||
|
||||
disable_ddw [PPC/PSERIES]
|
||||
Disable Dynamic DMA Window support. Use this if
|
||||
to workaround buggy firmware.
|
||||
|
|
@ -897,6 +915,10 @@
|
|||
The filter can be disabled or changed to another
|
||||
driver later using sysfs.
|
||||
|
||||
driver_async_probe= [KNL]
|
||||
List of driver names to be probed asynchronously.
|
||||
Format: <driver_name1>,<driver_name2>...
|
||||
|
||||
drm.edid_firmware=[<connector>:]<file>[,[<connector>:]<file>]
|
||||
Broken monitors, graphic adapters, KVMs and EDIDless
|
||||
panels may send no or incorrect EDID data sets.
|
||||
|
|
@ -1015,6 +1037,12 @@
|
|||
specified address. The serial port must already be
|
||||
setup and configured. Options are not yet supported.
|
||||
|
||||
rda,<addr>
|
||||
Start an early, polled-mode console on a serial port
|
||||
of an RDA Micro SoC, such as RDA8810PL, at the
|
||||
specified address. The serial port must already be
|
||||
setup and configured. Options are not yet supported.
|
||||
|
||||
smh Use ARM semihosting calls for early console.
|
||||
|
||||
s3c2410,<addr>
|
||||
|
|
@ -1054,16 +1082,22 @@
|
|||
specified address. The serial port must already be
|
||||
setup and configured. Options are not yet supported.
|
||||
|
||||
efifb,[options]
|
||||
Start an early, unaccelerated console on the EFI
|
||||
memory mapped framebuffer (if available). On cache
|
||||
coherent non-x86 systems that use system memory for
|
||||
the framebuffer, pass the 'ram' option so that it is
|
||||
mapped with the correct attributes.
|
||||
|
||||
earlyprintk= [X86,SH,ARM,M68k,S390]
|
||||
earlyprintk=vga
|
||||
earlyprintk=efi
|
||||
earlyprintk=sclp
|
||||
earlyprintk=xen
|
||||
earlyprintk=serial[,ttySn[,baudrate]]
|
||||
earlyprintk=serial[,0x...[,baudrate]]
|
||||
earlyprintk=ttySn[,baudrate]
|
||||
earlyprintk=dbgp[debugController#]
|
||||
earlyprintk=pciserial,bus:device.function[,baudrate]
|
||||
earlyprintk=pciserial[,force],bus:device.function[,baudrate]
|
||||
earlyprintk=xdbc[xhciController#]
|
||||
|
||||
earlyprintk is useful when the kernel crashes before
|
||||
|
|
@ -1095,6 +1129,10 @@
|
|||
|
||||
The sclp output can only be used on s390.
|
||||
|
||||
The optional "force" to "pciserial" enables use of a
|
||||
PCI device even when its classcode is not of the
|
||||
UART class.
|
||||
|
||||
edac_report= [HW,EDAC] Control how to report EDAC event
|
||||
Format: {"on" | "off" | "force"}
|
||||
on: enable EDAC to report H/W event. May be overridden
|
||||
|
|
@ -1159,9 +1197,10 @@
|
|||
arch/x86/kernel/cpu/cpufreq/elanfreq.c.
|
||||
|
||||
elevator= [IOSCHED]
|
||||
Format: {"cfq" | "deadline" | "noop"}
|
||||
See Documentation/block/cfq-iosched.txt and
|
||||
Documentation/block/deadline-iosched.txt for details.
|
||||
Format: { "mq-deadline" | "kyber" | "bfq" }
|
||||
See Documentation/block/deadline-iosched.txt,
|
||||
Documentation/block/kyber-iosched.txt and
|
||||
Documentation/block/bfq-iosched.txt for details.
|
||||
|
||||
elfcorehdr=[size[KMG]@]offset[KMG] [IA64,PPC,SH,X86,S390]
|
||||
Specifies physical address of start of kernel core
|
||||
|
|
@ -1385,6 +1424,11 @@
|
|||
hvc_iucv_allow= [S390] Comma-separated list of z/VM user IDs.
|
||||
If specified, z/VM IUCV HVC accepts connections
|
||||
from listed z/VM user IDs only.
|
||||
|
||||
hv_nopvspin [X86,HYPER_V] Disables the paravirt spinlock optimizations
|
||||
which allow the hypervisor to 'idle' the
|
||||
guest on lock contention.
|
||||
|
||||
keep_bootcon [KNL]
|
||||
Do not unregister boot console at start. This is only
|
||||
useful for debugging when something happens in the window
|
||||
|
|
@ -1668,12 +1712,11 @@
|
|||
By default, super page will be supported if Intel IOMMU
|
||||
has the capability. With this option, super page will
|
||||
not be supported.
|
||||
ecs_off [Default Off]
|
||||
By default, extended context tables will be supported if
|
||||
the hardware advertises that it has support both for the
|
||||
extended tables themselves, and also PASID support. With
|
||||
this option set, extended tables will not be used even
|
||||
on hardware which claims to support them.
|
||||
sm_on [Default Off]
|
||||
By default, scalable mode will be disabled even if the
|
||||
hardware advertises that it has support for the scalable
|
||||
mode translation. With this option set, scalable mode
|
||||
will be used on hardware which claims to support it.
|
||||
tboot_noforce [Default Off]
|
||||
Do not force the Intel IOMMU enabled under tboot.
|
||||
By default, tboot will force Intel IOMMU on, which
|
||||
|
|
@ -1749,12 +1792,24 @@
|
|||
nobypass [PPC/POWERNV]
|
||||
Disable IOMMU bypass, using IOMMU for PCI devices.
|
||||
|
||||
iommu.strict= [ARM64] Configure TLB invalidation behaviour
|
||||
Format: { "0" | "1" }
|
||||
0 - Lazy mode.
|
||||
Request that DMA unmap operations use deferred
|
||||
invalidation of hardware TLBs, for increased
|
||||
throughput at the cost of reduced device isolation.
|
||||
Will fall back to strict mode if not supported by
|
||||
the relevant IOMMU driver.
|
||||
1 - Strict mode (default).
|
||||
DMA unmap operations invalidate IOMMU hardware TLBs
|
||||
synchronously.
|
||||
|
||||
iommu.passthrough=
|
||||
[ARM64] Configure DMA to bypass the IOMMU by default.
|
||||
Format: { "0" | "1" }
|
||||
0 - Use IOMMU translation for DMA.
|
||||
1 - Bypass the IOMMU for DMA.
|
||||
unset - Use IOMMU translation for DMA.
|
||||
unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH.
|
||||
|
||||
io7= [HW] IO7 for Marvel based alpha systems
|
||||
See comment before marvel_specify_io7 in
|
||||
|
|
@ -1791,6 +1846,11 @@
|
|||
to let secondary kernels in charge of setting up
|
||||
LPIs.
|
||||
|
||||
irqchip.gicv3_pseudo_nmi= [ARM64]
|
||||
Enables support for pseudo-NMIs in the kernel. This
|
||||
requires the kernel to be built with
|
||||
CONFIG_ARM64_PSEUDO_NMI.
|
||||
|
||||
irqfixup [HW]
|
||||
When an interrupt is not handled search all handlers
|
||||
for it. Intended to get systems with badly broken
|
||||
|
|
@ -1942,6 +2002,12 @@
|
|||
Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y,
|
||||
the default is off.
|
||||
|
||||
kpti= [ARM64] Control page table isolation of user
|
||||
and kernel address spaces.
|
||||
Default: enabled on cores which need mitigation.
|
||||
0: force disabled
|
||||
1: force enabled
|
||||
|
||||
kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
|
||||
Default is 0 (don't ignore, but inject #GP)
|
||||
|
||||
|
|
@ -2069,6 +2135,9 @@
|
|||
off
|
||||
Disables hypervisor mitigations and doesn't
|
||||
emit any warnings.
|
||||
It also drops the swap size and available
|
||||
RAM limit restriction on both hypervisor and
|
||||
bare metal.
|
||||
|
||||
Default is 'flush'.
|
||||
|
||||
|
|
@ -2274,6 +2343,12 @@
|
|||
ltpc= [NET]
|
||||
Format: <io>,<irq>,<dma>
|
||||
|
||||
lsm.debug [SECURITY] Enable LSM initialization debugging output.
|
||||
|
||||
lsm=lsm1,...,lsmN
|
||||
[SECURITY] Choose order of LSM initialization. This
|
||||
overrides CONFIG_LSM, and the "security=" parameter.
|
||||
|
||||
machvec= [IA-64] Force the use of a particular machine-vector
|
||||
(machvec) in a generic kernel.
|
||||
Example: machvec=hpzx1_swiotlb
|
||||
|
|
@ -2404,7 +2479,7 @@
|
|||
seconds. Use this parameter to check at some
|
||||
other rate. 0 disables periodic checking.
|
||||
|
||||
memtest= [KNL,X86,ARM] Enable memtest
|
||||
memtest= [KNL,X86,ARM,PPC] Enable memtest
|
||||
Format: <integer>
|
||||
default : 0 <disable>
|
||||
Specifies the number of memtest passes to be
|
||||
|
|
@ -2798,7 +2873,7 @@
|
|||
check bypass). With this option data leaks are possible
|
||||
in the system.
|
||||
|
||||
nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2
|
||||
nospectre_v2 [X86,PPC_FSL_BOOK3E] Disable all mitigations for the Spectre variant 2
|
||||
(indirect branch prediction) vulnerability. System may
|
||||
allow data leaks with this option, which is equivalent
|
||||
to spectre_v2=off.
|
||||
|
|
@ -3053,6 +3128,14 @@
|
|||
timeout < 0: reboot immediately
|
||||
Format: <timeout>
|
||||
|
||||
panic_print= Bitmask for printing system info when panic happens.
|
||||
User can chose combination of the following bits:
|
||||
bit 0: print all tasks info
|
||||
bit 1: print system memory info
|
||||
bit 2: print timer info
|
||||
bit 3: print locks info if CONFIG_LOCKDEP is on
|
||||
bit 4: print ftrace buffer
|
||||
|
||||
panic_on_warn panic() instead of WARN(). Useful to cause kdump
|
||||
on a WARN().
|
||||
|
||||
|
|
@ -3476,6 +3559,10 @@
|
|||
before loading.
|
||||
See Documentation/blockdev/ramdisk.txt.
|
||||
|
||||
psi= [KNL] Enable or disable pressure stall information
|
||||
tracking.
|
||||
Format: <bool>
|
||||
|
||||
psmouse.proto= [HW,MOUSE] Highest PS2 mouse protocol extension to
|
||||
probe for; one of (bare|imps|exps|lifebook|any).
|
||||
psmouse.rate= [HW,MOUSE] Set desired mouse report rate, in reports
|
||||
|
|
@ -3540,14 +3627,14 @@
|
|||
|
||||
In kernels built with CONFIG_RCU_NOCB_CPU=y, set
|
||||
the specified list of CPUs to be no-callback CPUs.
|
||||
Invocation of these CPUs' RCU callbacks will
|
||||
be offloaded to "rcuox/N" kthreads created for
|
||||
that purpose, where "x" is "b" for RCU-bh, "p"
|
||||
for RCU-preempt, and "s" for RCU-sched, and "N"
|
||||
is the CPU number. This reduces OS jitter on the
|
||||
offloaded CPUs, which can be useful for HPC and
|
||||
real-time workloads. It can also improve energy
|
||||
efficiency for asymmetric multiprocessors.
|
||||
Invocation of these CPUs' RCU callbacks will be
|
||||
offloaded to "rcuox/N" kthreads created for that
|
||||
purpose, where "x" is "p" for RCU-preempt, and
|
||||
"s" for RCU-sched, and "N" is the CPU number.
|
||||
This reduces OS jitter on the offloaded CPUs,
|
||||
which can be useful for HPC and real-time
|
||||
workloads. It can also improve energy efficiency
|
||||
for asymmetric multiprocessors.
|
||||
|
||||
rcu_nocb_poll [KNL]
|
||||
Rather than requiring that offloaded CPUs
|
||||
|
|
@ -3597,12 +3684,6 @@
|
|||
latencies, which will choose a value aligned
|
||||
with the appropriate hardware boundaries.
|
||||
|
||||
rcutree.jiffies_till_sched_qs= [KNL]
|
||||
Set required age in jiffies for a
|
||||
given grace period before RCU starts
|
||||
soliciting quiescent-state help from
|
||||
rcu_note_context_switch().
|
||||
|
||||
rcutree.jiffies_till_first_fqs= [KNL]
|
||||
Set delay from grace-period initialization to
|
||||
first attempt to force quiescent states.
|
||||
|
|
@ -3614,6 +3695,20 @@
|
|||
quiescent states. Units are jiffies, minimum
|
||||
value is one, and maximum value is HZ.
|
||||
|
||||
rcutree.jiffies_till_sched_qs= [KNL]
|
||||
Set required age in jiffies for a
|
||||
given grace period before RCU starts
|
||||
soliciting quiescent-state help from
|
||||
rcu_note_context_switch() and cond_resched().
|
||||
If not specified, the kernel will calculate
|
||||
a value based on the most recent settings
|
||||
of rcutree.jiffies_till_first_fqs
|
||||
and rcutree.jiffies_till_next_fqs.
|
||||
This calculated value may be viewed in
|
||||
rcutree.jiffies_to_sched_qs. Any attempt to set
|
||||
rcutree.jiffies_to_sched_qs will be cheerfully
|
||||
overwritten.
|
||||
|
||||
rcutree.kthread_prio= [KNL,BOOT]
|
||||
Set the SCHED_FIFO priority of the RCU per-CPU
|
||||
kthreads (rcuc/N). This value is also used for
|
||||
|
|
@ -3657,6 +3752,11 @@
|
|||
This wake_up() will be accompanied by a
|
||||
WARN_ONCE() splat and an ftrace_dump().
|
||||
|
||||
rcutree.sysrq_rcu= [KNL]
|
||||
Commandeer a sysrq key to dump out Tree RCU's
|
||||
rcu_node tree with an eye towards determining
|
||||
why a new grace period has not yet started.
|
||||
|
||||
rcuperf.gp_async= [KNL]
|
||||
Measure performance of asynchronous
|
||||
grace-period primitives such as call_rcu().
|
||||
|
|
@ -3708,24 +3808,6 @@
|
|||
in microseconds. The default of zero says
|
||||
no holdoff.
|
||||
|
||||
rcutorture.cbflood_inter_holdoff= [KNL]
|
||||
Set holdoff time (jiffies) between successive
|
||||
callback-flood tests.
|
||||
|
||||
rcutorture.cbflood_intra_holdoff= [KNL]
|
||||
Set holdoff time (jiffies) between successive
|
||||
bursts of callbacks within a given callback-flood
|
||||
test.
|
||||
|
||||
rcutorture.cbflood_n_burst= [KNL]
|
||||
Set the number of bursts making up a given
|
||||
callback-flood test. Set this to zero to
|
||||
disable callback-flood testing.
|
||||
|
||||
rcutorture.cbflood_n_per_burst= [KNL]
|
||||
Set the number of callbacks to be registered
|
||||
in a given burst of a callback-flood test.
|
||||
|
||||
rcutorture.fqs_duration= [KNL]
|
||||
Set duration of force_quiescent_state bursts
|
||||
in microseconds.
|
||||
|
|
@ -3738,6 +3820,23 @@
|
|||
Set wait time between force_quiescent_state bursts
|
||||
in seconds.
|
||||
|
||||
rcutorture.fwd_progress= [KNL]
|
||||
Enable RCU grace-period forward-progress testing
|
||||
for the types of RCU supporting this notion.
|
||||
|
||||
rcutorture.fwd_progress_div= [KNL]
|
||||
Specify the fraction of a CPU-stall-warning
|
||||
period to do tight-loop forward-progress testing.
|
||||
|
||||
rcutorture.fwd_progress_holdoff= [KNL]
|
||||
Number of seconds to wait between successive
|
||||
forward-progress tests.
|
||||
|
||||
rcutorture.fwd_progress_need_resched= [KNL]
|
||||
Enclose cond_resched() calls within checks for
|
||||
need_resched() during tight-loop forward-progress
|
||||
testing.
|
||||
|
||||
rcutorture.gp_cond= [KNL]
|
||||
Use conditional/asynchronous update-side
|
||||
primitives, if available.
|
||||
|
|
@ -3869,12 +3968,6 @@
|
|||
rcupdate.rcu_self_test= [KNL]
|
||||
Run the RCU early boot self tests
|
||||
|
||||
rcupdate.rcu_self_test_bh= [KNL]
|
||||
Run the RCU bh early boot self tests
|
||||
|
||||
rcupdate.rcu_self_test_sched= [KNL]
|
||||
Run the RCU sched early boot self tests
|
||||
|
||||
rdinit= [KNL]
|
||||
Format: <full_path>
|
||||
Run specified binary instead of /init from the ramdisk,
|
||||
|
|
@ -4033,11 +4126,9 @@
|
|||
Note: increases power consumption, thus should only be
|
||||
enabled if running jitter sensitive (HPC/RT) workloads.
|
||||
|
||||
security= [SECURITY] Choose a security module to enable at boot.
|
||||
If this boot parameter is not specified, only the first
|
||||
security module asking for security registration will be
|
||||
loaded. An invalid security module name will be treated
|
||||
as if no module has been chosen.
|
||||
security= [SECURITY] Choose a legacy "major" security module to
|
||||
enable at boot. This has been deprecated by the
|
||||
"lsm=" parameter.
|
||||
|
||||
selinux= [SELINUX] Disable or enable SELinux at boot time.
|
||||
Format: { "0" | "1" }
|
||||
|
|
@ -4165,9 +4256,13 @@
|
|||
|
||||
spectre_v2= [X86] Control mitigation of Spectre variant 2
|
||||
(indirect branch speculation) vulnerability.
|
||||
The default operation protects the kernel from
|
||||
user space attacks.
|
||||
|
||||
on - unconditionally enable
|
||||
off - unconditionally disable
|
||||
on - unconditionally enable, implies
|
||||
spectre_v2_user=on
|
||||
off - unconditionally disable, implies
|
||||
spectre_v2_user=off
|
||||
auto - kernel detects whether your CPU model is
|
||||
vulnerable
|
||||
|
||||
|
|
@ -4177,6 +4272,12 @@
|
|||
CONFIG_RETPOLINE configuration option, and the
|
||||
compiler with which the kernel was built.
|
||||
|
||||
Selecting 'on' will also enable the mitigation
|
||||
against user space to user space task attacks.
|
||||
|
||||
Selecting 'off' will disable both the kernel and
|
||||
the user space protections.
|
||||
|
||||
Specific mitigations can also be selected manually:
|
||||
|
||||
retpoline - replace indirect branches
|
||||
|
|
@ -4186,6 +4287,48 @@
|
|||
Not specifying this option is equivalent to
|
||||
spectre_v2=auto.
|
||||
|
||||
spectre_v2_user=
|
||||
[X86] Control mitigation of Spectre variant 2
|
||||
(indirect branch speculation) vulnerability between
|
||||
user space tasks
|
||||
|
||||
on - Unconditionally enable mitigations. Is
|
||||
enforced by spectre_v2=on
|
||||
|
||||
off - Unconditionally disable mitigations. Is
|
||||
enforced by spectre_v2=off
|
||||
|
||||
prctl - Indirect branch speculation is enabled,
|
||||
but mitigation can be enabled via prctl
|
||||
per thread. The mitigation control state
|
||||
is inherited on fork.
|
||||
|
||||
prctl,ibpb
|
||||
- Like "prctl" above, but only STIBP is
|
||||
controlled per thread. IBPB is issued
|
||||
always when switching between different user
|
||||
space processes.
|
||||
|
||||
seccomp
|
||||
- Same as "prctl" above, but all seccomp
|
||||
threads will enable the mitigation unless
|
||||
they explicitly opt out.
|
||||
|
||||
seccomp,ibpb
|
||||
- Like "seccomp" above, but only STIBP is
|
||||
controlled per thread. IBPB is issued
|
||||
always when switching between different
|
||||
user space processes.
|
||||
|
||||
auto - Kernel selects the mitigation depending on
|
||||
the available CPU features and vulnerability.
|
||||
|
||||
Default mitigation:
|
||||
If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl"
|
||||
|
||||
Not specifying this option is equivalent to
|
||||
spectre_v2_user=auto.
|
||||
|
||||
spec_store_bypass_disable=
|
||||
[HW] Control Speculative Store Bypass (SSB) Disable mitigation
|
||||
(Speculative Store Bypass vulnerability)
|
||||
|
|
@ -4589,7 +4732,8 @@
|
|||
usbcore.authorized_default=
|
||||
[USB] Default USB device authorization:
|
||||
(default -1 = authorized except for wireless USB,
|
||||
0 = not authorized, 1 = authorized)
|
||||
0 = not authorized, 1 = authorized, 2 = authorized
|
||||
if device connected to internal port)
|
||||
|
||||
usbcore.autosuspend=
|
||||
[USB] The autosuspend time delay (in seconds) used
|
||||
|
|
@ -4610,7 +4754,8 @@
|
|||
|
||||
usbcore.old_scheme_first=
|
||||
[USB] Start with the old device initialization
|
||||
scheme (default 0 = off).
|
||||
scheme, applies only to low and full-speed devices
|
||||
(default 0 = off).
|
||||
|
||||
usbcore.usbfs_memory_mb=
|
||||
[USB] Memory limit (in MB) for buffers allocated by
|
||||
|
|
@ -4683,6 +4828,8 @@
|
|||
prevent spurious wakeup);
|
||||
n = USB_QUIRK_DELAY_CTRL_MSG (Device needs a
|
||||
pause after every control message);
|
||||
o = USB_QUIRK_HUB_SLOW_RESET (Hub needs extra
|
||||
delay after resetting its port);
|
||||
Example: quirks=0781:5580:bk,0a5c:5834:gij
|
||||
|
||||
usbhid.mousepoll=
|
||||
|
|
@ -4825,6 +4972,18 @@
|
|||
This is actually a boot loader parameter; the value is
|
||||
passed to the kernel using a special protocol.
|
||||
|
||||
vm_debug[=options] [KNL] Available with CONFIG_DEBUG_VM=y.
|
||||
May slow down system boot speed, especially when
|
||||
enabled on systems with a large amount of memory.
|
||||
All options are enabled by default, and this
|
||||
interface is meant to allow for selectively
|
||||
enabling or disabling specific virtual memory
|
||||
debugging features.
|
||||
|
||||
Available options are:
|
||||
P Enable page structure init time poisoning
|
||||
- Disable all of the above options
|
||||
|
||||
vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact
|
||||
size of <nn>. This can be used to increase the
|
||||
minimum size (128MB on x86). It can also be used to
|
||||
|
|
@ -4919,6 +5078,14 @@
|
|||
or other driver-specific files in the
|
||||
Documentation/watchdog/ directory.
|
||||
|
||||
watchdog_thresh=
|
||||
[KNL]
|
||||
Set the hard lockup detector stall duration
|
||||
threshold in seconds. The soft lockup detector
|
||||
threshold is set to twice the value. A value of 0
|
||||
disables both lockup detectors. Default is 10
|
||||
seconds.
|
||||
|
||||
workqueue.watchdog_thresh=
|
||||
If CONFIG_WQ_WATCHDOG is configured, workqueue can
|
||||
warn stall conditions and dump internal state to
|
||||
|
|
|
|||
|
|
@ -405,6 +405,9 @@ time with the option "l1tf=". The valid arguments for this option are:
|
|||
|
||||
off Disables hypervisor mitigations and doesn't emit any
|
||||
warnings.
|
||||
It also drops the swap size and available RAM limit restrictions
|
||||
on both hypervisor and bare metal.
|
||||
|
||||
============ =============================================================
|
||||
|
||||
The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
|
||||
|
|
@ -553,7 +556,7 @@ When nested virtualization is in use, three operating systems are involved:
|
|||
the bare metal hypervisor, the nested hypervisor and the nested virtual
|
||||
machine. VMENTER operations from the nested hypervisor into the nested
|
||||
guest will always be processed by the bare metal hypervisor. If KVM is the
|
||||
bare metal hypervisor it wiil:
|
||||
bare metal hypervisor it will:
|
||||
|
||||
- Flush the L1D cache on every switch from the nested hypervisor to the
|
||||
nested virtual machine, so that the nested hypervisor's secrets are not
|
||||
|
|
@ -576,7 +579,8 @@ Default mitigations
|
|||
The kernel default mitigations for vulnerable processors are:
|
||||
|
||||
- PTE inversion to protect against malicious user space. This is done
|
||||
unconditionally and cannot be controlled.
|
||||
unconditionally and cannot be controlled. The swap storage is limited
|
||||
to ~16TB.
|
||||
|
||||
- L1D conditional flushing on VMENTER when EPT is enabled for
|
||||
a guest.
|
||||
|
|
|
|||
|
|
@ -756,3 +756,6 @@ These currently include:
|
|||
The cache mode for raid5. raid5 could include an extra disk for
|
||||
caching. The mode can be "write-throuth" and "write-back". The
|
||||
default is "write-through".
|
||||
|
||||
ppl_write_hint
|
||||
NVMe stream ID to be set for each PPL write request.
|
||||
|
|
|
|||
|
|
@ -4,13 +4,13 @@
|
|||
Concepts overview
|
||||
=================
|
||||
|
||||
The memory management in Linux is complex system that evolved over the
|
||||
years and included more and more functionality to support variety of
|
||||
The memory management in Linux is a complex system that evolved over the
|
||||
years and included more and more functionality to support a variety of
|
||||
systems from MMU-less microcontrollers to supercomputers. The memory
|
||||
management for systems without MMU is called ``nommu`` and it
|
||||
management for systems without an MMU is called ``nommu`` and it
|
||||
definitely deserves a dedicated document, which hopefully will be
|
||||
eventually written. Yet, although some of the concepts are the same,
|
||||
here we assume that MMU is available and CPU can translate a virtual
|
||||
here we assume that an MMU is available and a CPU can translate a virtual
|
||||
address to a physical address.
|
||||
|
||||
.. contents:: :local:
|
||||
|
|
@ -21,10 +21,10 @@ Virtual Memory Primer
|
|||
The physical memory in a computer system is a limited resource and
|
||||
even for systems that support memory hotplug there is a hard limit on
|
||||
the amount of memory that can be installed. The physical memory is not
|
||||
necessary contiguous, it might be accessible as a set of distinct
|
||||
necessarily contiguous; it might be accessible as a set of distinct
|
||||
address ranges. Besides, different CPU architectures, and even
|
||||
different implementations of the same architecture have different view
|
||||
how these address ranges defined.
|
||||
different implementations of the same architecture have different views
|
||||
of how these address ranges are defined.
|
||||
|
||||
All this makes dealing directly with physical memory quite complex and
|
||||
to avoid this complexity a concept of virtual memory was developed.
|
||||
|
|
@ -48,8 +48,8 @@ appropriate kernel configuration option.
|
|||
|
||||
Each physical memory page can be mapped as one or more virtual
|
||||
pages. These mappings are described by page tables that allow
|
||||
translation from virtual address used by programs to real address in
|
||||
the physical memory. The page tables organized hierarchically.
|
||||
translation from a virtual address used by programs to the physical
|
||||
memory address. The page tables are organized hierarchically.
|
||||
|
||||
The tables at the lowest level of the hierarchy contain physical
|
||||
addresses of actual pages used by the software. The tables at higher
|
||||
|
|
@ -121,8 +121,8 @@ Nodes
|
|||
Many multi-processor machines are NUMA - Non-Uniform Memory Access -
|
||||
systems. In such systems the memory is arranged into banks that have
|
||||
different access latency depending on the "distance" from the
|
||||
processor. Each bank is referred as `node` and for each node Linux
|
||||
constructs an independent memory management subsystem. A node has it's
|
||||
processor. Each bank is referred to as a `node` and for each node Linux
|
||||
constructs an independent memory management subsystem. A node has its
|
||||
own set of zones, lists of free and used pages and various statistics
|
||||
counters. You can find more details about NUMA in
|
||||
:ref:`Documentation/vm/numa.rst <numa>` and in
|
||||
|
|
@ -149,9 +149,9 @@ for program's stack and heap or by explicit calls to mmap(2) system
|
|||
call. Usually, the anonymous mappings only define virtual memory areas
|
||||
that the program is allowed to access. The read accesses will result
|
||||
in creation of a page table entry that references a special physical
|
||||
page filled with zeroes. When the program performs a write, regular
|
||||
page filled with zeroes. When the program performs a write, a regular
|
||||
physical page will be allocated to hold the written data. The page
|
||||
will be marked dirty and if the kernel will decide to repurpose it,
|
||||
will be marked dirty and if the kernel decides to repurpose it,
|
||||
the dirty page will be swapped out.
|
||||
|
||||
Reclaim
|
||||
|
|
@ -181,8 +181,8 @@ pressure.
|
|||
The process of freeing the reclaimable physical memory pages and
|
||||
repurposing them is called (surprise!) `reclaim`. Linux can reclaim
|
||||
pages either asynchronously or synchronously, depending on the state
|
||||
of the system. When system is not loaded, most of the memory is free
|
||||
and allocation request will be satisfied immediately from the free
|
||||
of the system. When the system is not loaded, most of the memory is free
|
||||
and allocation requests will be satisfied immediately from the free
|
||||
pages supply. As the load increases, the amount of the free pages goes
|
||||
down and when it reaches a certain threshold (high watermark), an
|
||||
allocation request will awaken the ``kswapd`` daemon. It will
|
||||
|
|
@ -190,7 +190,7 @@ asynchronously scan memory pages and either just free them if the data
|
|||
they contain is available elsewhere, or evict to the backing storage
|
||||
device (remember those dirty pages?). As memory usage increases even
|
||||
more and reaches another threshold - min watermark - an allocation
|
||||
will trigger the `direct reclaim`. In this case allocation is stalled
|
||||
will trigger `direct reclaim`. In this case allocation is stalled
|
||||
until enough memory pages are reclaimed to satisfy the request.
|
||||
|
||||
Compaction
|
||||
|
|
@ -200,7 +200,7 @@ As the system runs, tasks allocate and free the memory and it becomes
|
|||
fragmented. Although with virtual memory it is possible to present
|
||||
scattered physical pages as virtually contiguous range, sometimes it is
|
||||
necessary to allocate large physically contiguous memory areas. Such
|
||||
need may arise, for instance, when a device driver requires large
|
||||
need may arise, for instance, when a device driver requires a large
|
||||
buffer for DMA, or when THP allocates a huge page. Memory `compaction`
|
||||
addresses the fragmentation issue. This mechanism moves occupied pages
|
||||
from the lower part of a memory zone to free pages in the upper part
|
||||
|
|
@ -208,15 +208,16 @@ of the zone. When a compaction scan is finished free pages are grouped
|
|||
together at the beginning of the zone and allocations of large
|
||||
physically contiguous areas become possible.
|
||||
|
||||
Like reclaim, the compaction may happen asynchronously in ``kcompactd``
|
||||
daemon or synchronously as a result of memory allocation request.
|
||||
Like reclaim, the compaction may happen asynchronously in the ``kcompactd``
|
||||
daemon or synchronously as a result of a memory allocation request.
|
||||
|
||||
OOM killer
|
||||
==========
|
||||
|
||||
It may happen, that on a loaded machine memory will be exhausted. When
|
||||
the kernel detects that the system runs out of memory (OOM) it invokes
|
||||
`OOM killer`. Its mission is simple: all it has to do is to select a
|
||||
task to sacrifice for the sake of the overall system health. The
|
||||
selected task is killed in a hope that after it exits enough memory
|
||||
will be freed to continue normal operation.
|
||||
It is possible that on a loaded machine memory will be exhausted and the
|
||||
kernel will be unable to reclaim enough memory to continue to operate. In
|
||||
order to save the rest of the system, it invokes the `OOM killer`.
|
||||
|
||||
The `OOM killer` selects a task to sacrifice for the sake of the overall
|
||||
system health. The selected task is killed in a hope that after it exits
|
||||
enough memory will be freed to continue normal operation.
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ the Linux memory management.
|
|||
hugetlbpage
|
||||
idle_page_tracking
|
||||
ksm
|
||||
memory-hotplug
|
||||
numa_memory_policy
|
||||
pagemap
|
||||
soft-dirty
|
||||
|
|
|
|||
444
Documentation/admin-guide/mm/memory-hotplug.rst
Normal file
|
|
@ -0,0 +1,444 @@
|
|||
.. _admin_guide_memory_hotplug:
|
||||
|
||||
==============
|
||||
Memory Hotplug
|
||||
==============
|
||||
|
||||
:Created: Jul 28 2007
|
||||
:Updated: Add some details about locking internals: Aug 20 2018
|
||||
|
||||
This document is about memory hotplug including how-to-use and current status.
|
||||
Because Memory Hotplug is still under development, contents of this text will
|
||||
be changed often.
|
||||
|
||||
.. contents:: :local:
|
||||
|
||||
.. note::
|
||||
|
||||
(1) x86_64's has special implementation for memory hotplug.
|
||||
This text does not describe it.
|
||||
(2) This text assumes that sysfs is mounted at ``/sys``.
|
||||
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
Purpose of memory hotplug
|
||||
-------------------------
|
||||
|
||||
Memory Hotplug allows users to increase/decrease the amount of memory.
|
||||
Generally, there are two purposes.
|
||||
|
||||
(A) For changing the amount of memory.
|
||||
This is to allow a feature like capacity on demand.
|
||||
(B) For installing/removing DIMMs or NUMA-nodes physically.
|
||||
This is to exchange DIMMs/NUMA-nodes, reduce power consumption, etc.
|
||||
|
||||
(A) is required by highly virtualized environments and (B) is required by
|
||||
hardware which supports memory power management.
|
||||
|
||||
Linux memory hotplug is designed for both purpose.
|
||||
|
||||
Phases of memory hotplug
|
||||
------------------------
|
||||
|
||||
There are 2 phases in Memory Hotplug:
|
||||
|
||||
1) Physical Memory Hotplug phase
|
||||
2) Logical Memory Hotplug phase.
|
||||
|
||||
The First phase is to communicate hardware/firmware and make/erase
|
||||
environment for hotplugged memory. Basically, this phase is necessary
|
||||
for the purpose (B), but this is good phase for communication between
|
||||
highly virtualized environments too.
|
||||
|
||||
When memory is hotplugged, the kernel recognizes new memory, makes new memory
|
||||
management tables, and makes sysfs files for new memory's operation.
|
||||
|
||||
If firmware supports notification of connection of new memory to OS,
|
||||
this phase is triggered automatically. ACPI can notify this event. If not,
|
||||
"probe" operation by system administration is used instead.
|
||||
(see :ref:`memory_hotplug_physical_mem`).
|
||||
|
||||
Logical Memory Hotplug phase is to change memory state into
|
||||
available/unavailable for users. Amount of memory from user's view is
|
||||
changed by this phase. The kernel makes all memory in it as free pages
|
||||
when a memory range is available.
|
||||
|
||||
In this document, this phase is described as online/offline.
|
||||
|
||||
Logical Memory Hotplug phase is triggered by write of sysfs file by system
|
||||
administrator. For the hot-add case, it must be executed after Physical Hotplug
|
||||
phase by hand.
|
||||
(However, if you writes udev's hotplug scripts for memory hotplug, these
|
||||
phases can be execute in seamless way.)
|
||||
|
||||
Unit of Memory online/offline operation
|
||||
---------------------------------------
|
||||
|
||||
Memory hotplug uses SPARSEMEM memory model which allows memory to be divided
|
||||
into chunks of the same size. These chunks are called "sections". The size of
|
||||
a memory section is architecture dependent. For example, power uses 16MiB, ia64
|
||||
uses 1GiB.
|
||||
|
||||
Memory sections are combined into chunks referred to as "memory blocks". The
|
||||
size of a memory block is architecture dependent and represents the logical
|
||||
unit upon which memory online/offline operations are to be performed. The
|
||||
default size of a memory block is the same as memory section size unless an
|
||||
architecture specifies otherwise. (see :ref:`memory_hotplug_sysfs_files`.)
|
||||
|
||||
To determine the size (in bytes) of a memory block please read this file::
|
||||
|
||||
/sys/devices/system/memory/block_size_bytes
|
||||
|
||||
Kernel Configuration
|
||||
====================
|
||||
|
||||
To use memory hotplug feature, kernel must be compiled with following
|
||||
config options.
|
||||
|
||||
- For all memory hotplug:
|
||||
- Memory model -> Sparse Memory (``CONFIG_SPARSEMEM``)
|
||||
- Allow for memory hot-add (``CONFIG_MEMORY_HOTPLUG``)
|
||||
|
||||
- To enable memory removal, the following are also necessary:
|
||||
- Allow for memory hot remove (``CONFIG_MEMORY_HOTREMOVE``)
|
||||
- Page Migration (``CONFIG_MIGRATION``)
|
||||
|
||||
- For ACPI memory hotplug, the following are also necessary:
|
||||
- Memory hotplug (under ACPI Support menu) (``CONFIG_ACPI_HOTPLUG_MEMORY``)
|
||||
- This option can be kernel module.
|
||||
|
||||
- As a related configuration, if your box has a feature of NUMA-node hotplug
|
||||
via ACPI, then this option is necessary too.
|
||||
|
||||
- ACPI0004,PNP0A05 and PNP0A06 Container Driver (under ACPI Support menu)
|
||||
(``CONFIG_ACPI_CONTAINER``).
|
||||
|
||||
This option can be kernel module too.
|
||||
|
||||
|
||||
.. _memory_hotplug_sysfs_files:
|
||||
|
||||
sysfs files for memory hotplug
|
||||
==============================
|
||||
|
||||
All memory blocks have their device information in sysfs. Each memory block
|
||||
is described under ``/sys/devices/system/memory`` as::
|
||||
|
||||
/sys/devices/system/memory/memoryXXX
|
||||
|
||||
where XXX is the memory block id.
|
||||
|
||||
For the memory block covered by the sysfs directory. It is expected that all
|
||||
memory sections in this range are present and no memory holes exist in the
|
||||
range. Currently there is no way to determine if there is a memory hole, but
|
||||
the existence of one should not affect the hotplug capabilities of the memory
|
||||
block.
|
||||
|
||||
For example, assume 1GiB memory block size. A device for a memory starting at
|
||||
0x100000000 is ``/sys/device/system/memory/memory4``::
|
||||
|
||||
(0x100000000 / 1Gib = 4)
|
||||
|
||||
This device covers address range [0x100000000 ... 0x140000000)
|
||||
|
||||
Under each memory block, you can see 5 files:
|
||||
|
||||
- ``/sys/devices/system/memory/memoryXXX/phys_index``
|
||||
- ``/sys/devices/system/memory/memoryXXX/phys_device``
|
||||
- ``/sys/devices/system/memory/memoryXXX/state``
|
||||
- ``/sys/devices/system/memory/memoryXXX/removable``
|
||||
- ``/sys/devices/system/memory/memoryXXX/valid_zones``
|
||||
|
||||
=================== ============================================================
|
||||
``phys_index`` read-only and contains memory block id, same as XXX.
|
||||
``state`` read-write
|
||||
|
||||
- at read: contains online/offline state of memory.
|
||||
- at write: user can specify "online_kernel",
|
||||
|
||||
"online_movable", "online", "offline" command
|
||||
which will be performed on all sections in the block.
|
||||
``phys_device`` read-only: designed to show the name of physical memory
|
||||
device. This is not well implemented now.
|
||||
``removable`` read-only: contains an integer value indicating
|
||||
whether the memory block is removable or not
|
||||
removable. A value of 1 indicates that the memory
|
||||
block is removable and a value of 0 indicates that
|
||||
it is not removable. A memory block is removable only if
|
||||
every section in the block is removable.
|
||||
``valid_zones`` read-only: designed to show which zones this memory block
|
||||
can be onlined to.
|
||||
|
||||
The first column shows it`s default zone.
|
||||
|
||||
"memory6/valid_zones: Normal Movable" shows this memoryblock
|
||||
can be onlined to ZONE_NORMAL by default and to ZONE_MOVABLE
|
||||
by online_movable.
|
||||
|
||||
"memory7/valid_zones: Movable Normal" shows this memoryblock
|
||||
can be onlined to ZONE_MOVABLE by default and to ZONE_NORMAL
|
||||
by online_kernel.
|
||||
=================== ============================================================
|
||||
|
||||
.. note::
|
||||
|
||||
These directories/files appear after physical memory hotplug phase.
|
||||
|
||||
If CONFIG_NUMA is enabled the memoryXXX/ directories can also be accessed
|
||||
via symbolic links located in the ``/sys/devices/system/node/node*`` directories.
|
||||
|
||||
For example::
|
||||
|
||||
/sys/devices/system/node/node0/memory9 -> ../../memory/memory9
|
||||
|
||||
A backlink will also be created::
|
||||
|
||||
/sys/devices/system/memory/memory9/node0 -> ../../node/node0
|
||||
|
||||
.. _memory_hotplug_physical_mem:
|
||||
|
||||
Physical memory hot-add phase
|
||||
=============================
|
||||
|
||||
Hardware(Firmware) Support
|
||||
--------------------------
|
||||
|
||||
On x86_64/ia64 platform, memory hotplug by ACPI is supported.
|
||||
|
||||
In general, the firmware (ACPI) which supports memory hotplug defines
|
||||
memory class object of _HID "PNP0C80". When a notify is asserted to PNP0C80,
|
||||
Linux's ACPI handler does hot-add memory to the system and calls a hotplug udev
|
||||
script. This will be done automatically.
|
||||
|
||||
But scripts for memory hotplug are not contained in generic udev package(now).
|
||||
You may have to write it by yourself or online/offline memory by hand.
|
||||
Please see :ref:`memory_hotplug_how_to_online_memory` and
|
||||
:ref:`memory_hotplug_how_to_offline_memory`.
|
||||
|
||||
If firmware supports NUMA-node hotplug, and defines an object _HID "ACPI0004",
|
||||
"PNP0A05", or "PNP0A06", notification is asserted to it, and ACPI handler
|
||||
calls hotplug code for all of objects which are defined in it.
|
||||
If memory device is found, memory hotplug code will be called.
|
||||
|
||||
Notify memory hot-add event by hand
|
||||
-----------------------------------
|
||||
|
||||
On some architectures, the firmware may not notify the kernel of a memory
|
||||
hotplug event. Therefore, the memory "probe" interface is supported to
|
||||
explicitly notify the kernel. This interface depends on
|
||||
CONFIG_ARCH_MEMORY_PROBE and can be configured on powerpc, sh, and x86
|
||||
if hotplug is supported, although for x86 this should be handled by ACPI
|
||||
notification.
|
||||
|
||||
Probe interface is located at::
|
||||
|
||||
/sys/devices/system/memory/probe
|
||||
|
||||
You can tell the physical address of new memory to the kernel by::
|
||||
|
||||
% echo start_address_of_new_memory > /sys/devices/system/memory/probe
|
||||
|
||||
Then, [start_address_of_new_memory, start_address_of_new_memory +
|
||||
memory_block_size] memory range is hot-added. In this case, hotplug script is
|
||||
not called (in current implementation). You'll have to online memory by
|
||||
yourself. Please see :ref:`memory_hotplug_how_to_online_memory`.
|
||||
|
||||
Logical Memory hot-add phase
|
||||
============================
|
||||
|
||||
State of memory
|
||||
---------------
|
||||
|
||||
To see (online/offline) state of a memory block, read 'state' file::
|
||||
|
||||
% cat /sys/device/system/memory/memoryXXX/state
|
||||
|
||||
|
||||
- If the memory block is online, you'll read "online".
|
||||
- If the memory block is offline, you'll read "offline".
|
||||
|
||||
|
||||
.. _memory_hotplug_how_to_online_memory:
|
||||
|
||||
How to online memory
|
||||
--------------------
|
||||
|
||||
When the memory is hot-added, the kernel decides whether or not to "online"
|
||||
it according to the policy which can be read from "auto_online_blocks" file::
|
||||
|
||||
% cat /sys/devices/system/memory/auto_online_blocks
|
||||
|
||||
The default depends on the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
|
||||
option. If it is disabled the default is "offline" which means the newly added
|
||||
memory is not in a ready-to-use state and you have to "online" the newly added
|
||||
memory blocks manually. Automatic onlining can be requested by writing "online"
|
||||
to "auto_online_blocks" file::
|
||||
|
||||
% echo online > /sys/devices/system/memory/auto_online_blocks
|
||||
|
||||
This sets a global policy and impacts all memory blocks that will subsequently
|
||||
be hotplugged. Currently offline blocks keep their state. It is possible, under
|
||||
certain circumstances, that some memory blocks will be added but will fail to
|
||||
online. User space tools can check their "state" files
|
||||
(``/sys/devices/system/memory/memoryXXX/state``) and try to online them manually.
|
||||
|
||||
If the automatic onlining wasn't requested, failed, or some memory block was
|
||||
offlined it is possible to change the individual block's state by writing to the
|
||||
"state" file::
|
||||
|
||||
% echo online > /sys/devices/system/memory/memoryXXX/state
|
||||
|
||||
This onlining will not change the ZONE type of the target memory block,
|
||||
If the memory block doesn't belong to any zone an appropriate kernel zone
|
||||
(usually ZONE_NORMAL) will be used unless movable_node kernel command line
|
||||
option is specified when ZONE_MOVABLE will be used.
|
||||
|
||||
You can explicitly request to associate it with ZONE_MOVABLE by::
|
||||
|
||||
% echo online_movable > /sys/devices/system/memory/memoryXXX/state
|
||||
|
||||
.. note:: current limit: this memory block must be adjacent to ZONE_MOVABLE
|
||||
|
||||
Or you can explicitly request a kernel zone (usually ZONE_NORMAL) by::
|
||||
|
||||
% echo online_kernel > /sys/devices/system/memory/memoryXXX/state
|
||||
|
||||
.. note:: current limit: this memory block must be adjacent to ZONE_NORMAL
|
||||
|
||||
An explicit zone onlining can fail (e.g. when the range is already within
|
||||
and existing and incompatible zone already).
|
||||
|
||||
After this, memory block XXX's state will be 'online' and the amount of
|
||||
available memory will be increased.
|
||||
|
||||
This may be changed in future.
|
||||
|
||||
Logical memory remove
|
||||
=====================
|
||||
|
||||
Memory offline and ZONE_MOVABLE
|
||||
-------------------------------
|
||||
|
||||
Memory offlining is more complicated than memory online. Because memory offline
|
||||
has to make the whole memory block be unused, memory offline can fail if
|
||||
the memory block includes memory which cannot be freed.
|
||||
|
||||
In general, memory offline can use 2 techniques.
|
||||
|
||||
(1) reclaim and free all memory in the memory block.
|
||||
(2) migrate all pages in the memory block.
|
||||
|
||||
In the current implementation, Linux's memory offline uses method (2), freeing
|
||||
all pages in the memory block by page migration. But not all pages are
|
||||
migratable. Under current Linux, migratable pages are anonymous pages and
|
||||
page caches. For offlining a memory block by migration, the kernel has to
|
||||
guarantee that the memory block contains only migratable pages.
|
||||
|
||||
Now, a boot option for making a memory block which consists of migratable pages
|
||||
is supported. By specifying "kernelcore=" or "movablecore=" boot option, you can
|
||||
create ZONE_MOVABLE...a zone which is just used for movable pages.
|
||||
(See also Documentation/admin-guide/kernel-parameters.rst)
|
||||
|
||||
Assume the system has "TOTAL" amount of memory at boot time, this boot option
|
||||
creates ZONE_MOVABLE as following.
|
||||
|
||||
1) When kernelcore=YYYY boot option is used,
|
||||
Size of memory not for movable pages (not for offline) is YYYY.
|
||||
Size of memory for movable pages (for offline) is TOTAL-YYYY.
|
||||
|
||||
2) When movablecore=ZZZZ boot option is used,
|
||||
Size of memory not for movable pages (not for offline) is TOTAL - ZZZZ.
|
||||
Size of memory for movable pages (for offline) is ZZZZ.
|
||||
|
||||
.. note::
|
||||
|
||||
Unfortunately, there is no information to show which memory block belongs
|
||||
to ZONE_MOVABLE. This is TBD.
|
||||
|
||||
.. _memory_hotplug_how_to_offline_memory:
|
||||
|
||||
How to offline memory
|
||||
---------------------
|
||||
|
||||
You can offline a memory block by using the same sysfs interface that was used
|
||||
in memory onlining::
|
||||
|
||||
% echo offline > /sys/devices/system/memory/memoryXXX/state
|
||||
|
||||
If offline succeeds, the state of the memory block is changed to be "offline".
|
||||
If it fails, some error core (like -EBUSY) will be returned by the kernel.
|
||||
Even if a memory block does not belong to ZONE_MOVABLE, you can try to offline
|
||||
it. If it doesn't contain 'unmovable' memory, you'll get success.
|
||||
|
||||
A memory block under ZONE_MOVABLE is considered to be able to be offlined
|
||||
easily. But under some busy state, it may return -EBUSY. Even if a memory
|
||||
block cannot be offlined due to -EBUSY, you can retry offlining it and may be
|
||||
able to offline it (or not). (For example, a page is referred to by some kernel
|
||||
internal call and released soon.)
|
||||
|
||||
Consideration:
|
||||
Memory hotplug's design direction is to make the possibility of memory
|
||||
offlining higher and to guarantee unplugging memory under any situation. But
|
||||
it needs more work. Returning -EBUSY under some situation may be good because
|
||||
the user can decide to retry more or not by himself. Currently, memory
|
||||
offlining code does some amount of retry with 120 seconds timeout.
|
||||
|
||||
Physical memory remove
|
||||
======================
|
||||
|
||||
Need more implementation yet....
|
||||
- Notification completion of remove works by OS to firmware.
|
||||
- Guard from remove if not yet.
|
||||
|
||||
|
||||
Locking Internals
|
||||
=================
|
||||
|
||||
When adding/removing memory that uses memory block devices (i.e. ordinary RAM),
|
||||
the device_hotplug_lock should be held to:
|
||||
|
||||
- synchronize against online/offline requests (e.g. via sysfs). This way, memory
|
||||
block devices can only be accessed (.online/.state attributes) by user
|
||||
space once memory has been fully added. And when removing memory, we
|
||||
know nobody is in critical sections.
|
||||
- synchronize against CPU hotplug and similar (e.g. relevant for ACPI and PPC)
|
||||
|
||||
Especially, there is a possible lock inversion that is avoided using
|
||||
device_hotplug_lock when adding memory and user space tries to online that
|
||||
memory faster than expected:
|
||||
|
||||
- device_online() will first take the device_lock(), followed by
|
||||
mem_hotplug_lock
|
||||
- add_memory_resource() will first take the mem_hotplug_lock, followed by
|
||||
the device_lock() (while creating the devices, during bus_add_device()).
|
||||
|
||||
As the device is visible to user space before taking the device_lock(), this
|
||||
can result in a lock inversion.
|
||||
|
||||
onlining/offlining of memory should be done via device_online()/
|
||||
device_offline() - to make sure it is properly synchronized to actions
|
||||
via sysfs. Holding device_hotplug_lock is advised (to e.g. protect online_type)
|
||||
|
||||
When adding/removing/onlining/offlining memory or adding/removing
|
||||
heterogeneous/device memory, we should always hold the mem_hotplug_lock in
|
||||
write mode to serialise memory hotplug (e.g. access to global/zone
|
||||
variables).
|
||||
|
||||
In addition, mem_hotplug_lock (in contrast to device_hotplug_lock) in read
|
||||
mode allows for a quite efficient get_online_mems/put_online_mems
|
||||
implementation, so code accessing memory can protect from that memory
|
||||
vanishing.
|
||||
|
||||
|
||||
Future Work
|
||||
===========
|
||||
|
||||
- allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like
|
||||
sysctl or new control file.
|
||||
- showing memory block and physical device relationship.
|
||||
- test and make it better memory offlining.
|
||||
- support HugeTLB page migration and offlining.
|
||||
- memmap removing at memory offline.
|
||||
- physical remove memory.
|
||||
|
|
@ -75,9 +75,10 @@ number of times a page is mapped.
|
|||
20. NOPAGE
|
||||
21. KSM
|
||||
22. THP
|
||||
23. BALLOON
|
||||
23. OFFLINE
|
||||
24. ZERO_PAGE
|
||||
25. IDLE
|
||||
26. PGTABLE
|
||||
|
||||
* ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the
|
||||
memory cgroup each page is charged to, indexed by PFN. Only available when
|
||||
|
|
@ -118,8 +119,8 @@ Short descriptions to the page flags
|
|||
identical memory pages dynamically shared between one or more processes
|
||||
22 - THP
|
||||
contiguous pages which construct transparent hugepages
|
||||
23 - BALLOON
|
||||
balloon compaction page
|
||||
23 - OFFLINE
|
||||
page is logically offline
|
||||
24 - ZERO_PAGE
|
||||
zero page for pfn_zero or huge_zero page
|
||||
25 - IDLE
|
||||
|
|
@ -128,6 +129,8 @@ Short descriptions to the page flags
|
|||
Note that this flag may be stale in case the page was accessed via
|
||||
a PTE. To make sure the flag is up-to-date one has to read
|
||||
``/sys/kernel/mm/page_idle/bitmap`` first.
|
||||
26 - PGTABLE
|
||||
page is in use as a page table
|
||||
|
||||
IO related page flags
|
||||
---------------------
|
||||
|
|
|
|||
230
Documentation/admin-guide/perf-security.rst
Normal file
|
|
@ -0,0 +1,230 @@
|
|||
.. _perf_security:
|
||||
|
||||
Perf Events and tool security
|
||||
=============================
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
Usage of Performance Counters for Linux (perf_events) [1]_ , [2]_ , [3]_
|
||||
can impose a considerable risk of leaking sensitive data accessed by
|
||||
monitored processes. The data leakage is possible both in scenarios of
|
||||
direct usage of perf_events system call API [2]_ and over data files
|
||||
generated by Perf tool user mode utility (Perf) [3]_ , [4]_ . The risk
|
||||
depends on the nature of data that perf_events performance monitoring
|
||||
units (PMU) [2]_ and Perf collect and expose for performance analysis.
|
||||
Collected system and performance data may be split into several
|
||||
categories:
|
||||
|
||||
1. System hardware and software configuration data, for example: a CPU
|
||||
model and its cache configuration, an amount of available memory and
|
||||
its topology, used kernel and Perf versions, performance monitoring
|
||||
setup including experiment time, events configuration, Perf command
|
||||
line parameters, etc.
|
||||
|
||||
2. User and kernel module paths and their load addresses with sizes,
|
||||
process and thread names with their PIDs and TIDs, timestamps for
|
||||
captured hardware and software events.
|
||||
|
||||
3. Content of kernel software counters (e.g., for context switches, page
|
||||
faults, CPU migrations), architectural hardware performance counters
|
||||
(PMC) [8]_ and machine specific registers (MSR) [9]_ that provide
|
||||
execution metrics for various monitored parts of the system (e.g.,
|
||||
memory controller (IMC), interconnect (QPI/UPI) or peripheral (PCIe)
|
||||
uncore counters) without direct attribution to any execution context
|
||||
state.
|
||||
|
||||
4. Content of architectural execution context registers (e.g., RIP, RSP,
|
||||
RBP on x86_64), process user and kernel space memory addresses and
|
||||
data, content of various architectural MSRs that capture data from
|
||||
this category.
|
||||
|
||||
Data that belong to the fourth category can potentially contain
|
||||
sensitive process data. If PMUs in some monitoring modes capture values
|
||||
of execution context registers or data from process memory then access
|
||||
to such monitoring capabilities requires to be ordered and secured
|
||||
properly. So, perf_events/Perf performance monitoring is the subject for
|
||||
security access control management [5]_ .
|
||||
|
||||
perf_events/Perf access control
|
||||
-------------------------------
|
||||
|
||||
To perform security checks, the Linux implementation splits processes
|
||||
into two categories [6]_ : a) privileged processes (whose effective user
|
||||
ID is 0, referred to as superuser or root), and b) unprivileged
|
||||
processes (whose effective UID is nonzero). Privileged processes bypass
|
||||
all kernel security permission checks so perf_events performance
|
||||
monitoring is fully available to privileged processes without access,
|
||||
scope and resource restrictions.
|
||||
|
||||
Unprivileged processes are subject to a full security permission check
|
||||
based on the process's credentials [5]_ (usually: effective UID,
|
||||
effective GID, and supplementary group list).
|
||||
|
||||
Linux divides the privileges traditionally associated with superuser
|
||||
into distinct units, known as capabilities [6]_ , which can be
|
||||
independently enabled and disabled on per-thread basis for processes and
|
||||
files of unprivileged users.
|
||||
|
||||
Unprivileged processes with enabled CAP_SYS_ADMIN capability are treated
|
||||
as privileged processes with respect to perf_events performance
|
||||
monitoring and bypass *scope* permissions checks in the kernel.
|
||||
|
||||
Unprivileged processes using perf_events system call API is also subject
|
||||
for PTRACE_MODE_READ_REALCREDS ptrace access mode check [7]_ , whose
|
||||
outcome determines whether monitoring is permitted. So unprivileged
|
||||
processes provided with CAP_SYS_PTRACE capability are effectively
|
||||
permitted to pass the check.
|
||||
|
||||
Other capabilities being granted to unprivileged processes can
|
||||
effectively enable capturing of additional data required for later
|
||||
performance analysis of monitored processes or a system. For example,
|
||||
CAP_SYSLOG capability permits reading kernel space memory addresses from
|
||||
/proc/kallsyms file.
|
||||
|
||||
perf_events/Perf privileged users
|
||||
---------------------------------
|
||||
|
||||
Mechanisms of capabilities, privileged capability-dumb files [6]_ and
|
||||
file system ACLs [10]_ can be used to create a dedicated group of
|
||||
perf_events/Perf privileged users who are permitted to execute
|
||||
performance monitoring without scope limits. The following steps can be
|
||||
taken to create such a group of privileged Perf users.
|
||||
|
||||
1. Create perf_users group of privileged Perf users, assign perf_users
|
||||
group to Perf tool executable and limit access to the executable for
|
||||
other users in the system who are not in the perf_users group:
|
||||
|
||||
::
|
||||
|
||||
# groupadd perf_users
|
||||
# ls -alhF
|
||||
-rwxr-xr-x 2 root root 11M Oct 19 15:12 perf
|
||||
# chgrp perf_users perf
|
||||
# ls -alhF
|
||||
-rwxr-xr-x 2 root perf_users 11M Oct 19 15:12 perf
|
||||
# chmod o-rwx perf
|
||||
# ls -alhF
|
||||
-rwxr-x--- 2 root perf_users 11M Oct 19 15:12 perf
|
||||
|
||||
2. Assign the required capabilities to the Perf tool executable file and
|
||||
enable members of perf_users group with performance monitoring
|
||||
privileges [6]_ :
|
||||
|
||||
::
|
||||
|
||||
# setcap "cap_sys_admin,cap_sys_ptrace,cap_syslog=ep" perf
|
||||
# setcap -v "cap_sys_admin,cap_sys_ptrace,cap_syslog=ep" perf
|
||||
perf: OK
|
||||
# getcap perf
|
||||
perf = cap_sys_ptrace,cap_sys_admin,cap_syslog+ep
|
||||
|
||||
As a result, members of perf_users group are capable of conducting
|
||||
performance monitoring by using functionality of the configured Perf
|
||||
tool executable that, when executes, passes perf_events subsystem scope
|
||||
checks.
|
||||
|
||||
This specific access control management is only available to superuser
|
||||
or root running processes with CAP_SETPCAP, CAP_SETFCAP [6]_
|
||||
capabilities.
|
||||
|
||||
perf_events/Perf unprivileged users
|
||||
-----------------------------------
|
||||
|
||||
perf_events/Perf *scope* and *access* control for unprivileged processes
|
||||
is governed by perf_event_paranoid [2]_ setting:
|
||||
|
||||
-1:
|
||||
Impose no *scope* and *access* restrictions on using perf_events
|
||||
performance monitoring. Per-user per-cpu perf_event_mlock_kb [2]_
|
||||
locking limit is ignored when allocating memory buffers for storing
|
||||
performance data. This is the least secure mode since allowed
|
||||
monitored *scope* is maximized and no perf_events specific limits
|
||||
are imposed on *resources* allocated for performance monitoring.
|
||||
|
||||
>=0:
|
||||
*scope* includes per-process and system wide performance monitoring
|
||||
but excludes raw tracepoints and ftrace function tracepoints
|
||||
monitoring. CPU and system events happened when executing either in
|
||||
user or in kernel space can be monitored and captured for later
|
||||
analysis. Per-user per-cpu perf_event_mlock_kb locking limit is
|
||||
imposed but ignored for unprivileged processes with CAP_IPC_LOCK
|
||||
[6]_ capability.
|
||||
|
||||
>=1:
|
||||
*scope* includes per-process performance monitoring only and
|
||||
excludes system wide performance monitoring. CPU and system events
|
||||
happened when executing either in user or in kernel space can be
|
||||
monitored and captured for later analysis. Per-user per-cpu
|
||||
perf_event_mlock_kb locking limit is imposed but ignored for
|
||||
unprivileged processes with CAP_IPC_LOCK capability.
|
||||
|
||||
>=2:
|
||||
*scope* includes per-process performance monitoring only. CPU and
|
||||
system events happened when executing in user space only can be
|
||||
monitored and captured for later analysis. Per-user per-cpu
|
||||
perf_event_mlock_kb locking limit is imposed but ignored for
|
||||
unprivileged processes with CAP_IPC_LOCK capability.
|
||||
|
||||
perf_events/Perf resource control
|
||||
---------------------------------
|
||||
|
||||
Open file descriptors
|
||||
+++++++++++++++++++++
|
||||
|
||||
The perf_events system call API [2]_ allocates file descriptors for
|
||||
every configured PMU event. Open file descriptors are a per-process
|
||||
accountable resource governed by the RLIMIT_NOFILE [11]_ limit
|
||||
(ulimit -n), which is usually derived from the login shell process. When
|
||||
configuring Perf collection for a long list of events on a large server
|
||||
system, this limit can be easily hit preventing required monitoring
|
||||
configuration. RLIMIT_NOFILE limit can be increased on per-user basis
|
||||
modifying content of the limits.conf file [12]_ . Ordinarily, a Perf
|
||||
sampling session (perf record) requires an amount of open perf_event
|
||||
file descriptors that is not less than the number of monitored events
|
||||
multiplied by the number of monitored CPUs.
|
||||
|
||||
Memory allocation
|
||||
+++++++++++++++++
|
||||
|
||||
The amount of memory available to user processes for capturing
|
||||
performance monitoring data is governed by the perf_event_mlock_kb [2]_
|
||||
setting. This perf_event specific resource setting defines overall
|
||||
per-cpu limits of memory allowed for mapping by the user processes to
|
||||
execute performance monitoring. The setting essentially extends the
|
||||
RLIMIT_MEMLOCK [11]_ limit, but only for memory regions mapped
|
||||
specifically for capturing monitored performance events and related data.
|
||||
|
||||
For example, if a machine has eight cores and perf_event_mlock_kb limit
|
||||
is set to 516 KiB, then a user process is provided with 516 KiB * 8 =
|
||||
4128 KiB of memory above the RLIMIT_MEMLOCK limit (ulimit -l) for
|
||||
perf_event mmap buffers. In particular, this means that, if the user
|
||||
wants to start two or more performance monitoring processes, the user is
|
||||
required to manually distribute the available 4128 KiB between the
|
||||
monitoring processes, for example, using the --mmap-pages Perf record
|
||||
mode option. Otherwise, the first started performance monitoring process
|
||||
allocates all available 4128 KiB and the other processes will fail to
|
||||
proceed due to the lack of memory.
|
||||
|
||||
RLIMIT_MEMLOCK and perf_event_mlock_kb resource constraints are ignored
|
||||
for processes with the CAP_IPC_LOCK capability. Thus, perf_events/Perf
|
||||
privileged users can be provided with memory above the constraints for
|
||||
perf_events/Perf performance monitoring purpose by providing the Perf
|
||||
executable with CAP_IPC_LOCK capability.
|
||||
|
||||
Bibliography
|
||||
------------
|
||||
|
||||
.. [1] `<https://lwn.net/Articles/337493/>`_
|
||||
.. [2] `<http://man7.org/linux/man-pages/man2/perf_event_open.2.html>`_
|
||||
.. [3] `<http://web.eece.maine.edu/~vweaver/projects/perf_events/>`_
|
||||
.. [4] `<https://perf.wiki.kernel.org/index.php/Main_Page>`_
|
||||
.. [5] `<https://www.kernel.org/doc/html/latest/security/credentials.html>`_
|
||||
.. [6] `<http://man7.org/linux/man-pages/man7/capabilities.7.html>`_
|
||||
.. [7] `<http://man7.org/linux/man-pages/man2/ptrace.2.html>`_
|
||||
.. [8] `<https://en.wikipedia.org/wiki/Hardware_performance_counter>`_
|
||||
.. [9] `<https://en.wikipedia.org/wiki/Model-specific_register>`_
|
||||
.. [10] `<http://man7.org/linux/man-pages/man5/acl.5.html>`_
|
||||
.. [11] `<http://man7.org/linux/man-pages/man2/getrlimit.2.html>`_
|
||||
.. [12] `<http://man7.org/linux/man-pages/man5/limits.conf.5.html>`_
|
||||
|
||||
|
|
@ -150,7 +150,7 @@ data structures necessary to handle the given policy and, possibly, to add
|
|||
a governor ``sysfs`` interface to it. Next, the governor is started by
|
||||
invoking its ``->start()`` callback.
|
||||
|
||||
That callback it expected to register per-CPU utilization update callbacks for
|
||||
That callback is expected to register per-CPU utilization update callbacks for
|
||||
all of the online CPUs belonging to the given policy with the CPU scheduler.
|
||||
The utilization update callbacks will be invoked by the CPU scheduler on
|
||||
important events, like task enqueue and dequeue, on every iteration of the
|
||||
|
|
|
|||
719
Documentation/admin-guide/pm/cpuidle.rst
Normal file
|
|
@ -0,0 +1,719 @@
|
|||
.. |struct cpuidle_state| replace:: :c:type:`struct cpuidle_state <cpuidle_state>`
|
||||
.. |cpufreq| replace:: :doc:`CPU Performance Scaling <cpufreq>`
|
||||
|
||||
========================
|
||||
CPU Idle Time Management
|
||||
========================
|
||||
|
||||
::
|
||||
|
||||
Copyright (c) 2018 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||
|
||||
Concepts
|
||||
========
|
||||
|
||||
Modern processors are generally able to enter states in which the execution of
|
||||
a program is suspended and instructions belonging to it are not fetched from
|
||||
memory or executed. Those states are the *idle* states of the processor.
|
||||
|
||||
Since part of the processor hardware is not used in idle states, entering them
|
||||
generally allows power drawn by the processor to be reduced and, in consequence,
|
||||
it is an opportunity to save energy.
|
||||
|
||||
CPU idle time management is an energy-efficiency feature concerned about using
|
||||
the idle states of processors for this purpose.
|
||||
|
||||
Logical CPUs
|
||||
------------
|
||||
|
||||
CPU idle time management operates on CPUs as seen by the *CPU scheduler* (that
|
||||
is the part of the kernel responsible for the distribution of computational
|
||||
work in the system). In its view, CPUs are *logical* units. That is, they need
|
||||
not be separate physical entities and may just be interfaces appearing to
|
||||
software as individual single-core processors. In other words, a CPU is an
|
||||
entity which appears to be fetching instructions that belong to one sequence
|
||||
(program) from memory and executing them, but it need not work this way
|
||||
physically. Generally, three different cases can be consider here.
|
||||
|
||||
First, if the whole processor can only follow one sequence of instructions (one
|
||||
program) at a time, it is a CPU. In that case, if the hardware is asked to
|
||||
enter an idle state, that applies to the processor as a whole.
|
||||
|
||||
Second, if the processor is multi-core, each core in it is able to follow at
|
||||
least one program at a time. The cores need not be entirely independent of each
|
||||
other (for example, they may share caches), but still most of the time they
|
||||
work physically in parallel with each other, so if each of them executes only
|
||||
one program, those programs run mostly independently of each other at the same
|
||||
time. The entire cores are CPUs in that case and if the hardware is asked to
|
||||
enter an idle state, that applies to the core that asked for it in the first
|
||||
place, but it also may apply to a larger unit (say a "package" or a "cluster")
|
||||
that the core belongs to (in fact, it may apply to an entire hierarchy of larger
|
||||
units containing the core). Namely, if all of the cores in the larger unit
|
||||
except for one have been put into idle states at the "core level" and the
|
||||
remaining core asks the processor to enter an idle state, that may trigger it
|
||||
to put the whole larger unit into an idle state which also will affect the
|
||||
other cores in that unit.
|
||||
|
||||
Finally, each core in a multi-core processor may be able to follow more than one
|
||||
program in the same time frame (that is, each core may be able to fetch
|
||||
instructions from multiple locations in memory and execute them in the same time
|
||||
frame, but not necessarily entirely in parallel with each other). In that case
|
||||
the cores present themselves to software as "bundles" each consisting of
|
||||
multiple individual single-core "processors", referred to as *hardware threads*
|
||||
(or hyper-threads specifically on Intel hardware), that each can follow one
|
||||
sequence of instructions. Then, the hardware threads are CPUs from the CPU idle
|
||||
time management perspective and if the processor is asked to enter an idle state
|
||||
by one of them, the hardware thread (or CPU) that asked for it is stopped, but
|
||||
nothing more happens, unless all of the other hardware threads within the same
|
||||
core also have asked the processor to enter an idle state. In that situation,
|
||||
the core may be put into an idle state individually or a larger unit containing
|
||||
it may be put into an idle state as a whole (if the other cores within the
|
||||
larger unit are in idle states already).
|
||||
|
||||
Idle CPUs
|
||||
---------
|
||||
|
||||
Logical CPUs, simply referred to as "CPUs" in what follows, are regarded as
|
||||
*idle* by the Linux kernel when there are no tasks to run on them except for the
|
||||
special "idle" task.
|
||||
|
||||
Tasks are the CPU scheduler's representation of work. Each task consists of a
|
||||
sequence of instructions to execute, or code, data to be manipulated while
|
||||
running that code, and some context information that needs to be loaded into the
|
||||
processor every time the task's code is run by a CPU. The CPU scheduler
|
||||
distributes work by assigning tasks to run to the CPUs present in the system.
|
||||
|
||||
Tasks can be in various states. In particular, they are *runnable* if there are
|
||||
no specific conditions preventing their code from being run by a CPU as long as
|
||||
there is a CPU available for that (for example, they are not waiting for any
|
||||
events to occur or similar). When a task becomes runnable, the CPU scheduler
|
||||
assigns it to one of the available CPUs to run and if there are no more runnable
|
||||
tasks assigned to it, the CPU will load the given task's context and run its
|
||||
code (from the instruction following the last one executed so far, possibly by
|
||||
another CPU). [If there are multiple runnable tasks assigned to one CPU
|
||||
simultaneously, they will be subject to prioritization and time sharing in order
|
||||
to allow them to make some progress over time.]
|
||||
|
||||
The special "idle" task becomes runnable if there are no other runnable tasks
|
||||
assigned to the given CPU and the CPU is then regarded as idle. In other words,
|
||||
in Linux idle CPUs run the code of the "idle" task called *the idle loop*. That
|
||||
code may cause the processor to be put into one of its idle states, if they are
|
||||
supported, in order to save energy, but if the processor does not support any
|
||||
idle states, or there is not enough time to spend in an idle state before the
|
||||
next wakeup event, or there are strict latency constraints preventing any of the
|
||||
available idle states from being used, the CPU will simply execute more or less
|
||||
useless instructions in a loop until it is assigned a new task to run.
|
||||
|
||||
|
||||
.. _idle-loop:
|
||||
|
||||
The Idle Loop
|
||||
=============
|
||||
|
||||
The idle loop code takes two major steps in every iteration of it. First, it
|
||||
calls into a code module referred to as the *governor* that belongs to the CPU
|
||||
idle time management subsystem called ``CPUIdle`` to select an idle state for
|
||||
the CPU to ask the hardware to enter. Second, it invokes another code module
|
||||
from the ``CPUIdle`` subsystem, called the *driver*, to actually ask the
|
||||
processor hardware to enter the idle state selected by the governor.
|
||||
|
||||
The role of the governor is to find an idle state most suitable for the
|
||||
conditions at hand. For this purpose, idle states that the hardware can be
|
||||
asked to enter by logical CPUs are represented in an abstract way independent of
|
||||
the platform or the processor architecture and organized in a one-dimensional
|
||||
(linear) array. That array has to be prepared and supplied by the ``CPUIdle``
|
||||
driver matching the platform the kernel is running on at the initialization
|
||||
time. This allows ``CPUIdle`` governors to be independent of the underlying
|
||||
hardware and to work with any platforms that the Linux kernel can run on.
|
||||
|
||||
Each idle state present in that array is characterized by two parameters to be
|
||||
taken into account by the governor, the *target residency* and the (worst-case)
|
||||
*exit latency*. The target residency is the minimum time the hardware must
|
||||
spend in the given state, including the time needed to enter it (which may be
|
||||
substantial), in order to save more energy than it would save by entering one of
|
||||
the shallower idle states instead. [The "depth" of an idle state roughly
|
||||
corresponds to the power drawn by the processor in that state.] The exit
|
||||
latency, in turn, is the maximum time it will take a CPU asking the processor
|
||||
hardware to enter an idle state to start executing the first instruction after a
|
||||
wakeup from that state. Note that in general the exit latency also must cover
|
||||
the time needed to enter the given state in case the wakeup occurs when the
|
||||
hardware is entering it and it must be entered completely to be exited in an
|
||||
ordered manner.
|
||||
|
||||
There are two types of information that can influence the governor's decisions.
|
||||
First of all, the governor knows the time until the closest timer event. That
|
||||
time is known exactly, because the kernel programs timers and it knows exactly
|
||||
when they will trigger, and it is the maximum time the hardware that the given
|
||||
CPU depends on can spend in an idle state, including the time necessary to enter
|
||||
and exit it. However, the CPU may be woken up by a non-timer event at any time
|
||||
(in particular, before the closest timer triggers) and it generally is not known
|
||||
when that may happen. The governor can only see how much time the CPU actually
|
||||
was idle after it has been woken up (that time will be referred to as the *idle
|
||||
duration* from now on) and it can use that information somehow along with the
|
||||
time until the closest timer to estimate the idle duration in future. How the
|
||||
governor uses that information depends on what algorithm is implemented by it
|
||||
and that is the primary reason for having more than one governor in the
|
||||
``CPUIdle`` subsystem.
|
||||
|
||||
There are three ``CPUIdle`` governors available, ``menu``, `TEO <teo-gov_>`_
|
||||
and ``ladder``. Which of them is used by default depends on the configuration
|
||||
of the kernel and in particular on whether or not the scheduler tick can be
|
||||
`stopped by the idle loop <idle-cpus-and-tick_>`_. It is possible to change the
|
||||
governor at run time if the ``cpuidle_sysfs_switch`` command line parameter has
|
||||
been passed to the kernel, but that is not safe in general, so it should not be
|
||||
done on production systems (that may change in the future, though). The name of
|
||||
the ``CPUIdle`` governor currently used by the kernel can be read from the
|
||||
:file:`current_governor_ro` (or :file:`current_governor` if
|
||||
``cpuidle_sysfs_switch`` is present in the kernel command line) file under
|
||||
:file:`/sys/devices/system/cpu/cpuidle/` in ``sysfs``.
|
||||
|
||||
Which ``CPUIdle`` driver is used, on the other hand, usually depends on the
|
||||
platform the kernel is running on, but there are platforms with more than one
|
||||
matching driver. For example, there are two drivers that can work with the
|
||||
majority of Intel platforms, ``intel_idle`` and ``acpi_idle``, one with
|
||||
hardcoded idle states information and the other able to read that information
|
||||
from the system's ACPI tables, respectively. Still, even in those cases, the
|
||||
driver chosen at the system initialization time cannot be replaced later, so the
|
||||
decision on which one of them to use has to be made early (on Intel platforms
|
||||
the ``acpi_idle`` driver will be used if ``intel_idle`` is disabled for some
|
||||
reason or if it does not recognize the processor). The name of the ``CPUIdle``
|
||||
driver currently used by the kernel can be read from the :file:`current_driver`
|
||||
file under :file:`/sys/devices/system/cpu/cpuidle/` in ``sysfs``.
|
||||
|
||||
|
||||
.. _idle-cpus-and-tick:
|
||||
|
||||
Idle CPUs and The Scheduler Tick
|
||||
================================
|
||||
|
||||
The scheduler tick is a timer that triggers periodically in order to implement
|
||||
the time sharing strategy of the CPU scheduler. Of course, if there are
|
||||
multiple runnable tasks assigned to one CPU at the same time, the only way to
|
||||
allow them to make reasonable progress in a given time frame is to make them
|
||||
share the available CPU time. Namely, in rough approximation, each task is
|
||||
given a slice of the CPU time to run its code, subject to the scheduling class,
|
||||
prioritization and so on and when that time slice is used up, the CPU should be
|
||||
switched over to running (the code of) another task. The currently running task
|
||||
may not want to give the CPU away voluntarily, however, and the scheduler tick
|
||||
is there to make the switch happen regardless. That is not the only role of the
|
||||
tick, but it is the primary reason for using it.
|
||||
|
||||
The scheduler tick is problematic from the CPU idle time management perspective,
|
||||
because it triggers periodically and relatively often (depending on the kernel
|
||||
configuration, the length of the tick period is between 1 ms and 10 ms).
|
||||
Thus, if the tick is allowed to trigger on idle CPUs, it will not make sense
|
||||
for them to ask the hardware to enter idle states with target residencies above
|
||||
the tick period length. Moreover, in that case the idle duration of any CPU
|
||||
will never exceed the tick period length and the energy used for entering and
|
||||
exiting idle states due to the tick wakeups on idle CPUs will be wasted.
|
||||
|
||||
Fortunately, it is not really necessary to allow the tick to trigger on idle
|
||||
CPUs, because (by definition) they have no tasks to run except for the special
|
||||
"idle" one. In other words, from the CPU scheduler perspective, the only user
|
||||
of the CPU time on them is the idle loop. Since the time of an idle CPU need
|
||||
not be shared between multiple runnable tasks, the primary reason for using the
|
||||
tick goes away if the given CPU is idle. Consequently, it is possible to stop
|
||||
the scheduler tick entirely on idle CPUs in principle, even though that may not
|
||||
always be worth the effort.
|
||||
|
||||
Whether or not it makes sense to stop the scheduler tick in the idle loop
|
||||
depends on what is expected by the governor. First, if there is another
|
||||
(non-tick) timer due to trigger within the tick range, stopping the tick clearly
|
||||
would be a waste of time, even though the timer hardware may not need to be
|
||||
reprogrammed in that case. Second, if the governor is expecting a non-timer
|
||||
wakeup within the tick range, stopping the tick is not necessary and it may even
|
||||
be harmful. Namely, in that case the governor will select an idle state with
|
||||
the target residency within the time until the expected wakeup, so that state is
|
||||
going to be relatively shallow. The governor really cannot select a deep idle
|
||||
state then, as that would contradict its own expectation of a wakeup in short
|
||||
order. Now, if the wakeup really occurs shortly, stopping the tick would be a
|
||||
waste of time and in this case the timer hardware would need to be reprogrammed,
|
||||
which is expensive. On the other hand, if the tick is stopped and the wakeup
|
||||
does not occur any time soon, the hardware may spend indefinite amount of time
|
||||
in the shallow idle state selected by the governor, which will be a waste of
|
||||
energy. Hence, if the governor is expecting a wakeup of any kind within the
|
||||
tick range, it is better to allow the tick trigger. Otherwise, however, the
|
||||
governor will select a relatively deep idle state, so the tick should be stopped
|
||||
so that it does not wake up the CPU too early.
|
||||
|
||||
In any case, the governor knows what it is expecting and the decision on whether
|
||||
or not to stop the scheduler tick belongs to it. Still, if the tick has been
|
||||
stopped already (in one of the previous iterations of the loop), it is better
|
||||
to leave it as is and the governor needs to take that into account.
|
||||
|
||||
The kernel can be configured to disable stopping the scheduler tick in the idle
|
||||
loop altogether. That can be done through the build-time configuration of it
|
||||
(by unsetting the ``CONFIG_NO_HZ_IDLE`` configuration option) or by passing
|
||||
``nohz=off`` to it in the command line. In both cases, as the stopping of the
|
||||
scheduler tick is disabled, the governor's decisions regarding it are simply
|
||||
ignored by the idle loop code and the tick is never stopped.
|
||||
|
||||
The systems that run kernels configured to allow the scheduler tick to be
|
||||
stopped on idle CPUs are referred to as *tickless* systems and they are
|
||||
generally regarded as more energy-efficient than the systems running kernels in
|
||||
which the tick cannot be stopped. If the given system is tickless, it will use
|
||||
the ``menu`` governor by default and if it is not tickless, the default
|
||||
``CPUIdle`` governor on it will be ``ladder``.
|
||||
|
||||
|
||||
.. _menu-gov:
|
||||
|
||||
The ``menu`` Governor
|
||||
=====================
|
||||
|
||||
The ``menu`` governor is the default ``CPUIdle`` governor for tickless systems.
|
||||
It is quite complex, but the basic principle of its design is straightforward.
|
||||
Namely, when invoked to select an idle state for a CPU (i.e. an idle state that
|
||||
the CPU will ask the processor hardware to enter), it attempts to predict the
|
||||
idle duration and uses the predicted value for idle state selection.
|
||||
|
||||
It first obtains the time until the closest timer event with the assumption
|
||||
that the scheduler tick will be stopped. That time, referred to as the *sleep
|
||||
length* in what follows, is the upper bound on the time before the next CPU
|
||||
wakeup. It is used to determine the sleep length range, which in turn is needed
|
||||
to get the sleep length correction factor.
|
||||
|
||||
The ``menu`` governor maintains two arrays of sleep length correction factors.
|
||||
One of them is used when tasks previously running on the given CPU are waiting
|
||||
for some I/O operations to complete and the other one is used when that is not
|
||||
the case. Each array contains several correction factor values that correspond
|
||||
to different sleep length ranges organized so that each range represented in the
|
||||
array is approximately 10 times wider than the previous one.
|
||||
|
||||
The correction factor for the given sleep length range (determined before
|
||||
selecting the idle state for the CPU) is updated after the CPU has been woken
|
||||
up and the closer the sleep length is to the observed idle duration, the closer
|
||||
to 1 the correction factor becomes (it must fall between 0 and 1 inclusive).
|
||||
The sleep length is multiplied by the correction factor for the range that it
|
||||
falls into to obtain the first approximation of the predicted idle duration.
|
||||
|
||||
Next, the governor uses a simple pattern recognition algorithm to refine its
|
||||
idle duration prediction. Namely, it saves the last 8 observed idle duration
|
||||
values and, when predicting the idle duration next time, it computes the average
|
||||
and variance of them. If the variance is small (smaller than 400 square
|
||||
milliseconds) or it is small relative to the average (the average is greater
|
||||
that 6 times the standard deviation), the average is regarded as the "typical
|
||||
interval" value. Otherwise, the longest of the saved observed idle duration
|
||||
values is discarded and the computation is repeated for the remaining ones.
|
||||
Again, if the variance of them is small (in the above sense), the average is
|
||||
taken as the "typical interval" value and so on, until either the "typical
|
||||
interval" is determined or too many data points are disregarded, in which case
|
||||
the "typical interval" is assumed to equal "infinity" (the maximum unsigned
|
||||
integer value). The "typical interval" computed this way is compared with the
|
||||
sleep length multiplied by the correction factor and the minimum of the two is
|
||||
taken as the predicted idle duration.
|
||||
|
||||
Then, the governor computes an extra latency limit to help "interactive"
|
||||
workloads. It uses the observation that if the exit latency of the selected
|
||||
idle state is comparable with the predicted idle duration, the total time spent
|
||||
in that state probably will be very short and the amount of energy to save by
|
||||
entering it will be relatively small, so likely it is better to avoid the
|
||||
overhead related to entering that state and exiting it. Thus selecting a
|
||||
shallower state is likely to be a better option then. The first approximation
|
||||
of the extra latency limit is the predicted idle duration itself which
|
||||
additionally is divided by a value depending on the number of tasks that
|
||||
previously ran on the given CPU and now they are waiting for I/O operations to
|
||||
complete. The result of that division is compared with the latency limit coming
|
||||
from the power management quality of service, or `PM QoS <cpu-pm-qos_>`_,
|
||||
framework and the minimum of the two is taken as the limit for the idle states'
|
||||
exit latency.
|
||||
|
||||
Now, the governor is ready to walk the list of idle states and choose one of
|
||||
them. For this purpose, it compares the target residency of each state with
|
||||
the predicted idle duration and the exit latency of it with the computed latency
|
||||
limit. It selects the state with the target residency closest to the predicted
|
||||
idle duration, but still below it, and exit latency that does not exceed the
|
||||
limit.
|
||||
|
||||
In the final step the governor may still need to refine the idle state selection
|
||||
if it has not decided to `stop the scheduler tick <idle-cpus-and-tick_>`_. That
|
||||
happens if the idle duration predicted by it is less than the tick period and
|
||||
the tick has not been stopped already (in a previous iteration of the idle
|
||||
loop). Then, the sleep length used in the previous computations may not reflect
|
||||
the real time until the closest timer event and if it really is greater than
|
||||
that time, the governor may need to select a shallower state with a suitable
|
||||
target residency.
|
||||
|
||||
|
||||
.. _teo-gov:
|
||||
|
||||
The Timer Events Oriented (TEO) Governor
|
||||
========================================
|
||||
|
||||
The timer events oriented (TEO) governor is an alternative ``CPUIdle`` governor
|
||||
for tickless systems. It follows the same basic strategy as the ``menu`` `one
|
||||
<menu-gov_>`_: it always tries to find the deepest idle state suitable for the
|
||||
given conditions. However, it applies a different approach to that problem.
|
||||
|
||||
First, it does not use sleep length correction factors, but instead it attempts
|
||||
to correlate the observed idle duration values with the available idle states
|
||||
and use that information to pick up the idle state that is most likely to
|
||||
"match" the upcoming CPU idle interval. Second, it does not take the tasks
|
||||
that were running on the given CPU in the past and are waiting on some I/O
|
||||
operations to complete now at all (there is no guarantee that they will run on
|
||||
the same CPU when they become runnable again) and the pattern detection code in
|
||||
it avoids taking timer wakeups into account. It also only uses idle duration
|
||||
values less than the current time till the closest timer (with the scheduler
|
||||
tick excluded) for that purpose.
|
||||
|
||||
Like in the ``menu`` governor `case <menu-gov_>`_, the first step is to obtain
|
||||
the *sleep length*, which is the time until the closest timer event with the
|
||||
assumption that the scheduler tick will be stopped (that also is the upper bound
|
||||
on the time until the next CPU wakeup). That value is then used to preselect an
|
||||
idle state on the basis of three metrics maintained for each idle state provided
|
||||
by the ``CPUIdle`` driver: ``hits``, ``misses`` and ``early_hits``.
|
||||
|
||||
The ``hits`` and ``misses`` metrics measure the likelihood that a given idle
|
||||
state will "match" the observed (post-wakeup) idle duration if it "matches" the
|
||||
sleep length. They both are subject to decay (after a CPU wakeup) every time
|
||||
the target residency of the idle state corresponding to them is less than or
|
||||
equal to the sleep length and the target residency of the next idle state is
|
||||
greater than the sleep length (that is, when the idle state corresponding to
|
||||
them "matches" the sleep length). The ``hits`` metric is increased if the
|
||||
former condition is satisfied and the target residency of the given idle state
|
||||
is less than or equal to the observed idle duration and the target residency of
|
||||
the next idle state is greater than the observed idle duration at the same time
|
||||
(that is, it is increased when the given idle state "matches" both the sleep
|
||||
length and the observed idle duration). In turn, the ``misses`` metric is
|
||||
increased when the given idle state "matches" the sleep length only and the
|
||||
observed idle duration is too short for its target residency.
|
||||
|
||||
The ``early_hits`` metric measures the likelihood that a given idle state will
|
||||
"match" the observed (post-wakeup) idle duration if it does not "match" the
|
||||
sleep length. It is subject to decay on every CPU wakeup and it is increased
|
||||
when the idle state corresponding to it "matches" the observed (post-wakeup)
|
||||
idle duration and the target residency of the next idle state is less than or
|
||||
equal to the sleep length (i.e. the idle state "matching" the sleep length is
|
||||
deeper than the given one).
|
||||
|
||||
The governor walks the list of idle states provided by the ``CPUIdle`` driver
|
||||
and finds the last (deepest) one with the target residency less than or equal
|
||||
to the sleep length. Then, the ``hits`` and ``misses`` metrics of that idle
|
||||
state are compared with each other and it is preselected if the ``hits`` one is
|
||||
greater (which means that that idle state is likely to "match" the observed idle
|
||||
duration after CPU wakeup). If the ``misses`` one is greater, the governor
|
||||
preselects the shallower idle state with the maximum ``early_hits`` metric
|
||||
(or if there are multiple shallower idle states with equal ``early_hits``
|
||||
metric which also is the maximum, the shallowest of them will be preselected).
|
||||
[If there is a wakeup latency constraint coming from the `PM QoS framework
|
||||
<cpu-pm-qos_>`_ which is hit before reaching the deepest idle state with the
|
||||
target residency within the sleep length, the deepest idle state with the exit
|
||||
latency within the constraint is preselected without consulting the ``hits``,
|
||||
``misses`` and ``early_hits`` metrics.]
|
||||
|
||||
Next, the governor takes several idle duration values observed most recently
|
||||
into consideration and if at least a half of them are greater than or equal to
|
||||
the target residency of the preselected idle state, that idle state becomes the
|
||||
final candidate to ask for. Otherwise, the average of the most recent idle
|
||||
duration values below the target residency of the preselected idle state is
|
||||
computed and the governor walks the idle states shallower than the preselected
|
||||
one and finds the deepest of them with the target residency within that average.
|
||||
That idle state is then taken as the final candidate to ask for.
|
||||
|
||||
Still, at this point the governor may need to refine the idle state selection if
|
||||
it has not decided to `stop the scheduler tick <idle-cpus-and-tick_>`_. That
|
||||
generally happens if the target residency of the idle state selected so far is
|
||||
less than the tick period and the tick has not been stopped already (in a
|
||||
previous iteration of the idle loop). Then, like in the ``menu`` governor
|
||||
`case <menu-gov_>`_, the sleep length used in the previous computations may not
|
||||
reflect the real time until the closest timer event and if it really is greater
|
||||
than that time, a shallower state with a suitable target residency may need to
|
||||
be selected.
|
||||
|
||||
|
||||
.. _idle-states-representation:
|
||||
|
||||
Representation of Idle States
|
||||
=============================
|
||||
|
||||
For the CPU idle time management purposes all of the physical idle states
|
||||
supported by the processor have to be represented as a one-dimensional array of
|
||||
|struct cpuidle_state| objects each allowing an individual (logical) CPU to ask
|
||||
the processor hardware to enter an idle state of certain properties. If there
|
||||
is a hierarchy of units in the processor, one |struct cpuidle_state| object can
|
||||
cover a combination of idle states supported by the units at different levels of
|
||||
the hierarchy. In that case, the `target residency and exit latency parameters
|
||||
of it <idle-loop_>`_, must reflect the properties of the idle state at the
|
||||
deepest level (i.e. the idle state of the unit containing all of the other
|
||||
units).
|
||||
|
||||
For example, take a processor with two cores in a larger unit referred to as
|
||||
a "module" and suppose that asking the hardware to enter a specific idle state
|
||||
(say "X") at the "core" level by one core will trigger the module to try to
|
||||
enter a specific idle state of its own (say "MX") if the other core is in idle
|
||||
state "X" already. In other words, asking for idle state "X" at the "core"
|
||||
level gives the hardware a license to go as deep as to idle state "MX" at the
|
||||
"module" level, but there is no guarantee that this is going to happen (the core
|
||||
asking for idle state "X" may just end up in that state by itself instead).
|
||||
Then, the target residency of the |struct cpuidle_state| object representing
|
||||
idle state "X" must reflect the minimum time to spend in idle state "MX" of
|
||||
the module (including the time needed to enter it), because that is the minimum
|
||||
time the CPU needs to be idle to save any energy in case the hardware enters
|
||||
that state. Analogously, the exit latency parameter of that object must cover
|
||||
the exit time of idle state "MX" of the module (and usually its entry time too),
|
||||
because that is the maximum delay between a wakeup signal and the time the CPU
|
||||
will start to execute the first new instruction (assuming that both cores in the
|
||||
module will always be ready to execute instructions as soon as the module
|
||||
becomes operational as a whole).
|
||||
|
||||
There are processors without direct coordination between different levels of the
|
||||
hierarchy of units inside them, however. In those cases asking for an idle
|
||||
state at the "core" level does not automatically affect the "module" level, for
|
||||
example, in any way and the ``CPUIdle`` driver is responsible for the entire
|
||||
handling of the hierarchy. Then, the definition of the idle state objects is
|
||||
entirely up to the driver, but still the physical properties of the idle state
|
||||
that the processor hardware finally goes into must always follow the parameters
|
||||
used by the governor for idle state selection (for instance, the actual exit
|
||||
latency of that idle state must not exceed the exit latency parameter of the
|
||||
idle state object selected by the governor).
|
||||
|
||||
In addition to the target residency and exit latency idle state parameters
|
||||
discussed above, the objects representing idle states each contain a few other
|
||||
parameters describing the idle state and a pointer to the function to run in
|
||||
order to ask the hardware to enter that state. Also, for each
|
||||
|struct cpuidle_state| object, there is a corresponding
|
||||
:c:type:`struct cpuidle_state_usage <cpuidle_state_usage>` one containing usage
|
||||
statistics of the given idle state. That information is exposed by the kernel
|
||||
via ``sysfs``.
|
||||
|
||||
For each CPU in the system, there is a :file:`/sys/devices/system/cpu<N>/cpuidle/`
|
||||
directory in ``sysfs``, where the number ``<N>`` is assigned to the given
|
||||
CPU at the initialization time. That directory contains a set of subdirectories
|
||||
called :file:`state0`, :file:`state1` and so on, up to the number of idle state
|
||||
objects defined for the given CPU minus one. Each of these directories
|
||||
corresponds to one idle state object and the larger the number in its name, the
|
||||
deeper the (effective) idle state represented by it. Each of them contains
|
||||
a number of files (attributes) representing the properties of the idle state
|
||||
object corresponding to it, as follows:
|
||||
|
||||
``above``
|
||||
Total number of times this idle state had been asked for, but the
|
||||
observed idle duration was certainly too short to match its target
|
||||
residency.
|
||||
|
||||
``below``
|
||||
Total number of times this idle state had been asked for, but cerainly
|
||||
a deeper idle state would have been a better match for the observed idle
|
||||
duration.
|
||||
|
||||
``desc``
|
||||
Description of the idle state.
|
||||
|
||||
``disable``
|
||||
Whether or not this idle state is disabled.
|
||||
|
||||
``latency``
|
||||
Exit latency of the idle state in microseconds.
|
||||
|
||||
``name``
|
||||
Name of the idle state.
|
||||
|
||||
``power``
|
||||
Power drawn by hardware in this idle state in milliwatts (if specified,
|
||||
0 otherwise).
|
||||
|
||||
``residency``
|
||||
Target residency of the idle state in microseconds.
|
||||
|
||||
``time``
|
||||
Total time spent in this idle state by the given CPU (as measured by the
|
||||
kernel) in microseconds.
|
||||
|
||||
``usage``
|
||||
Total number of times the hardware has been asked by the given CPU to
|
||||
enter this idle state.
|
||||
|
||||
The :file:`desc` and :file:`name` files both contain strings. The difference
|
||||
between them is that the name is expected to be more concise, while the
|
||||
description may be longer and it may contain white space or special characters.
|
||||
The other files listed above contain integer numbers.
|
||||
|
||||
The :file:`disable` attribute is the only writeable one. If it contains 1, the
|
||||
given idle state is disabled for this particular CPU, which means that the
|
||||
governor will never select it for this particular CPU and the ``CPUIdle``
|
||||
driver will never ask the hardware to enter it for that CPU as a result.
|
||||
However, disabling an idle state for one CPU does not prevent it from being
|
||||
asked for by the other CPUs, so it must be disabled for all of them in order to
|
||||
never be asked for by any of them. [Note that, due to the way the ``ladder``
|
||||
governor is implemented, disabling an idle state prevents that governor from
|
||||
selecting any idle states deeper than the disabled one too.]
|
||||
|
||||
If the :file:`disable` attribute contains 0, the given idle state is enabled for
|
||||
this particular CPU, but it still may be disabled for some or all of the other
|
||||
CPUs in the system at the same time. Writing 1 to it causes the idle state to
|
||||
be disabled for this particular CPU and writing 0 to it allows the governor to
|
||||
take it into consideration for the given CPU and the driver to ask for it,
|
||||
unless that state was disabled globally in the driver (in which case it cannot
|
||||
be used at all).
|
||||
|
||||
The :file:`power` attribute is not defined very well, especially for idle state
|
||||
objects representing combinations of idle states at different levels of the
|
||||
hierarchy of units in the processor, and it generally is hard to obtain idle
|
||||
state power numbers for complex hardware, so :file:`power` often contains 0 (not
|
||||
available) and if it contains a nonzero number, that number may not be very
|
||||
accurate and it should not be relied on for anything meaningful.
|
||||
|
||||
The number in the :file:`time` file generally may be greater than the total time
|
||||
really spent by the given CPU in the given idle state, because it is measured by
|
||||
the kernel and it may not cover the cases in which the hardware refused to enter
|
||||
this idle state and entered a shallower one instead of it (or even it did not
|
||||
enter any idle state at all). The kernel can only measure the time span between
|
||||
asking the hardware to enter an idle state and the subsequent wakeup of the CPU
|
||||
and it cannot say what really happened in the meantime at the hardware level.
|
||||
Moreover, if the idle state object in question represents a combination of idle
|
||||
states at different levels of the hierarchy of units in the processor,
|
||||
the kernel can never say how deep the hardware went down the hierarchy in any
|
||||
particular case. For these reasons, the only reliable way to find out how
|
||||
much time has been spent by the hardware in different idle states supported by
|
||||
it is to use idle state residency counters in the hardware, if available.
|
||||
|
||||
|
||||
.. _cpu-pm-qos:
|
||||
|
||||
Power Management Quality of Service for CPUs
|
||||
============================================
|
||||
|
||||
The power management quality of service (PM QoS) framework in the Linux kernel
|
||||
allows kernel code and user space processes to set constraints on various
|
||||
energy-efficiency features of the kernel to prevent performance from dropping
|
||||
below a required level. The PM QoS constraints can be set globally, in
|
||||
predefined categories referred to as PM QoS classes, or against individual
|
||||
devices.
|
||||
|
||||
CPU idle time management can be affected by PM QoS in two ways, through the
|
||||
global constraint in the ``PM_QOS_CPU_DMA_LATENCY`` class and through the
|
||||
resume latency constraints for individual CPUs. Kernel code (e.g. device
|
||||
drivers) can set both of them with the help of special internal interfaces
|
||||
provided by the PM QoS framework. User space can modify the former by opening
|
||||
the :file:`cpu_dma_latency` special device file under :file:`/dev/` and writing
|
||||
a binary value (interpreted as a signed 32-bit integer) to it. In turn, the
|
||||
resume latency constraint for a CPU can be modified by user space by writing a
|
||||
string (representing a signed 32-bit integer) to the
|
||||
:file:`power/pm_qos_resume_latency_us` file under
|
||||
:file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs``, where the CPU number
|
||||
``<N>`` is allocated at the system initialization time. Negative values
|
||||
will be rejected in both cases and, also in both cases, the written integer
|
||||
number will be interpreted as a requested PM QoS constraint in microseconds.
|
||||
|
||||
The requested value is not automatically applied as a new constraint, however,
|
||||
as it may be less restrictive (greater in this particular case) than another
|
||||
constraint previously requested by someone else. For this reason, the PM QoS
|
||||
framework maintains a list of requests that have been made so far in each
|
||||
global class and for each device, aggregates them and applies the effective
|
||||
(minimum in this particular case) value as the new constraint.
|
||||
|
||||
In fact, opening the :file:`cpu_dma_latency` special device file causes a new
|
||||
PM QoS request to be created and added to the priority list of requests in the
|
||||
``PM_QOS_CPU_DMA_LATENCY`` class and the file descriptor coming from the
|
||||
"open" operation represents that request. If that file descriptor is then
|
||||
used for writing, the number written to it will be associated with the PM QoS
|
||||
request represented by it as a new requested constraint value. Next, the
|
||||
priority list mechanism will be used to determine the new effective value of
|
||||
the entire list of requests and that effective value will be set as a new
|
||||
constraint. Thus setting a new requested constraint value will only change the
|
||||
real constraint if the effective "list" value is affected by it. In particular,
|
||||
for the ``PM_QOS_CPU_DMA_LATENCY`` class it only affects the real constraint if
|
||||
it is the minimum of the requested constraints in the list. The process holding
|
||||
a file descriptor obtained by opening the :file:`cpu_dma_latency` special device
|
||||
file controls the PM QoS request associated with that file descriptor, but it
|
||||
controls this particular PM QoS request only.
|
||||
|
||||
Closing the :file:`cpu_dma_latency` special device file or, more precisely, the
|
||||
file descriptor obtained while opening it, causes the PM QoS request associated
|
||||
with that file descriptor to be removed from the ``PM_QOS_CPU_DMA_LATENCY``
|
||||
class priority list and destroyed. If that happens, the priority list mechanism
|
||||
will be used, again, to determine the new effective value for the whole list
|
||||
and that value will become the new real constraint.
|
||||
|
||||
In turn, for each CPU there is only one resume latency PM QoS request
|
||||
associated with the :file:`power/pm_qos_resume_latency_us` file under
|
||||
:file:`/sys/devices/system/cpu/cpu<N>/` in ``sysfs`` and writing to it causes
|
||||
this single PM QoS request to be updated regardless of which user space
|
||||
process does that. In other words, this PM QoS request is shared by the entire
|
||||
user space, so access to the file associated with it needs to be arbitrated
|
||||
to avoid confusion. [Arguably, the only legitimate use of this mechanism in
|
||||
practice is to pin a process to the CPU in question and let it use the
|
||||
``sysfs`` interface to control the resume latency constraint for it.] It
|
||||
still only is a request, however. It is a member of a priority list used to
|
||||
determine the effective value to be set as the resume latency constraint for the
|
||||
CPU in question every time the list of requests is updated this way or another
|
||||
(there may be other requests coming from kernel code in that list).
|
||||
|
||||
CPU idle time governors are expected to regard the minimum of the global
|
||||
effective ``PM_QOS_CPU_DMA_LATENCY`` class constraint and the effective
|
||||
resume latency constraint for the given CPU as the upper limit for the exit
|
||||
latency of the idle states they can select for that CPU. They should never
|
||||
select any idle states with exit latency beyond that limit.
|
||||
|
||||
|
||||
Idle States Control Via Kernel Command Line
|
||||
===========================================
|
||||
|
||||
In addition to the ``sysfs`` interface allowing individual idle states to be
|
||||
`disabled for individual CPUs <idle-states-representation_>`_, there are kernel
|
||||
command line parameters affecting CPU idle time management.
|
||||
|
||||
The ``cpuidle.off=1`` kernel command line option can be used to disable the
|
||||
CPU idle time management entirely. It does not prevent the idle loop from
|
||||
running on idle CPUs, but it prevents the CPU idle time governors and drivers
|
||||
from being invoked. If it is added to the kernel command line, the idle loop
|
||||
will ask the hardware to enter idle states on idle CPUs via the CPU architecture
|
||||
support code that is expected to provide a default mechanism for this purpose.
|
||||
That default mechanism usually is the least common denominator for all of the
|
||||
processors implementing the architecture (i.e. CPU instruction set) in question,
|
||||
however, so it is rather crude and not very energy-efficient. For this reason,
|
||||
it is not recommended for production use.
|
||||
|
||||
The ``cpuidle.governor=`` kernel command line switch allows the ``CPUIdle``
|
||||
governor to use to be specified. It has to be appended with a string matching
|
||||
the name of an available governor (e.g. ``cpuidle.governor=menu``) and that
|
||||
governor will be used instead of the default one. It is possible to force
|
||||
the ``menu`` governor to be used on the systems that use the ``ladder`` governor
|
||||
by default this way, for example.
|
||||
|
||||
The other kernel command line parameters controlling CPU idle time management
|
||||
described below are only relevant for the *x86* architecture and some of
|
||||
them affect Intel processors only.
|
||||
|
||||
The *x86* architecture support code recognizes three kernel command line
|
||||
options related to CPU idle time management: ``idle=poll``, ``idle=halt``,
|
||||
and ``idle=nomwait``. The first two of them disable the ``acpi_idle`` and
|
||||
``intel_idle`` drivers altogether, which effectively causes the entire
|
||||
``CPUIdle`` subsystem to be disabled and makes the idle loop invoke the
|
||||
architecture support code to deal with idle CPUs. How it does that depends on
|
||||
which of the two parameters is added to the kernel command line. In the
|
||||
``idle=halt`` case, the architecture support code will use the ``HLT``
|
||||
instruction of the CPUs (which, as a rule, suspends the execution of the program
|
||||
and causes the hardware to attempt to enter the shallowest available idle state)
|
||||
for this purpose, and if ``idle=poll`` is used, idle CPUs will execute a
|
||||
more or less ``lightweight'' sequence of instructions in a tight loop. [Note
|
||||
that using ``idle=poll`` is somewhat drastic in many cases, as preventing idle
|
||||
CPUs from saving almost any energy at all may not be the only effect of it.
|
||||
For example, on Intel hardware it effectively prevents CPUs from using
|
||||
P-states (see |cpufreq|) that require any number of CPUs in a package to be
|
||||
idle, so it very well may hurt single-thread computations performance as well as
|
||||
energy-efficiency. Thus using it for performance reasons may not be a good idea
|
||||
at all.]
|
||||
|
||||
The ``idle=nomwait`` option disables the ``intel_idle`` driver and causes
|
||||
``acpi_idle`` to be used (as long as all of the information needed by it is
|
||||
there in the system's ACPI tables), but it is not allowed to use the
|
||||
``MWAIT`` instruction of the CPUs to ask the hardware to enter idle states.
|
||||
|
||||
In addition to the architecture-level kernel command line options affecting CPU
|
||||
idle time management, there are parameters affecting individual ``CPUIdle``
|
||||
drivers that can be passed to them via the kernel command line. Specifically,
|
||||
the ``intel_idle.max_cstate=<n>`` and ``processor.max_cstate=<n>`` parameters,
|
||||
where ``<n>`` is an idle state index also used in the name of the given
|
||||
state's directory in ``sysfs`` (see
|
||||
`Representation of Idle States <idle-states-representation_>`_), causes the
|
||||
``intel_idle`` and ``acpi_idle`` drivers, respectively, to discard all of the
|
||||
idle states deeper than idle state ``<n>``. In that case, they will never ask
|
||||
for any of those idle states or expose them to the governor. [The behavior of
|
||||
the two drivers is different for ``<n>`` equal to ``0``. Adding
|
||||
``intel_idle.max_cstate=0`` to the kernel command line disables the
|
||||
``intel_idle`` driver and allows ``acpi_idle`` to be used, whereas
|
||||
``processor.max_cstate=0`` is equivalent to ``processor.max_cstate=1``.
|
||||
Also, the ``acpi_idle`` driver is part of the ``processor`` kernel module that
|
||||
can be loaded separately and ``max_cstate=<n>`` can be passed to it as a module
|
||||
parameter when it is loaded.]
|
||||