Merge tag 'v4.4.46' into linux-linaro-lsk-v4.4

author Alex Shi <alex.shi@linaro.org>

Sat, 4 Feb 2017 04:11:15 +0000 (12:11 +0800)

committer Alex Shi <alex.shi@linaro.org>

Sat, 4 Feb 2017 04:11:15 +0000 (12:11 +0800)
author Alex Shi <alex.shi@linaro.org>
Sat, 4 Feb 2017 04:11:15 +0000 (12:11 +0800)
committer Alex Shi <alex.shi@linaro.org>
Sat, 4 Feb 2017 04:11:15 +0000 (12:11 +0800)
diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-etb10 b/Documentation/ABI/testing/sysfs-bus-coresight-devices-etb10

index 4b8d6ec92e2b91560123144c45e0afb19464287c..b5f526081711878eb46f0586415f043b929aaba0 100644 (file)
--- a/Documentation/ABI/testing/sysfs-bus-coresight-devices-etb10
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-etb10
@@ -6,13 +6,6 @@ Description:   (RW) Add/remove a sink from a trace path.  There can be multiple
                 source for a single sink.
                 ex: echo 1 > /sys/bus/coresight/devices/20010000.etb/enable_sink
  
-What:          /sys/bus/coresight/devices/<memory_map>.etb/status
-Date:          November 2014
-KernelVersion: 3.19
-Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
-Description:   (R) List various control and status registers.  The specific
-               layout and content is driver specific.
-
  What:          /sys/bus/coresight/devices/<memory_map>.etb/trigger_cntr
  Date:          November 2014
  KernelVersion: 3.19
@@ -22,3 +15,65 @@ Description: (RW) Disables write access to the Trace RAM by stopping the
                 following the trigger event. The number of 32-bit words written
                 into the Trace RAM following the trigger event is equal to the
                 value stored in this register+1 (from ARM ETB-TRM).
+
+What:          /sys/bus/coresight/devices/<memory_map>.etb/mgmt/rdp
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Defines the depth, in words, of the trace RAM in powers of
+               2.  The value is read directly from HW register RDP, 0x004.
+
+What:          /sys/bus/coresight/devices/<memory_map>.etb/mgmt/sts
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Shows the value held by the ETB status register.  The value
+               is read directly from HW register STS, 0x00C.
+
+What:          /sys/bus/coresight/devices/<memory_map>.etb/mgmt/rrp
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Shows the value held by the ETB RAM Read Pointer register
+               that is used to read entries from the Trace RAM over the APB
+               interface.  The value is read directly from HW register RRP,
+               0x014.
+
+What:          /sys/bus/coresight/devices/<memory_map>.etb/mgmt/rwp
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Shows the value held by the ETB RAM Write Pointer register
+               that is used to sets the write pointer to write entries from
+               the CoreSight bus into the Trace RAM. The value is read directly
+               from HW register RWP, 0x018.
+
+What:          /sys/bus/coresight/devices/<memory_map>.etb/mgmt/trg
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Similar to "trigger_cntr" above except that this value is
+               read directly from HW register TRG, 0x01C.
+
+What:          /sys/bus/coresight/devices/<memory_map>.etb/mgmt/ctl
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Shows the value held by the ETB Control register. The value
+               is read directly from HW register CTL, 0x020.
+
+What:          /sys/bus/coresight/devices/<memory_map>.etb/mgmt/ffsr
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Shows the value held by the ETB Formatter and Flush Status
+               register.  The value is read directly from HW register FFSR,
+               0x300.
+
+What:          /sys/bus/coresight/devices/<memory_map>.etb/mgmt/ffcr
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Shows the value held by the ETB Formatter and Flush Control
+               register.  The value is read directly from HW register FFCR,
+               0x304.
diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-etm4x b/Documentation/ABI/testing/sysfs-bus-coresight-devices-etm4x

index 2355ed8ae31f732d1567f2b9085d86574f817523..36258bc1b473a9c826d272bd3328f7c79562e49a 100644 (file)
--- a/Documentation/ABI/testing/sysfs-bus-coresight-devices-etm4x
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-etm4x
@@ -359,6 +359,19 @@ Contact:   Mathieu Poirier <mathieu.poirier@linaro.org>
  Description:   (R) Print the content of the Peripheral ID3 Register
                 (0xFEC).  The value is taken directly from the HW.
  
+What:          /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trcconfig
+Date:          February 2016
+KernelVersion: 4.07
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Print the content of the trace configuration register
+               (0x010) as currently set by SW.
+
+What:          /sys/bus/coresight/devices/<memory_map>.etm/mgmt/trctraceid
+Date:          February 2016
+KernelVersion: 4.07
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Print the content of the trace ID register (0x040).
+
  What:          /sys/bus/coresight/devices/<memory_map>.etm/trcidr/trcidr0
  Date:          April 2015
  KernelVersion: 4.01
diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-stm b/Documentation/ABI/testing/sysfs-bus-coresight-devices-stm

new file mode 100644 (file)

index 0000000..1dffabe
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
@@ -0,0 +1,53 @@
+What:          /sys/bus/coresight/devices/<memory_map>.stm/enable_source
+Date:          April 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (RW) Enable/disable tracing on this specific trace macrocell.
+               Enabling the trace macrocell implies it has been configured
+               properly and a sink has been identified for it.  The path
+               of coresight components linking the source to the sink is
+               configured and managed automatically by the coresight framework.
+
+What:          /sys/bus/coresight/devices/<memory_map>.stm/hwevent_enable
+Date:          April 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (RW) Provides access to the HW event enable register, used in
+               conjunction with HW event bank select register.
+
+What:          /sys/bus/coresight/devices/<memory_map>.stm/hwevent_select
+Date:          April 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (RW) Gives access to the HW event block select register
+               (STMHEBSR) in order to configure up to 256 channels.  Used in
+               conjunction with "hwevent_enable" register as described above.
+
+What:          /sys/bus/coresight/devices/<memory_map>.stm/port_enable
+Date:          April 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (RW) Provides access to the stimulus port enable register
+               (STMSPER).  Used in conjunction with "port_select" described
+               below.
+
+What:          /sys/bus/coresight/devices/<memory_map>.stm/port_select
+Date:          April 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (RW) Used to determine which bank of stimulus port bit in
+               register STMSPER (see above) apply to.
+
+What:          /sys/bus/coresight/devices/<memory_map>.stm/status
+Date:          April 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) List various control and status registers.  The specific
+               layout and content is driver specific.
+
+What:          /sys/bus/coresight/devices/<memory_map>.stm/traceid
+Date:          April 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (RW) Holds the trace ID that will appear in the trace stream
+               coming from this trace entity.
diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-tmc b/Documentation/ABI/testing/sysfs-bus-coresight-devices-tmc

index f38cded5fa22cfefa4bd91521d36ebef89b581e7..4fe677ed1305c8ecaf4a4f0a68af1745a8d0ce02 100644 (file)
--- a/Documentation/ABI/testing/sysfs-bus-coresight-devices-tmc
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-tmc
@@ -6,3 +6,80 @@ Description:   (RW) Disables write access to the Trace RAM by stopping the
                 formatter after a defined number of words have been stored
                 following the trigger event. Additional interface for this
                 driver are expected to be added as it matures.
+
+What:           /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/rsz
+Date:           March 2016
+KernelVersion:  4.7
+Contact:        Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:    (R) Defines the size, in 32-bit words, of the local RAM buffer.
+                The value is read directly from HW register RSZ, 0x004.
+
+What:           /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/sts
+Date:           March 2016
+KernelVersion:  4.7
+Contact:        Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Shows the value held by the TMC status register.  The value
+                is read directly from HW register STS, 0x00C.
+
+What:          /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/rrp
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Shows the value held by the TMC RAM Read Pointer register
+               that is used to read entries from the Trace RAM over the APB
+               interface.  The value is read directly from HW register RRP,
+               0x014.
+
+What:          /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/rwp
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Shows the value held by the TMC RAM Write Pointer register
+               that is used to sets the write pointer to write entries from
+               the CoreSight bus into the Trace RAM. The value is read directly
+               from HW register RWP, 0x018.
+
+What:          /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/trg
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Similar to "trigger_cntr" above except that this value is
+               read directly from HW register TRG, 0x01C.
+
+What:          /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/ctl
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Shows the value held by the TMC Control register. The value
+               is read directly from HW register CTL, 0x020.
+
+What:          /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/ffsr
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Shows the value held by the TMC Formatter and Flush Status
+               register.  The value is read directly from HW register FFSR,
+               0x300.
+
+What:          /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/ffcr
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Shows the value held by the TMC Formatter and Flush Control
+               register.  The value is read directly from HW register FFCR,
+               0x304.
+
+What:          /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/mode
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Shows the value held by the TMC Mode register, which
+               indicate the mode the device has been configured to enact.  The
+               The value is read directly from the MODE register, 0x028.
+
+What:          /sys/bus/coresight/devices/<memory_map>.tmc/mgmt/devid
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Mathieu Poirier <mathieu.poirier@linaro.org>
+Description:   (R) Indicates the capabilities of the Coresight TMC.
+               The value is read directly from the DEVID register, 0xFC8,
diff --git a/Documentation/ABI/testing/sysfs-class-stm b/Documentation/ABI/testing/sysfs-class-stm

index c9aa4f3fc9a71f7429210346a61a727753ed724e..77ed3da0f68e437f9c82ccfebcdc79d761831833 100644 (file)
--- a/Documentation/ABI/testing/sysfs-class-stm
+++ b/Documentation/ABI/testing/sysfs-class-stm
@@ -12,3 +12,13 @@ KernelVersion:       4.3
  Contact:       Alexander Shishkin <alexander.shishkin@linux.intel.com>
  Description:
                 Shows the number of channels per master on this STM device.
+
+What:          /sys/class/stm/<stm>/hw_override
+Date:          March 2016
+KernelVersion: 4.7
+Contact:       Alexander Shishkin <alexander.shishkin@linux.intel.com>
+Description:
+               Reads as 0 if master numbers in the STP stream produced by
+               this stm device will match the master numbers assigned by
+               the software or 1 if the stm hardware overrides software
+               assigned masters.
diff --git a/Documentation/arm64/booting.txt b/Documentation/arm64/booting.txt

index 701d39d3171a74d8c2eb670c0b1be2931f326ee8..56d6d8b796db6dd3aadd252a85cc4b691f9e20f7 100644 (file)
--- a/Documentation/arm64/booting.txt
+++ b/Documentation/arm64/booting.txt
@@ -109,7 +109,13 @@ Header notes:
                         1 - 4K
                         2 - 16K
                         3 - 64K
-  Bits 3-63:   Reserved.
+  Bit 3:       Kernel physical placement
+                       0 - 2MB aligned base should be as close as possible
+                           to the base of DRAM, since memory below it is not
+                           accessible via the linear mapping
+                       1 - 2MB aligned base may be anywhere in physical
+                           memory
+  Bits 4-63:   Reserved.
  
  - When image_size is zero, a bootloader should attempt to keep as much
    memory as possible free for use by the kernel immediately after the
@@ -117,14 +123,14 @@ Header notes:
    depending on selected features, and is effectively unbound.
  
  The Image must be placed text_offset bytes from a 2MB aligned base
-address near the start of usable system RAM and called there. Memory
-below that base address is currently unusable by Linux, and therefore it
-is strongly recommended that this location is the start of system RAM.
-The region between the 2 MB aligned base address and the start of the
-image has no special significance to the kernel, and may be used for
-other purposes.
+address anywhere in usable system RAM and called there. The region
+between the 2 MB aligned base address and the start of the image has no
+special significance to the kernel, and may be used for other purposes.
  At least image_size bytes from the start of the image must be free for
  use by the kernel.
+NOTE: versions prior to v4.6 cannot make use of memory below the
+physical offset of the Image so it is recommended that the Image be
+placed as close as possible to the start of system RAM.
  
  Any memory described to the kernel (even that below the start of the
  image) which is not marked as reserved from the kernel (e.g., with a
diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt

new file mode 100644 (file)

index 0000000..58b71dd
--- /dev/null
+++ b/Documentation/arm64/silicon-errata.txt
@@ -0,0 +1,58 @@
+                Silicon Errata and Software Workarounds
+                =======================================
+
+Author: Will Deacon <will.deacon@arm.com>
+Date  : 27 November 2015
+
+It is an unfortunate fact of life that hardware is often produced with
+so-called "errata", which can cause it to deviate from the architecture
+under specific circumstances.  For hardware produced by ARM, these
+errata are broadly classified into the following categories:
+
+  Category A: A critical error without a viable workaround.
+  Category B: A significant or critical error with an acceptable
+              workaround.
+  Category C: A minor error that is not expected to occur under normal
+              operation.
+
+For more information, consult one of the "Software Developers Errata
+Notice" documents available on infocenter.arm.com (registration
+required).
+
+As far as Linux is concerned, Category B errata may require some special
+treatment in the operating system. For example, avoiding a particular
+sequence of code, or configuring the processor in a particular way. A
+less common situation may require similar actions in order to declassify
+a Category A erratum into a Category C erratum. These are collectively
+known as "software workarounds" and are only required in the minority of
+cases (e.g. those cases that both require a non-secure workaround *and*
+can be triggered by Linux).
+
+For software workarounds that may adversely impact systems unaffected by
+the erratum in question, a Kconfig entry is added under "Kernel
+Features" -> "ARM errata workarounds via the alternatives framework".
+These are enabled by default and patched in at runtime when an affected
+CPU is detected. For less-intrusive workarounds, a Kconfig option is not
+available and the code is structured (preferably with a comment) in such
+a way that the erratum will not be hit.
+
+This approach can make it slightly onerous to determine exactly which
+errata are worked around in an arbitrary kernel source tree, so this
+file acts as a registry of software workarounds in the Linux Kernel and
+will be updated when new workarounds are committed and backported to
+stable kernels.
+
+| Implementor    | Component       | Erratum ID      | Kconfig                 |
++----------------+-----------------+-----------------+-------------------------+
+| ARM            | Cortex-A53      | #826319         | ARM64_ERRATUM_826319    |
+| ARM            | Cortex-A53      | #827319         | ARM64_ERRATUM_827319    |
+| ARM            | Cortex-A53      | #824069         | ARM64_ERRATUM_824069    |
+| ARM            | Cortex-A53      | #819472         | ARM64_ERRATUM_819472    |
+| ARM            | Cortex-A53      | #845719         | ARM64_ERRATUM_845719    |
+| ARM            | Cortex-A53      | #843419         | ARM64_ERRATUM_843419    |
+| ARM            | Cortex-A57      | #832075         | ARM64_ERRATUM_832075    |
+| ARM            | Cortex-A57      | #852523         | N/A                     |
+| ARM            | Cortex-A57      | #834220         | ARM64_ERRATUM_834220    |
+|                |                 |                 |                         |
+| Cavium         | ThunderX ITS    | #22375, #24313  | CAVIUM_ERRATUM_22375    |
+| Cavium         | ThunderX GICv3  | #23154          | CAVIUM_ERRATUM_23154    |
diff --git a/Documentation/cgroup-legacy/00-INDEX b/Documentation/cgroup-legacy/00-INDEX

new file mode 100644 (file)

index 0000000..3f5a40f
--- /dev/null
+++ b/Documentation/cgroup-legacy/00-INDEX
@@ -0,0 +1,30 @@
+00-INDEX
+       - this file
+blkio-controller.txt
+       - Description for Block IO Controller, implementation and usage details.
+cgroups.txt
+       - Control Groups definition, implementation details, examples and API.
+cpuacct.txt
+       - CPU Accounting Controller; account CPU usage for groups of tasks.
+cpusets.txt
+       - documents the cpusets feature; assign CPUs and Mem to a set of tasks.
+devices.txt
+       - Device Whitelist Controller; description, interface and security.
+freezer-subsystem.txt
+       - checkpointing; rationale to not use signals, interface.
+hugetlb.txt
+       - HugeTLB Controller implementation and usage details.
+memcg_test.txt
+       - Memory Resource Controller; implementation details.
+memory.txt
+       - Memory Resource Controller; design, accounting, interface, testing.
+net_cls.txt
+       - Network classifier cgroups details and usages.
+net_prio.txt
+       - Network priority cgroups details and usages.
+pids.txt
+       - Process number cgroups details and usages.
+resource_counter.txt
+       - Resource Counter API.
+unified-hierarchy.txt
+       - Description the new/next cgroup interface.
diff --git a/Documentation/cgroup-legacy/blkio-controller.txt b/Documentation/cgroup-legacy/blkio-controller.txt

new file mode 100644 (file)

index 0000000..4ecc954
--- /dev/null
+++ b/Documentation/cgroup-legacy/blkio-controller.txt
@@ -0,0 +1,376 @@
+                               Block IO Controller
+                               ===================
+Overview
+========
+cgroup subsys "blkio" implements the block io controller. There seems to be
+a need of various kinds of IO control policies (like proportional BW, max BW)
+both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
+Plan is to use the same cgroup based management interface for blkio controller
+and based on user options switch IO policies in the background.
+
+Currently two IO control policies are implemented. First one is proportional
+weight time based division of disk policy. It is implemented in CFQ. Hence
+this policy takes effect only on leaf nodes when CFQ is being used. The second
+one is throttling policy which can be used to specify upper IO rate limits
+on devices. This policy is implemented in generic block layer and can be
+used on leaf nodes as well as higher level logical devices like device mapper.
+
+HOWTO
+=====
+Proportional Weight division of bandwidth
+-----------------------------------------
+You can do a very simple testing of running two dd threads in two different
+cgroups. Here is what you can do.
+
+- Enable Block IO controller
+       CONFIG_BLK_CGROUP=y
+
+- Enable group scheduling in CFQ
+       CONFIG_CFQ_GROUP_IOSCHED=y
+
+- Compile and boot into kernel and mount IO controller (blkio); see
+  cgroups.txt, Why are cgroups needed?.
+
+       mount -t tmpfs cgroup_root /sys/fs/cgroup
+       mkdir /sys/fs/cgroup/blkio
+       mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
+
+- Create two cgroups
+       mkdir -p /sys/fs/cgroup/blkio/test1/ /sys/fs/cgroup/blkio/test2
+
+- Set weights of group test1 and test2
+       echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight
+       echo 500 > /sys/fs/cgroup/blkio/test2/blkio.weight
+
+- Create two same size files (say 512MB each) on same disk (file1, file2) and
+  launch two dd threads in different cgroup to read those files.
+
+       sync
+       echo 3 > /proc/sys/vm/drop_caches
+
+       dd if=/mnt/sdb/zerofile1 of=/dev/null &
+       echo $! > /sys/fs/cgroup/blkio/test1/tasks
+       cat /sys/fs/cgroup/blkio/test1/tasks
+
+       dd if=/mnt/sdb/zerofile2 of=/dev/null &
+       echo $! > /sys/fs/cgroup/blkio/test2/tasks
+       cat /sys/fs/cgroup/blkio/test2/tasks
+
+- At macro level, first dd should finish first. To get more precise data, keep
+  on looking at (with the help of script), at blkio.disk_time and
+  blkio.disk_sectors files of both test1 and test2 groups. This will tell how
+  much disk time (in milliseconds), each group got and how many sectors each
+  group dispatched to the disk. We provide fairness in terms of disk time, so
+  ideally io.disk_time of cgroups should be in proportion to the weight.
+
+Throttling/Upper Limit policy
+-----------------------------
+- Enable Block IO controller
+       CONFIG_BLK_CGROUP=y
+
+- Enable throttling in block layer
+       CONFIG_BLK_DEV_THROTTLING=y
+
+- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)
+        mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
+
+- Specify a bandwidth rate on particular device for root group. The format
+  for policy is "<major>:<minor>  <bytes_per_second>".
+
+        echo "8:16  1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
+
+  Above will put a limit of 1MB/second on reads happening for root group
+  on device having major/minor number 8:16.
+
+- Run dd to read a file and see if rate is throttled to 1MB/s or not.
+
+               # dd if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
+               # iflag=direct
+        1024+0 records in
+        1024+0 records out
+        4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
+
+ Limits for writes can be put using blkio.throttle.write_bps_device file.
+
+Hierarchical Cgroups
+====================
+
+Both CFQ and throttling implement hierarchy support; however,
+throttling's hierarchy support is enabled iff "sane_behavior" is
+enabled from cgroup side, which currently is a development option and
+not publicly available.
+
+If somebody created a hierarchy like as follows.
+
+                       root
+                       /  \
+                    test1 test2
+                       |
+                    test3
+
+CFQ by default and throttling with "sane_behavior" will handle the
+hierarchy correctly.  For details on CFQ hierarchy support, refer to
+Documentation/block/cfq-iosched.txt.  For throttling, all limits apply
+to the whole subtree while all statistics are local to the IOs
+directly generated by tasks in that cgroup.
+
+Throttling without "sane_behavior" enabled from cgroup side will
+practically treat all groups at same level as if it looks like the
+following.
+
+                               pivot
+                            /  /   \  \
+                       root  test1 test2  test3
+
+Various user visible config options
+===================================
+CONFIG_BLK_CGROUP
+       - Block IO controller.
+
+CONFIG_DEBUG_BLK_CGROUP
+       - Debug help. Right now some additional stats file show up in cgroup
+         if this option is enabled.
+
+CONFIG_CFQ_GROUP_IOSCHED
+       - Enables group scheduling in CFQ. Currently only 1 level of group
+         creation is allowed.
+
+CONFIG_BLK_DEV_THROTTLING
+       - Enable block device throttling support in block layer.
+
+Details of cgroup files
+=======================
+Proportional weight policy files
+--------------------------------
+- blkio.weight
+       - Specifies per cgroup weight. This is default weight of the group
+         on all the devices until and unless overridden by per device rule.
+         (See blkio.weight_device).
+         Currently allowed range of weights is from 10 to 1000.
+
+- blkio.weight_device
+       - One can specify per cgroup per device rules using this interface.
+         These rules override the default value of group weight as specified
+         by blkio.weight.
+
+         Following is the format.
+
+         # echo dev_maj:dev_minor weight > blkio.weight_device
+         Configure weight=300 on /dev/sdb (8:16) in this cgroup
+         # echo 8:16 300 > blkio.weight_device
+         # cat blkio.weight_device
+         dev     weight
+         8:16    300
+
+         Configure weight=500 on /dev/sda (8:0) in this cgroup
+         # echo 8:0 500 > blkio.weight_device
+         # cat blkio.weight_device
+         dev     weight
+         8:0     500
+         8:16    300
+
+         Remove specific weight for /dev/sda in this cgroup
+         # echo 8:0 0 > blkio.weight_device
+         # cat blkio.weight_device
+         dev     weight
+         8:16    300
+
+- blkio.leaf_weight[_device]
+       - Equivalents of blkio.weight[_device] for the purpose of
+          deciding how much weight tasks in the given cgroup has while
+          competing with the cgroup's child cgroups. For details,
+          please refer to Documentation/block/cfq-iosched.txt.
+
+- blkio.time
+       - disk time allocated to cgroup per device in milliseconds. First
+         two fields specify the major and minor number of the device and
+         third field specifies the disk time allocated to group in
+         milliseconds.
+
+- blkio.sectors
+       - number of sectors transferred to/from disk by the group. First
+         two fields specify the major and minor number of the device and
+         third field specifies the number of sectors transferred by the
+         group to/from the device.
+
+- blkio.io_service_bytes
+       - Number of bytes transferred to/from the disk by the group. These
+         are further divided by the type of operation - read or write, sync
+         or async. First two fields specify the major and minor number of the
+         device, third field specifies the operation type and the fourth field
+         specifies the number of bytes.
+
+- blkio.io_serviced
+       - Number of IOs (bio) issued to the disk by the group. These
+         are further divided by the type of operation - read or write, sync
+         or async. First two fields specify the major and minor number of the
+         device, third field specifies the operation type and the fourth field
+         specifies the number of IOs.
+
+- blkio.io_service_time
+       - Total amount of time between request dispatch and request completion
+         for the IOs done by this cgroup. This is in nanoseconds to make it
+         meaningful for flash devices too. For devices with queue depth of 1,
+         this time represents the actual service time. When queue_depth > 1,
+         that is no longer true as requests may be served out of order. This
+         may cause the service time for a given IO to include the service time
+         of multiple IOs when served out of order which may result in total
+         io_service_time > actual time elapsed. This time is further divided by
+         the type of operation - read or write, sync or async. First two fields
+         specify the major and minor number of the device, third field
+         specifies the operation type and the fourth field specifies the
+         io_service_time in ns.
+
+- blkio.io_wait_time
+       - Total amount of time the IOs for this cgroup spent waiting in the
+         scheduler queues for service. This can be greater than the total time
+         elapsed since it is cumulative io_wait_time for all IOs. It is not a
+         measure of total time the cgroup spent waiting but rather a measure of
+         the wait_time for its individual IOs. For devices with queue_depth > 1
+         this metric does not include the time spent waiting for service once
+         the IO is dispatched to the device but till it actually gets serviced
+         (there might be a time lag here due to re-ordering of requests by the
+         device). This is in nanoseconds to make it meaningful for flash
+         devices too. This time is further divided by the type of operation -
+         read or write, sync or async. First two fields specify the major and
+         minor number of the device, third field specifies the operation type
+         and the fourth field specifies the io_wait_time in ns.
+
+- blkio.io_merged
+       - Total number of bios/requests merged into requests belonging to this
+         cgroup. This is further divided by the type of operation - read or
+         write, sync or async.
+
+- blkio.io_queued
+       - Total number of requests queued up at any given instant for this
+         cgroup. This is further divided by the type of operation - read or
+         write, sync or async.
+
+- blkio.avg_queue_size
+       - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+         The average queue size for this cgroup over the entire time of this
+         cgroup's existence. Queue size samples are taken each time one of the
+         queues of this cgroup gets a timeslice.
+
+- blkio.group_wait_time
+       - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+         This is the amount of time the cgroup had to wait since it became busy
+         (i.e., went from 0 to 1 request queued) to get a timeslice for one of
+         its queues. This is different from the io_wait_time which is the
+         cumulative total of the amount of time spent by each IO in that cgroup
+         waiting in the scheduler queue. This is in nanoseconds. If this is
+         read when the cgroup is in a waiting (for timeslice) state, the stat
+         will only report the group_wait_time accumulated till the last time it
+         got a timeslice and will not include the current delta.
+
+- blkio.empty_time
+       - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+         This is the amount of time a cgroup spends without any pending
+         requests when not being served, i.e., it does not include any time
+         spent idling for one of the queues of the cgroup. This is in
+         nanoseconds. If this is read when the cgroup is in an empty state,
+         the stat will only report the empty_time accumulated till the last
+         time it had a pending request and will not include the current delta.
+
+- blkio.idle_time
+       - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+         This is the amount of time spent by the IO scheduler idling for a
+         given cgroup in anticipation of a better request than the existing ones
+         from other queues/cgroups. This is in nanoseconds. If this is read
+         when the cgroup is in an idling state, the stat will only report the
+         idle_time accumulated till the last idle period and will not include
+         the current delta.
+
+- blkio.dequeue
+       - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
+         gives the statistics about how many a times a group was dequeued
+         from service tree of the device. First two fields specify the major
+         and minor number of the device and third field specifies the number
+         of times a group was dequeued from a particular device.
+
+- blkio.*_recursive
+       - Recursive version of various stats. These files show the
+          same information as their non-recursive counterparts but
+          include stats from all the descendant cgroups.
+
+Throttling/Upper limit policy files
+-----------------------------------
+- blkio.throttle.read_bps_device
+       - Specifies upper limit on READ rate from the device. IO rate is
+         specified in bytes per second. Rules are per device. Following is
+         the format.
+
+  echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
+
+- blkio.throttle.write_bps_device
+       - Specifies upper limit on WRITE rate to the device. IO rate is
+         specified in bytes per second. Rules are per device. Following is
+         the format.
+
+  echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
+
+- blkio.throttle.read_iops_device
+       - Specifies upper limit on READ rate from the device. IO rate is
+         specified in IO per second. Rules are per device. Following is
+         the format.
+
+  echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
+
+- blkio.throttle.write_iops_device
+       - Specifies upper limit on WRITE rate to the device. IO rate is
+         specified in io per second. Rules are per device. Following is
+         the format.
+
+  echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
+
+Note: If both BW and IOPS rules are specified for a device, then IO is
+      subjected to both the constraints.
+
+- blkio.throttle.io_serviced
+       - Number of IOs (bio) issued to the disk by the group. These
+         are further divided by the type of operation - read or write, sync
+         or async. First two fields specify the major and minor number of the
+         device, third field specifies the operation type and the fourth field
+         specifies the number of IOs.
+
+- blkio.throttle.io_service_bytes
+       - Number of bytes transferred to/from the disk by the group. These
+         are further divided by the type of operation - read or write, sync
+         or async. First two fields specify the major and minor number of the
+         device, third field specifies the operation type and the fourth field
+         specifies the number of bytes.
+
+Common files among various policies
+-----------------------------------
+- blkio.reset_stats
+       - Writing an int to this file will result in resetting all the stats
+         for that cgroup.
+
+CFQ sysfs tunable
+=================
+/sys/block/<disk>/queue/iosched/slice_idle
+------------------------------------------
+On a faster hardware CFQ can be slow, especially with sequential workload.
+This happens because CFQ idles on a single queue and single queue might not
+drive deeper request queue depths to keep the storage busy. In such scenarios
+one can try setting slice_idle=0 and that would switch CFQ to IOPS
+(IO operations per second) mode on NCQ supporting hardware.
+
+That means CFQ will not idle between cfq queues of a cfq group and hence be
+able to driver higher queue depth and achieve better throughput. That also
+means that cfq provides fairness among groups in terms of IOPS and not in
+terms of disk time.
+
+/sys/block/<disk>/queue/iosched/group_idle
+------------------------------------------
+If one disables idling on individual cfq queues and cfq service trees by
+setting slice_idle=0, group_idle kicks in. That means CFQ will still idle
+on the group in an attempt to provide fairness among groups.
+
+By default group_idle is same as slice_idle and does not do anything if
+slice_idle is enabled.
+
+One can experience an overall throughput drop if you have created multiple
+groups and put applications in that group which are not driving enough
+IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle
+on individual groups and throughput should improve.
diff --git a/Documentation/cgroup-legacy/cgroups.txt b/Documentation/cgroup-legacy/cgroups.txt

new file mode 100644 (file)

index 0000000..c6256ae
--- /dev/null
+++ b/Documentation/cgroup-legacy/cgroups.txt
@@ -0,0 +1,682 @@
+                               CGROUPS
+                               -------
+
+Written by Paul Menage <menage@google.com> based on
+Documentation/cgroups/cpusets.txt
+
+Original copyright statements from cpusets.txt:
+Portions Copyright (C) 2004 BULL SA.
+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+Modified by Paul Jackson <pj@sgi.com>
+Modified by Christoph Lameter <clameter@sgi.com>
+
+CONTENTS:
+=========
+
+1. Control Groups
+  1.1 What are cgroups ?
+  1.2 Why are cgroups needed ?
+  1.3 How are cgroups implemented ?
+  1.4 What does notify_on_release do ?
+  1.5 What does clone_children do ?
+  1.6 How do I use cgroups ?
+2. Usage Examples and Syntax
+  2.1 Basic Usage
+  2.2 Attaching processes
+  2.3 Mounting hierarchies by name
+3. Kernel API
+  3.1 Overview
+  3.2 Synchronization
+  3.3 Subsystem API
+4. Extended attributes usage
+5. Questions
+
+1. Control Groups
+=================
+
+1.1 What are cgroups ?
+----------------------
+
+Control Groups provide a mechanism for aggregating/partitioning sets of
+tasks, and all their future children, into hierarchical groups with
+specialized behaviour.
+
+Definitions:
+
+A *cgroup* associates a set of tasks with a set of parameters for one
+or more subsystems.
+
+A *subsystem* is a module that makes use of the task grouping
+facilities provided by cgroups to treat groups of tasks in
+particular ways. A subsystem is typically a "resource controller" that
+schedules a resource or applies per-cgroup limits, but it may be
+anything that wants to act on a group of processes, e.g. a
+virtualization subsystem.
+
+A *hierarchy* is a set of cgroups arranged in a tree, such that
+every task in the system is in exactly one of the cgroups in the
+hierarchy, and a set of subsystems; each subsystem has system-specific
+state attached to each cgroup in the hierarchy.  Each hierarchy has
+an instance of the cgroup virtual filesystem associated with it.
+
+At any one time there may be multiple active hierarchies of task
+cgroups. Each hierarchy is a partition of all tasks in the system.
+
+User-level code may create and destroy cgroups by name in an
+instance of the cgroup virtual file system, specify and query to
+which cgroup a task is assigned, and list the task PIDs assigned to
+a cgroup. Those creations and assignments only affect the hierarchy
+associated with that instance of the cgroup file system.
+
+On their own, the only use for cgroups is for simple job
+tracking. The intention is that other subsystems hook into the generic
+cgroup support to provide new attributes for cgroups, such as
+accounting/limiting the resources which processes in a cgroup can
+access. For example, cpusets (see Documentation/cgroups/cpusets.txt) allow
+you to associate a set of CPUs and a set of memory nodes with the
+tasks in each cgroup.
+
+1.2 Why are cgroups needed ?
+----------------------------
+
+There are multiple efforts to provide process aggregations in the
+Linux kernel, mainly for resource-tracking purposes. Such efforts
+include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
+namespaces. These all require the basic notion of a
+grouping/partitioning of processes, with newly forked processes ending
+up in the same group (cgroup) as their parent process.
+
+The kernel cgroup patch provides the minimum essential kernel
+mechanisms required to efficiently implement such groups. It has
+minimal impact on the system fast paths, and provides hooks for
+specific subsystems such as cpusets to provide additional behaviour as
+desired.
+
+Multiple hierarchy support is provided to allow for situations where
+the division of tasks into cgroups is distinctly different for
+different subsystems - having parallel hierarchies allows each
+hierarchy to be a natural division of tasks, without having to handle
+complex combinations of tasks that would be present if several
+unrelated subsystems needed to be forced into the same tree of
+cgroups.
+
+At one extreme, each resource controller or subsystem could be in a
+separate hierarchy; at the other extreme, all subsystems
+would be attached to the same hierarchy.
+
+As an example of a scenario (originally proposed by vatsa@in.ibm.com)
+that can benefit from multiple hierarchies, consider a large
+university server with various users - students, professors, system
+tasks etc. The resource planning for this server could be along the
+following lines:
+
+       CPU :          "Top cpuset"
+                       /       \
+               CPUSet1         CPUSet2
+                  |               |
+               (Professors)    (Students)
+
+               In addition (system tasks) are attached to topcpuset (so
+               that they can run anywhere) with a limit of 20%
+
+       Memory : Professors (50%), Students (30%), system (20%)
+
+       Disk : Professors (50%), Students (30%), system (20%)
+
+       Network : WWW browsing (20%), Network File System (60%), others (20%)
+                               / \
+               Professors (15%)  students (5%)
+
+Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes
+into the NFS network class.
+
+At the same time Firefox/Lynx will share an appropriate CPU/Memory class
+depending on who launched it (prof/student).
+
+With the ability to classify tasks differently for different resources
+(by putting those resource subsystems in different hierarchies),
+the admin can easily set up a script which receives exec notifications
+and depending on who is launching the browser he can
+
+    # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks
+
+With only a single hierarchy, he now would potentially have to create
+a separate cgroup for every browser launched and associate it with
+appropriate network and other resource class.  This may lead to
+proliferation of such cgroups.
+
+Also let's say that the administrator would like to give enhanced network
+access temporarily to a student's browser (since it is night and the user
+wants to do online gaming :))  OR give one of the student's simulation
+apps enhanced CPU power.
+
+With ability to write PIDs directly to resource classes, it's just a
+matter of:
+
+       # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
+       (after some time)
+       # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks
+
+Without this ability, the administrator would have to split the cgroup into
+multiple separate ones and then associate the new cgroups with the
+new resource classes.
+
+
+
+1.3 How are cgroups implemented ?
+---------------------------------
+
+Control Groups extends the kernel as follows:
+
+ - Each task in the system has a reference-counted pointer to a
+   css_set.
+
+ - A css_set contains a set of reference-counted pointers to
+   cgroup_subsys_state objects, one for each cgroup subsystem
+   registered in the system. There is no direct link from a task to
+   the cgroup of which it's a member in each hierarchy, but this
+   can be determined by following pointers through the
+   cgroup_subsys_state objects. This is because accessing the
+   subsystem state is something that's expected to happen frequently
+   and in performance-critical code, whereas operations that require a
+   task's actual cgroup assignments (in particular, moving between
+   cgroups) are less common. A linked list runs through the cg_list
+   field of each task_struct using the css_set, anchored at
+   css_set->tasks.
+
+ - A cgroup hierarchy filesystem can be mounted for browsing and
+   manipulation from user space.
+
+ - You can list all the tasks (by PID) attached to any cgroup.
+
+The implementation of cgroups requires a few, simple hooks
+into the rest of the kernel, none in performance-critical paths:
+
+ - in init/main.c, to initialize the root cgroups and initial
+   css_set at system boot.
+
+ - in fork and exit, to attach and detach a task from its css_set.
+
+In addition, a new file system of type "cgroup" may be mounted, to
+enable browsing and modifying the cgroups presently known to the
+kernel.  When mounting a cgroup hierarchy, you may specify a
+comma-separated list of subsystems to mount as the filesystem mount
+options.  By default, mounting the cgroup filesystem attempts to
+mount a hierarchy containing all registered subsystems.
+
+If an active hierarchy with exactly the same set of subsystems already
+exists, it will be reused for the new mount. If no existing hierarchy
+matches, and any of the requested subsystems are in use in an existing
+hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
+is activated, associated with the requested subsystems.
+
+It's not currently possible to bind a new subsystem to an active
+cgroup hierarchy, or to unbind a subsystem from an active cgroup
+hierarchy. This may be possible in future, but is fraught with nasty
+error-recovery issues.
+
+When a cgroup filesystem is unmounted, if there are any
+child cgroups created below the top-level cgroup, that hierarchy
+will remain active even though unmounted; if there are no
+child cgroups then the hierarchy will be deactivated.
+
+No new system calls are added for cgroups - all support for
+querying and modifying cgroups is via this cgroup file system.
+
+Each task under /proc has an added file named 'cgroup' displaying,
+for each active hierarchy, the subsystem names and the cgroup name
+as the path relative to the root of the cgroup file system.
+
+Each cgroup is represented by a directory in the cgroup file system
+containing the following files describing that cgroup:
+
+ - tasks: list of tasks (by PID) attached to that cgroup.  This list
+   is not guaranteed to be sorted.  Writing a thread ID into this file
+   moves the thread into this cgroup.
+ - cgroup.procs: list of thread group IDs in the cgroup.  This list is
+   not guaranteed to be sorted or free of duplicate TGIDs, and userspace
+   should sort/uniquify the list if this property is required.
+   Writing a thread group ID into this file moves all threads in that
+   group into this cgroup.
+ - notify_on_release flag: run the release agent on exit?
+ - release_agent: the path to use for release notifications (this file
+   exists in the top cgroup only)
+
+Other subsystems such as cpusets may add additional files in each
+cgroup dir.
+
+New cgroups are created using the mkdir system call or shell
+command.  The properties of a cgroup, such as its flags, are
+modified by writing to the appropriate file in that cgroups
+directory, as listed above.
+
+The named hierarchical structure of nested cgroups allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cgroup allows organizing the work load
+on a system into related sets of tasks.  A task may be re-attached to
+any other cgroup, if allowed by the permissions on the necessary
+cgroup file system directories.
+
+When a task is moved from one cgroup to another, it gets a new
+css_set pointer - if there's an already existing css_set with the
+desired collection of cgroups then that group is reused, otherwise a new
+css_set is allocated. The appropriate existing css_set is located by
+looking into a hash table.
+
+To allow access from a cgroup to the css_sets (and hence tasks)
+that comprise it, a set of cg_cgroup_link objects form a lattice;
+each cg_cgroup_link is linked into a list of cg_cgroup_links for
+a single cgroup on its cgrp_link_list field, and a list of
+cg_cgroup_links for a single css_set on its cg_link_list.
+
+Thus the set of tasks in a cgroup can be listed by iterating over
+each css_set that references the cgroup, and sub-iterating over
+each css_set's task set.
+
+The use of a Linux virtual file system (vfs) to represent the
+cgroup hierarchy provides for a familiar permission and name space
+for cgroups, with a minimum of additional kernel code.
+
+1.4 What does notify_on_release do ?
+------------------------------------
+
+If the notify_on_release flag is enabled (1) in a cgroup, then
+whenever the last task in the cgroup leaves (exits or attaches to
+some other cgroup) and the last child cgroup of that cgroup
+is removed, then the kernel runs the command specified by the contents
+of the "release_agent" file in that hierarchy's root directory,
+supplying the pathname (relative to the mount point of the cgroup
+file system) of the abandoned cgroup.  This enables automatic
+removal of abandoned cgroups.  The default value of
+notify_on_release in the root cgroup at system boot is disabled
+(0).  The default value of other cgroups at creation is the current
+value of their parents' notify_on_release settings. The default value of
+a cgroup hierarchy's release_agent path is empty.
+
+1.5 What does clone_children do ?
+---------------------------------
+
+This flag only affects the cpuset controller. If the clone_children
+flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
+configuration from the parent during initialization.
+
+1.6 How do I use cgroups ?
+--------------------------
+
+To start a new job that is to be contained within a cgroup, using
+the "cpuset" cgroup subsystem, the steps are something like:
+
+ 1) mount -t tmpfs cgroup_root /sys/fs/cgroup
+ 2) mkdir /sys/fs/cgroup/cpuset
+ 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+ 4) Create the new cgroup by doing mkdir's and write's (or echo's) in
+    the /sys/fs/cgroup/cpuset virtual file system.
+ 5) Start a task that will be the "founding father" of the new job.
+ 6) Attach that task to the new cgroup by writing its PID to the
+    /sys/fs/cgroup/cpuset tasks file for that cgroup.
+ 7) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cgroup
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cgroup:
+
+  mount -t tmpfs cgroup_root /sys/fs/cgroup
+  mkdir /sys/fs/cgroup/cpuset
+  mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset
+  cd /sys/fs/cgroup/cpuset
+  mkdir Charlie
+  cd Charlie
+  /bin/echo 2-3 > cpuset.cpus
+  /bin/echo 1 > cpuset.mems
+  /bin/echo $$ > tasks
+  sh
+  # The subshell 'sh' is now running in cgroup Charlie
+  # The next line should display '/Charlie'
+  cat /proc/self/cgroup
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using cgroups can be done through the cgroup
+virtual filesystem.
+
+To mount a cgroup hierarchy with all available subsystems, type:
+# mount -t cgroup xxx /sys/fs/cgroup
+
+The "xxx" is not interpreted by the cgroup code, but will appear in
+/proc/mounts so may be any useful identifying string that you like.
+
+Note: Some subsystems do not work without some user input first.  For instance,
+if cpusets are enabled the user will have to populate the cpus and mems files
+for each new cgroup created before that group can be used.
+
+As explained in section `1.2 Why are cgroups needed?' you should create
+different hierarchies of cgroups for each single resource or group of
+resources you want to control. Therefore, you should mount a tmpfs on
+/sys/fs/cgroup and create directories for each cgroup resource or resource
+group.
+
+# mount -t tmpfs cgroup_root /sys/fs/cgroup
+# mkdir /sys/fs/cgroup/rg1
+
+To mount a cgroup hierarchy with just the cpuset and memory
+subsystems, type:
+# mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1
+
+While remounting cgroups is currently supported, it is not recommend
+to use it. Remounting allows changing bound subsystems and
+release_agent. Rebinding is hardly useful as it only works when the
+hierarchy is empty and release_agent itself should be replaced with
+conventional fsnotify. The support for remounting will be removed in
+the future.
+
+To Specify a hierarchy's release_agent:
+# mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
+  xxx /sys/fs/cgroup/rg1
+
+Note that specifying 'release_agent' more than once will return failure.
+
+Note that changing the set of subsystems is currently only supported
+when the hierarchy consists of a single (root) cgroup. Supporting
+the ability to arbitrarily bind/unbind subsystems from an existing
+cgroup hierarchy is intended to be implemented in the future.
+
+Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
+tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
+is the cgroup that holds the whole system.
+
+If you want to change the value of release_agent:
+# echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent
+
+It can also be changed via remount.
+
+If you want to create a new cgroup under /sys/fs/cgroup/rg1:
+# cd /sys/fs/cgroup/rg1
+# mkdir my_cgroup
+
+Now you want to do something with this cgroup.
+# cd my_cgroup
+
+In this directory you can find several files:
+# ls
+cgroup.procs notify_on_release tasks
+(plus whatever files added by the attached subsystems)
+
+Now attach your shell to this cgroup:
+# /bin/echo $$ > tasks
+
+You can also create cgroups inside your cgroup by using mkdir in this
+directory.
+# mkdir my_sub_cs
+
+To remove a cgroup, just use rmdir:
+# rmdir my_sub_cs
+
+This will fail if the cgroup is in use (has cgroups inside, or
+has processes attached, or is held alive by other subsystem-specific
+reference).
+
+2.2 Attaching processes
+-----------------------
+
+# /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another:
+
+# /bin/echo PID1 > tasks
+# /bin/echo PID2 > tasks
+       ...
+# /bin/echo PIDn > tasks
+
+You can attach the current shell task by echoing 0:
+
+# echo 0 > tasks
+
+You can use the cgroup.procs file instead of the tasks file to move all
+threads in a threadgroup at once. Echoing the PID of any task in a
+threadgroup to cgroup.procs causes all tasks in that threadgroup to be
+attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+in the writing task's threadgroup.
+
+Note: Since every task is always a member of exactly one cgroup in each
+mounted hierarchy, to remove a task from its current cgroup you must
+move it into a new cgroup (possibly the root cgroup) by writing to the
+new cgroup's tasks file.
+
+Note: Due to some restrictions enforced by some cgroup subsystems, moving
+a process to another cgroup can fail.
+
+2.3 Mounting hierarchies by name
+--------------------------------
+
+Passing the name=<x> option when mounting a cgroups hierarchy
+associates the given name with the hierarchy.  This can be used when
+mounting a pre-existing hierarchy, in order to refer to it by name
+rather than by its set of active subsystems.  Each hierarchy is either
+nameless, or has a unique name.
+
+The name should match [\w.-]+
+
+When passing a name=<x> option for a new hierarchy, you need to
+specify subsystems manually; the legacy behaviour of mounting all
+subsystems when none are explicitly specified is not supported when
+you give a subsystem a name.
+
+The name of the subsystem appears as part of the hierarchy description
+in /proc/mounts and /proc/<pid>/cgroups.
+
+
+3. Kernel API
+=============
+
+3.1 Overview
+------------
+
+Each kernel subsystem that wants to hook into the generic cgroup
+system needs to create a cgroup_subsys object. This contains
+various methods, which are callbacks from the cgroup system, along
+with a subsystem ID which will be assigned by the cgroup system.
+
+Other fields in the cgroup_subsys object include:
+
+- subsys_id: a unique array index for the subsystem, indicating which
+  entry in cgroup->subsys[] this subsystem should be managing.
+
+- name: should be initialized to a unique subsystem name. Should be
+  no longer than MAX_CGROUP_TYPE_NAMELEN.
+
+- early_init: indicate if the subsystem needs early initialization
+  at system boot.
+
+Each cgroup object created by the system has an array of pointers,
+indexed by subsystem ID; this pointer is entirely managed by the
+subsystem; the generic cgroup code will never touch this pointer.
+
+3.2 Synchronization
+-------------------
+
+There is a global mutex, cgroup_mutex, used by the cgroup
+system. This should be taken by anything that wants to modify a
+cgroup. It may also be taken to prevent cgroups from being
+modified, but more specific locks may be more appropriate in that
+situation.
+
+See kernel/cgroup.c for more details.
+
+Subsystems can take/release the cgroup_mutex via the functions
+cgroup_lock()/cgroup_unlock().
+
+Accessing a task's cgroup pointer may be done in the following ways:
+- while holding cgroup_mutex
+- while holding the task's alloc_lock (via task_lock())
+- inside an rcu_read_lock() section via rcu_dereference()
+
+3.3 Subsystem API
+-----------------
+
+Each subsystem should:
+
+- add an entry in linux/cgroup_subsys.h
+- define a cgroup_subsys object called <name>_subsys
+
+If a subsystem can be compiled as a module, it should also have in its
+module initcall a call to cgroup_load_subsys(), and in its exitcall a
+call to cgroup_unload_subsys(). It should also set its_subsys.module =
+THIS_MODULE in its .c file.
+
+Each subsystem may export the following methods. The only mandatory
+methods are css_alloc/free. Any others that are null are presumed to
+be successful no-ops.
+
+struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)
+(cgroup_mutex held by caller)
+
+Called to allocate a subsystem state object for a cgroup. The
+subsystem should allocate its subsystem state object for the passed
+cgroup, returning a pointer to the new object on success or a
+ERR_PTR() value. On success, the subsystem pointer should point to
+a structure of type cgroup_subsys_state (typically embedded in a
+larger subsystem-specific object), which will be initialized by the
+cgroup system. Note that this will be called at initialization to
+create the root subsystem state for this subsystem; this case can be
+identified by the passed cgroup object having a NULL parent (since
+it's the root of the hierarchy) and may be an appropriate place for
+initialization code.
+
+int css_online(struct cgroup *cgrp)
+(cgroup_mutex held by caller)
+
+Called after @cgrp successfully completed all allocations and made
+visible to cgroup_for_each_child/descendant_*() iterators. The
+subsystem may choose to fail creation by returning -errno. This
+callback can be used to implement reliable state sharing and
+propagation along the hierarchy. See the comment on
+cgroup_for_each_descendant_pre() for details.
+
+void css_offline(struct cgroup *cgrp);
+(cgroup_mutex held by caller)
+
+This is the counterpart of css_online() and called iff css_online()
+has succeeded on @cgrp. This signifies the beginning of the end of
+@cgrp. @cgrp is being removed and the subsystem should start dropping
+all references it's holding on @cgrp. When all references are dropped,
+cgroup removal will proceed to the next step - css_free(). After this
+callback, @cgrp should be considered dead to the subsystem.
+
+void css_free(struct cgroup *cgrp)
+(cgroup_mutex held by caller)
+
+The cgroup system is about to free @cgrp; the subsystem should free
+its subsystem state object. By the time this method is called, @cgrp
+is completely unused; @cgrp->parent is still valid. (Note - can also
+be called for a newly-created cgroup if an error occurs after this
+subsystem's create() method has been called for the new cgroup).
+
+int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+(cgroup_mutex held by caller)
+
+Called prior to moving one or more tasks into a cgroup; if the
+subsystem returns an error, this will abort the attach operation.
+@tset contains the tasks to be attached and is guaranteed to have at
+least one task in it.
+
+If there are multiple tasks in the taskset, then:
+  - it's guaranteed that all are from the same thread group
+  - @tset contains all tasks from the thread group whether or not
+    they're switching cgroups
+  - the first task is the leader
+
+Each @tset entry also contains the task's old cgroup and tasks which
+aren't switching cgroup can be skipped easily using the
+cgroup_taskset_for_each() iterator. Note that this isn't called on a
+fork. If this method returns 0 (success) then this should remain valid
+while the caller holds cgroup_mutex and it is ensured that either
+attach() or cancel_attach() will be called in future.
+
+void css_reset(struct cgroup_subsys_state *css)
+(cgroup_mutex held by caller)
+
+An optional operation which should restore @css's configuration to the
+initial state.  This is currently only used on the unified hierarchy
+when a subsystem is disabled on a cgroup through
+"cgroup.subtree_control" but should remain enabled because other
+subsystems depend on it.  cgroup core makes such a css invisible by
+removing the associated interface files and invokes this callback so
+that the hidden subsystem can return to the initial neutral state.
+This prevents unexpected resource control from a hidden css and
+ensures that the configuration is in the initial state when it is made
+visible again later.
+
+void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+(cgroup_mutex held by caller)
+
+Called when a task attach operation has failed after can_attach() has succeeded.
+A subsystem whose can_attach() has some side-effects should provide this
+function, so that the subsystem can implement a rollback. If not, not necessary.
+This will be called only about subsystems whose can_attach() operation have
+succeeded. The parameters are identical to can_attach().
+
+void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+(cgroup_mutex held by caller)
+
+Called after the task has been attached to the cgroup, to allow any
+post-attachment activity that requires memory allocations or blocking.
+The parameters are identical to can_attach().
+
+void fork(struct task_struct *task)
+
+Called when a task is forked into a cgroup.
+
+void exit(struct task_struct *task)
+
+Called during task exit.
+
+void free(struct task_struct *task)
+
+Called when the task_struct is freed.
+
+void bind(struct cgroup *root)
+(cgroup_mutex held by caller)
+
+Called when a cgroup subsystem is rebound to a different hierarchy
+and root cgroup. Currently this will only involve movement between
+the default hierarchy (which never has sub-cgroups) and a hierarchy
+that is being created/destroyed (and hence has no sub-cgroups).
+
+4. Extended attribute usage
+===========================
+
+cgroup filesystem supports certain types of extended attributes in its
+directories and files.  The current supported types are:
+       - Trusted (XATTR_TRUSTED)
+       - Security (XATTR_SECURITY)
+
+Both require CAP_SYS_ADMIN capability to set.
+
+Like in tmpfs, the extended attributes in cgroup filesystem are stored
+using kernel memory and it's advised to keep the usage at minimum.  This
+is the reason why user defined extended attributes are not supported, since
+any user can do it and there's no limit in the value size.
+
+The current known users for this feature are SELinux to limit cgroup usage
+in containers and systemd for assorted meta data like main PID in a cgroup
+(systemd creates a cgroup per service).
+
+5. Questions
+============
+
+Q: what's up with this '/bin/echo' ?
+A: bash's builtin 'echo' command does not check calls to write() against
+   errors. If you use it in the cgroup file system, you won't be
+   able to tell whether a command succeeded or failed.
+
+Q: When I attach processes, only the first of the line gets really attached !
+A: We can only return one error code per call to write(). So you should also
+   put only ONE PID.
+
diff --git a/Documentation/cgroup-legacy/cpuacct.txt b/Documentation/cgroup-legacy/cpuacct.txt

new file mode 100644 (file)

index 0000000..9d73cc0
--- /dev/null
+++ b/Documentation/cgroup-legacy/cpuacct.txt
@@ -0,0 +1,49 @@
+CPU Accounting Controller
+-------------------------
+
+The CPU accounting controller is used to group tasks using cgroups and
+account the CPU usage of these groups of tasks.
+
+The CPU accounting controller supports multi-hierarchy groups. An accounting
+group accumulates the CPU usage of all of its child groups and the tasks
+directly present in its group.
+
+Accounting groups can be created by first mounting the cgroup filesystem.
+
+# mount -t cgroup -ocpuacct none /sys/fs/cgroup
+
+With the above step, the initial or the parent accounting group becomes
+visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
+the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
+/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained
+by this group which is essentially the CPU time obtained by all the tasks
+in the system.
+
+New accounting groups can be created under the parent group /sys/fs/cgroup.
+
+# cd /sys/fs/cgroup
+# mkdir g1
+# echo $$ > g1/tasks
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it. CPU time consumed by this bash and its children
+can be obtained from g1/cpuacct.usage and the same is accumulated in
+/sys/fs/cgroup/cpuacct.usage also.
+
+cpuacct.stat file lists a few statistics which further divide the
+CPU time obtained by the cgroup into user and system times. Currently
+the following statistics are supported:
+
+user: Time spent by tasks of the cgroup in user mode.
+system: Time spent by tasks of the cgroup in kernel mode.
+
+user and system are in USER_HZ unit.
+
+cpuacct controller uses percpu_counter interface to collect user and
+system times. This has two side effects:
+
+- It is theoretically possible to see wrong values for user and system times.
+  This is because percpu_counter_read() on 32bit systems isn't safe
+  against concurrent writes.
+- It is possible to see slightly outdated values for user and system times
+  due to the batch processing nature of percpu_counter.
diff --git a/Documentation/cgroup-legacy/cpusets.txt b/Documentation/cgroup-legacy/cpusets.txt

new file mode 100644 (file)

index 0000000..fdf7dff
--- /dev/null
+++ b/Documentation/cgroup-legacy/cpusets.txt
@@ -0,0 +1,839 @@
+                               CPUSETS
+                               -------
+
+Copyright (C) 2004 BULL SA.
+Written by Simon.Derr@bull.net
+
+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+Modified by Paul Jackson <pj@sgi.com>
+Modified by Christoph Lameter <clameter@sgi.com>
+Modified by Paul Menage <menage@google.com>
+Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+
+CONTENTS:
+=========
+
+1. Cpusets
+  1.1 What are cpusets ?
+  1.2 Why are cpusets needed ?
+  1.3 How are cpusets implemented ?
+  1.4 What are exclusive cpusets ?
+  1.5 What is memory_pressure ?
+  1.6 What is memory spread ?
+  1.7 What is sched_load_balance ?
+  1.8 What is sched_relax_domain_level ?
+  1.9 How do I use cpusets ?
+2. Usage Examples and Syntax
+  2.1 Basic Usage
+  2.2 Adding/removing cpus
+  2.3 Setting flags
+  2.4 Attaching processes
+3. Questions
+4. Contact
+
+1. Cpusets
+==========
+
+1.1 What are cpusets ?
+----------------------
+
+Cpusets provide a mechanism for assigning a set of CPUs and Memory
+Nodes to a set of tasks.   In this document "Memory Node" refers to
+an on-line node that contains memory.
+
+Cpusets constrain the CPU and Memory placement of tasks to only
+the resources within a task's current cpuset.  They form a nested
+hierarchy visible in a virtual file system.  These are the essential
+hooks, beyond what is already present, required to manage dynamic
+job placement on large systems.
+
+Cpusets use the generic cgroup subsystem described in
+Documentation/cgroups/cgroups.txt.
+
+Requests by a task, using the sched_setaffinity(2) system call to
+include CPUs in its CPU affinity mask, and using the mbind(2) and
+set_mempolicy(2) system calls to include Memory Nodes in its memory
+policy, are both filtered through that task's cpuset, filtering out any
+CPUs or Memory Nodes not in that cpuset.  The scheduler will not
+schedule a task on a CPU that is not allowed in its cpus_allowed
+vector, and the kernel page allocator will not allocate a page on a
+node that is not allowed in the requesting task's mems_allowed vector.
+
+User level code may create and destroy cpusets by name in the cgroup
+virtual file system, manage the attributes and permissions of these
+cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
+specify and query to which cpuset a task is assigned, and list the
+task pids assigned to a cpuset.
+
+
+1.2 Why are cpusets needed ?
+----------------------------
+
+The management of large computer systems, with many processors (CPUs),
+complex memory cache hierarchies and multiple Memory Nodes having
+non-uniform access times (NUMA) presents additional challenges for
+the efficient scheduling and memory placement of processes.
+
+Frequently more modest sized systems can be operated with adequate
+efficiency just by letting the operating system automatically share
+the available CPU and Memory resources amongst the requesting tasks.
+
+But larger systems, which benefit more from careful processor and
+memory placement to reduce memory access times and contention,
+and which typically represent a larger investment for the customer,
+can benefit from explicitly placing jobs on properly sized subsets of
+the system.
+
+This can be especially valuable on:
+
+    * Web Servers running multiple instances of the same web application,
+    * Servers running different applications (for instance, a web server
+      and a database), or
+    * NUMA systems running large HPC applications with demanding
+      performance characteristics.
+
+These subsets, or "soft partitions" must be able to be dynamically
+adjusted, as the job mix changes, without impacting other concurrently
+executing jobs. The location of the running jobs pages may also be moved
+when the memory locations are changed.
+
+The kernel cpuset patch provides the minimum essential kernel
+mechanisms required to efficiently implement such subsets.  It
+leverages existing CPU and Memory Placement facilities in the Linux
+kernel to avoid any additional impact on the critical scheduler or
+memory allocator code.
+
+
+1.3 How are cpusets implemented ?
+---------------------------------
+
+Cpusets provide a Linux kernel mechanism to constrain which CPUs and
+Memory Nodes are used by a process or set of processes.
+
+The Linux kernel already has a pair of mechanisms to specify on which
+CPUs a task may be scheduled (sched_setaffinity) and on which Memory
+Nodes it may obtain memory (mbind, set_mempolicy).
+
+Cpusets extends these two mechanisms as follows:
+
+ - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
+   kernel.
+ - Each task in the system is attached to a cpuset, via a pointer
+   in the task structure to a reference counted cgroup structure.
+ - Calls to sched_setaffinity are filtered to just those CPUs
+   allowed in that task's cpuset.
+ - Calls to mbind and set_mempolicy are filtered to just
+   those Memory Nodes allowed in that task's cpuset.
+ - The root cpuset contains all the systems CPUs and Memory
+   Nodes.
+ - For any cpuset, one can define child cpusets containing a subset
+   of the parents CPU and Memory Node resources.
+ - The hierarchy of cpusets can be mounted at /dev/cpuset, for
+   browsing and manipulation from user space.
+ - A cpuset may be marked exclusive, which ensures that no other
+   cpuset (except direct ancestors and descendants) may contain
+   any overlapping CPUs or Memory Nodes.
+ - You can list all the tasks (by pid) attached to any cpuset.
+
+The implementation of cpusets requires a few, simple hooks
+into the rest of the kernel, none in performance critical paths:
+
+ - in init/main.c, to initialize the root cpuset at system boot.
+ - in fork and exit, to attach and detach a task from its cpuset.
+ - in sched_setaffinity, to mask the requested CPUs by what's
+   allowed in that task's cpuset.
+ - in sched.c migrate_live_tasks(), to keep migrating tasks within
+   the CPUs allowed by their cpuset, if possible.
+ - in the mbind and set_mempolicy system calls, to mask the requested
+   Memory Nodes by what's allowed in that task's cpuset.
+ - in page_alloc.c, to restrict memory to allowed nodes.
+ - in vmscan.c, to restrict page recovery to the current cpuset.
+
+You should mount the "cgroup" filesystem type in order to enable
+browsing and modifying the cpusets presently known to the kernel.  No
+new system calls are added for cpusets - all support for querying and
+modifying cpusets is via this cpuset file system.
+
+The /proc/<pid>/status file for each task has four added lines,
+displaying the task's cpus_allowed (on which CPUs it may be scheduled)
+and mems_allowed (on which Memory Nodes it may obtain memory),
+in the two formats seen in the following example:
+
+  Cpus_allowed:   ffffffff,ffffffff,ffffffff,ffffffff
+  Cpus_allowed_list:      0-127
+  Mems_allowed:   ffffffff,ffffffff
+  Mems_allowed_list:      0-63
+
+Each cpuset is represented by a directory in the cgroup file system
+containing (on top of the standard cgroup files) the following
+files describing that cpuset:
+
+ - cpuset.cpus: list of CPUs in that cpuset
+ - cpuset.mems: list of Memory Nodes in that cpuset
+ - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
+ - cpuset.cpu_exclusive flag: is cpu placement exclusive?
+ - cpuset.mem_exclusive flag: is memory placement exclusive?
+ - cpuset.mem_hardwall flag:  is memory allocation hardwalled
+ - cpuset.memory_pressure: measure of how much paging pressure in cpuset
+ - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
+ - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
+ - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
+ - cpuset.sched_relax_domain_level: the searching range when migrating tasks
+
+In addition, only the root cpuset has the following file:
+ - cpuset.memory_pressure_enabled flag: compute memory_pressure?
+
+New cpusets are created using the mkdir system call or shell
+command.  The properties of a cpuset, such as its flags, allowed
+CPUs and Memory Nodes, and attached tasks, are modified by writing
+to the appropriate file in that cpusets directory, as listed above.
+
+The named hierarchical structure of nested cpusets allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cpuset allows organizing the work load
+on a system into related sets of tasks such that each set is constrained
+to using the CPUs and Memory Nodes of a particular cpuset.  A task
+may be re-attached to any other cpuset, if allowed by the permissions
+on the necessary cpuset file system directories.
+
+Such management of a system "in the large" integrates smoothly with
+the detailed placement done on individual tasks and memory regions
+using the sched_setaffinity, mbind and set_mempolicy system calls.
+
+The following rules apply to each cpuset:
+
+ - Its CPUs and Memory Nodes must be a subset of its parents.
+ - It can't be marked exclusive unless its parent is.
+ - If its cpu or memory is exclusive, they may not overlap any sibling.
+
+These rules, and the natural hierarchy of cpusets, enable efficient
+enforcement of the exclusive guarantee, without having to scan all
+cpusets every time any of them change to ensure nothing overlaps a
+exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
+to represent the cpuset hierarchy provides for a familiar permission
+and name space for cpusets, with a minimum of additional kernel code.
+
+The cpus and mems files in the root (top_cpuset) cpuset are
+read-only.  The cpus file automatically tracks the value of
+cpu_online_mask using a CPU hotplug notifier, and the mems file
+automatically tracks the value of node_states[N_MEMORY]--i.e.,
+nodes with memory--using the cpuset_track_online_nodes() hook.
+
+
+1.4 What are exclusive cpusets ?
+--------------------------------
+
+If a cpuset is cpu or mem exclusive, no other cpuset, other than
+a direct ancestor or descendant, may share any of the same CPUs or
+Memory Nodes.
+
+A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
+i.e. it restricts kernel allocations for page, buffer and other data
+commonly shared by the kernel across multiple users.  All cpusets,
+whether hardwalled or not, restrict allocations of memory for user
+space.  This enables configuring a system so that several independent
+jobs can share common kernel data, such as file system pages, while
+isolating each job's user allocation in its own cpuset.  To do this,
+construct a large mem_exclusive cpuset to hold all the jobs, and
+construct child, non-mem_exclusive cpusets for each individual job.
+Only a small amount of typical kernel memory, such as requests from
+interrupt handlers, is allowed to be taken outside even a
+mem_exclusive cpuset.
+
+
+1.5 What is memory_pressure ?
+-----------------------------
+The memory_pressure of a cpuset provides a simple per-cpuset metric
+of the rate that the tasks in a cpuset are attempting to free up in
+use memory on the nodes of the cpuset to satisfy additional memory
+requests.
+
+This enables batch managers monitoring jobs running in dedicated
+cpusets to efficiently detect what level of memory pressure that job
+is causing.
+
+This is useful both on tightly managed systems running a wide mix of
+submitted jobs, which may choose to terminate or re-prioritize jobs that
+are trying to use more memory than allowed on the nodes assigned to them,
+and with tightly coupled, long running, massively parallel scientific
+computing jobs that will dramatically fail to meet required performance
+goals if they start to use more memory than allowed to them.
+
+This mechanism provides a very economical way for the batch manager
+to monitor a cpuset for signs of memory pressure.  It's up to the
+batch manager or other user code to decide what to do about it and
+take action.
+
+==> Unless this feature is enabled by writing "1" to the special file
+    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
+    code of __alloc_pages() for this metric reduces to simply noticing
+    that the cpuset_memory_pressure_enabled flag is zero.  So only
+    systems that enable this feature will compute the metric.
+
+Why a per-cpuset, running average:
+
+    Because this meter is per-cpuset, rather than per-task or mm,
+    the system load imposed by a batch scheduler monitoring this
+    metric is sharply reduced on large systems, because a scan of
+    the tasklist can be avoided on each set of queries.
+
+    Because this meter is a running average, instead of an accumulating
+    counter, a batch scheduler can detect memory pressure with a
+    single read, instead of having to read and accumulate results
+    for a period of time.
+
+    Because this meter is per-cpuset rather than per-task or mm,
+    the batch scheduler can obtain the key information, memory
+    pressure in a cpuset, with a single read, rather than having to
+    query and accumulate results over all the (dynamically changing)
+    set of tasks in the cpuset.
+
+A per-cpuset simple digital filter (requires a spinlock and 3 words
+of data per-cpuset) is kept, and updated by any task attached to that
+cpuset, if it enters the synchronous (direct) page reclaim code.
+
+A per-cpuset file provides an integer number representing the recent
+(half-life of 10 seconds) rate of direct page reclaims caused by
+the tasks in the cpuset, in units of reclaims attempted per second,
+times 1000.
+
+
+1.6 What is memory spread ?
+---------------------------
+There are two boolean flag files per cpuset that control where the
+kernel allocates pages for the file system buffers and related in
+kernel data structures.  They are called 'cpuset.memory_spread_page' and
+'cpuset.memory_spread_slab'.
+
+If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
+the kernel will spread the file system buffers (page cache) evenly
+over all the nodes that the faulting task is allowed to use, instead
+of preferring to put those pages on the node where the task is running.
+
+If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
+then the kernel will spread some file system related slab caches,
+such as for inodes and dentries evenly over all the nodes that the
+faulting task is allowed to use, instead of preferring to put those
+pages on the node where the task is running.
+
+The setting of these flags does not affect anonymous data segment or
+stack segment pages of a task.
+
+By default, both kinds of memory spreading are off, and memory
+pages are allocated on the node local to where the task is running,
+except perhaps as modified by the task's NUMA mempolicy or cpuset
+configuration, so long as sufficient free memory pages are available.
+
+When new cpusets are created, they inherit the memory spread settings
+of their parent.
+
+Setting memory spreading causes allocations for the affected page
+or slab caches to ignore the task's NUMA mempolicy and be spread
+instead.    Tasks using mbind() or set_mempolicy() calls to set NUMA
+mempolicies will not notice any change in these calls as a result of
+their containing task's memory spread settings.  If memory spreading
+is turned off, then the currently specified NUMA mempolicy once again
+applies to memory page allocations.
+
+Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
+files.  By default they contain "0", meaning that the feature is off
+for that cpuset.  If a "1" is written to that file, then that turns
+the named feature on.
+
+The implementation is simple.
+
+Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
+PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently
+joins that cpuset.  The page allocation calls for the page cache
+is modified to perform an inline check for this PFA_SPREAD_PAGE task
+flag, and if set, a call to a new routine cpuset_mem_spread_node()
+returns the node to prefer for the allocation.
+
+Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
+PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate
+pages from the node returned by cpuset_mem_spread_node().
+
+The cpuset_mem_spread_node() routine is also simple.  It uses the
+value of a per-task rotor cpuset_mem_spread_rotor to select the next
+node in the current task's mems_allowed to prefer for the allocation.
+
+This memory placement policy is also known (in other contexts) as
+round-robin or interleave.
+
+This policy can provide substantial improvements for jobs that need
+to place thread local data on the corresponding node, but that need
+to access large file system data sets that need to be spread across
+the several nodes in the jobs cpuset in order to fit.  Without this
+policy, especially for jobs that might have one thread reading in the
+data set, the memory allocation across the nodes in the jobs cpuset
+can become very uneven.
+
+1.7 What is sched_load_balance ?
+--------------------------------
+
+The kernel scheduler (kernel/sched/core.c) automatically load balances
+tasks.  If one CPU is underutilized, kernel code running on that
+CPU will look for tasks on other more overloaded CPUs and move those
+tasks to itself, within the constraints of such placement mechanisms
+as cpusets and sched_setaffinity.
+
+The algorithmic cost of load balancing and its impact on key shared
+kernel data structures such as the task list increases more than
+linearly with the number of CPUs being balanced.  So the scheduler
+has support to partition the systems CPUs into a number of sched
+domains such that it only load balances within each sched domain.
+Each sched domain covers some subset of the CPUs in the system;
+no two sched domains overlap; some CPUs might not be in any sched
+domain and hence won't be load balanced.
+
+Put simply, it costs less to balance between two smaller sched domains
+than one big one, but doing so means that overloads in one of the
+two domains won't be load balanced to the other one.
+
+By default, there is one sched domain covering all CPUs, including those
+marked isolated using the kernel boot time "isolcpus=" argument. However,
+the isolated CPUs will not participate in load balancing, and will not
+have tasks running on them unless explicitly assigned.
+
+This default load balancing across all CPUs is not well suited for
+the following two situations:
+ 1) On large systems, load balancing across many CPUs is expensive.
+    If the system is managed using cpusets to place independent jobs
+    on separate sets of CPUs, full load balancing is unnecessary.
+ 2) Systems supporting realtime on some CPUs need to minimize
+    system overhead on those CPUs, including avoiding task load
+    balancing if that is not needed.
+
+When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
+setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
+be contained in a single sched domain, ensuring that load balancing
+can move a task (not otherwised pinned, as by sched_setaffinity)
+from any CPU in that cpuset to any other.
+
+When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
+scheduler will avoid load balancing across the CPUs in that cpuset,
+--except-- in so far as is necessary because some overlapping cpuset
+has "sched_load_balance" enabled.
+
+So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
+enabled, then the scheduler will have one sched domain covering all
+CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
+cpusets won't matter, as we're already fully load balancing.
+
+Therefore in the above two situations, the top cpuset flag
+"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
+child cpusets have this flag enabled.
+
+When doing this, you don't usually want to leave any unpinned tasks in
+the top cpuset that might use non-trivial amounts of CPU, as such tasks
+may be artificially constrained to some subset of CPUs, depending on
+the particulars of this flag setting in descendant cpusets.  Even if
+such a task could use spare CPU cycles in some other CPUs, the kernel
+scheduler might not consider the possibility of load balancing that
+task to that underused CPU.
+
+Of course, tasks pinned to a particular CPU can be left in a cpuset
+that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
+else anyway.
+
+There is an impedance mismatch here, between cpusets and sched domains.
+Cpusets are hierarchical and nest.  Sched domains are flat; they don't
+overlap and each CPU is in at most one sched domain.
+
+It is necessary for sched domains to be flat because load balancing
+across partially overlapping sets of CPUs would risk unstable dynamics
+that would be beyond our understanding.  So if each of two partially
+overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
+form a single sched domain that is a superset of both.  We won't move
+a task to a CPU outside its cpuset, but the scheduler load balancing
+code might waste some compute cycles considering that possibility.
+
+This mismatch is why there is not a simple one-to-one relation
+between which cpusets have the flag "cpuset.sched_load_balance" enabled,
+and the sched domain configuration.  If a cpuset enables the flag, it
+will get balancing across all its CPUs, but if it disables the flag,
+it will only be assured of no load balancing if no other overlapping
+cpuset enables the flag.
+
+If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
+one of them has this flag enabled, then the other may find its
+tasks only partially load balanced, just on the overlapping CPUs.
+This is just the general case of the top_cpuset example given a few
+paragraphs above.  In the general case, as in the top cpuset case,
+don't leave tasks that might use non-trivial amounts of CPU in
+such partially load balanced cpusets, as they may be artificially
+constrained to some subset of the CPUs allowed to them, for lack of
+load balancing to the other CPUs.
+
+CPUs in "cpuset.isolcpus" were excluded from load balancing by the
+isolcpus= kernel boot option, and will never be load balanced regardless
+of the value of "cpuset.sched_load_balance" in any cpuset.
+
+1.7.1 sched_load_balance implementation details.
+------------------------------------------------
+
+The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
+to most cpuset flags.)  When enabled for a cpuset, the kernel will
+ensure that it can load balance across all the CPUs in that cpuset
+(makes sure that all the CPUs in the cpus_allowed of that cpuset are
+in the same sched domain.)
+
+If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
+then they will be (must be) both in the same sched domain.
+
+If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
+then by the above that means there is a single sched domain covering
+the whole system, regardless of any other cpuset settings.
+
+The kernel commits to user space that it will avoid load balancing
+where it can.  It will pick as fine a granularity partition of sched
+domains as it can while still providing load balancing for any set
+of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
+
+The internal kernel cpuset to scheduler interface passes from the
+cpuset code to the scheduler code a partition of the load balanced
+CPUs in the system. This partition is a set of subsets (represented
+as an array of struct cpumask) of CPUs, pairwise disjoint, that cover
+all the CPUs that must be load balanced.
+
+The cpuset code builds a new such partition and passes it to the
+scheduler sched domain setup code, to have the sched domains rebuilt
+as necessary, whenever:
+ - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
+ - or CPUs come or go from a cpuset with this flag enabled,
+ - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
+   and with this flag enabled changes,
+ - or a cpuset with non-empty CPUs and with this flag enabled is removed,
+ - or a cpu is offlined/onlined.
+
+This partition exactly defines what sched domains the scheduler should
+setup - one sched domain for each element (struct cpumask) in the
+partition.
+
+The scheduler remembers the currently active sched domain partitions.
+When the scheduler routine partition_sched_domains() is invoked from
+the cpuset code to update these sched domains, it compares the new
+partition requested with the current, and updates its sched domains,
+removing the old and adding the new, for each change.
+
+
+1.8 What is sched_relax_domain_level ?
+--------------------------------------
+
+In sched domain, the scheduler migrates tasks in 2 ways; periodic load
+balance on tick, and at time of some schedule events.
+
+When a task is woken up, scheduler try to move the task on idle CPU.
+For example, if a task A running on CPU X activates another task B
+on the same CPU X, and if CPU Y is X's sibling and performing idle,
+then scheduler migrate task B to CPU Y so that task B can start on
+CPU Y without waiting task A on CPU X.
+
+And if a CPU run out of tasks in its runqueue, the CPU try to pull
+extra tasks from other busy CPUs to help them before it is going to
+be idle.
+
+Of course it takes some searching cost to find movable tasks and/or
+idle CPUs, the scheduler might not search all CPUs in the domain
+every time.  In fact, in some architectures, the searching ranges on
+events are limited in the same socket or node where the CPU locates,
+while the load balance on tick searches all.
+
+For example, assume CPU Z is relatively far from CPU X.  Even if CPU Z
+is idle while CPU X and the siblings are busy, scheduler can't migrate
+woken task B from X to Z since it is out of its searching range.
+As the result, task B on CPU X need to wait task A or wait load balance
+on the next tick.  For some applications in special situation, waiting
+1 tick may be too long.
+
+The 'cpuset.sched_relax_domain_level' file allows you to request changing
+this searching range as you like.  This file takes int value which
+indicates size of searching range in levels ideally as follows,
+otherwise initial value -1 that indicates the cpuset has no request.
+
+  -1  : no request. use system default or follow request of others.
+   0  : no search.
+   1  : search siblings (hyperthreads in a core).
+   2  : search cores in a package.
+   3  : search cpus in a node [= system wide on non-NUMA system]
+   4  : search nodes in a chunk of node [on NUMA system]
+   5  : search system wide [on NUMA system]
+
+The system default is architecture dependent.  The system default
+can be changed using the relax_domain_level= boot parameter.
+
+This file is per-cpuset and affect the sched domain where the cpuset
+belongs to.  Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
+is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
+there is no sched domain belonging the cpuset.
+
+If multiple cpusets are overlapping and hence they form a single sched
+domain, the largest value among those is used.  Be careful, if one
+requests 0 and others are -1 then 0 is used.
+
+Note that modifying this file will have both good and bad effects,
+and whether it is acceptable or not depends on your situation.
+Don't modify this file if you are not sure.
+
+If your situation is:
+ - The migration costs between each cpu can be assumed considerably
+   small(for you) due to your special application's behavior or
+   special hardware support for CPU cache etc.
+ - The searching cost doesn't have impact(for you) or you can make
+   the searching cost enough small by managing cpuset to compact etc.
+ - The latency is required even it sacrifices cache hit rate etc.
+then increasing 'sched_relax_domain_level' would benefit you.
+
+
+1.9 How do I use cpusets ?
+--------------------------
+
+In order to minimize the impact of cpusets on critical kernel
+code, such as the scheduler, and due to the fact that the kernel
+does not support one task updating the memory placement of another
+task directly, the impact on a task of changing its cpuset CPU
+or Memory Node placement, or of changing to which cpuset a task
+is attached, is subtle.
+
+If a cpuset has its Memory Nodes modified, then for each task attached
+to that cpuset, the next time that the kernel attempts to allocate
+a page of memory for that task, the kernel will notice the change
+in the task's cpuset, and update its per-task memory placement to
+remain within the new cpusets memory placement.  If the task was using
+mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
+its new cpuset, then the task will continue to use whatever subset
+of MPOL_BIND nodes are still allowed in the new cpuset.  If the task
+was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
+in the new cpuset, then the task will be essentially treated as if it
+was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
+as queried by get_mempolicy(), doesn't change).  If a task is moved
+from one cpuset to another, then the kernel will adjust the task's
+memory placement, as above, the next time that the kernel attempts
+to allocate a page of memory for that task.
+
+If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
+will have its allowed CPU placement changed immediately.  Similarly,
+if a task's pid is written to another cpusets 'cpuset.tasks' file, then its
+allowed CPU placement is changed immediately.  If such a task had been
+bound to some subset of its cpuset using the sched_setaffinity() call,
+the task will be allowed to run on any CPU allowed in its new cpuset,
+negating the effect of the prior sched_setaffinity() call.
+
+In summary, the memory placement of a task whose cpuset is changed is
+updated by the kernel, on the next allocation of a page for that task,
+and the processor placement is updated immediately.
+
+Normally, once a page is allocated (given a physical page
+of main memory) then that page stays on whatever node it
+was allocated, so long as it remains allocated, even if the
+cpusets memory placement policy 'cpuset.mems' subsequently changes.
+If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
+tasks are attached to that cpuset, any pages that task had
+allocated to it on nodes in its previous cpuset are migrated
+to the task's new cpuset. The relative placement of the page within
+the cpuset is preserved during these migration operations if possible.
+For example if the page was on the second valid node of the prior cpuset
+then the page will be placed on the second valid node of the new cpuset.
+
+Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
+'cpuset.mems' file is modified, pages allocated to tasks in that
+cpuset, that were on nodes in the previous setting of 'cpuset.mems',
+will be moved to nodes in the new setting of 'mems.'
+Pages that were not in the task's prior cpuset, or in the cpuset's
+prior 'cpuset.mems' setting, will not be moved.
+
+There is an exception to the above.  If hotplug functionality is used
+to remove all the CPUs that are currently assigned to a cpuset,
+then all the tasks in that cpuset will be moved to the nearest ancestor
+with non-empty cpus.  But the moving of some (or all) tasks might fail if
+cpuset is bound with another cgroup subsystem which has some restrictions
+on task attaching.  In this failing case, those tasks will stay
+in the original cpuset, and the kernel will automatically update
+their cpus_allowed to allow all online CPUs.  When memory hotplug
+functionality for removing Memory Nodes is available, a similar exception
+is expected to apply there as well.  In general, the kernel prefers to
+violate cpuset placement, over starving a task that has had all
+its allowed CPUs or Memory Nodes taken offline.
+
+There is a second exception to the above.  GFP_ATOMIC requests are
+kernel internal allocations that must be satisfied, immediately.
+The kernel may drop some request, in rare cases even panic, if a
+GFP_ATOMIC alloc fails.  If the request cannot be satisfied within
+the current task's cpuset, then we relax the cpuset, and look for
+memory anywhere we can find it.  It's better to violate the cpuset
+than stress the kernel.
+
+To start a new job that is to be contained within a cpuset, the steps are:
+
+ 1) mkdir /sys/fs/cgroup/cpuset
+ 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+ 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
+    the /sys/fs/cgroup/cpuset virtual file system.
+ 4) Start a task that will be the "founding father" of the new job.
+ 5) Attach that task to the new cpuset by writing its pid to the
+    /sys/fs/cgroup/cpuset tasks file for that cpuset.
+ 6) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cpuset
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cpuset:
+
+  mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+  cd /sys/fs/cgroup/cpuset
+  mkdir Charlie
+  cd Charlie
+  /bin/echo 2-3 > cpuset.cpus
+  /bin/echo 1 > cpuset.mems
+  /bin/echo $$ > tasks
+  sh
+  # The subshell 'sh' is now running in cpuset Charlie
+  # The next line should display '/Charlie'
+  cat /proc/self/cpuset
+
+There are ways to query or modify cpusets:
+ - via the cpuset file system directly, using the various cd, mkdir, echo,
+   cat, rmdir commands from the shell, or their equivalent from C.
+ - via the C library libcpuset.
+ - via the C library libcgroup.
+   (http://sourceforge.net/projects/libcg/)
+ - via the python application cset.
+   (http://code.google.com/p/cpuset/)
+
+The sched_setaffinity calls can also be done at the shell prompt using
+SGI's runon or Robert Love's taskset.  The mbind and set_mempolicy
+calls can be done at the shell prompt using the numactl command
+(part of Andi Kleen's numa package).
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using the cpusets can be done through the cpuset
+virtual filesystem.
+
+To mount it, type:
+# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset
+
+Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the
+tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset
+is the cpuset that holds the whole system.
+
+If you want to create a new cpuset under /sys/fs/cgroup/cpuset:
+# cd /sys/fs/cgroup/cpuset
+# mkdir my_cpuset
+
+Now you want to do something with this cpuset.
+# cd my_cpuset
+
+In this directory you can find several files:
+# ls
+cgroup.clone_children  cpuset.memory_pressure
+cgroup.event_control   cpuset.memory_spread_page
+cgroup.procs           cpuset.memory_spread_slab
+cpuset.cpu_exclusive   cpuset.mems
+cpuset.cpus            cpuset.sched_load_balance
+cpuset.mem_exclusive   cpuset.sched_relax_domain_level
+cpuset.mem_hardwall    notify_on_release
+cpuset.memory_migrate  tasks
+
+Reading them will give you information about the state of this cpuset:
+the CPUs and Memory Nodes it can use, the processes that are using
+it, its properties.  By writing to these files you can manipulate
+the cpuset.
+
+Set some flags:
+# /bin/echo 1 > cpuset.cpu_exclusive
+
+Add some cpus:
+# /bin/echo 0-7 > cpuset.cpus
+
+Add some mems:
+# /bin/echo 0-7 > cpuset.mems
+
+Now attach your shell to this cpuset:
+# /bin/echo $$ > tasks
+
+You can also create cpusets inside your cpuset by using mkdir in this
+directory.
+# mkdir my_sub_cs
+
+To remove a cpuset, just use rmdir:
+# rmdir my_sub_cs
+This will fail if the cpuset is in use (has cpusets inside, or has
+processes attached).
+
+Note that for legacy reasons, the "cpuset" filesystem exists as a
+wrapper around the cgroup filesystem.
+
+The command
+
+mount -t cpuset X /sys/fs/cgroup/cpuset
+
+is equivalent to
+
+mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset
+echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent
+
+2.2 Adding/removing cpus
+------------------------
+
+This is the syntax to use when writing in the cpus or mems files
+in cpuset directories:
+
+# /bin/echo 1-4 > cpuset.cpus          -> set cpus list to cpus 1,2,3,4
+# /bin/echo 1,2,3,4 > cpuset.cpus      -> set cpus list to cpus 1,2,3,4
+
+To add a CPU to a cpuset, write the new list of CPUs including the
+CPU to be added. To add 6 to the above cpuset:
+
+# /bin/echo 1-4,6 > cpuset.cpus        -> set cpus list to cpus 1,2,3,4,6
+
+Similarly to remove a CPU from a cpuset, write the new list of CPUs
+without the CPU to be removed.
+
+To remove all the CPUs:
+
+# /bin/echo "" > cpuset.cpus           -> clear cpus list
+
+2.3 Setting flags
+-----------------
+
+The syntax is very simple:
+
+# /bin/echo 1 > cpuset.cpu_exclusive   -> set flag 'cpuset.cpu_exclusive'
+# /bin/echo 0 > cpuset.cpu_exclusive   -> unset flag 'cpuset.cpu_exclusive'
+
+2.4 Attaching processes
+-----------------------
+
+# /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another:
+
+# /bin/echo PID1 > tasks
+# /bin/echo PID2 > tasks
+       ...
+# /bin/echo PIDn > tasks
+
+
+3. Questions
+============
+
+Q: what's up with this '/bin/echo' ?
+A: bash's builtin 'echo' command does not check calls to write() against
+   errors. If you use it in the cpuset file system, you won't be
+   able to tell whether a command succeeded or failed.
+
+Q: When I attach processes, only the first of the line gets really attached !
+A: We can only return one error code per call to write(). So you should also
+   put only ONE pid.
+
+4. Contact
+==========
+
+Web: http://www.bullopensource.org/cpuset
diff --git a/Documentation/cgroup-legacy/devices.txt b/Documentation/cgroup-legacy/devices.txt

new file mode 100644 (file)

index 0000000..3c1095c
--- /dev/null
+++ b/Documentation/cgroup-legacy/devices.txt
@@ -0,0 +1,116 @@
+Device Whitelist Controller
+
+1. Description:
+
+Implement a cgroup to track and enforce open and mknod restrictions
+on device files.  A device cgroup associates a device access
+whitelist with each cgroup.  A whitelist entry has 4 fields.
+'type' is a (all), c (char), or b (block).  'all' means it applies
+to all types and all major and minor numbers.  Major and minor are
+either an integer or * for all.  Access is a composition of r
+(read), w (write), and m (mknod).
+
+The root device cgroup starts with rwm to 'all'.  A child device
+cgroup gets a copy of the parent.  Administrators can then remove
+devices from the whitelist or add new entries.  A child cgroup can
+never receive a device access which is denied by its parent.
+
+2. User Interface
+
+An entry is added using devices.allow, and removed using
+devices.deny.  For instance
+
+       echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow
+
+allows cgroup 1 to read and mknod the device usually known as
+/dev/null.  Doing
+
+       echo a > /sys/fs/cgroup/1/devices.deny
+
+will remove the default 'a *:* rwm' entry. Doing
+
+       echo a > /sys/fs/cgroup/1/devices.allow
+
+will add the 'a *:* rwm' entry to the whitelist.
+
+3. Security
+
+Any task can move itself between cgroups.  This clearly won't
+suffice, but we can decide the best way to adequately restrict
+movement as people get some experience with this.  We may just want
+to require CAP_SYS_ADMIN, which at least is a separate bit from
+CAP_MKNOD.  We may want to just refuse moving to a cgroup which
+isn't a descendant of the current one.  Or we may want to use
+CAP_MAC_ADMIN, since we really are trying to lock down root.
+
+CAP_SYS_ADMIN is needed to modify the whitelist or move another
+task to a new cgroup.  (Again we'll probably want to change that).
+
+A cgroup may not be granted more permissions than the cgroup's
+parent has.
+
+4. Hierarchy
+
+device cgroups maintain hierarchy by making sure a cgroup never has more
+access permissions than its parent.  Every time an entry is written to
+a cgroup's devices.deny file, all its children will have that entry removed
+from their whitelist and all the locally set whitelist entries will be
+re-evaluated.  In case one of the locally set whitelist entries would provide
+more access than the cgroup's parent, it'll be removed from the whitelist.
+
+Example:
+      A
+     / \
+        B
+
+    group        behavior      exceptions
+    A            allow         "b 8:* rwm", "c 116:1 rw"
+    B            deny          "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
+
+If a device is denied in group A:
+       # echo "c 116:* r" > A/devices.deny
+it'll propagate down and after revalidating B's entries, the whitelist entry
+"c 116:2 rwm" will be removed:
+
+    group        whitelist entries                        denied devices
+    A            all                                      "b 8:* rwm", "c 116:* rw"
+    B            "c 1:3 rwm", "b 3:* rwm"                 all the rest
+
+In case parent's exceptions change and local exceptions are not allowed
+anymore, they'll be deleted.
+
+Notice that new whitelist entries will not be propagated:
+      A
+     / \
+        B
+
+    group        whitelist entries                        denied devices
+    A            "c 1:3 rwm", "c 1:5 r"                   all the rest
+    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
+
+when adding "c *:3 rwm":
+       # echo "c *:3 rwm" >A/devices.allow
+
+the result:
+    group        whitelist entries                        denied devices
+    A            "c *:3 rwm", "c 1:5 r"                   all the rest
+    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
+
+but now it'll be possible to add new entries to B:
+       # echo "c 2:3 rwm" >B/devices.allow
+       # echo "c 50:3 r" >B/devices.allow
+or even
+       # echo "c *:3 rwm" >B/devices.allow
+
+Allowing or denying all by writing 'a' to devices.allow or devices.deny will
+not be possible once the device cgroups has children.
+
+4.1 Hierarchy (internal implementation)
+
+device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
+list of exceptions.  The internal state is controlled using the same user
+interface to preserve compatibility with the previous whitelist-only
+implementation.  Removal or addition of exceptions that will reduce the access
+to devices will be propagated down the hierarchy.
+For every propagated exception, the effective rules will be re-evaluated based
+on current parent's access rules.
diff --git a/Documentation/cgroup-legacy/freezer-subsystem.txt b/Documentation/cgroup-legacy/freezer-subsystem.txt

new file mode 100644 (file)

index 0000000..e831cb2
--- /dev/null
+++ b/Documentation/cgroup-legacy/freezer-subsystem.txt
@@ -0,0 +1,123 @@
+The cgroup freezer is useful to batch job management system which start
+and stop sets of tasks in order to schedule the resources of a machine
+according to the desires of a system administrator. This sort of program
+is often used on HPC clusters to schedule access to the cluster as a
+whole. The cgroup freezer uses cgroups to describe the set of tasks to
+be started/stopped by the batch job management system. It also provides
+a means to start and stop the tasks composing the job.
+
+The cgroup freezer will also be useful for checkpointing running groups
+of tasks. The freezer allows the checkpoint code to obtain a consistent
+image of the tasks by attempting to force the tasks in a cgroup into a
+quiescent state. Once the tasks are quiescent another task can
+walk /proc or invoke a kernel interface to gather information about the
+quiesced tasks. Checkpointed tasks can be restarted later should a
+recoverable error occur. This also allows the checkpointed tasks to be
+migrated between nodes in a cluster by copying the gathered information
+to another node and restarting the tasks there.
+
+Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
+and resuming tasks in userspace. Both of these signals are observable
+from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
+blocked, or ignored it can be seen by waiting or ptracing parent tasks.
+SIGCONT is especially unsuitable since it can be caught by the task. Any
+programs designed to watch for SIGSTOP and SIGCONT could be broken by
+attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
+demonstrate this problem using nested bash shells:
+
+       $ echo $$
+       16644
+       $ bash
+       $ echo $$
+       16690
+
+       From a second, unrelated bash shell:
+       $ kill -SIGSTOP 16690
+       $ kill -SIGCONT 16690
+
+       <at this point 16690 exits and causes 16644 to exit too>
+
+This happens because bash can observe both signals and choose how it
+responds to them.
+
+Another example of a program which catches and responds to these
+signals is gdb. In fact any program designed to use ptrace is likely to
+have a problem with this method of stopping and resuming tasks.
+
+In contrast, the cgroup freezer uses the kernel freezer code to
+prevent the freeze/unfreeze cycle from becoming visible to the tasks
+being frozen. This allows the bash example above and gdb to run as
+expected.
+
+The cgroup freezer is hierarchical. Freezing a cgroup freezes all
+tasks belonging to the cgroup and all its descendant cgroups. Each
+cgroup has its own state (self-state) and the state inherited from the
+parent (parent-state). Iff both states are THAWED, the cgroup is
+THAWED.
+
+The following cgroupfs files are created by cgroup freezer.
+
+* freezer.state: Read-write.
+
+  When read, returns the effective state of the cgroup - "THAWED",
+  "FREEZING" or "FROZEN". This is the combined self and parent-states.
+  If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
+
+  FREEZING cgroup transitions into FROZEN state when all tasks
+  belonging to the cgroup and its descendants become frozen. Note that
+  a cgroup reverts to FREEZING from FROZEN after a new task is added
+  to the cgroup or one of its descendant cgroups until the new task is
+  frozen.
+
+  When written, sets the self-state of the cgroup. Two values are
+  allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
+  if not already freezing, enters FREEZING state along with all its
+  descendant cgroups.
+
+  If THAWED is written, the self-state of the cgroup is changed to
+  THAWED.  Note that the effective state may not change to THAWED if
+  the parent-state is still freezing. If a cgroup's effective state
+  becomes THAWED, all its descendants which are freezing because of
+  the cgroup also leave the freezing state.
+
+* freezer.self_freezing: Read only.
+
+  Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
+  This value is 1 iff the last write to freezer.state was "FROZEN".
+
+* freezer.parent_freezing: Read only.
+
+  Shows the parent-state.  0 if none of the cgroup's ancestors is
+  frozen; otherwise, 1.
+
+The root cgroup is non-freezable and the above interface files don't
+exist.
+
+* Examples of usage :
+
+   # mkdir /sys/fs/cgroup/freezer
+   # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer
+   # mkdir /sys/fs/cgroup/freezer/0
+   # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks
+
+to get status of the freezer subsystem :
+
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   THAWED
+
+to freeze all tasks in the container :
+
+   # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   FREEZING
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   FROZEN
+
+to unfreeze all tasks in the container :
+
+   # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   THAWED
+
+This is the basic mechanism which should do the right thing for user space task
+in a simple scenario.
diff --git a/Documentation/cgroup-legacy/hugetlb.txt b/Documentation/cgroup-legacy/hugetlb.txt

new file mode 100644 (file)

index 0000000..106245c
--- /dev/null
+++ b/Documentation/cgroup-legacy/hugetlb.txt
@@ -0,0 +1,45 @@
+HugeTLB Controller
+-------------------
+
+The HugeTLB controller allows to limit the HugeTLB usage per control group and
+enforces the controller limit during page fault. Since HugeTLB doesn't
+support page reclaim, enforcing the limit at page fault time implies that,
+the application will get SIGBUS signal if it tries to access HugeTLB pages
+beyond its limit. This requires the application to know beforehand how much
+HugeTLB pages it would require for its use.
+
+HugeTLB controller can be created by first mounting the cgroup filesystem.
+
+# mount -t cgroup -o hugetlb none /sys/fs/cgroup
+
+With the above step, the initial or the parent HugeTLB group becomes
+visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
+the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
+
+New groups can be created under the parent group /sys/fs/cgroup.
+
+# cd /sys/fs/cgroup
+# mkdir g1
+# echo $$ > g1/tasks
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it.
+
+Brief summary of control files
+
+ hugetlb.<hugepagesize>.limit_in_bytes     # set/show limit of "hugepagesize" hugetlb usage
+ hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb  usage recorded
+ hugetlb.<hugepagesize>.usage_in_bytes     # show current usage for "hugepagesize" hugetlb
+ hugetlb.<hugepagesize>.failcnt                   # show the number of allocation failure due to HugeTLB limit
+
+For a system supporting two hugepage size (16M and 16G) the control
+files include:
+
+hugetlb.16GB.limit_in_bytes
+hugetlb.16GB.max_usage_in_bytes
+hugetlb.16GB.usage_in_bytes
+hugetlb.16GB.failcnt
+hugetlb.16MB.limit_in_bytes
+hugetlb.16MB.max_usage_in_bytes
+hugetlb.16MB.usage_in_bytes
+hugetlb.16MB.failcnt
diff --git a/Documentation/cgroup-legacy/memcg_test.txt b/Documentation/cgroup-legacy/memcg_test.txt

new file mode 100644 (file)

index 0000000..8870b02
--- /dev/null
+++ b/Documentation/cgroup-legacy/memcg_test.txt
@@ -0,0 +1,280 @@
+Memory Resource Controller(Memcg)  Implementation Memo.
+Last Updated: 2010/2
+Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
+
+Because VM is getting complex (one of reasons is memcg...), memcg's behavior
+is complex. This is a document for memcg's internal behavior.
+Please note that implementation details can be changed.
+
+(*) Topics on API should be in Documentation/cgroups/memory.txt)
+
+0. How to record usage ?
+   2 objects are used.
+
+   page_cgroup ....an object per page.
+       Allocated at boot or memory hotplug. Freed at memory hot removal.
+
+   swap_cgroup ... an entry per swp_entry.
+       Allocated at swapon(). Freed at swapoff().
+
+   The page_cgroup has USED bit and double count against a page_cgroup never
+   occurs. swap_cgroup is used only when a charged page is swapped-out.
+
+1. Charge
+
+   a page/swp_entry may be charged (usage += PAGE_SIZE) at
+
+       mem_cgroup_try_charge()
+
+2. Uncharge
+  a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
+
+       mem_cgroup_uncharge()
+         Called when a page's refcount goes down to 0.
+
+       mem_cgroup_uncharge_swap()
+         Called when swp_entry's refcnt goes down to 0. A charge against swap
+         disappears.
+
+3. charge-commit-cancel
+       Memcg pages are charged in two steps:
+               mem_cgroup_try_charge()
+               mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
+
+       At try_charge(), there are no flags to say "this page is charged".
+       at this point, usage += PAGE_SIZE.
+
+       At commit(), the page is associated with the memcg.
+
+       At cancel(), simply usage -= PAGE_SIZE.
+
+Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
+
+4. Anonymous
+       Anonymous page is newly allocated at
+                 - page fault into MAP_ANONYMOUS mapping.
+                 - Copy-On-Write.
+
+       4.1 Swap-in.
+       At swap-in, the page is taken from swap-cache. There are 2 cases.
+
+       (a) If the SwapCache is newly allocated and read, it has no charges.
+       (b) If the SwapCache has been mapped by processes, it has been
+           charged already.
+
+       4.2 Swap-out.
+       At swap-out, typical state transition is below.
+
+       (a) add to swap cache. (marked as SwapCache)
+           swp_entry's refcnt += 1.
+       (b) fully unmapped.
+           swp_entry's refcnt += # of ptes.
+       (c) write back to swap.
+       (d) delete from swap cache. (remove from SwapCache)
+           swp_entry's refcnt -= 1.
+
+
+       Finally, at task exit,
+       (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
+
+5. Page Cache
+       Page Cache is charged at
+       - add_to_page_cache_locked().
+
+       The logic is very clear. (About migration, see below)
+       Note: __remove_from_page_cache() is called by remove_from_page_cache()
+       and __remove_mapping().
+
+6. Shmem(tmpfs) Page Cache
+       The best way to understand shmem's page state transition is to read
+       mm/shmem.c.
+       But brief explanation of the behavior of memcg around shmem will be
+       helpful to understand the logic.
+
+       Shmem's page (just leaf page, not direct/indirect block) can be on
+               - radix-tree of shmem's inode.
+               - SwapCache.
+               - Both on radix-tree and SwapCache. This happens at swap-in
+                 and swap-out,
+
+       It's charged when...
+       - A new page is added to shmem's radix-tree.
+       - A swp page is read. (move a charge from swap_cgroup to page_cgroup)
+
+7. Page Migration
+
+       mem_cgroup_migrate()
+
+8. LRU
+        Each memcg has its own private LRU. Now, its handling is under global
+       VM's control (means that it's handled under global zone->lru_lock).
+       Almost all routines around memcg's LRU is called by global LRU's
+       list management functions under zone->lru_lock().
+
+       A special function is mem_cgroup_isolate_pages(). This scans
+       memcg's private LRU and call __isolate_lru_page() to extract a page
+       from LRU.
+       (By __isolate_lru_page(), the page is removed from both of global and
+        private LRU.)
+
+
+9. Typical Tests.
+
+ Tests for racy cases.
+
+ 9.1 Small limit to memcg.
+       When you do test to do racy case, it's good test to set memcg's limit
+       to be very small rather than GB. Many races found in the test under
+       xKB or xxMB limits.
+       (Memory behavior under GB and Memory behavior under MB shows very
+        different situation.)
+
+ 9.2 Shmem
+       Historically, memcg's shmem handling was poor and we saw some amount
+       of troubles here. This is because shmem is page-cache but can be
+       SwapCache. Test with shmem/tmpfs is always good test.
+
+ 9.3 Migration
+       For NUMA, migration is an another special case. To do easy test, cpuset
+       is useful. Following is a sample script to do migration.
+
+       mount -t cgroup -o cpuset none /opt/cpuset
+
+       mkdir /opt/cpuset/01
+       echo 1 > /opt/cpuset/01/cpuset.cpus
+       echo 0 > /opt/cpuset/01/cpuset.mems
+       echo 1 > /opt/cpuset/01/cpuset.memory_migrate
+       mkdir /opt/cpuset/02
+       echo 1 > /opt/cpuset/02/cpuset.cpus
+       echo 1 > /opt/cpuset/02/cpuset.mems
+       echo 1 > /opt/cpuset/02/cpuset.memory_migrate
+
+       In above set, when you moves a task from 01 to 02, page migration to
+       node 0 to node 1 will occur. Following is a script to migrate all
+       under cpuset.
+       --
+       move_task()
+       {
+       for pid in $1
+        do
+                /bin/echo $pid >$2/tasks 2>/dev/null
+               echo -n $pid
+               echo -n " "
+        done
+       echo END
+       }
+
+       G1_TASK=`cat ${G1}/tasks`
+       G2_TASK=`cat ${G2}/tasks`
+       move_task "${G1_TASK}" ${G2} &
+       --
+ 9.4 Memory hotplug.
+       memory hotplug test is one of good test.
+       to offline memory, do following.
+       # echo offline > /sys/devices/system/memory/memoryXXX/state
+       (XXX is the place of memory)
+       This is an easy way to test page migration, too.
+
+ 9.5 mkdir/rmdir
+       When using hierarchy, mkdir/rmdir test should be done.
+       Use tests like the following.
+
+       echo 1 >/opt/cgroup/01/memory/use_hierarchy
+       mkdir /opt/cgroup/01/child_a
+       mkdir /opt/cgroup/01/child_b
+
+       set limit to 01.
+       add limit to 01/child_b
+       run jobs under child_a and child_b
+
+       create/delete following groups at random while jobs are running.
+       /opt/cgroup/01/child_a/child_aa
+       /opt/cgroup/01/child_b/child_bb
+       /opt/cgroup/01/child_c
+
+       running new jobs in new group is also good.
+
+ 9.6 Mount with other subsystems.
+       Mounting with other subsystems is a good test because there is a
+       race and lock dependency with other cgroup subsystems.
+
+       example)
+       # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
+
+       and do task move, mkdir, rmdir etc...under this.
+
+ 9.7 swapoff.
+       Besides management of swap is one of complicated parts of memcg,
+       call path of swap-in at swapoff is not same as usual swap-in path..
+       It's worth to be tested explicitly.
+
+       For example, test like following is good.
+       (Shell-A)
+       # mount -t cgroup none /cgroup -o memory
+       # mkdir /cgroup/test
+       # echo 40M > /cgroup/test/memory.limit_in_bytes
+       # echo 0 > /cgroup/test/tasks
+       Run malloc(100M) program under this. You'll see 60M of swaps.
+       (Shell-B)
+       # move all tasks in /cgroup/test to /cgroup
+       # /sbin/swapoff -a
+       # rmdir /cgroup/test
+       # kill malloc task.
+
+       Of course, tmpfs v.s. swapoff test should be tested, too.
+
+ 9.8 OOM-Killer
+       Out-of-memory caused by memcg's limit will kill tasks under
+       the memcg. When hierarchy is used, a task under hierarchy
+       will be killed by the kernel.
+       In this case, panic_on_oom shouldn't be invoked and tasks
+       in other groups shouldn't be killed.
+
+       It's not difficult to cause OOM under memcg as following.
+       Case A) when you can swapoff
+       #swapoff -a
+       #echo 50M > /memory.limit_in_bytes
+       run 51M of malloc
+
+       Case B) when you use mem+swap limitation.
+       #echo 50M > memory.limit_in_bytes
+       #echo 50M > memory.memsw.limit_in_bytes
+       run 51M of malloc
+
+ 9.9 Move charges at task migration
+       Charges associated with a task can be moved along with task migration.
+
+       (Shell-A)
+       #mkdir /cgroup/A
+       #echo $$ >/cgroup/A/tasks
+       run some programs which uses some amount of memory in /cgroup/A.
+
+       (Shell-B)
+       #mkdir /cgroup/B
+       #echo 1 >/cgroup/B/memory.move_charge_at_immigrate
+       #echo "pid of the program running in group A" >/cgroup/B/tasks
+
+       You can see charges have been moved by reading *.usage_in_bytes or
+       memory.stat of both A and B.
+       See 8.2 of Documentation/cgroups/memory.txt to see what value should be
+       written to move_charge_at_immigrate.
+
+ 9.10 Memory thresholds
+       Memory controller implements memory thresholds using cgroups notification
+       API. You can use tools/cgroup/cgroup_event_listener.c to test it.
+
+       (Shell-A) Create cgroup and run event listener
+       # mkdir /cgroup/A
+       # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
+
+       (Shell-B) Add task to cgroup and try to allocate and free memory
+       # echo $$ >/cgroup/A/tasks
+       # a="$(dd if=/dev/zero bs=1M count=10)"
+       # a=
+
+       You will see message from cgroup_event_listener every time you cross
+       the thresholds.
+
+       Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
+
+       It's good idea to test root cgroup as well.
diff --git a/Documentation/cgroup-legacy/memory.txt b/Documentation/cgroup-legacy/memory.txt

new file mode 100644 (file)

index 0000000..ff71e16
--- /dev/null
+++ b/Documentation/cgroup-legacy/memory.txt
@@ -0,0 +1,876 @@
+Memory Resource Controller
+
+NOTE: This document is hopelessly outdated and it asks for a complete
+      rewrite. It still contains a useful information so we are keeping it
+      here but make sure to check the current code if you need a deeper
+      understanding.
+
+NOTE: The Memory Resource Controller has generically been referred to as the
+      memory controller in this document. Do not confuse memory controller
+      used here with the memory controller that is used in hardware.
+
+(For editors)
+In this document:
+      When we mention a cgroup (cgroupfs's directory) with memory controller,
+      we call it "memory cgroup". When you see git-log and source code, you'll
+      see patch's title and function names tend to use "memcg".
+      In this document, we avoid using it.
+
+Benefits and Purpose of the memory controller
+
+The memory controller isolates the memory behaviour of a group of tasks
+from the rest of the system. The article on LWN [12] mentions some probable
+uses of the memory controller. The memory controller can be used to
+
+a. Isolate an application or a group of applications
+   Memory-hungry applications can be isolated and limited to a smaller
+   amount of memory.
+b. Create a cgroup with a limited amount of memory; this can be used
+   as a good alternative to booting with mem=XXXX.
+c. Virtualization solutions can control the amount of memory they want
+   to assign to a virtual machine instance.
+d. A CD/DVD burner could control the amount of memory used by the
+   rest of the system to ensure that burning does not fail due to lack
+   of available memory.
+e. There are several other use cases; find one or use the controller just
+   for fun (to learn and hack on the VM subsystem).
+
+Current Status: linux-2.6.34-mmotm(development version of 2010/April)
+
+Features:
+ - accounting anonymous pages, file caches, swap caches usage and limiting them.
+ - pages are linked to per-memcg LRU exclusively, and there is no global LRU.
+ - optionally, memory+swap usage can be accounted and limited.
+ - hierarchical accounting
+ - soft limit
+ - moving (recharging) account at moving a task is selectable.
+ - usage threshold notifier
+ - memory pressure notifier
+ - oom-killer disable knob and oom-notifier
+ - Root cgroup has no limit controls.
+
+ Kernel memory support is a work in progress, and the current version provides
+ basically functionality. (See Section 2.7)
+
+Brief summary of control files.
+
+ tasks                          # attach a task(thread) and show list of threads
+ cgroup.procs                   # show list of processes
+ cgroup.event_control           # an interface for event_fd()
+ memory.usage_in_bytes          # show current usage for memory
+                                (See 5.5 for details)
+ memory.memsw.usage_in_bytes    # show current usage for memory+Swap
+                                (See 5.5 for details)
+ memory.limit_in_bytes          # set/show limit of memory usage
+ memory.memsw.limit_in_bytes    # set/show limit of memory+Swap usage
+ memory.failcnt                         # show the number of memory usage hits limits
+ memory.memsw.failcnt           # show the number of memory+Swap hits limits
+ memory.max_usage_in_bytes      # show max memory usage recorded
+ memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded
+ memory.soft_limit_in_bytes     # set/show soft limit of memory usage
+ memory.stat                    # show various statistics
+ memory.use_hierarchy           # set/show hierarchical account enabled
+ memory.force_empty             # trigger forced move charge to parent
+ memory.pressure_level          # set memory pressure notifications
+ memory.swappiness              # set/show swappiness parameter of vmscan
+                                (See sysctl's vm.swappiness)
+ memory.move_charge_at_immigrate # set/show controls of moving charges
+ memory.oom_control             # set/show oom controls.
+ memory.numa_stat               # show the number of memory usage per numa node
+
+ memory.kmem.limit_in_bytes      # set/show hard limit for kernel memory
+ memory.kmem.usage_in_bytes      # show current kernel memory allocation
+ memory.kmem.failcnt             # show the number of kernel memory usage hits limits
+ memory.kmem.max_usage_in_bytes  # show max kernel memory usage recorded
+
+ memory.kmem.tcp.limit_in_bytes  # set/show hard limit for tcp buf memory
+ memory.kmem.tcp.usage_in_bytes  # show current tcp buf memory allocation
+ memory.kmem.tcp.failcnt            # show the number of tcp buf memory usage hits limits
+ memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded
+
+1. History
+
+The memory controller has a long history. A request for comments for the memory
+controller was posted by Balbir Singh [1]. At the time the RFC was posted
+there were several implementations for memory control. The goal of the
+RFC was to build consensus and agreement for the minimal features required
+for memory control. The first RSS controller was posted by Balbir Singh[2]
+in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
+RSS controller. At OLS, at the resource management BoF, everyone suggested
+that we handle both page cache and RSS together. Another request was raised
+to allow user space handling of OOM. The current memory controller is
+at version 6; it combines both mapped (RSS) and unmapped Page
+Cache Control [11].
+
+2. Memory Control
+
+Memory is a unique resource in the sense that it is present in a limited
+amount. If a task requires a lot of CPU processing, the task can spread
+its processing over a period of hours, days, months or years, but with
+memory, the same physical memory needs to be reused to accomplish the task.
+
+The memory controller implementation has been divided into phases. These
+are:
+
+1. Memory controller
+2. mlock(2) controller
+3. Kernel user memory accounting and slab control
+4. user mappings length controller
+
+The memory controller is the first controller developed.
+
+2.1. Design
+
+The core of the design is a counter called the page_counter. The
+page_counter tracks the current memory usage and limit of the group of
+processes associated with the controller. Each cgroup has a memory controller
+specific data structure (mem_cgroup) associated with it.
+
+2.2. Accounting
+
+               +--------------------+
+               |  mem_cgroup        |
+               |  (page_counter)    |
+               +--------------------+
+                /            ^      \
+               /             |       \
+           +---------------+  |        +---------------+
+           | mm_struct     |  |....    | mm_struct     |
+           |               |  |        |               |
+           +---------------+  |        +---------------+
+                              |
+                              + --------------+
+                                              |
+           +---------------+           +------+--------+
+           | page          +---------->  page_cgroup|
+           |               |           |               |
+           +---------------+           +---------------+
+
+             (Figure 1: Hierarchy of Accounting)
+
+
+Figure 1 shows the important aspects of the controller
+
+1. Accounting happens per cgroup
+2. Each mm_struct knows about which cgroup it belongs to
+3. Each page has a pointer to the page_cgroup, which in turn knows the
+   cgroup it belongs to
+
+The accounting is done as follows: mem_cgroup_charge_common() is invoked to
+set up the necessary data structures and check if the cgroup that is being
+charged is over its limit. If it is, then reclaim is invoked on the cgroup.
+More details can be found in the reclaim section of this document.
+If everything goes well, a page meta-data-structure called page_cgroup is
+updated. page_cgroup has its own LRU on cgroup.
+(*) page_cgroup structure is allocated at boot/memory-hotplug time.
+
+2.2.1 Accounting details
+
+All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
+Some pages which are never reclaimable and will not be on the LRU
+are not accounted. We just account pages under usual VM management.
+
+RSS pages are accounted at page_fault unless they've already been accounted
+for earlier. A file page will be accounted for as Page Cache when it's
+inserted into inode (radix-tree). While it's mapped into the page tables of
+processes, duplicate accounting is carefully avoided.
+
+An RSS page is unaccounted when it's fully unmapped. A PageCache page is
+unaccounted when it's removed from radix-tree. Even if RSS pages are fully
+unmapped (by kswapd), they may exist as SwapCache in the system until they
+are really freed. Such SwapCaches are also accounted.
+A swapped-in page is not accounted until it's mapped.
+
+Note: The kernel does swapin-readahead and reads multiple swaps at once.
+This means swapped-in pages may contain pages for other tasks than a task
+causing page fault. So, we avoid accounting at swap-in I/O.
+
+At page migration, accounting information is kept.
+
+Note: we just account pages-on-LRU because our purpose is to control amount
+of used pages; not-on-LRU pages tend to be out-of-control from VM view.
+
+2.3 Shared Page Accounting
+
+Shared pages are accounted on the basis of the first touch approach. The
+cgroup that first touches a page is accounted for the page. The principle
+behind this approach is that a cgroup that aggressively uses a shared
+page will eventually get charged for it (once it is uncharged from
+the cgroup that brought it in -- this will happen on memory pressure).
+
+But see section 8.2: when moving a task to another cgroup, its pages may
+be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
+
+Exception: If CONFIG_MEMCG_SWAP is not used.
+When you do swapoff and make swapped-out pages of shmem(tmpfs) to
+be backed into memory in force, charges for pages are accounted against the
+caller of swapoff rather than the users of shmem.
+
+2.4 Swap Extension (CONFIG_MEMCG_SWAP)
+
+Swap Extension allows you to record charge for swap. A swapped-in page is
+charged back to original page allocator if possible.
+
+When swap is accounted, following files are added.
+ - memory.memsw.usage_in_bytes.
+ - memory.memsw.limit_in_bytes.
+
+memsw means memory+swap. Usage of memory+swap is limited by
+memsw.limit_in_bytes.
+
+Example: Assume a system with 4G of swap. A task which allocates 6G of memory
+(by mistake) under 2G memory limitation will use all swap.
+In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
+By using the memsw limit, you can avoid system OOM which can be caused by swap
+shortage.
+
+* why 'memory+swap' rather than swap.
+The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
+to move account from memory to swap...there is no change in usage of
+memory+swap. In other words, when we want to limit the usage of swap without
+affecting global LRU, memory+swap limit is better than just limiting swap from
+an OS point of view.
+
+* What happens when a cgroup hits memory.memsw.limit_in_bytes
+When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
+in this cgroup. Then, swap-out will not be done by cgroup routine and file
+caches are dropped. But as mentioned above, global LRU can do swapout memory
+from it for sanity of the system's memory management state. You can't forbid
+it by cgroup.
+
+2.5 Reclaim
+
+Each cgroup maintains a per cgroup LRU which has the same structure as
+global VM. When a cgroup goes over its limit, we first try
+to reclaim memory from the cgroup so as to make space for the new
+pages that the cgroup has touched. If the reclaim is unsuccessful,
+an OOM routine is invoked to select and kill the bulkiest task in the
+cgroup. (See 10. OOM Control below.)
+
+The reclaim algorithm has not been modified for cgroups, except that
+pages that are selected for reclaiming come from the per-cgroup LRU
+list.
+
+NOTE: Reclaim does not work for the root cgroup, since we cannot set any
+limits on the root cgroup.
+
+Note2: When panic_on_oom is set to "2", the whole system will panic.
+
+When oom event notifier is registered, event will be delivered.
+(See oom_control section)
+
+2.6 Locking
+
+   lock_page_cgroup()/unlock_page_cgroup() should not be called under
+   mapping->tree_lock.
+
+   Other lock order is following:
+   PG_locked.
+   mm->page_table_lock
+       zone->lru_lock
+         lock_page_cgroup.
+  In many cases, just lock_page_cgroup() is called.
+  per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
+  zone->lru_lock, it has no lock of its own.
+
+2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
+
+With the Kernel memory extension, the Memory Controller is able to limit
+the amount of kernel memory used by the system. Kernel memory is fundamentally
+different than user memory, since it can't be swapped out, which makes it
+possible to DoS the system by consuming too much of this precious resource.
+
+Kernel memory won't be accounted at all until limit on a group is set. This
+allows for existing setups to continue working without disruption.  The limit
+cannot be set if the cgroup have children, or if there are already tasks in the
+cgroup. Attempting to set the limit under those conditions will return -EBUSY.
+When use_hierarchy == 1 and a group is accounted, its children will
+automatically be accounted regardless of their limit value.
+
+After a group is first limited, it will be kept being accounted until it
+is removed. The memory limitation itself, can of course be removed by writing
+-1 to memory.kmem.limit_in_bytes. In this case, kmem will be accounted, but not
+limited.
+
+Kernel memory limits are not imposed for the root cgroup. Usage for the root
+cgroup may or may not be accounted. The memory used is accumulated into
+memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
+(currently only for tcp).
+The main "kmem" counter is fed into the main counter, so kmem charges will
+also be visible from the user counter.
+
+Currently no soft limit is implemented for kernel memory. It is future work
+to trigger slab reclaim when those limits are reached.
+
+2.7.1 Current Kernel Memory resources accounted
+
+* stack pages: every process consumes some stack pages. By accounting into
+kernel memory, we prevent new processes from being created when the kernel
+memory usage is too high.
+
+* slab pages: pages allocated by the SLAB or SLUB allocator are tracked. A copy
+of each kmem_cache is created every time the cache is touched by the first time
+from inside the memcg. The creation is done lazily, so some objects can still be
+skipped while the cache is being created. All objects in a slab page should
+belong to the same memcg. This only fails to hold when a task is migrated to a
+different memcg during the page allocation by the cache.
+
+* sockets memory pressure: some sockets protocols have memory pressure
+thresholds. The Memory Controller allows them to be controlled individually
+per cgroup, instead of globally.
+
+* tcp memory pressure: sockets memory pressure for the tcp protocol.
+
+2.7.2 Common use cases
+
+Because the "kmem" counter is fed to the main user counter, kernel memory can
+never be limited completely independently of user memory. Say "U" is the user
+limit, and "K" the kernel limit. There are three possible ways limits can be
+set:
+
+    U != 0, K = unlimited:
+    This is the standard memcg limitation mechanism already present before kmem
+    accounting. Kernel memory is completely ignored.
+
+    U != 0, K < U:
+    Kernel memory is a subset of the user memory. This setup is useful in
+    deployments where the total amount of memory per-cgroup is overcommited.
+    Overcommiting kernel memory limits is definitely not recommended, since the
+    box can still run out of non-reclaimable memory.
+    In this case, the admin could set up K so that the sum of all groups is
+    never greater than the total memory, and freely set U at the cost of his
+    QoS.
+    WARNING: In the current implementation, memory reclaim will NOT be
+    triggered for a cgroup when it hits K while staying below U, which makes
+    this setup impractical.
+
+    U != 0, K >= U:
+    Since kmem charges will also be fed to the user counter and reclaim will be
+    triggered for the cgroup for both kinds of memory. This setup gives the
+    admin a unified view of memory, and it is also useful for people who just
+    want to track kernel memory usage.
+
+3. User Interface
+
+3.0. Configuration
+
+a. Enable CONFIG_CGROUPS
+b. Enable CONFIG_MEMCG
+c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
+d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
+
+3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
+# mount -t tmpfs none /sys/fs/cgroup
+# mkdir /sys/fs/cgroup/memory
+# mount -t cgroup none /sys/fs/cgroup/memory -o memory
+
+3.2. Make the new group and move bash into it
+# mkdir /sys/fs/cgroup/memory/0
+# echo $$ > /sys/fs/cgroup/memory/0/tasks
+
+Since now we're in the 0 cgroup, we can alter the memory limit:
+# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
+
+NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
+mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.)
+
+NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
+NOTE: We cannot set limits on the root cgroup any more.
+
+# cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
+4194304
+
+We can check the usage:
+# cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
+1216512
+
+A successful write to this file does not guarantee a successful setting of
+this limit to the value written into the file. This can be due to a
+number of factors, such as rounding up to page boundaries or the total
+availability of memory on the system. The user is required to re-read
+this file after a write to guarantee the value committed by the kernel.
+
+# echo 1 > memory.limit_in_bytes
+# cat memory.limit_in_bytes
+4096
+
+The memory.failcnt field gives the number of times that the cgroup limit was
+exceeded.
+
+The memory.stat file gives accounting information. Now, the number of
+caches, RSS and Active pages/Inactive pages are shown.
+
+4. Testing
+
+For testing features and implementation, see memcg_test.txt.
+
+Performance test is also important. To see pure memory controller's overhead,
+testing on tmpfs will give you good numbers of small overheads.
+Example: do kernel make on tmpfs.
+
+Page-fault scalability is also important. At measuring parallel
+page fault test, multi-process test may be better than multi-thread
+test because it has noise of shared objects/status.
+
+But the above two are testing extreme situations.
+Trying usual test under memory controller is always helpful.
+
+4.1 Troubleshooting
+
+Sometimes a user might find that the application under a cgroup is
+terminated by the OOM killer. There are several causes for this:
+
+1. The cgroup limit is too low (just too low to do anything useful)
+2. The user is using anonymous memory and swap is turned off or too low
+
+A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
+some of the pages cached in the cgroup (page cache pages).
+
+To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
+seeing what happens will be helpful.
+
+4.2 Task migration
+
+When a task migrates from one cgroup to another, its charge is not
+carried forward by default. The pages allocated from the original cgroup still
+remain charged to it, the charge is dropped when the page is freed or
+reclaimed.
+
+You can move charges of a task along with task migration.
+See 8. "Move charges at task migration"
+
+4.3 Removing a cgroup
+
+A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
+cgroup might have some charge associated with it, even though all
+tasks have migrated away from it. (because we charge against pages, not
+against tasks.)
+
+We move the stats to root (if use_hierarchy==0) or parent (if
+use_hierarchy==1), and no change on the charge except uncharging
+from the child.
+
+Charges recorded in swap information is not updated at removal of cgroup.
+Recorded information is discarded and a cgroup which uses swap (swapcache)
+will be charged as a new owner of it.
+
+About use_hierarchy, see Section 6.
+
+5. Misc. interfaces.
+
+5.1 force_empty
+  memory.force_empty interface is provided to make cgroup's memory usage empty.
+  When writing anything to this
+
+  # echo 0 > memory.force_empty
+
+  the cgroup will be reclaimed and as many pages reclaimed as possible.
+
+  The typical use case for this interface is before calling rmdir().
+  Because rmdir() moves all pages to parent, some out-of-use page caches can be
+  moved to the parent. If you want to avoid that, force_empty will be useful.
+
+  Also, note that when memory.kmem.limit_in_bytes is set the charges due to
+  kernel pages will still be seen. This is not considered a failure and the
+  write will still return success. In this case, it is expected that
+  memory.kmem.usage_in_bytes == memory.usage_in_bytes.
+
+  About use_hierarchy, see Section 6.
+
+5.2 stat file
+
+memory.stat file includes following statistics
+
+# per-memory cgroup local status
+cache          - # of bytes of page cache memory.
+rss            - # of bytes of anonymous and swap cache memory (includes
+               transparent hugepages).
+rss_huge       - # of bytes of anonymous transparent hugepages.
+mapped_file    - # of bytes of mapped file (includes tmpfs/shmem)
+pgpgin         - # of charging events to the memory cgroup. The charging
+               event happens each time a page is accounted as either mapped
+               anon page(RSS) or cache page(Page Cache) to the cgroup.
+pgpgout                - # of uncharging events to the memory cgroup. The uncharging
+               event happens each time a page is unaccounted from the cgroup.
+swap           - # of bytes of swap usage
+dirty          - # of bytes that are waiting to get written back to the disk.
+writeback      - # of bytes of file/anon cache that are queued for syncing to
+               disk.
+inactive_anon  - # of bytes of anonymous and swap cache memory on inactive
+               LRU list.
+active_anon    - # of bytes of anonymous and swap cache memory on active
+               LRU list.
+inactive_file  - # of bytes of file-backed memory on inactive LRU list.
+active_file    - # of bytes of file-backed memory on active LRU list.
+unevictable    - # of bytes of memory that cannot be reclaimed (mlocked etc).
+
+# status considering hierarchy (see memory.use_hierarchy settings)
+
+hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy
+                       under which the memory cgroup is
+hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to
+                       hierarchy under which memory cgroup is.
+
+total_<counter>                - # hierarchical version of <counter>, which in
+                       addition to the cgroup's own value includes the
+                       sum of all hierarchical children's values of
+                       <counter>, i.e. total_cache
+
+# The following additional stats are dependent on CONFIG_DEBUG_VM.
+
+recent_rotated_anon    - VM internal parameter. (see mm/vmscan.c)
+recent_rotated_file    - VM internal parameter. (see mm/vmscan.c)
+recent_scanned_anon    - VM internal parameter. (see mm/vmscan.c)
+recent_scanned_file    - VM internal parameter. (see mm/vmscan.c)
+
+Memo:
+       recent_rotated means recent frequency of LRU rotation.
+       recent_scanned means recent # of scans to LRU.
+       showing for better debug please see the code for meanings.
+
+Note:
+       Only anonymous and swap cache memory is listed as part of 'rss' stat.
+       This should not be confused with the true 'resident set size' or the
+       amount of physical memory used by the cgroup.
+       'rss + file_mapped" will give you resident set size of cgroup.
+       (Note: file and shmem may be shared among other cgroups. In that case,
+        file_mapped is accounted only when the memory cgroup is owner of page
+        cache.)
+
+5.3 swappiness
+
+Overrides /proc/sys/vm/swappiness for the particular group. The tunable
+in the root cgroup corresponds to the global swappiness setting.
+
+Please note that unlike during the global reclaim, limit reclaim
+enforces that 0 swappiness really prevents from any swapping even if
+there is a swap storage available. This might lead to memcg OOM killer
+if there are no file pages to reclaim.
+
+5.4 failcnt
+
+A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
+This failcnt(== failure count) shows the number of times that a usage counter
+hit its limit. When a memory cgroup hits a limit, failcnt increases and
+memory under it will be reclaimed.
+
+You can reset failcnt by writing 0 to failcnt file.
+# echo 0 > .../memory.failcnt
+
+5.5 usage_in_bytes
+
+For efficiency, as other kernel components, memory cgroup uses some optimization
+to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
+method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz
+value for efficient access. (Of course, when necessary, it's synchronized.)
+If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
+value in memory.stat(see 5.2).
+
+5.6 numa_stat
+
+This is similar to numa_maps but operates on a per-memcg basis.  This is
+useful for providing visibility into the numa locality information within
+an memcg since the pages are allowed to be allocated from any physical
+node.  One of the use cases is evaluating application performance by
+combining this information with the application's CPU allocation.
+
+Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
+per-node page counts including "hierarchical_<counter>" which sums up all
+hierarchical children's values in addition to the memcg's own value.
+
+The output format of memory.numa_stat is:
+
+total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
+file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
+anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
+
+The "total" count is sum of file + anon + unevictable.
+
+6. Hierarchy support
+
+The memory controller supports a deep hierarchy and hierarchical accounting.
+The hierarchy is created by creating the appropriate cgroups in the
+cgroup filesystem. Consider for example, the following cgroup filesystem
+hierarchy
+
+              root
+            /  |   \
+            /  |    \
+          a    b     c
+                     | \
+                     |  \
+                     d   e
+
+In the diagram above, with hierarchical accounting enabled, all memory
+usage of e, is accounted to its ancestors up until the root (i.e, c and root),
+that has memory.use_hierarchy enabled. If one of the ancestors goes over its
+limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
+children of the ancestor.
+
+6.1 Enabling hierarchical accounting and reclaim
+
+A memory cgroup by default disables the hierarchy feature. Support
+can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup
+
+# echo 1 > memory.use_hierarchy
+
+The feature can be disabled by
+
+# echo 0 > memory.use_hierarchy
+
+NOTE1: Enabling/disabling will fail if either the cgroup already has other
+       cgroups created below it, or if the parent cgroup has use_hierarchy
+       enabled.
+
+NOTE2: When panic_on_oom is set to "2", the whole system will panic in
+       case of an OOM event in any cgroup.
+
+7. Soft limits
+
+Soft limits allow for greater sharing of memory. The idea behind soft limits
+is to allow control groups to use as much of the memory as needed, provided
+
+a. There is no memory contention
+b. They do not exceed their hard limit
+
+When the system detects memory contention or low memory, control groups
+are pushed back to their soft limits. If the soft limit of each control
+group is very high, they are pushed back as much as possible to make
+sure that one control group does not starve the others of memory.
+
+Please note that soft limits is a best-effort feature; it comes with
+no guarantees, but it does its best to make sure that when memory is
+heavily contended for, memory is allocated based on the soft limit
+hints/setup. Currently soft limit based reclaim is set up such that
+it gets invoked from balance_pgdat (kswapd).
+
+7.1 Interface
+
+Soft limits can be setup by using the following commands (in this example we
+assume a soft limit of 256 MiB)
+
+# echo 256M > memory.soft_limit_in_bytes
+
+If we want to change this to 1G, we can at any time use
+
+# echo 1G > memory.soft_limit_in_bytes
+
+NOTE1: Soft limits take effect over a long period of time, since they involve
+       reclaiming memory for balancing between memory cgroups
+NOTE2: It is recommended to set the soft limit always below the hard limit,
+       otherwise the hard limit will take precedence.
+
+8. Move charges at task migration
+
+Users can move charges associated with a task along with task migration, that
+is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
+This feature is not supported in !CONFIG_MMU environments because of lack of
+page tables.
+
+8.1 Interface
+
+This feature is disabled by default. It can be enabled (and disabled again) by
+writing to memory.move_charge_at_immigrate of the destination cgroup.
+
+If you want to enable it:
+
+# echo (some positive value) > memory.move_charge_at_immigrate
+
+Note: Each bits of move_charge_at_immigrate has its own meaning about what type
+      of charges should be moved. See 8.2 for details.
+Note: Charges are moved only when you move mm->owner, in other words,
+      a leader of a thread group.
+Note: If we cannot find enough space for the task in the destination cgroup, we
+      try to make space by reclaiming memory. Task migration may fail if we
+      cannot make enough space.
+Note: It can take several seconds if you move charges much.
+
+And if you want disable it again:
+
+# echo 0 > memory.move_charge_at_immigrate
+
+8.2 Type of charges which can be moved
+
+Each bit in move_charge_at_immigrate has its own meaning about what type of
+charges should be moved. But in any case, it must be noted that an account of
+a page or a swap can be moved only when it is charged to the task's current
+(old) memory cgroup.
+
+  bit | what type of charges would be moved ?
+ -----+------------------------------------------------------------------------
+   0  | A charge of an anonymous page (or swap of it) used by the target task.
+      | You must enable Swap Extension (see 2.4) to enable move of swap charges.
+ -----+------------------------------------------------------------------------
+   1  | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory)
+      | and swaps of tmpfs file) mmapped by the target task. Unlike the case of
+      | anonymous pages, file pages (and swaps) in the range mmapped by the task
+      | will be moved even if the task hasn't done page fault, i.e. they might
+      | not be the task's "RSS", but other task's "RSS" that maps the same file.
+      | And mapcount of the page is ignored (the page can be moved even if
+      | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to
+      | enable move of swap charges.
+
+8.3 TODO
+
+- All of moving charge operations are done under cgroup_mutex. It's not good
+  behavior to hold the mutex too long, so we may need some trick.
+
+9. Memory thresholds
+
+Memory cgroup implements memory thresholds using the cgroups notification
+API (see cgroups.txt). It allows to register multiple memory and memsw
+thresholds and gets notifications when it crosses.
+
+To register a threshold, an application must:
+- create an eventfd using eventfd(2);
+- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
+- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
+  cgroup.event_control.
+
+Application will be notified through eventfd when memory usage crosses
+threshold in any direction.
+
+It's applicable for root and non-root cgroup.
+
+10. OOM Control
+
+memory.oom_control file is for OOM notification and other controls.
+
+Memory cgroup implements OOM notifier using the cgroup notification
+API (See cgroups.txt). It allows to register multiple OOM notification
+delivery and gets notification when OOM happens.
+
+To register a notifier, an application must:
+ - create an eventfd using eventfd(2)
+ - open memory.oom_control file
+ - write string like "<event_fd> <fd of memory.oom_control>" to
+   cgroup.event_control
+
+The application will be notified through eventfd when OOM happens.
+OOM notification doesn't work for the root cgroup.
+
+You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
+
+       #echo 1 > memory.oom_control
+
+If OOM-killer is disabled, tasks under cgroup will hang/sleep
+in memory cgroup's OOM-waitqueue when they request accountable memory.
+
+For running them, you have to relax the memory cgroup's OOM status by
+       * enlarge limit or reduce usage.
+To reduce usage,
+       * kill some tasks.
+       * move some tasks to other group with account migration.
+       * remove some files (on tmpfs?)
+
+Then, stopped tasks will work again.
+
+At reading, current status of OOM is shown.
+       oom_kill_disable 0 or 1 (if 1, oom-killer is disabled)
+       under_oom        0 or 1 (if 1, the memory cgroup is under OOM, tasks may
+                                be stopped.)
+
+11. Memory Pressure
+
+The pressure level notifications can be used to monitor the memory
+allocation cost; based on the pressure, applications can implement
+different strategies of managing their memory resources. The pressure
+levels are defined as following:
+
+The "low" level means that the system is reclaiming memory for new
+allocations. Monitoring this reclaiming activity might be useful for
+maintaining cache level. Upon notification, the program (typically
+"Activity Manager") might analyze vmstat and act in advance (i.e.
+prematurely shutdown unimportant services).
+
+The "medium" level means that the system is experiencing medium memory
+pressure, the system might be making swap, paging out active file caches,
+etc. Upon this event applications may decide to further analyze
+vmstat/zoneinfo/memcg or internal memory usage statistics and free any
+resources that can be easily reconstructed or re-read from a disk.
+
+The "critical" level means that the system is actively thrashing, it is
+about to out of memory (OOM) or even the in-kernel OOM killer is on its
+way to trigger. Applications should do whatever they can to help the
+system. It might be too late to consult with vmstat or any other
+statistics, so it's advisable to take an immediate action.
+
+The events are propagated upward until the event is handled, i.e. the
+events are not pass-through. Here is what this means: for example you have
+three cgroups: A->B->C. Now you set up an event listener on cgroups A, B
+and C, and suppose group C experiences some pressure. In this situation,
+only group C will receive the notification, i.e. groups A and B will not
+receive it. This is done to avoid excessive "broadcasting" of messages,
+which disturbs the system and which is especially bad if we are low on
+memory or thrashing. So, organize the cgroups wisely, or propagate the
+events manually (or, ask us to implement the pass-through events,
+explaining why would you need them.)
+
+The file memory.pressure_level is only used to setup an eventfd. To
+register a notification, an application must:
+
+- create an eventfd using eventfd(2);
+- open memory.pressure_level;
+- write string like "<event_fd> <fd of memory.pressure_level> <level>"
+  to cgroup.event_control.
+
+Application will be notified through eventfd when memory pressure is at
+the specific level (or higher). Read/write operations to
+memory.pressure_level are no implemented.
+
+Test:
+
+   Here is a small script example that makes a new cgroup, sets up a
+   memory limit, sets up a notification in the cgroup and then makes child
+   cgroup experience a critical pressure:
+
+   # cd /sys/fs/cgroup/memory/
+   # mkdir foo
+   # cd foo
+   # cgroup_event_listener memory.pressure_level low &
+   # echo 8000000 > memory.limit_in_bytes
+   # echo 8000000 > memory.memsw.limit_in_bytes
+   # echo $$ > tasks
+   # dd if=/dev/zero | read x
+
+   (Expect a bunch of notifications, and eventually, the oom-killer will
+   trigger.)
+
+12. TODO
+
+1. Make per-cgroup scanner reclaim not-shared pages first
+2. Teach controller to account for shared-pages
+3. Start reclamation in the background when the limit is
+   not yet hit but the usage is getting closer
+
+Summary
+
+Overall, the memory controller has been a stable controller and has been
+commented and discussed quite extensively in the community.
+
+References
+
+1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
+2. Singh, Balbir. Memory Controller (RSS Control),
+   http://lwn.net/Articles/222762/
+3. Emelianov, Pavel. Resource controllers based on process cgroups
+   http://lkml.org/lkml/2007/3/6/198
+4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
+   http://lkml.org/lkml/2007/4/9/78
+5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
+   http://lkml.org/lkml/2007/5/30/244
+6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
+7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
+   subsystem (v3), http://lwn.net/Articles/235534/
+8. Singh, Balbir. RSS controller v2 test results (lmbench),
+   http://lkml.org/lkml/2007/5/17/232
+9. Singh, Balbir. RSS controller v2 AIM9 results
+   http://lkml.org/lkml/2007/5/18/1
+10. Singh, Balbir. Memory controller v6 test results,
+    http://lkml.org/lkml/2007/8/19/36
+11. Singh, Balbir. Memory controller introduction (v6),
+    http://lkml.org/lkml/2007/8/17/69
+12. Corbet, Jonathan, Controlling memory use in cgroups,
+    http://lwn.net/Articles/243795/
diff --git a/Documentation/cgroup-legacy/net_cls.txt b/Documentation/cgroup-legacy/net_cls.txt

new file mode 100644 (file)

index 0000000..ec18234
--- /dev/null
+++ b/Documentation/cgroup-legacy/net_cls.txt
@@ -0,0 +1,39 @@
+Network classifier cgroup
+-------------------------
+
+The Network classifier cgroup provides an interface to
+tag network packets with a class identifier (classid).
+
+The Traffic Controller (tc) can be used to assign
+different priorities to packets from different cgroups.
+Also, Netfilter (iptables) can use this tag to perform
+actions on such packets.
+
+Creating a net_cls cgroups instance creates a net_cls.classid file.
+This net_cls.classid value is initialized to 0.
+
+You can write hexadecimal values to net_cls.classid; the format for these
+values is 0xAAAABBBB; AAAA is the major handle number and BBBB
+is the minor handle number.
+Reading net_cls.classid yields a decimal result.
+
+Example:
+mkdir /sys/fs/cgroup/net_cls
+mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls
+mkdir /sys/fs/cgroup/net_cls/0
+echo 0x100001 >  /sys/fs/cgroup/net_cls/0/net_cls.classid
+       - setting a 10:1 handle.
+
+cat /sys/fs/cgroup/net_cls/0/net_cls.classid
+1048577
+
+configuring tc:
+tc qdisc add dev eth0 root handle 10: htb
+
+tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit
+ - creating traffic class 10:1
+
+tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup
+
+configuring iptables, basic example:
+iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP
diff --git a/Documentation/cgroup-legacy/net_prio.txt b/Documentation/cgroup-legacy/net_prio.txt

new file mode 100644 (file)

index 0000000..a82cbd2
--- /dev/null
+++ b/Documentation/cgroup-legacy/net_prio.txt
@@ -0,0 +1,55 @@
+Network priority cgroup
+-------------------------
+
+The Network priority cgroup provides an interface to allow an administrator to
+dynamically set the priority of network traffic generated by various
+applications
+
+Nominally, an application would set the priority of its traffic via the
+SO_PRIORITY socket option.  This however, is not always possible because:
+
+1) The application may not have been coded to set this value
+2) The priority of application traffic is often a site-specific administrative
+   decision rather than an application defined one.
+
+This cgroup allows an administrator to assign a process to a group which defines
+the priority of egress traffic on a given interface. Network priority groups can
+be created by first mounting the cgroup filesystem.
+
+# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
+
+With the above step, the initial group acting as the parent accounting group
+becomes visible at '/sys/fs/cgroup/net_prio'.  This group includes all tasks in
+the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup.
+
+Each net_prio cgroup contains two files that are subsystem specific
+
+net_prio.prioidx
+This file is read-only, and is simply informative.  It contains a unique integer
+value that the kernel uses as an internal representation of this cgroup.
+
+net_prio.ifpriomap
+This file contains a map of the priorities assigned to traffic originating from
+processes in this group and egressing the system on various interfaces. It
+contains a list of tuples in the form <ifname priority>.  Contents of this file
+can be modified by echoing a string into the file using the same tuple format.
+for example:
+
+echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap
+
+This command would force any traffic originating from processes belonging to the
+iscsi net_prio cgroup and egressing on interface eth0 to have the priority of
+said traffic set to the value 5. The parent accounting group also has a
+writeable 'net_prio.ifpriomap' file that can be used to set a system default
+priority.
+
+Priorities are set immediately prior to queueing a frame to the device
+queueing discipline (qdisc) so priorities will be assigned prior to the hardware
+queue selection being made.
+
+One usage for the net_prio cgroup is with mqprio qdisc allowing application
+traffic to be steered to hardware/driver based traffic classes. These mappings
+can then be managed by administrators or other networking protocols such as
+DCBX.
+
+A new net_prio cgroup inherits the parent's configuration.
diff --git a/Documentation/cgroup-legacy/pids.txt b/Documentation/cgroup-legacy/pids.txt

new file mode 100644 (file)

index 0000000..1a078b5
--- /dev/null
+++ b/Documentation/cgroup-legacy/pids.txt
@@ -0,0 +1,85 @@
+                                                  Process Number Controller
+                                                  =========================
+
+Abstract
+--------
+
+The process number controller is used to allow a cgroup hierarchy to stop any
+new tasks from being fork()'d or clone()'d after a certain limit is reached.
+
+Since it is trivial to hit the task limit without hitting any kmemcg limits in
+place, PIDs are a fundamental resource. As such, PID exhaustion must be
+preventable in the scope of a cgroup hierarchy by allowing resource limiting of
+the number of tasks in a cgroup.
+
+Usage
+-----
+
+In order to use the `pids` controller, set the maximum number of tasks in
+pids.max (this is not available in the root cgroup for obvious reasons). The
+number of processes currently in the cgroup is given by pids.current.
+
+Organisational operations are not blocked by cgroup policies, so it is possible
+to have pids.current > pids.max. This can be done by either setting the limit to
+be smaller than pids.current, or attaching enough processes to the cgroup such
+that pids.current > pids.max. However, it is not possible to violate a cgroup
+policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
+creation of a new process would cause a cgroup policy to be violated.
+
+To set a cgroup to have no limit, set pids.max to "max". This is the default for
+all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
+limit in the hierarchy is followed).
+
+pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
+superset of parent/child/pids.current.
+
+Example
+-------
+
+First, we mount the pids controller:
+# mkdir -p /sys/fs/cgroup/pids
+# mount -t cgroup -o pids none /sys/fs/cgroup/pids
+
+Then we create a hierarchy, set limits and attach processes to it:
+# mkdir -p /sys/fs/cgroup/pids/parent/child
+# echo 2 > /sys/fs/cgroup/pids/parent/pids.max
+# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
+# cat /sys/fs/cgroup/pids/parent/pids.current
+2
+#
+
+It should be noted that attempts to overcome the set limit (2 in this case) will
+fail:
+
+# cat /sys/fs/cgroup/pids/parent/pids.current
+2
+# ( /bin/echo "Here's some processes for you." | cat )
+sh: fork: Resource temporary unavailable
+#
+
+Even if we migrate to a child cgroup (which doesn't have a set limit), we will
+not be able to overcome the most stringent limit in the hierarchy (in this case,
+parent's):
+
+# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
+# cat /sys/fs/cgroup/pids/parent/pids.current
+2
+# cat /sys/fs/cgroup/pids/parent/child/pids.current
+2
+# cat /sys/fs/cgroup/pids/parent/child/pids.max
+max
+# ( /bin/echo "Here's some processes for you." | cat )
+sh: fork: Resource temporary unavailable
+#
+
+We can set a limit that is smaller than pids.current, which will stop any new
+processes from being forked at all (note that the shell itself counts towards
+pids.current):
+
+# echo 1 > /sys/fs/cgroup/pids/parent/pids.max
+# /bin/echo "We can't even spawn a single process now."
+sh: fork: Resource temporary unavailable
+# echo 0 > /sys/fs/cgroup/pids/parent/pids.max
+# /bin/echo "We can't even spawn a single process now."
+sh: fork: Resource temporary unavailable
+#
diff --git a/Documentation/cgroup.txt b/Documentation/cgroup.txt

new file mode 100644 (file)

index 0000000..31d1f7b
--- /dev/null
+++ b/Documentation/cgroup.txt
@@ -0,0 +1,1293 @@
+
+Control Group v2
+
+October, 2015          Tejun Heo <tj@kernel.org>
+
+This is the authoritative documentation on the design, interface and
+conventions of cgroup v2.  It describes all userland-visible aspects
+of cgroup including core and specific controller behaviors.  All
+future changes must be reflected in this document.  Documentation for
+v1 is available under Documentation/cgroup-legacy/.
+
+CONTENTS
+
+1. Introduction
+  1-1. Terminology
+  1-2. What is cgroup?
+2. Basic Operations
+  2-1. Mounting
+  2-2. Organizing Processes
+  2-3. [Un]populated Notification
+  2-4. Controlling Controllers
+    2-4-1. Enabling and Disabling
+    2-4-2. Top-down Constraint
+    2-4-3. No Internal Process Constraint
+  2-5. Delegation
+    2-5-1. Model of Delegation
+    2-5-2. Delegation Containment
+  2-6. Guidelines
+    2-6-1. Organize Once and Control
+    2-6-2. Avoid Name Collisions
+3. Resource Distribution Models
+  3-1. Weights
+  3-2. Limits
+  3-3. Protections
+  3-4. Allocations
+4. Interface Files
+  4-1. Format
+  4-2. Conventions
+  4-3. Core Interface Files
+5. Controllers
+  5-1. CPU
+    5-1-1. CPU Interface Files
+  5-2. Memory
+    5-2-1. Memory Interface Files
+    5-2-2. Usage Guidelines
+    5-2-3. Memory Ownership
+  5-3. IO
+    5-3-1. IO Interface Files
+    5-3-2. Writeback
+P. Information on Kernel Programming
+  P-1. Filesystem Support for Writeback
+D. Deprecated v1 Core Features
+R. Issues with v1 and Rationales for v2
+  R-1. Multiple Hierarchies
+  R-2. Thread Granularity
+  R-3. Competition Between Inner Nodes and Threads
+  R-4. Other Interface Issues
+  R-5. Controller Issues and Remedies
+    R-5-1. Memory
+
+
+1. Introduction
+
+1-1. Terminology
+
+"cgroup" stands for "control group" and is never capitalized.  The
+singular form is used to designate the whole feature and also as a
+qualifier as in "cgroup controllers".  When explicitly referring to
+multiple individual control groups, the plural form "cgroups" is used.
+
+
+1-2. What is cgroup?
+
+cgroup is a mechanism to organize processes hierarchically and
+distribute system resources along the hierarchy in a controlled and
+configurable manner.
+
+cgroup is largely composed of two parts - the core and controllers.
+cgroup core is primarily responsible for hierarchically organizing
+processes.  A cgroup controller is usually responsible for
+distributing a specific type of system resource along the hierarchy
+although there are utility controllers which serve purposes other than
+resource distribution.
+
+cgroups form a tree structure and every process in the system belongs
+to one and only one cgroup.  All threads of a process belong to the
+same cgroup.  On creation, all processes are put in the cgroup that
+the parent process belongs to at the time.  A process can be migrated
+to another cgroup.  Migration of a process doesn't affect already
+existing descendant processes.
+
+Following certain structural constraints, controllers may be enabled or
+disabled selectively on a cgroup.  All controller behaviors are
+hierarchical - if a controller is enabled on a cgroup, it affects all
+processes which belong to the cgroups consisting the inclusive
+sub-hierarchy of the cgroup.  When a controller is enabled on a nested
+cgroup, it always restricts the resource distribution further.  The
+restrictions set closer to the root in the hierarchy can not be
+overridden from further away.
+
+
+2. Basic Operations
+
+2-1. Mounting
+
+Unlike v1, cgroup v2 has only single hierarchy.  The cgroup v2
+hierarchy can be mounted with the following mount command.
+
+  # mount -t cgroup2 none $MOUNT_POINT
+
+cgroup2 filesystem has the magic number 0x63677270 ("cgrp").  All
+controllers which support v2 and are not bound to a v1 hierarchy are
+automatically bound to the v2 hierarchy and show up at the root.
+Controllers which are not in active use in the v2 hierarchy can be
+bound to other hierarchies.  This allows mixing v2 hierarchy with the
+legacy v1 multiple hierarchies in a fully backward compatible way.
+
+A controller can be moved across hierarchies only after the controller
+is no longer referenced in its current hierarchy.  Because per-cgroup
+controller states are destroyed asynchronously and controllers may
+have lingering references, a controller may not show up immediately on
+the v2 hierarchy after the final umount of the previous hierarchy.
+Similarly, a controller should be fully disabled to be moved out of
+the unified hierarchy and it may take some time for the disabled
+controller to become available for other hierarchies; furthermore, due
+to inter-controller dependencies, other controllers may need to be
+disabled too.
+
+While useful for development and manual configurations, moving
+controllers dynamically between the v2 and other hierarchies is
+strongly discouraged for production use.  It is recommended to decide
+the hierarchies and controller associations before starting using the
+controllers after system boot.
+
+
+2-2. Organizing Processes
+
+Initially, only the root cgroup exists to which all processes belong.
+A child cgroup can be created by creating a sub-directory.
+
+  # mkdir $CGROUP_NAME
+
+A given cgroup may have multiple child cgroups forming a tree
+structure.  Each cgroup has a read-writable interface file
+"cgroup.procs".  When read, it lists the PIDs of all processes which
+belong to the cgroup one-per-line.  The PIDs are not ordered and the
+same PID may show up more than once if the process got moved to
+another cgroup and then back or the PID got recycled while reading.
+
+A process can be migrated into a cgroup by writing its PID to the
+target cgroup's "cgroup.procs" file.  Only one process can be migrated
+on a single write(2) call.  If a process is composed of multiple
+threads, writing the PID of any thread migrates all threads of the
+process.
+
+When a process forks a child process, the new process is born into the
+cgroup that the forking process belongs to at the time of the
+operation.  After exit, a process stays associated with the cgroup
+that it belonged to at the time of exit until it's reaped; however, a
+zombie process does not appear in "cgroup.procs" and thus can't be
+moved to another cgroup.
+
+A cgroup which doesn't have any children or live processes can be
+destroyed by removing the directory.  Note that a cgroup which doesn't
+have any children and is associated only with zombie processes is
+considered empty and can be removed.
+
+  # rmdir $CGROUP_NAME
+
+"/proc/$PID/cgroup" lists a process's cgroup membership.  If legacy
+cgroup is in use in the system, this file may contain multiple lines,
+one for each hierarchy.  The entry for cgroup v2 is always in the
+format "0::$PATH".
+
+  # cat /proc/842/cgroup
+  ...
+  0::/test-cgroup/test-cgroup-nested
+
+If the process becomes a zombie and the cgroup it was associated with
+is removed subsequently, " (deleted)" is appended to the path.
+
+  # cat /proc/842/cgroup
+  ...
+  0::/test-cgroup/test-cgroup-nested (deleted)
+
+
+2-3. [Un]populated Notification
+
+Each non-root cgroup has a "cgroup.events" file which contains
+"populated" field indicating whether the cgroup's sub-hierarchy has
+live processes in it.  Its value is 0 if there is no live process in
+the cgroup and its descendants; otherwise, 1.  poll and [id]notify
+events are triggered when the value changes.  This can be used, for
+example, to start a clean-up operation after all processes of a given
+sub-hierarchy have exited.  The populated state updates and
+notifications are recursive.  Consider the following sub-hierarchy
+where the numbers in the parentheses represent the numbers of processes
+in each cgroup.
+
+  A(4) - B(0) - C(1)
+              \ D(0)
+
+A, B and C's "populated" fields would be 1 while D's 0.  After the one
+process in C exits, B and C's "populated" fields would flip to "0" and
+file modified events will be generated on the "cgroup.events" files of
+both cgroups.
+
+
+2-4. Controlling Controllers
+
+2-4-1. Enabling and Disabling
+
+Each cgroup has a "cgroup.controllers" file which lists all
+controllers available for the cgroup to enable.
+
+  # cat cgroup.controllers
+  cpu io memory
+
+No controller is enabled by default.  Controllers can be enabled and
+disabled by writing to the "cgroup.subtree_control" file.
+
+  # echo "+cpu +memory -io" > cgroup.subtree_control
+
+Only controllers which are listed in "cgroup.controllers" can be
+enabled.  When multiple operations are specified as above, either they
+all succeed or fail.  If multiple operations on the same controller
+are specified, the last one is effective.
+
+Enabling a controller in a cgroup indicates that the distribution of
+the target resource across its immediate children will be controlled.
+Consider the following sub-hierarchy.  The enabled controllers are
+listed in parentheses.
+
+  A(cpu,memory) - B(memory) - C()
+                            \ D()
+
+As A has "cpu" and "memory" enabled, A will control the distribution
+of CPU cycles and memory to its children, in this case, B.  As B has
+"memory" enabled but not "CPU", C and D will compete freely on CPU
+cycles but their division of memory available to B will be controlled.
+
+As a controller regulates the distribution of the target resource to
+the cgroup's children, enabling it creates the controller's interface
+files in the child cgroups.  In the above example, enabling "cpu" on B
+would create the "cpu." prefixed controller interface files in C and
+D.  Likewise, disabling "memory" from B would remove the "memory."
+prefixed controller interface files from C and D.  This means that the
+controller interface files - anything which doesn't start with
+"cgroup." are owned by the parent rather than the cgroup itself.
+
+
+2-4-2. Top-down Constraint
+
+Resources are distributed top-down and a cgroup can further distribute
+a resource only if the resource has been distributed to it from the
+parent.  This means that all non-root "cgroup.subtree_control" files
+can only contain controllers which are enabled in the parent's
+"cgroup.subtree_control" file.  A controller can be enabled only if
+the parent has the controller enabled and a controller can't be
+disabled if one or more children have it enabled.
+
+
+2-4-3. No Internal Process Constraint
+
+Non-root cgroups can only distribute resources to their children when
+they don't have any processes of their own.  In other words, only
+cgroups which don't contain any processes can have controllers enabled
+in their "cgroup.subtree_control" files.
+
+This guarantees that, when a controller is looking at the part of the
+hierarchy which has it enabled, processes are always only on the
+leaves.  This rules out situations where child cgroups compete against
+internal processes of the parent.
+
+The root cgroup is exempt from this restriction.  Root contains
+processes and anonymous resource consumption which can't be associated
+with any other cgroups and requires special treatment from most
+controllers.  How resource consumption in the root cgroup is governed
+is up to each controller.
+
+Note that the restriction doesn't get in the way if there is no
+enabled controller in the cgroup's "cgroup.subtree_control".  This is
+important as otherwise it wouldn't be possible to create children of a
+populated cgroup.  To control resource distribution of a cgroup, the
+cgroup must create children and transfer all its processes to the
+children before enabling controllers in its "cgroup.subtree_control"
+file.
+
+
+2-5. Delegation
+
+2-5-1. Model of Delegation
+
+A cgroup can be delegated to a less privileged user by granting write
+access of the directory and its "cgroup.procs" file to the user.  Note
+that resource control interface files in a given directory control the
+distribution of the parent's resources and thus must not be delegated
+along with the directory.
+
+Once delegated, the user can build sub-hierarchy under the directory,
+organize processes as it sees fit and further distribute the resources
+it received from the parent.  The limits and other settings of all
+resource controllers are hierarchical and regardless of what happens
+in the delegated sub-hierarchy, nothing can escape the resource
+restrictions imposed by the parent.
+
+Currently, cgroup doesn't impose any restrictions on the number of
+cgroups in or nesting depth of a delegated sub-hierarchy; however,
+this may be limited explicitly in the future.
+
+
+2-5-2. Delegation Containment
+
+A delegated sub-hierarchy is contained in the sense that processes
+can't be moved into or out of the sub-hierarchy by the delegatee.  For
+a process with a non-root euid to migrate a target process into a
+cgroup by writing its PID to the "cgroup.procs" file, the following
+conditions must be met.
+
+- The writer's euid must match either uid or suid of the target process.
+
+- The writer must have write access to the "cgroup.procs" file.
+
+- The writer must have write access to the "cgroup.procs" file of the
+  common ancestor of the source and destination cgroups.
+
+The above three constraints ensure that while a delegatee may migrate
+processes around freely in the delegated sub-hierarchy it can't pull
+in from or push out to outside the sub-hierarchy.
+
+For an example, let's assume cgroups C0 and C1 have been delegated to
+user U0 who created C00, C01 under C0 and C10 under C1 as follows and
+all processes under C0 and C1 belong to U0.
+
+  ~~~~~~~~~~~~~ - C0 - C00
+  ~ cgroup    ~      \ C01
+  ~ hierarchy ~
+  ~~~~~~~~~~~~~ - C1 - C10
+
+Let's also say U0 wants to write the PID of a process which is
+currently in C10 into "C00/cgroup.procs".  U0 has write access to the
+file and uid match on the process; however, the common ancestor of the
+source cgroup C10 and the destination cgroup C00 is above the points
+of delegation and U0 would not have write access to its "cgroup.procs"
+files and thus the write will be denied with -EACCES.
+
+
+2-6. Guidelines
+
+2-6-1. Organize Once and Control
+
+Migrating a process across cgroups is a relatively expensive operation
+and stateful resources such as memory are not moved together with the
+process.  This is an explicit design decision as there often exist
+inherent trade-offs between migration and various hot paths in terms
+of synchronization cost.
+
+As such, migrating processes across cgroups frequently as a means to
+apply different resource restrictions is discouraged.  A workload
+should be assigned to a cgroup according to the system's logical and
+resource structure once on start-up.  Dynamic adjustments to resource
+distribution can be made by changing controller configuration through
+the interface files.
+
+
+2-6-2. Avoid Name Collisions
+
+Interface files for a cgroup and its children cgroups occupy the same
+directory and it is possible to create children cgroups which collide
+with interface files.
+
+All cgroup core interface files are prefixed with "cgroup." and each
+controller's interface files are prefixed with the controller name and
+a dot.  A controller's name is composed of lower case alphabets and
+'_'s but never begins with an '_' so it can be used as the prefix
+character for collision avoidance.  Also, interface file names won't
+start or end with terms which are often used in categorizing workloads
+such as job, service, slice, unit or workload.
+
+cgroup doesn't do anything to prevent name collisions and it's the
+user's responsibility to avoid them.
+
+
+3. Resource Distribution Models
+
+cgroup controllers implement several resource distribution schemes
+depending on the resource type and expected use cases.  This section
+describes major schemes in use along with their expected behaviors.
+
+
+3-1. Weights
+
+A parent's resource is distributed by adding up the weights of all
+active children and giving each the fraction matching the ratio of its
+weight against the sum.  As only children which can make use of the
+resource at the moment participate in the distribution, this is
+work-conserving.  Due to the dynamic nature, this model is usually
+used for stateless resources.
+
+All weights are in the range [1, 10000] with the default at 100.  This
+allows symmetric multiplicative biases in both directions at fine
+enough granularity while staying in the intuitive range.
+
+As long as the weight is in range, all configuration combinations are
+valid and there is no reason to reject configuration changes or
+process migrations.
+
+"cpu.weight" proportionally distributes CPU cycles to active children
+and is an example of this type.
+
+
+3-2. Limits
+
+A child can only consume upto the configured amount of the resource.
+Limits can be over-committed - the sum of the limits of children can
+exceed the amount of resource available to the parent.
+
+Limits are in the range [0, max] and defaults to "max", which is noop.
+
+As limits can be over-committed, all configuration combinations are
+valid and there is no reason to reject configuration changes or
+process migrations.
+
+"io.max" limits the maximum BPS and/or IOPS that a cgroup can consume
+on an IO device and is an example of this type.
+
+
+3-3. Protections
+
+A cgroup is protected to be allocated upto the configured amount of
+the resource if the usages of all its ancestors are under their
+protected levels.  Protections can be hard guarantees or best effort
+soft boundaries.  Protections can also be over-committed in which case
+only upto the amount available to the parent is protected among
+children.
+
+Protections are in the range [0, max] and defaults to 0, which is
+noop.
+
+As protections can be over-committed, all configuration combinations
+are valid and there is no reason to reject configuration changes or
+process migrations.
+
+"memory.low" implements best-effort memory protection and is an
+example of this type.
+
+
+3-4. Allocations
+
+A cgroup is exclusively allocated a certain amount of a finite
+resource.  Allocations can't be over-committed - the sum of the
+allocations of children can not exceed the amount of resource
+available to the parent.
+
+Allocations are in the range [0, max] and defaults to 0, which is no
+resource.
+
+As allocations can't be over-committed, some configuration
+combinations are invalid and should be rejected.  Also, if the
+resource is mandatory for execution of processes, process migrations
+may be rejected.
+
+"cpu.rt.max" hard-allocates realtime slices and is an example of this
+type.
+
+
+4. Interface Files
+
+4-1. Format
+
+All interface files should be in one of the following formats whenever
+possible.
+
+  New-line separated values
+  (when only one value can be written at once)
+
+       VAL0\n
+       VAL1\n
+       ...
+
+  Space separated values
+  (when read-only or multiple values can be written at once)
+
+       VAL0 VAL1 ...\n
+
+  Flat keyed
+
+       KEY0 VAL0\n
+       KEY1 VAL1\n
+       ...
+
+  Nested keyed
+
+       KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01...
+       KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11...
+       ...
+
+For a writable file, the format for writing should generally match
+reading; however, controllers may allow omitting later fields or
+implement restricted shortcuts for most common use cases.
+
+For both flat and nested keyed files, only the values for a single key
+can be written at a time.  For nested keyed files, the sub key pairs
+may be specified in any order and not all pairs have to be specified.
+
+
+4-2. Conventions
+
+- Settings for a single feature should be contained in a single file.
+
+- The root cgroup should be exempt from resource control and thus
+  shouldn't have resource control interface files.  Also,
+  informational files on the root cgroup which end up showing global
+  information available elsewhere shouldn't exist.
+
+- If a controller implements weight based resource distribution, its
+  interface file should be named "weight" and have the range [1,
+  10000] with 100 as the default.  The values are chosen to allow
+  enough and symmetric bias in both directions while keeping it
+  intuitive (the default is 100%).
+
+- If a controller implements an absolute resource guarantee and/or
+  limit, the interface files should be named "min" and "max"
+  respectively.  If a controller implements best effort resource
+  guarantee and/or limit, the interface files should be named "low"
+  and "high" respectively.
+
+  In the above four control files, the special token "max" should be
+  used to represent upward infinity for both reading and writing.
+
+- If a setting has a configurable default value and keyed specific
+  overrides, the default entry should be keyed with "default" and
+  appear as the first entry in the file.
+
+  The default value can be updated by writing either "default $VAL" or
+  "$VAL".
+
+  When writing to update a specific override, "default" can be used as
+  the value to indicate removal of the override.  Override entries
+  with "default" as the value must not appear when read.
+
+  For example, a setting which is keyed by major:minor device numbers
+  with integer values may look like the following.
+
+    # cat cgroup-example-interface-file
+    default 150
+    8:0 300
+
+  The default value can be updated by
+
+    # echo 125 > cgroup-example-interface-file
+
+  or
+
+    # echo "default 125" > cgroup-example-interface-file
+
+  An override can be set by
+
+    # echo "8:16 170" > cgroup-example-interface-file
+
+  and cleared by
+
+    # echo "8:0 default" > cgroup-example-interface-file
+    # cat cgroup-example-interface-file
+    default 125
+    8:16 170
+
+- For events which are not very high frequency, an interface file
+  "events" should be created which lists event key value pairs.
+  Whenever a notifiable event happens, file modified event should be
+  generated on the file.
+
+
+4-3. Core Interface Files
+
+All cgroup core files are prefixed with "cgroup."
+
+  cgroup.procs
+
+       A read-write new-line separated values file which exists on
+       all cgroups.
+
+       When read, it lists the PIDs of all processes which belong to
+       the cgroup one-per-line.  The PIDs are not ordered and the
+       same PID may show up more than once if the process got moved
+       to another cgroup and then back or the PID got recycled while
+       reading.
+
+       A PID can be written to migrate the process associated with
+       the PID to the cgroup.  The writer should match all of the
+       following conditions.
+
+       - Its euid is either root or must match either uid or suid of
+          the target process.
+
+       - It must have write access to the "cgroup.procs" file.
+
+       - It must have write access to the "cgroup.procs" file of the
+         common ancestor of the source and destination cgroups.
+
+       When delegating a sub-hierarchy, write access to this file
+       should be granted along with the containing directory.
+
+  cgroup.controllers
+
+       A read-only space separated values file which exists on all
+       cgroups.
+
+       It shows space separated list of all controllers available to
+       the cgroup.  The controllers are not ordered.
+
+  cgroup.subtree_control
+
+       A read-write space separated values file which exists on all
+       cgroups.  Starts out empty.
+
+       When read, it shows space separated list of the controllers
+       which are enabled to control resource distribution from the
+       cgroup to its children.
+
+       Space separated list of controllers prefixed with '+' or '-'
+       can be written to enable or disable controllers.  A controller
+       name prefixed with '+' enables the controller and '-'
+       disables.  If a controller appears more than once on the list,
+       the last one is effective.  When multiple enable and disable
+       operations are specified, either all succeed or all fail.
+
+  cgroup.events
+
+       A read-only flat-keyed file which exists on non-root cgroups.
+       The following entries are defined.  Unless specified
+       otherwise, a value change in this file generates a file
+       modified event.
+
+         populated
+
+               1 if the cgroup or its descendants contains any live
+               processes; otherwise, 0.
+
+
+5. Controllers
+
+5-1. CPU
+
+[NOTE: The interface for the cpu controller hasn't been merged yet]
+
+The "cpu" controllers regulates distribution of CPU cycles.  This
+controller implements weight and absolute bandwidth limit models for
+normal scheduling policy and absolute bandwidth allocation model for
+realtime scheduling policy.
+
+
+5-1-1. CPU Interface Files
+
+All time durations are in microseconds.
+
+  cpu.stat
+
+       A read-only flat-keyed file which exists on non-root cgroups.
+
+       It reports the following six stats.
+
+         usage_usec
+         user_usec
+         system_usec
+         nr_periods
+         nr_throttled
+         throttled_usec
+
+  cpu.weight
+
+       A read-write single value file which exists on non-root
+       cgroups.  The default is "100".
+
+       The weight in the range [1, 10000].
+
+  cpu.max
+
+       A read-write two value file which exists on non-root cgroups.
+       The default is "max 100000".
+
+       The maximum bandwidth limit.  It's in the following format.
+
+         $MAX $PERIOD
+
+       which indicates that the group may consume upto $MAX in each
+       $PERIOD duration.  "max" for $MAX indicates no limit.  If only
+       one number is written, $MAX is updated.
+
+  cpu.rt.max
+
+  [NOTE: The semantics of this file is still under discussion and the
+   interface hasn't been merged yet]
+
+       A read-write two value file which exists on all cgroups.
+       The default is "0 100000".
+
+       The maximum realtime runtime allocation.  Over-committing
+       configurations are disallowed and process migrations are
+       rejected if not enough bandwidth is available.  It's in the
+       following format.
+
+         $MAX $PERIOD
+
+       which indicates that the group may consume upto $MAX in each
+       $PERIOD duration.  If only one number is written, $MAX is
+       updated.
+
+
+5-2. Memory
+
+The "memory" controller regulates distribution of memory.  Memory is
+stateful and implements both limit and protection models.  Due to the
+intertwining between memory usage and reclaim pressure and the
+stateful nature of memory, the distribution model is relatively
+complex.
+
+While not completely water-tight, all major memory usages by a given
+cgroup are tracked so that the total memory consumption can be
+accounted and controlled to a reasonable extent.  Currently, the
+following types of memory usages are tracked.
+
+- Userland memory - page cache and anonymous memory.
+
+- Kernel data structures such as dentries and inodes.
+
+- TCP socket buffers.
+
+The above list may expand in the future for better coverage.
+
+
+5-2-1. Memory Interface Files
+
+All memory amounts are in bytes.  If a value which is not aligned to
+PAGE_SIZE is written, the value may be rounded up to the closest
+PAGE_SIZE multiple when read back.
+
+  memory.current
+
+       A read-only single value file which exists on non-root
+       cgroups.
+
+       The total amount of memory currently being used by the cgroup
+       and its descendants.
+
+  memory.low
+
+       A read-write single value file which exists on non-root
+       cgroups.  The default is "0".
+
+       Best-effort memory protection.  If the memory usages of a
+       cgroup and all its ancestors are below their low boundaries,
+       the cgroup's memory won't be reclaimed unless memory can be
+       reclaimed from unprotected cgroups.
+
+       Putting more memory than generally available under this
+       protection is discouraged.
+
+  memory.high
+
+       A read-write single value file which exists on non-root
+       cgroups.  The default is "max".
+
+       Memory usage throttle limit.  This is the main mechanism to
+       control memory usage of a cgroup.  If a cgroup's usage goes
+       over the high boundary, the processes of the cgroup are
+       throttled and put under heavy reclaim pressure.
+
+       Going over the high limit never invokes the OOM killer and
+       under extreme conditions the limit may be breached.
+
+  memory.max
+
+       A read-write single value file which exists on non-root
+       cgroups.  The default is "max".
+
+       Memory usage hard limit.  This is the final protection
+       mechanism.  If a cgroup's memory usage reaches this limit and
+       can't be reduced, the OOM killer is invoked in the cgroup.
+       Under certain circumstances, the usage may go over the limit
+       temporarily.
+
+       This is the ultimate protection mechanism.  As long as the
+       high limit is used and monitored properly, this limit's
+       utility is limited to providing the final safety net.
+
+  memory.events
+
+       A read-only flat-keyed file which exists on non-root cgroups.
+       The following entries are defined.  Unless specified
+       otherwise, a value change in this file generates a file
+       modified event.
+
+         low
+
+               The number of times the cgroup is reclaimed due to
+               high memory pressure even though its usage is under
+               the low boundary.  This usually indicates that the low
+               boundary is over-committed.
+
+         high
+
+               The number of times processes of the cgroup are
+               throttled and routed to perform direct memory reclaim
+               because the high memory boundary was exceeded.  For a
+               cgroup whose memory usage is capped by the high limit
+               rather than global memory pressure, this event's
+               occurrences are expected.
+
+         max
+
+               The number of times the cgroup's memory usage was
+               about to go over the max boundary.  If direct reclaim
+               fails to bring it down, the OOM killer is invoked.
+
+         oom
+
+               The number of times the OOM killer has been invoked in
+               the cgroup.  This may not exactly match the number of
+               processes killed but should generally be close.
+
+
+5-2-2. General Usage
+
+"memory.high" is the main mechanism to control memory usage.
+Over-committing on high limit (sum of high limits > available memory)
+and letting global memory pressure to distribute memory according to
+usage is a viable strategy.
+
+Because breach of the high limit doesn't trigger the OOM killer but
+throttles the offending cgroup, a management agent has ample
+opportunities to monitor and take appropriate actions such as granting
+more memory or terminating the workload.
+
+Determining whether a cgroup has enough memory is not trivial as
+memory usage doesn't indicate whether the workload can benefit from
+more memory.  For example, a workload which writes data received from
+network to a file can use all available memory but can also operate as
+performant with a small amount of memory.  A measure of memory
+pressure - how much the workload is being impacted due to lack of
+memory - is necessary to determine whether a workload needs more
+memory; unfortunately, memory pressure monitoring mechanism isn't
+implemented yet.
+
+
+5-2-3. Memory Ownership
+
+A memory area is charged to the cgroup which instantiated it and stays
+charged to the cgroup until the area is released.  Migrating a process
+to a different cgroup doesn't move the memory usages that it
+instantiated while in the previous cgroup to the new cgroup.
+
+A memory area may be used by processes belonging to different cgroups.
+To which cgroup the area will be charged is in-deterministic; however,
+over time, the memory area is likely to end up in a cgroup which has
+enough memory allowance to avoid high reclaim pressure.
+
+If a cgroup sweeps a considerable amount of memory which is expected
+to be accessed repeatedly by other cgroups, it may make sense to use
+POSIX_FADV_DONTNEED to relinquish the ownership of memory areas
+belonging to the affected files to ensure correct memory ownership.
+
+
+5-3. IO
+
+The "io" controller regulates the distribution of IO resources.  This
+controller implements both weight based and absolute bandwidth or IOPS
+limit distribution; however, weight based distribution is available
+only if cfq-iosched is in use and neither scheme is available for
+blk-mq devices.
+
+
+5-3-1. IO Interface Files
+
+  io.stat
+
+       A read-only nested-keyed file which exists on non-root
+       cgroups.
+
+       Lines are keyed by $MAJ:$MIN device numbers and not ordered.
+       The following nested keys are defined.
+
+         rbytes        Bytes read
+         wbytes        Bytes written
+         rios          Number of read IOs
+         wios          Number of write IOs
+
+       An example read output follows.
+
+         8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353
+         8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252
+
+  io.weight
+
+       A read-write flat-keyed file which exists on non-root cgroups.
+       The default is "default 100".
+
+       The first line is the default weight applied to devices
+       without specific override.  The rest are overrides keyed by
+       $MAJ:$MIN device numbers and not ordered.  The weights are in
+       the range [1, 10000] and specifies the relative amount IO time
+       the cgroup can use in relation to its siblings.
+
+       The default weight can be updated by writing either "default
+       $WEIGHT" or simply "$WEIGHT".  Overrides can be set by writing
+       "$MAJ:$MIN $WEIGHT" and unset by writing "$MAJ:$MIN default".
+
+       An example read output follows.
+
+         default 100
+         8:16 200
+         8:0 50
+
+  io.max
+
+       A read-write nested-keyed file which exists on non-root
+       cgroups.
+
+       BPS and IOPS based IO limit.  Lines are keyed by $MAJ:$MIN
+       device numbers and not ordered.  The following nested keys are
+       defined.
+
+         rbps          Max read bytes per second
+         wbps          Max write bytes per second
+         riops         Max read IO operations per second
+         wiops         Max write IO operations per second
+
+       When writing, any number of nested key-value pairs can be
+       specified in any order.  "max" can be specified as the value
+       to remove a specific limit.  If the same key is specified
+       multiple times, the outcome is undefined.
+
+       BPS and IOPS are measured in each IO direction and IOs are
+       delayed if limit is reached.  Temporary bursts are allowed.
+
+       Setting read limit at 2M BPS and write at 120 IOPS for 8:16.
+
+         echo "8:16 rbps=2097152 wiops=120" > io.max
+
+       Reading returns the following.
+
+         8:16 rbps=2097152 wbps=max riops=max wiops=120
+
+       Write IOPS limit can be removed by writing the following.
+
+         echo "8:16 wiops=max" > io.max
+
+       Reading now returns the following.
+
+         8:16 rbps=2097152 wbps=max riops=max wiops=max
+
+
+5-3-2. Writeback
+
+Page cache is dirtied through buffered writes and shared mmaps and
+written asynchronously to the backing filesystem by the writeback
+mechanism.  Writeback sits between the memory and IO domains and
+regulates the proportion of dirty memory by balancing dirtying and
+write IOs.
+
+The io controller, in conjunction with the memory controller,
+implements control of page cache writeback IOs.  The memory controller
+defines the memory domain that dirty memory ratio is calculated and
+maintained for and the io controller defines the io domain which
+writes out dirty pages for the memory domain.  Both system-wide and
+per-cgroup dirty memory states are examined and the more restrictive
+of the two is enforced.
+
+cgroup writeback requires explicit support from the underlying
+filesystem.  Currently, cgroup writeback is implemented on ext2, ext4
+and btrfs.  On other filesystems, all writeback IOs are attributed to
+the root cgroup.
+
+There are inherent differences in memory and writeback management
+which affects how cgroup ownership is tracked.  Memory is tracked per
+page while writeback per inode.  For the purpose of writeback, an
+inode is assigned to a cgroup and all IO requests to write dirty pages
+from the inode are attributed to that cgroup.
+
+As cgroup ownership for memory is tracked per page, there can be pages
+which are associated with different cgroups than the one the inode is
+associated with.  These are called foreign pages.  The writeback
+constantly keeps track of foreign pages and, if a particular foreign
+cgroup becomes the majority over a certain period of time, switches
+the ownership of the inode to that cgroup.
+
+While this model is enough for most use cases where a given inode is
+mostly dirtied by a single cgroup even when the main writing cgroup
+changes over time, use cases where multiple cgroups write to a single
+inode simultaneously are not supported well.  In such circumstances, a
+significant portion of IOs are likely to be attributed incorrectly.
+As memory controller assigns page ownership on the first use and
+doesn't update it until the page is released, even if writeback
+strictly follows page ownership, multiple cgroups dirtying overlapping
+areas wouldn't work as expected.  It's recommended to avoid such usage
+patterns.
+
+The sysctl knobs which affect writeback behavior are applied to cgroup
+writeback as follows.
+
+  vm.dirty_background_ratio
+  vm.dirty_ratio
+
+       These ratios apply the same to cgroup writeback with the
+       amount of available memory capped by limits imposed by the
+       memory controller and system-wide clean memory.
+
+  vm.dirty_background_bytes
+  vm.dirty_bytes
+
+       For cgroup writeback, this is calculated into ratio against
+       total available memory and applied the same way as
+       vm.dirty[_background]_ratio.
+
+
+P. Information on Kernel Programming
+
+This section contains kernel programming information in the areas
+where interacting with cgroup is necessary.  cgroup core and
+controllers are not covered.
+
+
+P-1. Filesystem Support for Writeback
+
+A filesystem can support cgroup writeback by updating
+address_space_operations->writepage[s]() to annotate bio's using the
+following two functions.
+
+  wbc_init_bio(@wbc, @bio)
+
+       Should be called for each bio carrying writeback data and
+       associates the bio with the inode's owner cgroup.  Can be
+       called anytime between bio allocation and submission.
+
+  wbc_account_io(@wbc, @page, @bytes)
+
+       Should be called for each data segment being written out.
+       While this function doesn't care exactly when it's called
+       during the writeback session, it's the easiest and most
+       natural to call it as data segments are added to a bio.
+
+With writeback bio's annotated, cgroup support can be enabled per
+super_block by setting SB_I_CGROUPWB in ->s_iflags.  This allows for
+selective disabling of cgroup writeback support which is helpful when
+certain filesystem features, e.g. journaled data mode, are
+incompatible.
+
+wbc_init_bio() binds the specified bio to its cgroup.  Depending on
+the configuration, the bio may be executed at a lower priority and if
+the writeback session is holding shared resources, e.g. a journal
+entry, may lead to priority inversion.  There is no one easy solution
+for the problem.  Filesystems can try to work around specific problem
+cases by skipping wbc_init_bio() or using bio_associate_blkcg()
+directly.
+
+
+D. Deprecated v1 Core Features
+
+- Multiple hierarchies including named ones are not supported.
+
+- All mount options and remounting are not supported.
+
+- The "tasks" file is removed and "cgroup.procs" is not sorted.
+
+- "cgroup.clone_children" is removed.
+
+- /proc/cgroups is meaningless for v2.  Use "cgroup.controllers" file
+  at the root instead.
+
+
+R. Issues with v1 and Rationales for v2
+
+R-1. Multiple Hierarchies
+
+cgroup v1 allowed an arbitrary number of hierarchies and each
+hierarchy could host any number of controllers.  While this seemed to
+provide a high level of flexibility, it wasn't useful in practice.
+
+For example, as there is only one instance of each controller, utility
+type controllers such as freezer which can be useful in all
+hierarchies could only be used in one.  The issue is exacerbated by
+the fact that controllers couldn't be moved to another hierarchy once
+hierarchies were populated.  Another issue was that all controllers
+bound to a hierarchy were forced to have exactly the same view of the
+hierarchy.  It wasn't possible to vary the granularity depending on
+the specific controller.
+
+In practice, these issues heavily limited which controllers could be
+put on the same hierarchy and most configurations resorted to putting
+each controller on its own hierarchy.  Only closely related ones, such
+as the cpu and cpuacct controllers, made sense to be put on the same
+hierarchy.  This often meant that userland ended up managing multiple
+similar hierarchies repeating the same steps on each hierarchy
+whenever a hierarchy management operation was necessary.
+
+Furthermore, support for multiple hierarchies came at a steep cost.
+It greatly complicated cgroup core implementation but more importantly
+the support for multiple hierarchies restricted how cgroup could be
+used in general and what controllers was able to do.
+
+There was no limit on how many hierarchies there might be, which meant
+that a thread's cgroup membership couldn't be described in finite
+length.  The key might contain any number of entries and was unlimited
+in length, which made it highly awkward to manipulate and led to
+addition of controllers which existed only to identify membership,
+which in turn exacerbated the original problem of proliferating number
+of hierarchies.
+
+Also, as a controller couldn't have any expectation regarding the
+topologies of hierarchies other controllers might be on, each
+controller had to assume that all other controllers were attached to
+completely orthogonal hierarchies.  This made it impossible, or at
+least very cumbersome, for controllers to cooperate with each other.
+
+In most use cases, putting controllers on hierarchies which are
+completely orthogonal to each other isn't necessary.  What usually is
+called for is the ability to have differing levels of granularity
+depending on the specific controller.  In other words, hierarchy may
+be collapsed from leaf towards root when viewed from specific
+controllers.  For example, a given configuration might not care about
+how memory is distributed beyond a certain level while still wanting
+to control how CPU cycles are distributed.
+
+
+R-2. Thread Granularity
+
+cgroup v1 allowed threads of a process to belong to different cgroups.
+This didn't make sense for some controllers and those controllers
+ended up implementing different ways to ignore such situations but
+much more importantly it blurred the line between API exposed to
+individual applications and system management interface.
+
+Generally, in-process knowledge is available only to the process
+itself; thus, unlike service-level organization of processes,
+categorizing threads of a process requires active participation from
+the application which owns the target process.
+
+cgroup v1 had an ambiguously defined delegation model which got abused
+in combination with thread granularity.  cgroups were delegated to
+individual applications so that they can create and manage their own
+sub-hierarchies and control resource distributions along them.  This
+effectively raised cgroup to the status of a syscall-like API exposed
+to lay programs.
+
+First of all, cgroup has a fundamentally inadequate interface to be
+exposed this way.  For a process to access its own knobs, it has to
+extract the path on the target hierarchy from /proc/self/cgroup,
+construct the path by appending the name of the knob to the path, open
+and then read and/or write to it.  This is not only extremely clunky
+and unusual but also inherently racy.  There is no conventional way to
+define transaction across the required steps and nothing can guarantee
+that the process would actually be operating on its own sub-hierarchy.
+
+cgroup controllers implemented a number of knobs which would never be
+accepted as public APIs because they were just adding control knobs to
+system-management pseudo filesystem.  cgroup ended up with interface
+knobs which were not properly abstracted or refined and directly
+revealed kernel internal details.  These knobs got exposed to
+individual applications through the ill-defined delegation mechanism
+effectively abusing cgroup as a shortcut to implementing public APIs
+without going through the required scrutiny.
+
+This was painful for both userland and kernel.  Userland ended up with
+misbehaving and poorly abstracted interfaces and kernel exposing and
+locked into constructs inadvertently.
+
+
+R-3. Competition Between Inner Nodes and Threads
+
+cgroup v1 allowed threads to be in any cgroups which created an
+interesting problem where threads belonging to a parent cgroup and its
+children cgroups competed for resources.  This was nasty as two
+different types of entities competed and there was no obvious way to
+settle it.  Different controllers did different things.
+
+The cpu controller considered threads and cgroups as equivalents and
+mapped nice levels to cgroup weights.  This worked for some cases but
+fell flat when children wanted to be allocated specific ratios of CPU
+cycles and the number of internal threads fluctuated - the ratios
+constantly changed as the number of competing entities fluctuated.
+There also were other issues.  The mapping from nice level to weight
+wasn't obvious or universal, and there were various other knobs which
+simply weren't available for threads.
+
+The io controller implicitly created a hidden leaf node for each
+cgroup to host the threads.  The hidden leaf had its own copies of all
+the knobs with "leaf_" prefixed.  While this allowed equivalent
+control over internal threads, it was with serious drawbacks.  It
+always added an extra layer of nesting which wouldn't be necessary
+otherwise, made the interface messy and significantly complicated the
+implementation.
+
+The memory controller didn't have a way to control what happened
+between internal tasks and child cgroups and the behavior was not
+clearly defined.  There were attempts to add ad-hoc behaviors and
+knobs to tailor the behavior to specific workloads which would have
+led to problems extremely difficult to resolve in the long term.
+
+Multiple controllers struggled with internal tasks and came up with
+different ways to deal with it; unfortunately, all the approaches were
+severely flawed and, furthermore, the widely different behaviors
+made cgroup as a whole highly inconsistent.
+
+This clearly is a problem which needs to be addressed from cgroup core
+in a uniform way.
+
+
+R-4. Other Interface Issues
+
+cgroup v1 grew without oversight and developed a large number of
+idiosyncrasies and inconsistencies.  One issue on the cgroup core side
+was how an empty cgroup was notified - a userland helper binary was
+forked and executed for each event.  The event delivery wasn't
+recursive or delegatable.  The limitations of the mechanism also led
+to in-kernel event delivery filtering mechanism further complicating
+the interface.
+
+Controller interfaces were problematic too.  An extreme example is
+controllers completely ignoring hierarchical organization and treating
+all cgroups as if they were all located directly under the root
+cgroup.  Some controllers exposed a large amount of inconsistent
+implementation details to userland.
+
+There also was no consistency across controllers.  When a new cgroup
+was created, some controllers defaulted to not imposing extra
+restrictions while others disallowed any resource usage until
+explicitly configured.  Configuration knobs for the same type of
+control used widely differing naming schemes and formats.  Statistics
+and information knobs were named arbitrarily and used different
+formats and units even in the same controller.
+
+cgroup v2 establishes common conventions where appropriate and updates
+controllers so that they expose minimal and consistent interfaces.
+
+
+R-5. Controller Issues and Remedies
+
+R-5-1. Memory
+
+The original lower boundary, the soft limit, is defined as a limit
+that is per default unset.  As a result, the set of cgroups that
+global reclaim prefers is opt-in, rather than opt-out.  The costs for
+optimizing these mostly negative lookups are so high that the
+implementation, despite its enormous size, does not even provide the
+basic desirable behavior.  First off, the soft limit has no
+hierarchical meaning.  All configured groups are organized in a global
+rbtree and treated like equal peers, regardless where they are located
+in the hierarchy.  This makes subtree delegation impossible.  Second,
+the soft limit reclaim pass is so aggressive that it not just
+introduces high allocation latencies into the system, but also impacts
+system performance due to overreclaim, to the point where the feature
+becomes self-defeating.
+
+The memory.low boundary on the other hand is a top-down allocated
+reserve.  A cgroup enjoys reclaim protection when it and all its
+ancestors are below their low boundaries, which makes delegation of
+subtrees possible.  Secondly, new cgroups have no reserve per default
+and in the common case most cgroups are eligible for the preferred
+reclaim pass.  This allows the new low boundary to be efficiently
+implemented with just a minor addition to the generic reclaim code,
+without the need for out-of-band data structures and reclaim passes.
+Because the generic reclaim code considers all cgroups except for the
+ones running low in the preferred first reclaim pass, overreclaim of
+individual groups is eliminated as well, resulting in much better
+overall workload performance.
+
+The original high boundary, the hard limit, is defined as a strict
+limit that can not budge, even if the OOM killer has to be called.
+But this generally goes against the goal of making the most out of the
+available memory.  The memory consumption of workloads varies during
+runtime, and that requires users to overcommit.  But doing that with a
+strict upper limit requires either a fairly accurate prediction of the
+working set size or adding slack to the limit.  Since working set size
+estimation is hard and error prone, and getting it wrong results in
+OOM kills, most users tend to err on the side of a looser limit and
+end up wasting precious resources.
+
+The memory.high boundary on the other hand can be set much more
+conservatively.  When hit, it throttles allocations by forcing them
+into direct reclaim to work off the excess, but it never invokes the
+OOM killer.  As a result, a high boundary that is chosen too
+aggressively will not terminate the processes, but instead it will
+lead to gradual performance degradation.  The user can monitor this
+and make corrections until the minimal memory footprint that still
+gives acceptable performance is found.
+
+In extreme cases, with many concurrent allocations and a complete
+breakdown of reclaim progress within the group, the high boundary can
+be exceeded.  But even then it's mostly better to satisfy the
+allocation from the slack available in other groups or the rest of the
+system than killing the group.  Otherwise, memory.max is there to
+limit this type of spillover and ultimately contain buggy or even
+malicious applications.
diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX

deleted file mode 100644 (file)

index 3f5a40f..0000000
--- a/Documentation/cgroups/00-INDEX
+++ /dev/null
@@ -1,30 +0,0 @@
-00-INDEX
-       - this file
-blkio-controller.txt
-       - Description for Block IO Controller, implementation and usage details.
-cgroups.txt
-       - Control Groups definition, implementation details, examples and API.
-cpuacct.txt
-       - CPU Accounting Controller; account CPU usage for groups of tasks.
-cpusets.txt
-       - documents the cpusets feature; assign CPUs and Mem to a set of tasks.
-devices.txt
-       - Device Whitelist Controller; description, interface and security.
-freezer-subsystem.txt
-       - checkpointing; rationale to not use signals, interface.
-hugetlb.txt
-       - HugeTLB Controller implementation and usage details.
-memcg_test.txt
-       - Memory Resource Controller; implementation details.
-memory.txt
-       - Memory Resource Controller; design, accounting, interface, testing.
-net_cls.txt
-       - Network classifier cgroups details and usages.
-net_prio.txt
-       - Network priority cgroups details and usages.
-pids.txt
-       - Process number cgroups details and usages.
-resource_counter.txt
-       - Resource Counter API.
-unified-hierarchy.txt
-       - Description the new/next cgroup interface.
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt

deleted file mode 100644 (file)

index 52fa9f3..0000000
--- a/Documentation/cgroups/blkio-controller.txt
+++ /dev/null
@@ -1,455 +0,0 @@
-                               Block IO Controller
-                               ===================
-Overview
-========
-cgroup subsys "blkio" implements the block io controller. There seems to be
-a need of various kinds of IO control policies (like proportional BW, max BW)
-both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
-Plan is to use the same cgroup based management interface for blkio controller
-and based on user options switch IO policies in the background.
-
-Currently two IO control policies are implemented. First one is proportional
-weight time based division of disk policy. It is implemented in CFQ. Hence
-this policy takes effect only on leaf nodes when CFQ is being used. The second
-one is throttling policy which can be used to specify upper IO rate limits
-on devices. This policy is implemented in generic block layer and can be
-used on leaf nodes as well as higher level logical devices like device mapper.
-
-HOWTO
-=====
-Proportional Weight division of bandwidth
------------------------------------------
-You can do a very simple testing of running two dd threads in two different
-cgroups. Here is what you can do.
-
-- Enable Block IO controller
-       CONFIG_BLK_CGROUP=y
-
-- Enable group scheduling in CFQ
-       CONFIG_CFQ_GROUP_IOSCHED=y
-
-- Compile and boot into kernel and mount IO controller (blkio); see
-  cgroups.txt, Why are cgroups needed?.
-
-       mount -t tmpfs cgroup_root /sys/fs/cgroup
-       mkdir /sys/fs/cgroup/blkio
-       mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
-
-- Create two cgroups
-       mkdir -p /sys/fs/cgroup/blkio/test1/ /sys/fs/cgroup/blkio/test2
-
-- Set weights of group test1 and test2
-       echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight
-       echo 500 > /sys/fs/cgroup/blkio/test2/blkio.weight
-
-- Create two same size files (say 512MB each) on same disk (file1, file2) and
-  launch two dd threads in different cgroup to read those files.
-
-       sync
-       echo 3 > /proc/sys/vm/drop_caches
-
-       dd if=/mnt/sdb/zerofile1 of=/dev/null &
-       echo $! > /sys/fs/cgroup/blkio/test1/tasks
-       cat /sys/fs/cgroup/blkio/test1/tasks
-
-       dd if=/mnt/sdb/zerofile2 of=/dev/null &
-       echo $! > /sys/fs/cgroup/blkio/test2/tasks
-       cat /sys/fs/cgroup/blkio/test2/tasks
-
-- At macro level, first dd should finish first. To get more precise data, keep
-  on looking at (with the help of script), at blkio.disk_time and
-  blkio.disk_sectors files of both test1 and test2 groups. This will tell how
-  much disk time (in milliseconds), each group got and how many sectors each
-  group dispatched to the disk. We provide fairness in terms of disk time, so
-  ideally io.disk_time of cgroups should be in proportion to the weight.
-
-Throttling/Upper Limit policy
------------------------------
-- Enable Block IO controller
-       CONFIG_BLK_CGROUP=y
-
-- Enable throttling in block layer
-       CONFIG_BLK_DEV_THROTTLING=y
-
-- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)
-        mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
-
-- Specify a bandwidth rate on particular device for root group. The format
-  for policy is "<major>:<minor>  <bytes_per_second>".
-
-        echo "8:16  1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
-
-  Above will put a limit of 1MB/second on reads happening for root group
-  on device having major/minor number 8:16.
-
-- Run dd to read a file and see if rate is throttled to 1MB/s or not.
-
-               # dd if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
-               # iflag=direct
-        1024+0 records in
-        1024+0 records out
-        4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
-
- Limits for writes can be put using blkio.throttle.write_bps_device file.
-
-Hierarchical Cgroups
-====================
-
-Both CFQ and throttling implement hierarchy support; however,
-throttling's hierarchy support is enabled iff "sane_behavior" is
-enabled from cgroup side, which currently is a development option and
-not publicly available.
-
-If somebody created a hierarchy like as follows.
-
-                       root
-                       /  \
-                    test1 test2
-                       |
-                    test3
-
-CFQ by default and throttling with "sane_behavior" will handle the
-hierarchy correctly.  For details on CFQ hierarchy support, refer to
-Documentation/block/cfq-iosched.txt.  For throttling, all limits apply
-to the whole subtree while all statistics are local to the IOs
-directly generated by tasks in that cgroup.
-
-Throttling without "sane_behavior" enabled from cgroup side will
-practically treat all groups at same level as if it looks like the
-following.
-
-                               pivot
-                            /  /   \  \
-                       root  test1 test2  test3
-
-Various user visible config options
-===================================
-CONFIG_BLK_CGROUP
-       - Block IO controller.
-
-CONFIG_DEBUG_BLK_CGROUP
-       - Debug help. Right now some additional stats file show up in cgroup
-         if this option is enabled.
-
-CONFIG_CFQ_GROUP_IOSCHED
-       - Enables group scheduling in CFQ. Currently only 1 level of group
-         creation is allowed.
-
-CONFIG_BLK_DEV_THROTTLING
-       - Enable block device throttling support in block layer.
-
-Details of cgroup files
-=======================
-Proportional weight policy files
---------------------------------
-- blkio.weight
-       - Specifies per cgroup weight. This is default weight of the group
-         on all the devices until and unless overridden by per device rule.
-         (See blkio.weight_device).
-         Currently allowed range of weights is from 10 to 1000.
-
-- blkio.weight_device
-       - One can specify per cgroup per device rules using this interface.
-         These rules override the default value of group weight as specified
-         by blkio.weight.
-
-         Following is the format.
-
-         # echo dev_maj:dev_minor weight > blkio.weight_device
-         Configure weight=300 on /dev/sdb (8:16) in this cgroup
-         # echo 8:16 300 > blkio.weight_device
-         # cat blkio.weight_device
-         dev     weight
-         8:16    300
-
-         Configure weight=500 on /dev/sda (8:0) in this cgroup
-         # echo 8:0 500 > blkio.weight_device
-         # cat blkio.weight_device
-         dev     weight
-         8:0     500
-         8:16    300
-
-         Remove specific weight for /dev/sda in this cgroup
-         # echo 8:0 0 > blkio.weight_device
-         # cat blkio.weight_device
-         dev     weight
-         8:16    300
-
-- blkio.leaf_weight[_device]
-       - Equivalents of blkio.weight[_device] for the purpose of
-          deciding how much weight tasks in the given cgroup has while
-          competing with the cgroup's child cgroups. For details,
-          please refer to Documentation/block/cfq-iosched.txt.
-
-- blkio.time
-       - disk time allocated to cgroup per device in milliseconds. First
-         two fields specify the major and minor number of the device and
-         third field specifies the disk time allocated to group in
-         milliseconds.
-
-- blkio.sectors
-       - number of sectors transferred to/from disk by the group. First
-         two fields specify the major and minor number of the device and
-         third field specifies the number of sectors transferred by the
-         group to/from the device.
-
-- blkio.io_service_bytes
-       - Number of bytes transferred to/from the disk by the group. These
-         are further divided by the type of operation - read or write, sync
-         or async. First two fields specify the major and minor number of the
-         device, third field specifies the operation type and the fourth field
-         specifies the number of bytes.
-
-- blkio.io_serviced
-       - Number of IOs (bio) issued to the disk by the group. These
-         are further divided by the type of operation - read or write, sync
-         or async. First two fields specify the major and minor number of the
-         device, third field specifies the operation type and the fourth field
-         specifies the number of IOs.
-
-- blkio.io_service_time
-       - Total amount of time between request dispatch and request completion
-         for the IOs done by this cgroup. This is in nanoseconds to make it
-         meaningful for flash devices too. For devices with queue depth of 1,
-         this time represents the actual service time. When queue_depth > 1,
-         that is no longer true as requests may be served out of order. This
-         may cause the service time for a given IO to include the service time
-         of multiple IOs when served out of order which may result in total
-         io_service_time > actual time elapsed. This time is further divided by
-         the type of operation - read or write, sync or async. First two fields
-         specify the major and minor number of the device, third field
-         specifies the operation type and the fourth field specifies the
-         io_service_time in ns.
-
-- blkio.io_wait_time
-       - Total amount of time the IOs for this cgroup spent waiting in the
-         scheduler queues for service. This can be greater than the total time
-         elapsed since it is cumulative io_wait_time for all IOs. It is not a
-         measure of total time the cgroup spent waiting but rather a measure of
-         the wait_time for its individual IOs. For devices with queue_depth > 1
-         this metric does not include the time spent waiting for service once
-         the IO is dispatched to the device but till it actually gets serviced
-         (there might be a time lag here due to re-ordering of requests by the
-         device). This is in nanoseconds to make it meaningful for flash
-         devices too. This time is further divided by the type of operation -
-         read or write, sync or async. First two fields specify the major and
-         minor number of the device, third field specifies the operation type
-         and the fourth field specifies the io_wait_time in ns.
-
-- blkio.io_merged
-       - Total number of bios/requests merged into requests belonging to this
-         cgroup. This is further divided by the type of operation - read or
-         write, sync or async.
-
-- blkio.io_queued
-       - Total number of requests queued up at any given instant for this
-         cgroup. This is further divided by the type of operation - read or
-         write, sync or async.
-
-- blkio.avg_queue_size
-       - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
-         The average queue size for this cgroup over the entire time of this
-         cgroup's existence. Queue size samples are taken each time one of the
-         queues of this cgroup gets a timeslice.
-
-- blkio.group_wait_time
-       - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
-         This is the amount of time the cgroup had to wait since it became busy
-         (i.e., went from 0 to 1 request queued) to get a timeslice for one of
-         its queues. This is different from the io_wait_time which is the
-         cumulative total of the amount of time spent by each IO in that cgroup
-         waiting in the scheduler queue. This is in nanoseconds. If this is
-         read when the cgroup is in a waiting (for timeslice) state, the stat
-         will only report the group_wait_time accumulated till the last time it
-         got a timeslice and will not include the current delta.
-
-- blkio.empty_time
-       - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
-         This is the amount of time a cgroup spends without any pending
-         requests when not being served, i.e., it does not include any time
-         spent idling for one of the queues of the cgroup. This is in
-         nanoseconds. If this is read when the cgroup is in an empty state,
-         the stat will only report the empty_time accumulated till the last
-         time it had a pending request and will not include the current delta.
-
-- blkio.idle_time
-       - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
-         This is the amount of time spent by the IO scheduler idling for a
-         given cgroup in anticipation of a better request than the existing ones
-         from other queues/cgroups. This is in nanoseconds. If this is read
-         when the cgroup is in an idling state, the stat will only report the
-         idle_time accumulated till the last idle period and will not include
-         the current delta.
-
-- blkio.dequeue
-       - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
-         gives the statistics about how many a times a group was dequeued
-         from service tree of the device. First two fields specify the major
-         and minor number of the device and third field specifies the number
-         of times a group was dequeued from a particular device.
-
-- blkio.*_recursive
-       - Recursive version of various stats. These files show the
-          same information as their non-recursive counterparts but
-          include stats from all the descendant cgroups.
-
-Throttling/Upper limit policy files
------------------------------------
-- blkio.throttle.read_bps_device
-       - Specifies upper limit on READ rate from the device. IO rate is
-         specified in bytes per second. Rules are per device. Following is
-         the format.
-
-  echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
-
-- blkio.throttle.write_bps_device
-       - Specifies upper limit on WRITE rate to the device. IO rate is
-         specified in bytes per second. Rules are per device. Following is
-         the format.
-
-  echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
-
-- blkio.throttle.read_iops_device
-       - Specifies upper limit on READ rate from the device. IO rate is
-         specified in IO per second. Rules are per device. Following is
-         the format.
-
-  echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
-
-- blkio.throttle.write_iops_device
-       - Specifies upper limit on WRITE rate to the device. IO rate is
-         specified in io per second. Rules are per device. Following is
-         the format.
-
-  echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
-
-Note: If both BW and IOPS rules are specified for a device, then IO is
-      subjected to both the constraints.
-
-- blkio.throttle.io_serviced
-       - Number of IOs (bio) issued to the disk by the group. These
-         are further divided by the type of operation - read or write, sync
-         or async. First two fields specify the major and minor number of the
-         device, third field specifies the operation type and the fourth field
-         specifies the number of IOs.
-
-- blkio.throttle.io_service_bytes
-       - Number of bytes transferred to/from the disk by the group. These
-         are further divided by the type of operation - read or write, sync
-         or async. First two fields specify the major and minor number of the
-         device, third field specifies the operation type and the fourth field
-         specifies the number of bytes.
-
-Common files among various policies
------------------------------------
-- blkio.reset_stats
-       - Writing an int to this file will result in resetting all the stats
-         for that cgroup.
-
-CFQ sysfs tunable
-=================
-/sys/block/<disk>/queue/iosched/slice_idle
-------------------------------------------
-On a faster hardware CFQ can be slow, especially with sequential workload.
-This happens because CFQ idles on a single queue and single queue might not
-drive deeper request queue depths to keep the storage busy. In such scenarios
-one can try setting slice_idle=0 and that would switch CFQ to IOPS
-(IO operations per second) mode on NCQ supporting hardware.
-
-That means CFQ will not idle between cfq queues of a cfq group and hence be
-able to driver higher queue depth and achieve better throughput. That also
-means that cfq provides fairness among groups in terms of IOPS and not in
-terms of disk time.
-
-/sys/block/<disk>/queue/iosched/group_idle
-------------------------------------------
-If one disables idling on individual cfq queues and cfq service trees by
-setting slice_idle=0, group_idle kicks in. That means CFQ will still idle
-on the group in an attempt to provide fairness among groups.
-
-By default group_idle is same as slice_idle and does not do anything if
-slice_idle is enabled.
-
-One can experience an overall throughput drop if you have created multiple
-groups and put applications in that group which are not driving enough
-IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle
-on individual groups and throughput should improve.
-
-Writeback
-=========
-
-Page cache is dirtied through buffered writes and shared mmaps and
-written asynchronously to the backing filesystem by the writeback
-mechanism.  Writeback sits between the memory and IO domains and
-regulates the proportion of dirty memory by balancing dirtying and
-write IOs.
-
-On traditional cgroup hierarchies, relationships between different
-controllers cannot be established making it impossible for writeback
-to operate accounting for cgroup resource restrictions and all
-writeback IOs are attributed to the root cgroup.
-
-If both the blkio and memory controllers are used on the v2 hierarchy
-and the filesystem supports cgroup writeback, writeback operations
-correctly follow the resource restrictions imposed by both memory and
-blkio controllers.
-
-Writeback examines both system-wide and per-cgroup dirty memory status
-and enforces the more restrictive of the two.  Also, writeback control
-parameters which are absolute values - vm.dirty_bytes and
-vm.dirty_background_bytes - are distributed across cgroups according
-to their current writeback bandwidth.
-
-There's a peculiarity stemming from the discrepancy in ownership
-granularity between memory controller and writeback.  While memory
-controller tracks ownership per page, writeback operates on inode
-basis.  cgroup writeback bridges the gap by tracking ownership by
-inode but migrating ownership if too many foreign pages, pages which
-don't match the current inode ownership, have been encountered while
-writing back the inode.
-
-This is a conscious design choice as writeback operations are
-inherently tied to inodes making strictly following page ownership
-complicated and inefficient.  The only use case which suffers from
-this compromise is multiple cgroups concurrently dirtying disjoint
-regions of the same inode, which is an unlikely use case and decided
-to be unsupported.  Note that as memory controller assigns page
-ownership on the first use and doesn't update it until the page is
-released, even if cgroup writeback strictly follows page ownership,
-multiple cgroups dirtying overlapping areas wouldn't work as expected.
-In general, write-sharing an inode across multiple cgroups is not well
-supported.
-
-Filesystem support for cgroup writeback
----------------------------------------
-
-A filesystem can make writeback IOs cgroup-aware by updating
-address_space_operations->writepage[s]() to annotate bio's using the
-following two functions.
-
-* wbc_init_bio(@wbc, @bio)
-
-  Should be called for each bio carrying writeback data and associates
-  the bio with the inode's owner cgroup.  Can be called anytime
-  between bio allocation and submission.
-
-* wbc_account_io(@wbc, @page, @bytes)
-
-  Should be called for each data segment being written out.  While
-  this function doesn't care exactly when it's called during the
-  writeback session, it's the easiest and most natural to call it as
-  data segments are added to a bio.
-
-With writeback bio's annotated, cgroup support can be enabled per
-super_block by setting MS_CGROUPWB in ->s_flags.  This allows for
-selective disabling of cgroup writeback support which is helpful when
-certain filesystem features, e.g. journaled data mode, are
-incompatible.
-
-wbc_init_bio() binds the specified bio to its cgroup.  Depending on
-the configuration, the bio may be executed at a lower priority and if
-the writeback session is holding shared resources, e.g. a journal
-entry, may lead to priority inversion.  There is no one easy solution
-for the problem.  Filesystems can try to work around specific problem
-cases by skipping wbc_init_bio() or using bio_associate_blkcg()
-directly.
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt

deleted file mode 100644 (file)

index c6256ae..0000000
--- a/Documentation/cgroups/cgroups.txt
+++ /dev/null
@@ -1,682 +0,0 @@
-                               CGROUPS
-                               -------
-
-Written by Paul Menage <menage@google.com> based on
-Documentation/cgroups/cpusets.txt
-
-Original copyright statements from cpusets.txt:
-Portions Copyright (C) 2004 BULL SA.
-Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
-Modified by Paul Jackson <pj@sgi.com>
-Modified by Christoph Lameter <clameter@sgi.com>
-
-CONTENTS:
-=========
-
-1. Control Groups
-  1.1 What are cgroups ?
-  1.2 Why are cgroups needed ?
-  1.3 How are cgroups implemented ?
-  1.4 What does notify_on_release do ?
-  1.5 What does clone_children do ?
-  1.6 How do I use cgroups ?
-2. Usage Examples and Syntax
-  2.1 Basic Usage
-  2.2 Attaching processes
-  2.3 Mounting hierarchies by name
-3. Kernel API
-  3.1 Overview
-  3.2 Synchronization
-  3.3 Subsystem API
-4. Extended attributes usage
-5. Questions
-
-1. Control Groups
-=================
-
-1.1 What are cgroups ?
-----------------------
-
-Control Groups provide a mechanism for aggregating/partitioning sets of
-tasks, and all their future children, into hierarchical groups with
-specialized behaviour.
-
-Definitions:
-
-A *cgroup* associates a set of tasks with a set of parameters for one
-or more subsystems.
-
-A *subsystem* is a module that makes use of the task grouping
-facilities provided by cgroups to treat groups of tasks in
-particular ways. A subsystem is typically a "resource controller" that
-schedules a resource or applies per-cgroup limits, but it may be
-anything that wants to act on a group of processes, e.g. a
-virtualization subsystem.
-
-A *hierarchy* is a set of cgroups arranged in a tree, such that
-every task in the system is in exactly one of the cgroups in the
-hierarchy, and a set of subsystems; each subsystem has system-specific
-state attached to each cgroup in the hierarchy.  Each hierarchy has
-an instance of the cgroup virtual filesystem associated with it.
-
-At any one time there may be multiple active hierarchies of task
-cgroups. Each hierarchy is a partition of all tasks in the system.
-
-User-level code may create and destroy cgroups by name in an
-instance of the cgroup virtual file system, specify and query to
-which cgroup a task is assigned, and list the task PIDs assigned to
-a cgroup. Those creations and assignments only affect the hierarchy
-associated with that instance of the cgroup file system.
-
-On their own, the only use for cgroups is for simple job
-tracking. The intention is that other subsystems hook into the generic
-cgroup support to provide new attributes for cgroups, such as
-accounting/limiting the resources which processes in a cgroup can
-access. For example, cpusets (see Documentation/cgroups/cpusets.txt) allow
-you to associate a set of CPUs and a set of memory nodes with the
-tasks in each cgroup.
-
-1.2 Why are cgroups needed ?
-----------------------------
-
-There are multiple efforts to provide process aggregations in the
-Linux kernel, mainly for resource-tracking purposes. Such efforts
-include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
-namespaces. These all require the basic notion of a
-grouping/partitioning of processes, with newly forked processes ending
-up in the same group (cgroup) as their parent process.
-
-The kernel cgroup patch provides the minimum essential kernel
-mechanisms required to efficiently implement such groups. It has
-minimal impact on the system fast paths, and provides hooks for
-specific subsystems such as cpusets to provide additional behaviour as
-desired.
-
-Multiple hierarchy support is provided to allow for situations where
-the division of tasks into cgroups is distinctly different for
-different subsystems - having parallel hierarchies allows each
-hierarchy to be a natural division of tasks, without having to handle
-complex combinations of tasks that would be present if several
-unrelated subsystems needed to be forced into the same tree of
-cgroups.
-
-At one extreme, each resource controller or subsystem could be in a
-separate hierarchy; at the other extreme, all subsystems
-would be attached to the same hierarchy.
-
-As an example of a scenario (originally proposed by vatsa@in.ibm.com)
-that can benefit from multiple hierarchies, consider a large
-university server with various users - students, professors, system
-tasks etc. The resource planning for this server could be along the
-following lines:
-
-       CPU :          "Top cpuset"
-                       /       \
-               CPUSet1         CPUSet2
-                  |               |
-               (Professors)    (Students)
-
-               In addition (system tasks) are attached to topcpuset (so
-               that they can run anywhere) with a limit of 20%
-
-       Memory : Professors (50%), Students (30%), system (20%)
-
-       Disk : Professors (50%), Students (30%), system (20%)
-
-       Network : WWW browsing (20%), Network File System (60%), others (20%)
-                               / \
-               Professors (15%)  students (5%)
-
-Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes
-into the NFS network class.
-
-At the same time Firefox/Lynx will share an appropriate CPU/Memory class
-depending on who launched it (prof/student).
-
-With the ability to classify tasks differently for different resources
-(by putting those resource subsystems in different hierarchies),
-the admin can easily set up a script which receives exec notifications
-and depending on who is launching the browser he can
-
-    # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks
-
-With only a single hierarchy, he now would potentially have to create
-a separate cgroup for every browser launched and associate it with
-appropriate network and other resource class.  This may lead to
-proliferation of such cgroups.
-
-Also let's say that the administrator would like to give enhanced network
-access temporarily to a student's browser (since it is night and the user
-wants to do online gaming :))  OR give one of the student's simulation
-apps enhanced CPU power.
-
-With ability to write PIDs directly to resource classes, it's just a
-matter of:
-
-       # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
-       (after some time)
-       # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks
-
-Without this ability, the administrator would have to split the cgroup into
-multiple separate ones and then associate the new cgroups with the
-new resource classes.
-
-
-
-1.3 How are cgroups implemented ?
----------------------------------
-
-Control Groups extends the kernel as follows:
-
- - Each task in the system has a reference-counted pointer to a
-   css_set.
-
- - A css_set contains a set of reference-counted pointers to
-   cgroup_subsys_state objects, one for each cgroup subsystem
-   registered in the system. There is no direct link from a task to
-   the cgroup of which it's a member in each hierarchy, but this
-   can be determined by following pointers through the
-   cgroup_subsys_state objects. This is because accessing the
-   subsystem state is something that's expected to happen frequently
-   and in performance-critical code, whereas operations that require a
-   task's actual cgroup assignments (in particular, moving between
-   cgroups) are less common. A linked list runs through the cg_list
-   field of each task_struct using the css_set, anchored at
-   css_set->tasks.
-
- - A cgroup hierarchy filesystem can be mounted for browsing and
-   manipulation from user space.
-
- - You can list all the tasks (by PID) attached to any cgroup.
-
-The implementation of cgroups requires a few, simple hooks
-into the rest of the kernel, none in performance-critical paths:
-
- - in init/main.c, to initialize the root cgroups and initial
-   css_set at system boot.
-
- - in fork and exit, to attach and detach a task from its css_set.
-
-In addition, a new file system of type "cgroup" may be mounted, to
-enable browsing and modifying the cgroups presently known to the
-kernel.  When mounting a cgroup hierarchy, you may specify a
-comma-separated list of subsystems to mount as the filesystem mount
-options.  By default, mounting the cgroup filesystem attempts to
-mount a hierarchy containing all registered subsystems.
-
-If an active hierarchy with exactly the same set of subsystems already
-exists, it will be reused for the new mount. If no existing hierarchy
-matches, and any of the requested subsystems are in use in an existing
-hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
-is activated, associated with the requested subsystems.
-
-It's not currently possible to bind a new subsystem to an active
-cgroup hierarchy, or to unbind a subsystem from an active cgroup
-hierarchy. This may be possible in future, but is fraught with nasty
-error-recovery issues.
-
-When a cgroup filesystem is unmounted, if there are any
-child cgroups created below the top-level cgroup, that hierarchy
-will remain active even though unmounted; if there are no
-child cgroups then the hierarchy will be deactivated.
-
-No new system calls are added for cgroups - all support for
-querying and modifying cgroups is via this cgroup file system.
-
-Each task under /proc has an added file named 'cgroup' displaying,
-for each active hierarchy, the subsystem names and the cgroup name
-as the path relative to the root of the cgroup file system.
-
-Each cgroup is represented by a directory in the cgroup file system
-containing the following files describing that cgroup:
-
- - tasks: list of tasks (by PID) attached to that cgroup.  This list
-   is not guaranteed to be sorted.  Writing a thread ID into this file
-   moves the thread into this cgroup.
- - cgroup.procs: list of thread group IDs in the cgroup.  This list is
-   not guaranteed to be sorted or free of duplicate TGIDs, and userspace
-   should sort/uniquify the list if this property is required.
-   Writing a thread group ID into this file moves all threads in that
-   group into this cgroup.
- - notify_on_release flag: run the release agent on exit?
- - release_agent: the path to use for release notifications (this file
-   exists in the top cgroup only)
-
-Other subsystems such as cpusets may add additional files in each
-cgroup dir.
-
-New cgroups are created using the mkdir system call or shell
-command.  The properties of a cgroup, such as its flags, are
-modified by writing to the appropriate file in that cgroups
-directory, as listed above.
-
-The named hierarchical structure of nested cgroups allows partitioning
-a large system into nested, dynamically changeable, "soft-partitions".
-
-The attachment of each task, automatically inherited at fork by any
-children of that task, to a cgroup allows organizing the work load
-on a system into related sets of tasks.  A task may be re-attached to
-any other cgroup, if allowed by the permissions on the necessary
-cgroup file system directories.
-
-When a task is moved from one cgroup to another, it gets a new
-css_set pointer - if there's an already existing css_set with the
-desired collection of cgroups then that group is reused, otherwise a new
-css_set is allocated. The appropriate existing css_set is located by
-looking into a hash table.
-
-To allow access from a cgroup to the css_sets (and hence tasks)
-that comprise it, a set of cg_cgroup_link objects form a lattice;
-each cg_cgroup_link is linked into a list of cg_cgroup_links for
-a single cgroup on its cgrp_link_list field, and a list of
-cg_cgroup_links for a single css_set on its cg_link_list.
-
-Thus the set of tasks in a cgroup can be listed by iterating over
-each css_set that references the cgroup, and sub-iterating over
-each css_set's task set.
-
-The use of a Linux virtual file system (vfs) to represent the
-cgroup hierarchy provides for a familiar permission and name space
-for cgroups, with a minimum of additional kernel code.
-
-1.4 What does notify_on_release do ?
-------------------------------------
-
-If the notify_on_release flag is enabled (1) in a cgroup, then
-whenever the last task in the cgroup leaves (exits or attaches to
-some other cgroup) and the last child cgroup of that cgroup
-is removed, then the kernel runs the command specified by the contents
-of the "release_agent" file in that hierarchy's root directory,
-supplying the pathname (relative to the mount point of the cgroup
-file system) of the abandoned cgroup.  This enables automatic
-removal of abandoned cgroups.  The default value of
-notify_on_release in the root cgroup at system boot is disabled
-(0).  The default value of other cgroups at creation is the current
-value of their parents' notify_on_release settings. The default value of
-a cgroup hierarchy's release_agent path is empty.
-
-1.5 What does clone_children do ?
----------------------------------
-
-This flag only affects the cpuset controller. If the clone_children
-flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
-configuration from the parent during initialization.
-
-1.6 How do I use cgroups ?
---------------------------
-
-To start a new job that is to be contained within a cgroup, using
-the "cpuset" cgroup subsystem, the steps are something like:
-
- 1) mount -t tmpfs cgroup_root /sys/fs/cgroup
- 2) mkdir /sys/fs/cgroup/cpuset
- 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
- 4) Create the new cgroup by doing mkdir's and write's (or echo's) in
-    the /sys/fs/cgroup/cpuset virtual file system.
- 5) Start a task that will be the "founding father" of the new job.
- 6) Attach that task to the new cgroup by writing its PID to the
-    /sys/fs/cgroup/cpuset tasks file for that cgroup.
- 7) fork, exec or clone the job tasks from this founding father task.
-
-For example, the following sequence of commands will setup a cgroup
-named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
-and then start a subshell 'sh' in that cgroup:
-
-  mount -t tmpfs cgroup_root /sys/fs/cgroup
-  mkdir /sys/fs/cgroup/cpuset
-  mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset
-  cd /sys/fs/cgroup/cpuset
-  mkdir Charlie
-  cd Charlie
-  /bin/echo 2-3 > cpuset.cpus
-  /bin/echo 1 > cpuset.mems
-  /bin/echo $$ > tasks
-  sh
-  # The subshell 'sh' is now running in cgroup Charlie
-  # The next line should display '/Charlie'
-  cat /proc/self/cgroup
-
-2. Usage Examples and Syntax
-============================
-
-2.1 Basic Usage
----------------
-
-Creating, modifying, using cgroups can be done through the cgroup
-virtual filesystem.
-
-To mount a cgroup hierarchy with all available subsystems, type:
-# mount -t cgroup xxx /sys/fs/cgroup
-
-The "xxx" is not interpreted by the cgroup code, but will appear in
-/proc/mounts so may be any useful identifying string that you like.
-
-Note: Some subsystems do not work without some user input first.  For instance,
-if cpusets are enabled the user will have to populate the cpus and mems files
-for each new cgroup created before that group can be used.
-
-As explained in section `1.2 Why are cgroups needed?' you should create
-different hierarchies of cgroups for each single resource or group of
-resources you want to control. Therefore, you should mount a tmpfs on
-/sys/fs/cgroup and create directories for each cgroup resource or resource
-group.
-
-# mount -t tmpfs cgroup_root /sys/fs/cgroup
-# mkdir /sys/fs/cgroup/rg1
-
-To mount a cgroup hierarchy with just the cpuset and memory
-subsystems, type:
-# mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1
-
-While remounting cgroups is currently supported, it is not recommend
-to use it. Remounting allows changing bound subsystems and
-release_agent. Rebinding is hardly useful as it only works when the
-hierarchy is empty and release_agent itself should be replaced with
-conventional fsnotify. The support for remounting will be removed in
-the future.
-
-To Specify a hierarchy's release_agent:
-# mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
-  xxx /sys/fs/cgroup/rg1
-
-Note that specifying 'release_agent' more than once will return failure.
-
-Note that changing the set of subsystems is currently only supported
-when the hierarchy consists of a single (root) cgroup. Supporting
-the ability to arbitrarily bind/unbind subsystems from an existing
-cgroup hierarchy is intended to be implemented in the future.
-
-Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
-tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
-is the cgroup that holds the whole system.
-
-If you want to change the value of release_agent:
-# echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent
-
-It can also be changed via remount.
-
-If you want to create a new cgroup under /sys/fs/cgroup/rg1:
-# cd /sys/fs/cgroup/rg1
-# mkdir my_cgroup
-
-Now you want to do something with this cgroup.
-# cd my_cgroup
-
-In this directory you can find several files:
-# ls
-cgroup.procs notify_on_release tasks
-(plus whatever files added by the attached subsystems)
-
-Now attach your shell to this cgroup:
-# /bin/echo $$ > tasks
-
-You can also create cgroups inside your cgroup by using mkdir in this
-directory.
-# mkdir my_sub_cs
-
-To remove a cgroup, just use rmdir:
-# rmdir my_sub_cs
-
-This will fail if the cgroup is in use (has cgroups inside, or
-has processes attached, or is held alive by other subsystem-specific
-reference).
-
-2.2 Attaching processes
------------------------
-
-# /bin/echo PID > tasks
-
-Note that it is PID, not PIDs. You can only attach ONE task at a time.
-If you have several tasks to attach, you have to do it one after another:
-
-# /bin/echo PID1 > tasks
-# /bin/echo PID2 > tasks
-       ...
-# /bin/echo PIDn > tasks
-
-You can attach the current shell task by echoing 0:
-
-# echo 0 > tasks
-
-You can use the cgroup.procs file instead of the tasks file to move all
-threads in a threadgroup at once. Echoing the PID of any task in a
-threadgroup to cgroup.procs causes all tasks in that threadgroup to be
-attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
-in the writing task's threadgroup.
-
-Note: Since every task is always a member of exactly one cgroup in each
-mounted hierarchy, to remove a task from its current cgroup you must
-move it into a new cgroup (possibly the root cgroup) by writing to the
-new cgroup's tasks file.
-
-Note: Due to some restrictions enforced by some cgroup subsystems, moving
-a process to another cgroup can fail.
-
-2.3 Mounting hierarchies by name
---------------------------------
-
-Passing the name=<x> option when mounting a cgroups hierarchy
-associates the given name with the hierarchy.  This can be used when
-mounting a pre-existing hierarchy, in order to refer to it by name
-rather than by its set of active subsystems.  Each hierarchy is either
-nameless, or has a unique name.
-
-The name should match [\w.-]+
-
-When passing a name=<x> option for a new hierarchy, you need to
-specify subsystems manually; the legacy behaviour of mounting all
-subsystems when none are explicitly specified is not supported when
-you give a subsystem a name.
-
-The name of the subsystem appears as part of the hierarchy description
-in /proc/mounts and /proc/<pid>/cgroups.
-
-
-3. Kernel API
-=============
-
-3.1 Overview
-------------
-
-Each kernel subsystem that wants to hook into the generic cgroup
-system needs to create a cgroup_subsys object. This contains
-various methods, which are callbacks from the cgroup system, along
-with a subsystem ID which will be assigned by the cgroup system.
-
-Other fields in the cgroup_subsys object include:
-
-- subsys_id: a unique array index for the subsystem, indicating which
-  entry in cgroup->subsys[] this subsystem should be managing.
-
-- name: should be initialized to a unique subsystem name. Should be
-  no longer than MAX_CGROUP_TYPE_NAMELEN.
-
-- early_init: indicate if the subsystem needs early initialization
-  at system boot.
-
-Each cgroup object created by the system has an array of pointers,
-indexed by subsystem ID; this pointer is entirely managed by the
-subsystem; the generic cgroup code will never touch this pointer.
-
-3.2 Synchronization
--------------------
-
-There is a global mutex, cgroup_mutex, used by the cgroup
-system. This should be taken by anything that wants to modify a
-cgroup. It may also be taken to prevent cgroups from being
-modified, but more specific locks may be more appropriate in that
-situation.
-
-See kernel/cgroup.c for more details.
-
-Subsystems can take/release the cgroup_mutex via the functions
-cgroup_lock()/cgroup_unlock().
-
-Accessing a task's cgroup pointer may be done in the following ways:
-- while holding cgroup_mutex
-- while holding the task's alloc_lock (via task_lock())
-- inside an rcu_read_lock() section via rcu_dereference()
-
-3.3 Subsystem API
------------------
-
-Each subsystem should:
-
-- add an entry in linux/cgroup_subsys.h
-- define a cgroup_subsys object called <name>_subsys
-
-If a subsystem can be compiled as a module, it should also have in its
-module initcall a call to cgroup_load_subsys(), and in its exitcall a
-call to cgroup_unload_subsys(). It should also set its_subsys.module =
-THIS_MODULE in its .c file.
-
-Each subsystem may export the following methods. The only mandatory
-methods are css_alloc/free. Any others that are null are presumed to
-be successful no-ops.
-
-struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)
-(cgroup_mutex held by caller)
-
-Called to allocate a subsystem state object for a cgroup. The
-subsystem should allocate its subsystem state object for the passed
-cgroup, returning a pointer to the new object on success or a
-ERR_PTR() value. On success, the subsystem pointer should point to
-a structure of type cgroup_subsys_state (typically embedded in a
-larger subsystem-specific object), which will be initialized by the
-cgroup system. Note that this will be called at initialization to
-create the root subsystem state for this subsystem; this case can be
-identified by the passed cgroup object having a NULL parent (since
-it's the root of the hierarchy) and may be an appropriate place for
-initialization code.
-
-int css_online(struct cgroup *cgrp)
-(cgroup_mutex held by caller)
-
-Called after @cgrp successfully completed all allocations and made
-visible to cgroup_for_each_child/descendant_*() iterators. The
-subsystem may choose to fail creation by returning -errno. This
-callback can be used to implement reliable state sharing and
-propagation along the hierarchy. See the comment on
-cgroup_for_each_descendant_pre() for details.
-
-void css_offline(struct cgroup *cgrp);
-(cgroup_mutex held by caller)
-
-This is the counterpart of css_online() and called iff css_online()
-has succeeded on @cgrp. This signifies the beginning of the end of
-@cgrp. @cgrp is being removed and the subsystem should start dropping
-all references it's holding on @cgrp. When all references are dropped,
-cgroup removal will proceed to the next step - css_free(). After this
-callback, @cgrp should be considered dead to the subsystem.
-
-void css_free(struct cgroup *cgrp)
-(cgroup_mutex held by caller)
-
-The cgroup system is about to free @cgrp; the subsystem should free
-its subsystem state object. By the time this method is called, @cgrp
-is completely unused; @cgrp->parent is still valid. (Note - can also
-be called for a newly-created cgroup if an error occurs after this
-subsystem's create() method has been called for the new cgroup).
-
-int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-(cgroup_mutex held by caller)
-
-Called prior to moving one or more tasks into a cgroup; if the
-subsystem returns an error, this will abort the attach operation.
-@tset contains the tasks to be attached and is guaranteed to have at
-least one task in it.
-
-If there are multiple tasks in the taskset, then:
-  - it's guaranteed that all are from the same thread group
-  - @tset contains all tasks from the thread group whether or not
-    they're switching cgroups
-  - the first task is the leader
-
-Each @tset entry also contains the task's old cgroup and tasks which
-aren't switching cgroup can be skipped easily using the
-cgroup_taskset_for_each() iterator. Note that this isn't called on a
-fork. If this method returns 0 (success) then this should remain valid
-while the caller holds cgroup_mutex and it is ensured that either
-attach() or cancel_attach() will be called in future.
-
-void css_reset(struct cgroup_subsys_state *css)
-(cgroup_mutex held by caller)
-
-An optional operation which should restore @css's configuration to the
-initial state.  This is currently only used on the unified hierarchy
-when a subsystem is disabled on a cgroup through
-"cgroup.subtree_control" but should remain enabled because other
-subsystems depend on it.  cgroup core makes such a css invisible by
-removing the associated interface files and invokes this callback so
-that the hidden subsystem can return to the initial neutral state.
-This prevents unexpected resource control from a hidden css and
-ensures that the configuration is in the initial state when it is made
-visible again later.
-
-void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-(cgroup_mutex held by caller)
-
-Called when a task attach operation has failed after can_attach() has succeeded.
-A subsystem whose can_attach() has some side-effects should provide this
-function, so that the subsystem can implement a rollback. If not, not necessary.
-This will be called only about subsystems whose can_attach() operation have
-succeeded. The parameters are identical to can_attach().
-
-void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-(cgroup_mutex held by caller)
-
-Called after the task has been attached to the cgroup, to allow any
-post-attachment activity that requires memory allocations or blocking.
-The parameters are identical to can_attach().
-
-void fork(struct task_struct *task)
-
-Called when a task is forked into a cgroup.
-
-void exit(struct task_struct *task)
-
-Called during task exit.
-
-void free(struct task_struct *task)
-
-Called when the task_struct is freed.
-
-void bind(struct cgroup *root)
-(cgroup_mutex held by caller)
-
-Called when a cgroup subsystem is rebound to a different hierarchy
-and root cgroup. Currently this will only involve movement between
-the default hierarchy (which never has sub-cgroups) and a hierarchy
-that is being created/destroyed (and hence has no sub-cgroups).
-
-4. Extended attribute usage
-===========================
-
-cgroup filesystem supports certain types of extended attributes in its
-directories and files.  The current supported types are:
-       - Trusted (XATTR_TRUSTED)
-       - Security (XATTR_SECURITY)
-
-Both require CAP_SYS_ADMIN capability to set.
-
-Like in tmpfs, the extended attributes in cgroup filesystem are stored
-using kernel memory and it's advised to keep the usage at minimum.  This
-is the reason why user defined extended attributes are not supported, since
-any user can do it and there's no limit in the value size.
-
-The current known users for this feature are SELinux to limit cgroup usage
-in containers and systemd for assorted meta data like main PID in a cgroup
-(systemd creates a cgroup per service).
-
-5. Questions
-============
-
-Q: what's up with this '/bin/echo' ?
-A: bash's builtin 'echo' command does not check calls to write() against
-   errors. If you use it in the cgroup file system, you won't be
-   able to tell whether a command succeeded or failed.
-
-Q: When I attach processes, only the first of the line gets really attached !
-A: We can only return one error code per call to write(). So you should also
-   put only ONE PID.
-
diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt

deleted file mode 100644 (file)

index 9d73cc0..0000000
--- a/Documentation/cgroups/cpuacct.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-CPU Accounting Controller
--------------------------
-
-The CPU accounting controller is used to group tasks using cgroups and
-account the CPU usage of these groups of tasks.
-
-The CPU accounting controller supports multi-hierarchy groups. An accounting
-group accumulates the CPU usage of all of its child groups and the tasks
-directly present in its group.
-
-Accounting groups can be created by first mounting the cgroup filesystem.
-
-# mount -t cgroup -ocpuacct none /sys/fs/cgroup
-
-With the above step, the initial or the parent accounting group becomes
-visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
-the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
-/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained
-by this group which is essentially the CPU time obtained by all the tasks
-in the system.
-
-New accounting groups can be created under the parent group /sys/fs/cgroup.
-
-# cd /sys/fs/cgroup
-# mkdir g1
-# echo $$ > g1/tasks
-
-The above steps create a new group g1 and move the current shell
-process (bash) into it. CPU time consumed by this bash and its children
-can be obtained from g1/cpuacct.usage and the same is accumulated in
-/sys/fs/cgroup/cpuacct.usage also.
-
-cpuacct.stat file lists a few statistics which further divide the
-CPU time obtained by the cgroup into user and system times. Currently
-the following statistics are supported:
-
-user: Time spent by tasks of the cgroup in user mode.
-system: Time spent by tasks of the cgroup in kernel mode.
-
-user and system are in USER_HZ unit.
-
-cpuacct controller uses percpu_counter interface to collect user and
-system times. This has two side effects:
-
-- It is theoretically possible to see wrong values for user and system times.
-  This is because percpu_counter_read() on 32bit systems isn't safe
-  against concurrent writes.
-- It is possible to see slightly outdated values for user and system times
-  due to the batch processing nature of percpu_counter.
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt

deleted file mode 100644 (file)

index fdf7dff..0000000
--- a/Documentation/cgroups/cpusets.txt
+++ /dev/null
@@ -1,839 +0,0 @@
-                               CPUSETS
-                               -------
-
-Copyright (C) 2004 BULL SA.
-Written by Simon.Derr@bull.net
-
-Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
-Modified by Paul Jackson <pj@sgi.com>
-Modified by Christoph Lameter <clameter@sgi.com>
-Modified by Paul Menage <menage@google.com>
-Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
-
-CONTENTS:
-=========
-
-1. Cpusets
-  1.1 What are cpusets ?
-  1.2 Why are cpusets needed ?
-  1.3 How are cpusets implemented ?
-  1.4 What are exclusive cpusets ?
-  1.5 What is memory_pressure ?
-  1.6 What is memory spread ?
-  1.7 What is sched_load_balance ?
-  1.8 What is sched_relax_domain_level ?
-  1.9 How do I use cpusets ?
-2. Usage Examples and Syntax
-  2.1 Basic Usage
-  2.2 Adding/removing cpus
-  2.3 Setting flags
-  2.4 Attaching processes
-3. Questions
-4. Contact
-
-1. Cpusets
-==========
-
-1.1 What are cpusets ?
-----------------------
-
-Cpusets provide a mechanism for assigning a set of CPUs and Memory
-Nodes to a set of tasks.   In this document "Memory Node" refers to
-an on-line node that contains memory.
-
-Cpusets constrain the CPU and Memory placement of tasks to only
-the resources within a task's current cpuset.  They form a nested
-hierarchy visible in a virtual file system.  These are the essential
-hooks, beyond what is already present, required to manage dynamic
-job placement on large systems.
-
-Cpusets use the generic cgroup subsystem described in
-Documentation/cgroups/cgroups.txt.
-
-Requests by a task, using the sched_setaffinity(2) system call to
-include CPUs in its CPU affinity mask, and using the mbind(2) and
-set_mempolicy(2) system calls to include Memory Nodes in its memory
-policy, are both filtered through that task's cpuset, filtering out any
-CPUs or Memory Nodes not in that cpuset.  The scheduler will not
-schedule a task on a CPU that is not allowed in its cpus_allowed
-vector, and the kernel page allocator will not allocate a page on a
-node that is not allowed in the requesting task's mems_allowed vector.
-
-User level code may create and destroy cpusets by name in the cgroup
-virtual file system, manage the attributes and permissions of these
-cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
-specify and query to which cpuset a task is assigned, and list the
-task pids assigned to a cpuset.
-
-
-1.2 Why are cpusets needed ?
-----------------------------
-
-The management of large computer systems, with many processors (CPUs),
-complex memory cache hierarchies and multiple Memory Nodes having
-non-uniform access times (NUMA) presents additional challenges for
-the efficient scheduling and memory placement of processes.
-
-Frequently more modest sized systems can be operated with adequate
-efficiency just by letting the operating system automatically share
-the available CPU and Memory resources amongst the requesting tasks.
-
-But larger systems, which benefit more from careful processor and
-memory placement to reduce memory access times and contention,
-and which typically represent a larger investment for the customer,
-can benefit from explicitly placing jobs on properly sized subsets of
-the system.
-
-This can be especially valuable on:
-
-    * Web Servers running multiple instances of the same web application,
-    * Servers running different applications (for instance, a web server
-      and a database), or
-    * NUMA systems running large HPC applications with demanding
-      performance characteristics.
-
-These subsets, or "soft partitions" must be able to be dynamically
-adjusted, as the job mix changes, without impacting other concurrently
-executing jobs. The location of the running jobs pages may also be moved
-when the memory locations are changed.
-
-The kernel cpuset patch provides the minimum essential kernel
-mechanisms required to efficiently implement such subsets.  It
-leverages existing CPU and Memory Placement facilities in the Linux
-kernel to avoid any additional impact on the critical scheduler or
-memory allocator code.
-
-
-1.3 How are cpusets implemented ?
----------------------------------
-
-Cpusets provide a Linux kernel mechanism to constrain which CPUs and
-Memory Nodes are used by a process or set of processes.
-
-The Linux kernel already has a pair of mechanisms to specify on which
-CPUs a task may be scheduled (sched_setaffinity) and on which Memory
-Nodes it may obtain memory (mbind, set_mempolicy).
-
-Cpusets extends these two mechanisms as follows:
-
- - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
-   kernel.
- - Each task in the system is attached to a cpuset, via a pointer
-   in the task structure to a reference counted cgroup structure.
- - Calls to sched_setaffinity are filtered to just those CPUs
-   allowed in that task's cpuset.
- - Calls to mbind and set_mempolicy are filtered to just
-   those Memory Nodes allowed in that task's cpuset.
- - The root cpuset contains all the systems CPUs and Memory
-   Nodes.
- - For any cpuset, one can define child cpusets containing a subset
-   of the parents CPU and Memory Node resources.
- - The hierarchy of cpusets can be mounted at /dev/cpuset, for
-   browsing and manipulation from user space.
- - A cpuset may be marked exclusive, which ensures that no other
-   cpuset (except direct ancestors and descendants) may contain
-   any overlapping CPUs or Memory Nodes.
- - You can list all the tasks (by pid) attached to any cpuset.
-
-The implementation of cpusets requires a few, simple hooks
-into the rest of the kernel, none in performance critical paths:
-
- - in init/main.c, to initialize the root cpuset at system boot.
- - in fork and exit, to attach and detach a task from its cpuset.
- - in sched_setaffinity, to mask the requested CPUs by what's
-   allowed in that task's cpuset.
- - in sched.c migrate_live_tasks(), to keep migrating tasks within
-   the CPUs allowed by their cpuset, if possible.
- - in the mbind and set_mempolicy system calls, to mask the requested
-   Memory Nodes by what's allowed in that task's cpuset.
- - in page_alloc.c, to restrict memory to allowed nodes.
- - in vmscan.c, to restrict page recovery to the current cpuset.
-
-You should mount the "cgroup" filesystem type in order to enable
-browsing and modifying the cpusets presently known to the kernel.  No
-new system calls are added for cpusets - all support for querying and
-modifying cpusets is via this cpuset file system.
-
-The /proc/<pid>/status file for each task has four added lines,
-displaying the task's cpus_allowed (on which CPUs it may be scheduled)
-and mems_allowed (on which Memory Nodes it may obtain memory),
-in the two formats seen in the following example:
-
-  Cpus_allowed:   ffffffff,ffffffff,ffffffff,ffffffff
-  Cpus_allowed_list:      0-127
-  Mems_allowed:   ffffffff,ffffffff
-  Mems_allowed_list:      0-63
-
-Each cpuset is represented by a directory in the cgroup file system
-containing (on top of the standard cgroup files) the following
-files describing that cpuset:
-
- - cpuset.cpus: list of CPUs in that cpuset
- - cpuset.mems: list of Memory Nodes in that cpuset
- - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
- - cpuset.cpu_exclusive flag: is cpu placement exclusive?
- - cpuset.mem_exclusive flag: is memory placement exclusive?
- - cpuset.mem_hardwall flag:  is memory allocation hardwalled
- - cpuset.memory_pressure: measure of how much paging pressure in cpuset
- - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
- - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
- - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
- - cpuset.sched_relax_domain_level: the searching range when migrating tasks
-
-In addition, only the root cpuset has the following file:
- - cpuset.memory_pressure_enabled flag: compute memory_pressure?
-
-New cpusets are created using the mkdir system call or shell
-command.  The properties of a cpuset, such as its flags, allowed
-CPUs and Memory Nodes, and attached tasks, are modified by writing
-to the appropriate file in that cpusets directory, as listed above.
-
-The named hierarchical structure of nested cpusets allows partitioning
-a large system into nested, dynamically changeable, "soft-partitions".
-
-The attachment of each task, automatically inherited at fork by any
-children of that task, to a cpuset allows organizing the work load
-on a system into related sets of tasks such that each set is constrained
-to using the CPUs and Memory Nodes of a particular cpuset.  A task
-may be re-attached to any other cpuset, if allowed by the permissions
-on the necessary cpuset file system directories.
-
-Such management of a system "in the large" integrates smoothly with
-the detailed placement done on individual tasks and memory regions
-using the sched_setaffinity, mbind and set_mempolicy system calls.
-
-The following rules apply to each cpuset:
-
- - Its CPUs and Memory Nodes must be a subset of its parents.
- - It can't be marked exclusive unless its parent is.
- - If its cpu or memory is exclusive, they may not overlap any sibling.
-
-These rules, and the natural hierarchy of cpusets, enable efficient
-enforcement of the exclusive guarantee, without having to scan all
-cpusets every time any of them change to ensure nothing overlaps a
-exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
-to represent the cpuset hierarchy provides for a familiar permission
-and name space for cpusets, with a minimum of additional kernel code.
-
-The cpus and mems files in the root (top_cpuset) cpuset are
-read-only.  The cpus file automatically tracks the value of
-cpu_online_mask using a CPU hotplug notifier, and the mems file
-automatically tracks the value of node_states[N_MEMORY]--i.e.,
-nodes with memory--using the cpuset_track_online_nodes() hook.
-
-
-1.4 What are exclusive cpusets ?
---------------------------------
-
-If a cpuset is cpu or mem exclusive, no other cpuset, other than
-a direct ancestor or descendant, may share any of the same CPUs or
-Memory Nodes.
-
-A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
-i.e. it restricts kernel allocations for page, buffer and other data
-commonly shared by the kernel across multiple users.  All cpusets,
-whether hardwalled or not, restrict allocations of memory for user
-space.  This enables configuring a system so that several independent
-jobs can share common kernel data, such as file system pages, while
-isolating each job's user allocation in its own cpuset.  To do this,
-construct a large mem_exclusive cpuset to hold all the jobs, and
-construct child, non-mem_exclusive cpusets for each individual job.
-Only a small amount of typical kernel memory, such as requests from
-interrupt handlers, is allowed to be taken outside even a
-mem_exclusive cpuset.
-
-
-1.5 What is memory_pressure ?
------------------------------
-The memory_pressure of a cpuset provides a simple per-cpuset metric
-of the rate that the tasks in a cpuset are attempting to free up in
-use memory on the nodes of the cpuset to satisfy additional memory
-requests.
-
-This enables batch managers monitoring jobs running in dedicated
-cpusets to efficiently detect what level of memory pressure that job
-is causing.
-
-This is useful both on tightly managed systems running a wide mix of
-submitted jobs, which may choose to terminate or re-prioritize jobs that
-are trying to use more memory than allowed on the nodes assigned to them,
-and with tightly coupled, long running, massively parallel scientific
-computing jobs that will dramatically fail to meet required performance
-goals if they start to use more memory than allowed to them.
-
-This mechanism provides a very economical way for the batch manager
-to monitor a cpuset for signs of memory pressure.  It's up to the
-batch manager or other user code to decide what to do about it and
-take action.
-
-==> Unless this feature is enabled by writing "1" to the special file
-    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
-    code of __alloc_pages() for this metric reduces to simply noticing
-    that the cpuset_memory_pressure_enabled flag is zero.  So only
-    systems that enable this feature will compute the metric.
-
-Why a per-cpuset, running average:
-
-    Because this meter is per-cpuset, rather than per-task or mm,
-    the system load imposed by a batch scheduler monitoring this
-    metric is sharply reduced on large systems, because a scan of
-    the tasklist can be avoided on each set of queries.
-
-    Because this meter is a running average, instead of an accumulating
-    counter, a batch scheduler can detect memory pressure with a
-    single read, instead of having to read and accumulate results
-    for a period of time.
-
-    Because this meter is per-cpuset rather than per-task or mm,
-    the batch scheduler can obtain the key information, memory
-    pressure in a cpuset, with a single read, rather than having to
-    query and accumulate results over all the (dynamically changing)
-    set of tasks in the cpuset.
-
-A per-cpuset simple digital filter (requires a spinlock and 3 words
-of data per-cpuset) is kept, and updated by any task attached to that
-cpuset, if it enters the synchronous (direct) page reclaim code.
-
-A per-cpuset file provides an integer number representing the recent
-(half-life of 10 seconds) rate of direct page reclaims caused by
-the tasks in the cpuset, in units of reclaims attempted per second,
-times 1000.
-
-
-1.6 What is memory spread ?
----------------------------
-There are two boolean flag files per cpuset that control where the
-kernel allocates pages for the file system buffers and related in
-kernel data structures.  They are called 'cpuset.memory_spread_page' and
-'cpuset.memory_spread_slab'.
-
-If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
-the kernel will spread the file system buffers (page cache) evenly
-over all the nodes that the faulting task is allowed to use, instead
-of preferring to put those pages on the node where the task is running.
-
-If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
-then the kernel will spread some file system related slab caches,
-such as for inodes and dentries evenly over all the nodes that the
-faulting task is allowed to use, instead of preferring to put those
-pages on the node where the task is running.
-
-The setting of these flags does not affect anonymous data segment or
-stack segment pages of a task.
-
-By default, both kinds of memory spreading are off, and memory
-pages are allocated on the node local to where the task is running,
-except perhaps as modified by the task's NUMA mempolicy or cpuset
-configuration, so long as sufficient free memory pages are available.
-
-When new cpusets are created, they inherit the memory spread settings
-of their parent.
-
-Setting memory spreading causes allocations for the affected page
-or slab caches to ignore the task's NUMA mempolicy and be spread
-instead.    Tasks using mbind() or set_mempolicy() calls to set NUMA
-mempolicies will not notice any change in these calls as a result of
-their containing task's memory spread settings.  If memory spreading
-is turned off, then the currently specified NUMA mempolicy once again
-applies to memory page allocations.
-
-Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
-files.  By default they contain "0", meaning that the feature is off
-for that cpuset.  If a "1" is written to that file, then that turns
-the named feature on.
-
-The implementation is simple.
-
-Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
-PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently
-joins that cpuset.  The page allocation calls for the page cache
-is modified to perform an inline check for this PFA_SPREAD_PAGE task
-flag, and if set, a call to a new routine cpuset_mem_spread_node()
-returns the node to prefer for the allocation.
-
-Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
-PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate
-pages from the node returned by cpuset_mem_spread_node().
-
-The cpuset_mem_spread_node() routine is also simple.  It uses the
-value of a per-task rotor cpuset_mem_spread_rotor to select the next
-node in the current task's mems_allowed to prefer for the allocation.
-
-This memory placement policy is also known (in other contexts) as
-round-robin or interleave.
-
-This policy can provide substantial improvements for jobs that need
-to place thread local data on the corresponding node, but that need
-to access large file system data sets that need to be spread across
-the several nodes in the jobs cpuset in order to fit.  Without this
-policy, especially for jobs that might have one thread reading in the
-data set, the memory allocation across the nodes in the jobs cpuset
-can become very uneven.
-
-1.7 What is sched_load_balance ?
---------------------------------
-
-The kernel scheduler (kernel/sched/core.c) automatically load balances
-tasks.  If one CPU is underutilized, kernel code running on that
-CPU will look for tasks on other more overloaded CPUs and move those
-tasks to itself, within the constraints of such placement mechanisms
-as cpusets and sched_setaffinity.
-
-The algorithmic cost of load balancing and its impact on key shared
-kernel data structures such as the task list increases more than
-linearly with the number of CPUs being balanced.  So the scheduler
-has support to partition the systems CPUs into a number of sched
-domains such that it only load balances within each sched domain.
-Each sched domain covers some subset of the CPUs in the system;
-no two sched domains overlap; some CPUs might not be in any sched
-domain and hence won't be load balanced.
-
-Put simply, it costs less to balance between two smaller sched domains
-than one big one, but doing so means that overloads in one of the
-two domains won't be load balanced to the other one.
-
-By default, there is one sched domain covering all CPUs, including those
-marked isolated using the kernel boot time "isolcpus=" argument. However,
-the isolated CPUs will not participate in load balancing, and will not
-have tasks running on them unless explicitly assigned.
-
-This default load balancing across all CPUs is not well suited for
-the following two situations:
- 1) On large systems, load balancing across many CPUs is expensive.
-    If the system is managed using cpusets to place independent jobs
-    on separate sets of CPUs, full load balancing is unnecessary.
- 2) Systems supporting realtime on some CPUs need to minimize
-    system overhead on those CPUs, including avoiding task load
-    balancing if that is not needed.
-
-When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
-setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
-be contained in a single sched domain, ensuring that load balancing
-can move a task (not otherwised pinned, as by sched_setaffinity)
-from any CPU in that cpuset to any other.
-
-When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
-scheduler will avoid load balancing across the CPUs in that cpuset,
---except-- in so far as is necessary because some overlapping cpuset
-has "sched_load_balance" enabled.
-
-So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
-enabled, then the scheduler will have one sched domain covering all
-CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
-cpusets won't matter, as we're already fully load balancing.
-
-Therefore in the above two situations, the top cpuset flag
-"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
-child cpusets have this flag enabled.
-
-When doing this, you don't usually want to leave any unpinned tasks in
-the top cpuset that might use non-trivial amounts of CPU, as such tasks
-may be artificially constrained to some subset of CPUs, depending on
-the particulars of this flag setting in descendant cpusets.  Even if
-such a task could use spare CPU cycles in some other CPUs, the kernel
-scheduler might not consider the possibility of load balancing that
-task to that underused CPU.
-
-Of course, tasks pinned to a particular CPU can be left in a cpuset
-that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
-else anyway.
-
-There is an impedance mismatch here, between cpusets and sched domains.
-Cpusets are hierarchical and nest.  Sched domains are flat; they don't
-overlap and each CPU is in at most one sched domain.
-
-It is necessary for sched domains to be flat because load balancing
-across partially overlapping sets of CPUs would risk unstable dynamics
-that would be beyond our understanding.  So if each of two partially
-overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
-form a single sched domain that is a superset of both.  We won't move
-a task to a CPU outside its cpuset, but the scheduler load balancing
-code might waste some compute cycles considering that possibility.
-
-This mismatch is why there is not a simple one-to-one relation
-between which cpusets have the flag "cpuset.sched_load_balance" enabled,
-and the sched domain configuration.  If a cpuset enables the flag, it
-will get balancing across all its CPUs, but if it disables the flag,
-it will only be assured of no load balancing if no other overlapping
-cpuset enables the flag.
-
-If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
-one of them has this flag enabled, then the other may find its
-tasks only partially load balanced, just on the overlapping CPUs.
-This is just the general case of the top_cpuset example given a few
-paragraphs above.  In the general case, as in the top cpuset case,
-don't leave tasks that might use non-trivial amounts of CPU in
-such partially load balanced cpusets, as they may be artificially
-constrained to some subset of the CPUs allowed to them, for lack of
-load balancing to the other CPUs.
-
-CPUs in "cpuset.isolcpus" were excluded from load balancing by the
-isolcpus= kernel boot option, and will never be load balanced regardless
-of the value of "cpuset.sched_load_balance" in any cpuset.
-
-1.7.1 sched_load_balance implementation details.
-------------------------------------------------
-
-The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
-to most cpuset flags.)  When enabled for a cpuset, the kernel will
-ensure that it can load balance across all the CPUs in that cpuset
-(makes sure that all the CPUs in the cpus_allowed of that cpuset are
-in the same sched domain.)
-
-If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
-then they will be (must be) both in the same sched domain.
-
-If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
-then by the above that means there is a single sched domain covering
-the whole system, regardless of any other cpuset settings.
-
-The kernel commits to user space that it will avoid load balancing
-where it can.  It will pick as fine a granularity partition of sched
-domains as it can while still providing load balancing for any set
-of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
-
-The internal kernel cpuset to scheduler interface passes from the
-cpuset code to the scheduler code a partition of the load balanced
-CPUs in the system. This partition is a set of subsets (represented
-as an array of struct cpumask) of CPUs, pairwise disjoint, that cover
-all the CPUs that must be load balanced.
-
-The cpuset code builds a new such partition and passes it to the
-scheduler sched domain setup code, to have the sched domains rebuilt
-as necessary, whenever:
- - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
- - or CPUs come or go from a cpuset with this flag enabled,
- - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
-   and with this flag enabled changes,
- - or a cpuset with non-empty CPUs and with this flag enabled is removed,
- - or a cpu is offlined/onlined.
-
-This partition exactly defines what sched domains the scheduler should
-setup - one sched domain for each element (struct cpumask) in the
-partition.
-
-The scheduler remembers the currently active sched domain partitions.
-When the scheduler routine partition_sched_domains() is invoked from
-the cpuset code to update these sched domains, it compares the new
-partition requested with the current, and updates its sched domains,
-removing the old and adding the new, for each change.
-
-
-1.8 What is sched_relax_domain_level ?
---------------------------------------
-
-In sched domain, the scheduler migrates tasks in 2 ways; periodic load
-balance on tick, and at time of some schedule events.
-
-When a task is woken up, scheduler try to move the task on idle CPU.
-For example, if a task A running on CPU X activates another task B
-on the same CPU X, and if CPU Y is X's sibling and performing idle,
-then scheduler migrate task B to CPU Y so that task B can start on
-CPU Y without waiting task A on CPU X.
-
-And if a CPU run out of tasks in its runqueue, the CPU try to pull
-extra tasks from other busy CPUs to help them before it is going to
-be idle.
-
-Of course it takes some searching cost to find movable tasks and/or
-idle CPUs, the scheduler might not search all CPUs in the domain
-every time.  In fact, in some architectures, the searching ranges on
-events are limited in the same socket or node where the CPU locates,
-while the load balance on tick searches all.
-
-For example, assume CPU Z is relatively far from CPU X.  Even if CPU Z
-is idle while CPU X and the siblings are busy, scheduler can't migrate
-woken task B from X to Z since it is out of its searching range.
-As the result, task B on CPU X need to wait task A or wait load balance
-on the next tick.  For some applications in special situation, waiting
-1 tick may be too long.
-
-The 'cpuset.sched_relax_domain_level' file allows you to request changing
-this searching range as you like.  This file takes int value which
-indicates size of searching range in levels ideally as follows,
-otherwise initial value -1 that indicates the cpuset has no request.
-
-  -1  : no request. use system default or follow request of others.
-   0  : no search.
-   1  : search siblings (hyperthreads in a core).
-   2  : search cores in a package.
-   3  : search cpus in a node [= system wide on non-NUMA system]
-   4  : search nodes in a chunk of node [on NUMA system]
-   5  : search system wide [on NUMA system]
-
-The system default is architecture dependent.  The system default
-can be changed using the relax_domain_level= boot parameter.
-
-This file is per-cpuset and affect the sched domain where the cpuset
-belongs to.  Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
-is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
-there is no sched domain belonging the cpuset.
-
-If multiple cpusets are overlapping and hence they form a single sched
-domain, the largest value among those is used.  Be careful, if one
-requests 0 and others are -1 then 0 is used.
-
-Note that modifying this file will have both good and bad effects,
-and whether it is acceptable or not depends on your situation.
-Don't modify this file if you are not sure.
-
-If your situation is:
- - The migration costs between each cpu can be assumed considerably
-   small(for you) due to your special application's behavior or
-   special hardware support for CPU cache etc.
- - The searching cost doesn't have impact(for you) or you can make
-   the searching cost enough small by managing cpuset to compact etc.
- - The latency is required even it sacrifices cache hit rate etc.
-then increasing 'sched_relax_domain_level' would benefit you.
-
-
-1.9 How do I use cpusets ?
---------------------------
-
-In order to minimize the impact of cpusets on critical kernel
-code, such as the scheduler, and due to the fact that the kernel
-does not support one task updating the memory placement of another
-task directly, the impact on a task of changing its cpuset CPU
-or Memory Node placement, or of changing to which cpuset a task
-is attached, is subtle.
-
-If a cpuset has its Memory Nodes modified, then for each task attached
-to that cpuset, the next time that the kernel attempts to allocate
-a page of memory for that task, the kernel will notice the change
-in the task's cpuset, and update its per-task memory placement to
-remain within the new cpusets memory placement.  If the task was using
-mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
-its new cpuset, then the task will continue to use whatever subset
-of MPOL_BIND nodes are still allowed in the new cpuset.  If the task
-was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
-in the new cpuset, then the task will be essentially treated as if it
-was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
-as queried by get_mempolicy(), doesn't change).  If a task is moved
-from one cpuset to another, then the kernel will adjust the task's
-memory placement, as above, the next time that the kernel attempts
-to allocate a page of memory for that task.
-
-If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
-will have its allowed CPU placement changed immediately.  Similarly,
-if a task's pid is written to another cpusets 'cpuset.tasks' file, then its
-allowed CPU placement is changed immediately.  If such a task had been
-bound to some subset of its cpuset using the sched_setaffinity() call,
-the task will be allowed to run on any CPU allowed in its new cpuset,
-negating the effect of the prior sched_setaffinity() call.
-
-In summary, the memory placement of a task whose cpuset is changed is
-updated by the kernel, on the next allocation of a page for that task,
-and the processor placement is updated immediately.
-
-Normally, once a page is allocated (given a physical page
-of main memory) then that page stays on whatever node it
-was allocated, so long as it remains allocated, even if the
-cpusets memory placement policy 'cpuset.mems' subsequently changes.
-If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
-tasks are attached to that cpuset, any pages that task had
-allocated to it on nodes in its previous cpuset are migrated
-to the task's new cpuset. The relative placement of the page within
-the cpuset is preserved during these migration operations if possible.
-For example if the page was on the second valid node of the prior cpuset
-then the page will be placed on the second valid node of the new cpuset.
-
-Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
-'cpuset.mems' file is modified, pages allocated to tasks in that
-cpuset, that were on nodes in the previous setting of 'cpuset.mems',
-will be moved to nodes in the new setting of 'mems.'
-Pages that were not in the task's prior cpuset, or in the cpuset's
-prior 'cpuset.mems' setting, will not be moved.
-
-There is an exception to the above.  If hotplug functionality is used
-to remove all the CPUs that are currently assigned to a cpuset,
-then all the tasks in that cpuset will be moved to the nearest ancestor
-with non-empty cpus.  But the moving of some (or all) tasks might fail if
-cpuset is bound with another cgroup subsystem which has some restrictions
-on task attaching.  In this failing case, those tasks will stay
-in the original cpuset, and the kernel will automatically update
-their cpus_allowed to allow all online CPUs.  When memory hotplug
-functionality for removing Memory Nodes is available, a similar exception
-is expected to apply there as well.  In general, the kernel prefers to
-violate cpuset placement, over starving a task that has had all
-its allowed CPUs or Memory Nodes taken offline.
-
-There is a second exception to the above.  GFP_ATOMIC requests are
-kernel internal allocations that must be satisfied, immediately.
-The kernel may drop some request, in rare cases even panic, if a
-GFP_ATOMIC alloc fails.  If the request cannot be satisfied within
-the current task's cpuset, then we relax the cpuset, and look for
-memory anywhere we can find it.  It's better to violate the cpuset
-than stress the kernel.
-
-To start a new job that is to be contained within a cpuset, the steps are:
-
- 1) mkdir /sys/fs/cgroup/cpuset
- 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
- 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
-    the /sys/fs/cgroup/cpuset virtual file system.
- 4) Start a task that will be the "founding father" of the new job.
- 5) Attach that task to the new cpuset by writing its pid to the
-    /sys/fs/cgroup/cpuset tasks file for that cpuset.
- 6) fork, exec or clone the job tasks from this founding father task.
-
-For example, the following sequence of commands will setup a cpuset
-named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
-and then start a subshell 'sh' in that cpuset:
-
-  mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
-  cd /sys/fs/cgroup/cpuset
-  mkdir Charlie
-  cd Charlie
-  /bin/echo 2-3 > cpuset.cpus
-  /bin/echo 1 > cpuset.mems
-  /bin/echo $$ > tasks
-  sh
-  # The subshell 'sh' is now running in cpuset Charlie
-  # The next line should display '/Charlie'
-  cat /proc/self/cpuset
-
-There are ways to query or modify cpusets:
- - via the cpuset file system directly, using the various cd, mkdir, echo,
-   cat, rmdir commands from the shell, or their equivalent from C.
- - via the C library libcpuset.
- - via the C library libcgroup.
-   (http://sourceforge.net/projects/libcg/)
- - via the python application cset.
-   (http://code.google.com/p/cpuset/)
-
-The sched_setaffinity calls can also be done at the shell prompt using
-SGI's runon or Robert Love's taskset.  The mbind and set_mempolicy
-calls can be done at the shell prompt using the numactl command
-(part of Andi Kleen's numa package).
-
-2. Usage Examples and Syntax
-============================
-
-2.1 Basic Usage
----------------
-
-Creating, modifying, using the cpusets can be done through the cpuset
-virtual filesystem.
-
-To mount it, type:
-# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset
-
-Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the
-tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset
-is the cpuset that holds the whole system.
-
-If you want to create a new cpuset under /sys/fs/cgroup/cpuset:
-# cd /sys/fs/cgroup/cpuset
-# mkdir my_cpuset
-
-Now you want to do something with this cpuset.
-# cd my_cpuset
-
-In this directory you can find several files:
-# ls
-cgroup.clone_children  cpuset.memory_pressure
-cgroup.event_control   cpuset.memory_spread_page
-cgroup.procs           cpuset.memory_spread_slab
-cpuset.cpu_exclusive   cpuset.mems
-cpuset.cpus            cpuset.sched_load_balance
-cpuset.mem_exclusive   cpuset.sched_relax_domain_level
-cpuset.mem_hardwall    notify_on_release
-cpuset.memory_migrate  tasks
-
-Reading them will give you information about the state of this cpuset:
-the CPUs and Memory Nodes it can use, the processes that are using
-it, its properties.  By writing to these files you can manipulate
-the cpuset.
-
-Set some flags:
-# /bin/echo 1 > cpuset.cpu_exclusive
-
-Add some cpus:
-# /bin/echo 0-7 > cpuset.cpus
-
-Add some mems:
-# /bin/echo 0-7 > cpuset.mems
-
-Now attach your shell to this cpuset:
-# /bin/echo $$ > tasks
-
-You can also create cpusets inside your cpuset by using mkdir in this
-directory.
-# mkdir my_sub_cs
-
-To remove a cpuset, just use rmdir:
-# rmdir my_sub_cs
-This will fail if the cpuset is in use (has cpusets inside, or has
-processes attached).
-
-Note that for legacy reasons, the "cpuset" filesystem exists as a
-wrapper around the cgroup filesystem.
-
-The command
-
-mount -t cpuset X /sys/fs/cgroup/cpuset
-
-is equivalent to
-
-mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset
-echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent
-
-2.2 Adding/removing cpus
-------------------------
-
-This is the syntax to use when writing in the cpus or mems files
-in cpuset directories:
-
-# /bin/echo 1-4 > cpuset.cpus          -> set cpus list to cpus 1,2,3,4
-# /bin/echo 1,2,3,4 > cpuset.cpus      -> set cpus list to cpus 1,2,3,4
-
-To add a CPU to a cpuset, write the new list of CPUs including the
-CPU to be added. To add 6 to the above cpuset:
-
-# /bin/echo 1-4,6 > cpuset.cpus        -> set cpus list to cpus 1,2,3,4,6
-
-Similarly to remove a CPU from a cpuset, write the new list of CPUs
-without the CPU to be removed.
-
-To remove all the CPUs:
-
-# /bin/echo "" > cpuset.cpus           -> clear cpus list
-
-2.3 Setting flags
------------------
-
-The syntax is very simple:
-
-# /bin/echo 1 > cpuset.cpu_exclusive   -> set flag 'cpuset.cpu_exclusive'
-# /bin/echo 0 > cpuset.cpu_exclusive   -> unset flag 'cpuset.cpu_exclusive'
-
-2.4 Attaching processes
------------------------
-
-# /bin/echo PID > tasks
-
-Note that it is PID, not PIDs. You can only attach ONE task at a time.
-If you have several tasks to attach, you have to do it one after another:
-
-# /bin/echo PID1 > tasks
-# /bin/echo PID2 > tasks
-       ...
-# /bin/echo PIDn > tasks
-
-
-3. Questions
-============
-
-Q: what's up with this '/bin/echo' ?
-A: bash's builtin 'echo' command does not check calls to write() against
-   errors. If you use it in the cpuset file system, you won't be
-   able to tell whether a command succeeded or failed.
-
-Q: When I attach processes, only the first of the line gets really attached !
-A: We can only return one error code per call to write(). So you should also
-   put only ONE pid.
-
-4. Contact
-==========
-
-Web: http://www.bullopensource.org/cpuset
diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroups/devices.txt

deleted file mode 100644 (file)

index 3c1095c..0000000
--- a/Documentation/cgroups/devices.txt
+++ /dev/null
@@ -1,116 +0,0 @@
-Device Whitelist Controller
-
-1. Description:
-
-Implement a cgroup to track and enforce open and mknod restrictions
-on device files.  A device cgroup associates a device access
-whitelist with each cgroup.  A whitelist entry has 4 fields.
-'type' is a (all), c (char), or b (block).  'all' means it applies
-to all types and all major and minor numbers.  Major and minor are
-either an integer or * for all.  Access is a composition of r
-(read), w (write), and m (mknod).
-
-The root device cgroup starts with rwm to 'all'.  A child device
-cgroup gets a copy of the parent.  Administrators can then remove
-devices from the whitelist or add new entries.  A child cgroup can
-never receive a device access which is denied by its parent.
-
-2. User Interface
-
-An entry is added using devices.allow, and removed using
-devices.deny.  For instance
-
-       echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow
-
-allows cgroup 1 to read and mknod the device usually known as
-/dev/null.  Doing
-
-       echo a > /sys/fs/cgroup/1/devices.deny
-
-will remove the default 'a *:* rwm' entry. Doing
-
-       echo a > /sys/fs/cgroup/1/devices.allow
-
-will add the 'a *:* rwm' entry to the whitelist.
-
-3. Security
-
-Any task can move itself between cgroups.  This clearly won't
-suffice, but we can decide the best way to adequately restrict
-movement as people get some experience with this.  We may just want
-to require CAP_SYS_ADMIN, which at least is a separate bit from
-CAP_MKNOD.  We may want to just refuse moving to a cgroup which
-isn't a descendant of the current one.  Or we may want to use
-CAP_MAC_ADMIN, since we really are trying to lock down root.
-
-CAP_SYS_ADMIN is needed to modify the whitelist or move another
-task to a new cgroup.  (Again we'll probably want to change that).
-
-A cgroup may not be granted more permissions than the cgroup's
-parent has.
-
-4. Hierarchy
-
-device cgroups maintain hierarchy by making sure a cgroup never has more
-access permissions than its parent.  Every time an entry is written to
-a cgroup's devices.deny file, all its children will have that entry removed
-from their whitelist and all the locally set whitelist entries will be
-re-evaluated.  In case one of the locally set whitelist entries would provide
-more access than the cgroup's parent, it'll be removed from the whitelist.
-
-Example:
-      A
-     / \
-        B
-
-    group        behavior      exceptions
-    A            allow         "b 8:* rwm", "c 116:1 rw"
-    B            deny          "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
-
-If a device is denied in group A:
-       # echo "c 116:* r" > A/devices.deny
-it'll propagate down and after revalidating B's entries, the whitelist entry
-"c 116:2 rwm" will be removed:
-
-    group        whitelist entries                        denied devices
-    A            all                                      "b 8:* rwm", "c 116:* rw"
-    B            "c 1:3 rwm", "b 3:* rwm"                 all the rest
-
-In case parent's exceptions change and local exceptions are not allowed
-anymore, they'll be deleted.
-
-Notice that new whitelist entries will not be propagated:
-      A
-     / \
-        B
-
-    group        whitelist entries                        denied devices
-    A            "c 1:3 rwm", "c 1:5 r"                   all the rest
-    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
-
-when adding "c *:3 rwm":
-       # echo "c *:3 rwm" >A/devices.allow
-
-the result:
-    group        whitelist entries                        denied devices
-    A            "c *:3 rwm", "c 1:5 r"                   all the rest
-    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
-
-but now it'll be possible to add new entries to B:
-       # echo "c 2:3 rwm" >B/devices.allow
-       # echo "c 50:3 r" >B/devices.allow
-or even
-       # echo "c *:3 rwm" >B/devices.allow
-
-Allowing or denying all by writing 'a' to devices.allow or devices.deny will
-not be possible once the device cgroups has children.
-
-4.1 Hierarchy (internal implementation)
-
-device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
-list of exceptions.  The internal state is controlled using the same user
-interface to preserve compatibility with the previous whitelist-only
-implementation.  Removal or addition of exceptions that will reduce the access
-to devices will be propagated down the hierarchy.
-For every propagated exception, the effective rules will be re-evaluated based
-on current parent's access rules.
diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroups/freezer-subsystem.txt

deleted file mode 100644 (file)

index e831cb2..0000000
--- a/Documentation/cgroups/freezer-subsystem.txt
+++ /dev/null
@@ -1,123 +0,0 @@
-The cgroup freezer is useful to batch job management system which start
-and stop sets of tasks in order to schedule the resources of a machine
-according to the desires of a system administrator. This sort of program
-is often used on HPC clusters to schedule access to the cluster as a
-whole. The cgroup freezer uses cgroups to describe the set of tasks to
-be started/stopped by the batch job management system. It also provides
-a means to start and stop the tasks composing the job.
-
-The cgroup freezer will also be useful for checkpointing running groups
-of tasks. The freezer allows the checkpoint code to obtain a consistent
-image of the tasks by attempting to force the tasks in a cgroup into a
-quiescent state. Once the tasks are quiescent another task can
-walk /proc or invoke a kernel interface to gather information about the
-quiesced tasks. Checkpointed tasks can be restarted later should a
-recoverable error occur. This also allows the checkpointed tasks to be
-migrated between nodes in a cluster by copying the gathered information
-to another node and restarting the tasks there.
-
-Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
-and resuming tasks in userspace. Both of these signals are observable
-from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
-blocked, or ignored it can be seen by waiting or ptracing parent tasks.
-SIGCONT is especially unsuitable since it can be caught by the task. Any
-programs designed to watch for SIGSTOP and SIGCONT could be broken by
-attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
-demonstrate this problem using nested bash shells:
-
-       $ echo $$
-       16644
-       $ bash
-       $ echo $$
-       16690
-
-       From a second, unrelated bash shell:
-       $ kill -SIGSTOP 16690
-       $ kill -SIGCONT 16690
-
-       <at this point 16690 exits and causes 16644 to exit too>
-
-This happens because bash can observe both signals and choose how it
-responds to them.
-
-Another example of a program which catches and responds to these
-signals is gdb. In fact any program designed to use ptrace is likely to
-have a problem with this method of stopping and resuming tasks.
-
-In contrast, the cgroup freezer uses the kernel freezer code to
-prevent the freeze/unfreeze cycle from becoming visible to the tasks
-being frozen. This allows the bash example above and gdb to run as
-expected.
-
-The cgroup freezer is hierarchical. Freezing a cgroup freezes all
-tasks belonging to the cgroup and all its descendant cgroups. Each
-cgroup has its own state (self-state) and the state inherited from the
-parent (parent-state). Iff both states are THAWED, the cgroup is
-THAWED.
-
-The following cgroupfs files are created by cgroup freezer.
-
-* freezer.state: Read-write.
-
-  When read, returns the effective state of the cgroup - "THAWED",
-  "FREEZING" or "FROZEN". This is the combined self and parent-states.
-  If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
-
-  FREEZING cgroup transitions into FROZEN state when all tasks
-  belonging to the cgroup and its descendants become frozen. Note that
-  a cgroup reverts to FREEZING from FROZEN after a new task is added
-  to the cgroup or one of its descendant cgroups until the new task is
-  frozen.
-
-  When written, sets the self-state of the cgroup. Two values are
-  allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
-  if not already freezing, enters FREEZING state along with all its
-  descendant cgroups.
-
-  If THAWED is written, the self-state of the cgroup is changed to
-  THAWED.  Note that the effective state may not change to THAWED if
-  the parent-state is still freezing. If a cgroup's effective state
-  becomes THAWED, all its descendants which are freezing because of
-  the cgroup also leave the freezing state.
-
-* freezer.self_freezing: Read only.
-
-  Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
-  This value is 1 iff the last write to freezer.state was "FROZEN".
-
-* freezer.parent_freezing: Read only.
-
-  Shows the parent-state.  0 if none of the cgroup's ancestors is
-  frozen; otherwise, 1.
-
-The root cgroup is non-freezable and the above interface files don't
-exist.
-
-* Examples of usage :
-
-   # mkdir /sys/fs/cgroup/freezer
-   # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer
-   # mkdir /sys/fs/cgroup/freezer/0
-   # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks
-
-to get status of the freezer subsystem :
-
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   THAWED
-
-to freeze all tasks in the container :
-
-   # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   FREEZING
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   FROZEN
-
-to unfreeze all tasks in the container :
-
-   # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   THAWED
-
-This is the basic mechanism which should do the right thing for user space task
-in a simple scenario.
diff --git a/Documentation/cgroups/hugetlb.txt b/Documentation/cgroups/hugetlb.txt

deleted file mode 100644 (file)

index 106245c..0000000
--- a/Documentation/cgroups/hugetlb.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-HugeTLB Controller
--------------------
-
-The HugeTLB controller allows to limit the HugeTLB usage per control group and
-enforces the controller limit during page fault. Since HugeTLB doesn't
-support page reclaim, enforcing the limit at page fault time implies that,
-the application will get SIGBUS signal if it tries to access HugeTLB pages
-beyond its limit. This requires the application to know beforehand how much
-HugeTLB pages it would require for its use.
-
-HugeTLB controller can be created by first mounting the cgroup filesystem.
-
-# mount -t cgroup -o hugetlb none /sys/fs/cgroup
-
-With the above step, the initial or the parent HugeTLB group becomes
-visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
-the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
-
-New groups can be created under the parent group /sys/fs/cgroup.
-
-# cd /sys/fs/cgroup
-# mkdir g1
-# echo $$ > g1/tasks
-
-The above steps create a new group g1 and move the current shell
-process (bash) into it.
-
-Brief summary of control files
-
- hugetlb.<hugepagesize>.limit_in_bytes     # set/show limit of "hugepagesize" hugetlb usage
- hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb  usage recorded
- hugetlb.<hugepagesize>.usage_in_bytes     # show current usage for "hugepagesize" hugetlb
- hugetlb.<hugepagesize>.failcnt                   # show the number of allocation failure due to HugeTLB limit
-
-For a system supporting two hugepage size (16M and 16G) the control
-files include:
-
-hugetlb.16GB.limit_in_bytes
-hugetlb.16GB.max_usage_in_bytes
-hugetlb.16GB.usage_in_bytes
-hugetlb.16GB.failcnt
-hugetlb.16MB.limit_in_bytes
-hugetlb.16MB.max_usage_in_bytes
-hugetlb.16MB.usage_in_bytes
-hugetlb.16MB.failcnt
diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt

deleted file mode 100644 (file)

index 8870b02..0000000
--- a/Documentation/cgroups/memcg_test.txt
+++ /dev/null
@@ -1,280 +0,0 @@
-Memory Resource Controller(Memcg)  Implementation Memo.
-Last Updated: 2010/2
-Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
-
-Because VM is getting complex (one of reasons is memcg...), memcg's behavior
-is complex. This is a document for memcg's internal behavior.
-Please note that implementation details can be changed.
-
-(*) Topics on API should be in Documentation/cgroups/memory.txt)
-
-0. How to record usage ?
-   2 objects are used.
-
-   page_cgroup ....an object per page.
-       Allocated at boot or memory hotplug. Freed at memory hot removal.
-
-   swap_cgroup ... an entry per swp_entry.
-       Allocated at swapon(). Freed at swapoff().
-
-   The page_cgroup has USED bit and double count against a page_cgroup never
-   occurs. swap_cgroup is used only when a charged page is swapped-out.
-
-1. Charge
-
-   a page/swp_entry may be charged (usage += PAGE_SIZE) at
-
-       mem_cgroup_try_charge()
-
-2. Uncharge
-  a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
-
-       mem_cgroup_uncharge()
-         Called when a page's refcount goes down to 0.
-
-       mem_cgroup_uncharge_swap()
-         Called when swp_entry's refcnt goes down to 0. A charge against swap
-         disappears.
-
-3. charge-commit-cancel
-       Memcg pages are charged in two steps:
-               mem_cgroup_try_charge()
-               mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
-
-       At try_charge(), there are no flags to say "this page is charged".
-       at this point, usage += PAGE_SIZE.
-
-       At commit(), the page is associated with the memcg.
-
-       At cancel(), simply usage -= PAGE_SIZE.
-
-Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
-
-4. Anonymous
-       Anonymous page is newly allocated at
-                 - page fault into MAP_ANONYMOUS mapping.
-                 - Copy-On-Write.
-
-       4.1 Swap-in.
-       At swap-in, the page is taken from swap-cache. There are 2 cases.
-
-       (a) If the SwapCache is newly allocated and read, it has no charges.
-       (b) If the SwapCache has been mapped by processes, it has been
-           charged already.
-
-       4.2 Swap-out.
-       At swap-out, typical state transition is below.
-
-       (a) add to swap cache. (marked as SwapCache)
-           swp_entry's refcnt += 1.
-       (b) fully unmapped.
-           swp_entry's refcnt += # of ptes.
-       (c) write back to swap.
-       (d) delete from swap cache. (remove from SwapCache)
-           swp_entry's refcnt -= 1.
-
-
-       Finally, at task exit,
-       (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
-
-5. Page Cache
-       Page Cache is charged at
-       - add_to_page_cache_locked().
-
-       The logic is very clear. (About migration, see below)
-       Note: __remove_from_page_cache() is called by remove_from_page_cache()
-       and __remove_mapping().
-
-6. Shmem(tmpfs) Page Cache
-       The best way to understand shmem's page state transition is to read
-       mm/shmem.c.
-       But brief explanation of the behavior of memcg around shmem will be
-       helpful to understand the logic.
-
-       Shmem's page (just leaf page, not direct/indirect block) can be on
-               - radix-tree of shmem's inode.
-               - SwapCache.
-               - Both on radix-tree and SwapCache. This happens at swap-in
-                 and swap-out,
-
-       It's charged when...
-       - A new page is added to shmem's radix-tree.
-       - A swp page is read. (move a charge from swap_cgroup to page_cgroup)
-
-7. Page Migration
-
-       mem_cgroup_migrate()
-
-8. LRU
-        Each memcg has its own private LRU. Now, its handling is under global
-       VM's control (means that it's handled under global zone->lru_lock).
-       Almost all routines around memcg's LRU is called by global LRU's
-       list management functions under zone->lru_lock().
-
-       A special function is mem_cgroup_isolate_pages(). This scans
-       memcg's private LRU and call __isolate_lru_page() to extract a page
-       from LRU.
-       (By __isolate_lru_page(), the page is removed from both of global and
-        private LRU.)
-
-
-9. Typical Tests.
-
- Tests for racy cases.
-
- 9.1 Small limit to memcg.
-       When you do test to do racy case, it's good test to set memcg's limit
-       to be very small rather than GB. Many races found in the test under
-       xKB or xxMB limits.
-       (Memory behavior under GB and Memory behavior under MB shows very
-        different situation.)
-
- 9.2 Shmem
-       Historically, memcg's shmem handling was poor and we saw some amount
-       of troubles here. This is because shmem is page-cache but can be
-       SwapCache. Test with shmem/tmpfs is always good test.
-
- 9.3 Migration
-       For NUMA, migration is an another special case. To do easy test, cpuset
-       is useful. Following is a sample script to do migration.
-
-       mount -t cgroup -o cpuset none /opt/cpuset
-
-       mkdir /opt/cpuset/01
-       echo 1 > /opt/cpuset/01/cpuset.cpus
-       echo 0 > /opt/cpuset/01/cpuset.mems
-       echo 1 > /opt/cpuset/01/cpuset.memory_migrate
-       mkdir /opt/cpuset/02
-       echo 1 > /opt/cpuset/02/cpuset.cpus
-       echo 1 > /opt/cpuset/02/cpuset.mems
-       echo 1 > /opt/cpuset/02/cpuset.memory_migrate
-
-       In above set, when you moves a task from 01 to 02, page migration to
-       node 0 to node 1 will occur. Following is a script to migrate all
-       under cpuset.
-       --
-       move_task()
-       {
-       for pid in $1
-        do
-                /bin/echo $pid >$2/tasks 2>/dev/null
-               echo -n $pid
-               echo -n " "
-        done
-       echo END
-       }
-
-       G1_TASK=`cat ${G1}/tasks`
-       G2_TASK=`cat ${G2}/tasks`
-       move_task "${G1_TASK}" ${G2} &
-       --
- 9.4 Memory hotplug.
-       memory hotplug test is one of good test.
-       to offline memory, do following.
-       # echo offline > /sys/devices/system/memory/memoryXXX/state
-       (XXX is the place of memory)
-       This is an easy way to test page migration, too.
-
- 9.5 mkdir/rmdir
-       When using hierarchy, mkdir/rmdir test should be done.
-       Use tests like the following.
-
-       echo 1 >/opt/cgroup/01/memory/use_hierarchy
-       mkdir /opt/cgroup/01/child_a
-       mkdir /opt/cgroup/01/child_b
-
-       set limit to 01.
-       add limit to 01/child_b
-       run jobs under child_a and child_b
-
-       create/delete following groups at random while jobs are running.
-       /opt/cgroup/01/child_a/child_aa
-       /opt/cgroup/01/child_b/child_bb
-       /opt/cgroup/01/child_c
-
-       running new jobs in new group is also good.
-
- 9.6 Mount with other subsystems.
-       Mounting with other subsystems is a good test because there is a
-       race and lock dependency with other cgroup subsystems.
-
-       example)
-       # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
-
-       and do task move, mkdir, rmdir etc...under this.
-
- 9.7 swapoff.
-       Besides management of swap is one of complicated parts of memcg,
-       call path of swap-in at swapoff is not same as usual swap-in path..
-       It's worth to be tested explicitly.
-
-       For example, test like following is good.
-       (Shell-A)
-       # mount -t cgroup none /cgroup -o memory
-       # mkdir /cgroup/test
-       # echo 40M > /cgroup/test/memory.limit_in_bytes
-       # echo 0 > /cgroup/test/tasks
-       Run malloc(100M) program under this. You'll see 60M of swaps.
-       (Shell-B)
-       # move all tasks in /cgroup/test to /cgroup
-       # /sbin/swapoff -a
-       # rmdir /cgroup/test
-       # kill malloc task.
-
-       Of course, tmpfs v.s. swapoff test should be tested, too.
-
- 9.8 OOM-Killer
-       Out-of-memory caused by memcg's limit will kill tasks under
-       the memcg. When hierarchy is used, a task under hierarchy
-       will be killed by the kernel.
-       In this case, panic_on_oom shouldn't be invoked and tasks
-       in other groups shouldn't be killed.
-
-       It's not difficult to cause OOM under memcg as following.
-       Case A) when you can swapoff
-       #swapoff -a
-       #echo 50M > /memory.limit_in_bytes
-       run 51M of malloc
-
-       Case B) when you use mem+swap limitation.
-       #echo 50M > memory.limit_in_bytes
-       #echo 50M > memory.memsw.limit_in_bytes
-       run 51M of malloc
-
- 9.9 Move charges at task migration
-       Charges associated with a task can be moved along with task migration.
-
-       (Shell-A)
-       #mkdir /cgroup/A
-       #echo $$ >/cgroup/A/tasks
-       run some programs which uses some amount of memory in /cgroup/A.
-
-       (Shell-B)
-       #mkdir /cgroup/B
-       #echo 1 >/cgroup/B/memory.move_charge_at_immigrate
-       #echo "pid of the program running in group A" >/cgroup/B/tasks
-
-       You can see charges have been moved by reading *.usage_in_bytes or
-       memory.stat of both A and B.
-       See 8.2 of Documentation/cgroups/memory.txt to see what value should be
-       written to move_charge_at_immigrate.
-
- 9.10 Memory thresholds
-       Memory controller implements memory thresholds using cgroups notification
-       API. You can use tools/cgroup/cgroup_event_listener.c to test it.
-
-       (Shell-A) Create cgroup and run event listener
-       # mkdir /cgroup/A
-       # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
-
-       (Shell-B) Add task to cgroup and try to allocate and free memory
-       # echo $$ >/cgroup/A/tasks
-       # a="$(dd if=/dev/zero bs=1M count=10)"
-       # a=
-
-       You will see message from cgroup_event_listener every time you cross
-       the thresholds.
-
-       Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
-
-       It's good idea to test root cgroup as well.
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt

deleted file mode 100644 (file)

index ff71e16..0000000
--- a/Documentation/cgroups/memory.txt
+++ /dev/null
@@ -1,876 +0,0 @@
-Memory Resource Controller
-
-NOTE: This document is hopelessly outdated and it asks for a complete
-      rewrite. It still contains a useful information so we are keeping it
-      here but make sure to check the current code if you need a deeper
-      understanding.
-
-NOTE: The Memory Resource Controller has generically been referred to as the
-      memory controller in this document. Do not confuse memory controller
-      used here with the memory controller that is used in hardware.
-
-(For editors)
-In this document:
-      When we mention a cgroup (cgroupfs's directory) with memory controller,
-      we call it "memory cgroup". When you see git-log and source code, you'll
-      see patch's title and function names tend to use "memcg".
-      In this document, we avoid using it.
-
-Benefits and Purpose of the memory controller
-
-The memory controller isolates the memory behaviour of a group of tasks
-from the rest of the system. The article on LWN [12] mentions some probable
-uses of the memory controller. The memory controller can be used to
-
-a. Isolate an application or a group of applications
-   Memory-hungry applications can be isolated and limited to a smaller
-   amount of memory.
-b. Create a cgroup with a limited amount of memory; this can be used
-   as a good alternative to booting with mem=XXXX.
-c. Virtualization solutions can control the amount of memory they want
-   to assign to a virtual machine instance.
-d. A CD/DVD burner could control the amount of memory used by the
-   rest of the system to ensure that burning does not fail due to lack
-   of available memory.
-e. There are several other use cases; find one or use the controller just
-   for fun (to learn and hack on the VM subsystem).
-
-Current Status: linux-2.6.34-mmotm(development version of 2010/April)
-
-Features:
- - accounting anonymous pages, file caches, swap caches usage and limiting them.
- - pages are linked to per-memcg LRU exclusively, and there is no global LRU.
- - optionally, memory+swap usage can be accounted and limited.
- - hierarchical accounting
- - soft limit
- - moving (recharging) account at moving a task is selectable.
- - usage threshold notifier
- - memory pressure notifier
- - oom-killer disable knob and oom-notifier
- - Root cgroup has no limit controls.
-
- Kernel memory support is a work in progress, and the current version provides
- basically functionality. (See Section 2.7)
-
-Brief summary of control files.
-
- tasks                          # attach a task(thread) and show list of threads
- cgroup.procs                   # show list of processes
- cgroup.event_control           # an interface for event_fd()
- memory.usage_in_bytes          # show current usage for memory
-                                (See 5.5 for details)
- memory.memsw.usage_in_bytes    # show current usage for memory+Swap
-                                (See 5.5 for details)
- memory.limit_in_bytes          # set/show limit of memory usage
- memory.memsw.limit_in_bytes    # set/show limit of memory+Swap usage
- memory.failcnt                         # show the number of memory usage hits limits
- memory.memsw.failcnt           # show the number of memory+Swap hits limits
- memory.max_usage_in_bytes      # show max memory usage recorded
- memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded
- memory.soft_limit_in_bytes     # set/show soft limit of memory usage
- memory.stat                    # show various statistics
- memory.use_hierarchy           # set/show hierarchical account enabled
- memory.force_empty             # trigger forced move charge to parent
- memory.pressure_level          # set memory pressure notifications
- memory.swappiness              # set/show swappiness parameter of vmscan
-                                (See sysctl's vm.swappiness)
- memory.move_charge_at_immigrate # set/show controls of moving charges
- memory.oom_control             # set/show oom controls.
- memory.numa_stat               # show the number of memory usage per numa node
-
- memory.kmem.limit_in_bytes      # set/show hard limit for kernel memory
- memory.kmem.usage_in_bytes      # show current kernel memory allocation
- memory.kmem.failcnt             # show the number of kernel memory usage hits limits
- memory.kmem.max_usage_in_bytes  # show max kernel memory usage recorded
-
- memory.kmem.tcp.limit_in_bytes  # set/show hard limit for tcp buf memory
- memory.kmem.tcp.usage_in_bytes  # show current tcp buf memory allocation
- memory.kmem.tcp.failcnt            # show the number of tcp buf memory usage hits limits
- memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded
-
-1. History
-
-The memory controller has a long history. A request for comments for the memory
-controller was posted by Balbir Singh [1]. At the time the RFC was posted
-there were several implementations for memory control. The goal of the
-RFC was to build consensus and agreement for the minimal features required
-for memory control. The first RSS controller was posted by Balbir Singh[2]
-in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
-RSS controller. At OLS, at the resource management BoF, everyone suggested
-that we handle both page cache and RSS together. Another request was raised
-to allow user space handling of OOM. The current memory controller is
-at version 6; it combines both mapped (RSS) and unmapped Page
-Cache Control [11].
-
-2. Memory Control
-
-Memory is a unique resource in the sense that it is present in a limited
-amount. If a task requires a lot of CPU processing, the task can spread
-its processing over a period of hours, days, months or years, but with
-memory, the same physical memory needs to be reused to accomplish the task.
-
-The memory controller implementation has been divided into phases. These
-are:
-
-1. Memory controller
-2. mlock(2) controller
-3. Kernel user memory accounting and slab control
-4. user mappings length controller
-
-The memory controller is the first controller developed.
-
-2.1. Design
-
-The core of the design is a counter called the page_counter. The
-page_counter tracks the current memory usage and limit of the group of
-processes associated with the controller. Each cgroup has a memory controller
-specific data structure (mem_cgroup) associated with it.
-
-2.2. Accounting
-
-               +--------------------+
-               |  mem_cgroup        |
-               |  (page_counter)    |
-               +--------------------+
-                /            ^      \
-               /             |       \
-           +---------------+  |        +---------------+
-           | mm_struct     |  |....    | mm_struct     |
-           |               |  |        |               |
-           +---------------+  |        +---------------+
-                              |
-                              + --------------+
-                                              |
-           +---------------+           +------+--------+
-           | page          +---------->  page_cgroup|
-           |               |           |               |
-           +---------------+           +---------------+
-
-             (Figure 1: Hierarchy of Accounting)
-
-
-Figure 1 shows the important aspects of the controller
-
-1. Accounting happens per cgroup
-2. Each mm_struct knows about which cgroup it belongs to
-3. Each page has a pointer to the page_cgroup, which in turn knows the
-   cgroup it belongs to
-
-The accounting is done as follows: mem_cgroup_charge_common() is invoked to
-set up the necessary data structures and check if the cgroup that is being
-charged is over its limit. If it is, then reclaim is invoked on the cgroup.
-More details can be found in the reclaim section of this document.
-If everything goes well, a page meta-data-structure called page_cgroup is
-updated. page_cgroup has its own LRU on cgroup.
-(*) page_cgroup structure is allocated at boot/memory-hotplug time.
-
-2.2.1 Accounting details
-
-All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
-Some pages which are never reclaimable and will not be on the LRU
-are not accounted. We just account pages under usual VM management.
-
-RSS pages are accounted at page_fault unless they've already been accounted
-for earlier. A file page will be accounted for as Page Cache when it's
-inserted into inode (radix-tree). While it's mapped into the page tables of
-processes, duplicate accounting is carefully avoided.
-
-An RSS page is unaccounted when it's fully unmapped. A PageCache page is
-unaccounted when it's removed from radix-tree. Even if RSS pages are fully
-unmapped (by kswapd), they may exist as SwapCache in the system until they
-are really freed. Such SwapCaches are also accounted.
-A swapped-in page is not accounted until it's mapped.
-
-Note: The kernel does swapin-readahead and reads multiple swaps at once.
-This means swapped-in pages may contain pages for other tasks than a task
-causing page fault. So, we avoid accounting at swap-in I/O.
-
-At page migration, accounting information is kept.
-
-Note: we just account pages-on-LRU because our purpose is to control amount
-of used pages; not-on-LRU pages tend to be out-of-control from VM view.
-
-2.3 Shared Page Accounting
-
-Shared pages are accounted on the basis of the first touch approach. The
-cgroup that first touches a page is accounted for the page. The principle
-behind this approach is that a cgroup that aggressively uses a shared
-page will eventually get charged for it (once it is uncharged from
-the cgroup that brought it in -- this will happen on memory pressure).
-
-But see section 8.2: when moving a task to another cgroup, its pages may
-be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
-
-Exception: If CONFIG_MEMCG_SWAP is not used.
-When you do swapoff and make swapped-out pages of shmem(tmpfs) to
-be backed into memory in force, charges for pages are accounted against the
-caller of swapoff rather than the users of shmem.
-
-2.4 Swap Extension (CONFIG_MEMCG_SWAP)
-
-Swap Extension allows you to record charge for swap. A swapped-in page is
-charged back to original page allocator if possible.
-
-When swap is accounted, following files are added.
- - memory.memsw.usage_in_bytes.
- - memory.memsw.limit_in_bytes.
-
-memsw means memory+swap. Usage of memory+swap is limited by
-memsw.limit_in_bytes.
-
-Example: Assume a system with 4G of swap. A task which allocates 6G of memory
-(by mistake) under 2G memory limitation will use all swap.
-In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
-By using the memsw limit, you can avoid system OOM which can be caused by swap
-shortage.
-
-* why 'memory+swap' rather than swap.
-The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
-to move account from memory to swap...there is no change in usage of
-memory+swap. In other words, when we want to limit the usage of swap without
-affecting global LRU, memory+swap limit is better than just limiting swap from
-an OS point of view.
-
-* What happens when a cgroup hits memory.memsw.limit_in_bytes
-When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
-in this cgroup. Then, swap-out will not be done by cgroup routine and file
-caches are dropped. But as mentioned above, global LRU can do swapout memory
-from it for sanity of the system's memory management state. You can't forbid
-it by cgroup.
-
-2.5 Reclaim
-
-Each cgroup maintains a per cgroup LRU which has the same structure as
-global VM. When a cgroup goes over its limit, we first try
-to reclaim memory from the cgroup so as to make space for the new
-pages that the cgroup has touched. If the reclaim is unsuccessful,
-an OOM routine is invoked to select and kill the bulkiest task in the
-cgroup. (See 10. OOM Control below.)
-
-The reclaim algorithm has not been modified for cgroups, except that
-pages that are selected for reclaiming come from the per-cgroup LRU
-list.
-
-NOTE: Reclaim does not work for the root cgroup, since we cannot set any
-limits on the root cgroup.
-
-Note2: When panic_on_oom is set to "2", the whole system will panic.
-
-When oom event notifier is registered, event will be delivered.
-(See oom_control section)
-
-2.6 Locking
-
-   lock_page_cgroup()/unlock_page_cgroup() should not be called under
-   mapping->tree_lock.
-
-   Other lock order is following:
-   PG_locked.
-   mm->page_table_lock
-       zone->lru_lock
-         lock_page_cgroup.
-  In many cases, just lock_page_cgroup() is called.
-  per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
-  zone->lru_lock, it has no lock of its own.
-
-2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
-
-With the Kernel memory extension, the Memory Controller is able to limit
-the amount of kernel memory used by the system. Kernel memory is fundamentally
-different than user memory, since it can't be swapped out, which makes it
-possible to DoS the system by consuming too much of this precious resource.
-
-Kernel memory won't be accounted at all until limit on a group is set. This
-allows for existing setups to continue working without disruption.  The limit
-cannot be set if the cgroup have children, or if there are already tasks in the
-cgroup. Attempting to set the limit under those conditions will return -EBUSY.
-When use_hierarchy == 1 and a group is accounted, its children will
-automatically be accounted regardless of their limit value.
-
-After a group is first limited, it will be kept being accounted until it
-is removed. The memory limitation itself, can of course be removed by writing
--1 to memory.kmem.limit_in_bytes. In this case, kmem will be accounted, but not
-limited.
-
-Kernel memory limits are not imposed for the root cgroup. Usage for the root
-cgroup may or may not be accounted. The memory used is accumulated into
-memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
-(currently only for tcp).
-The main "kmem" counter is fed into the main counter, so kmem charges will
-also be visible from the user counter.
-
-Currently no soft limit is implemented for kernel memory. It is future work
-to trigger slab reclaim when those limits are reached.
-
-2.7.1 Current Kernel Memory resources accounted
-
-* stack pages: every process consumes some stack pages. By accounting into
-kernel memory, we prevent new processes from being created when the kernel
-memory usage is too high.
-
-* slab pages: pages allocated by the SLAB or SLUB allocator are tracked. A copy
-of each kmem_cache is created every time the cache is touched by the first time
-from inside the memcg. The creation is done lazily, so some objects can still be
-skipped while the cache is being created. All objects in a slab page should
-belong to the same memcg. This only fails to hold when a task is migrated to a
-different memcg during the page allocation by the cache.
-
-* sockets memory pressure: some sockets protocols have memory pressure
-thresholds. The Memory Controller allows them to be controlled individually
-per cgroup, instead of globally.
-
-* tcp memory pressure: sockets memory pressure for the tcp protocol.
-
-2.7.2 Common use cases
-
-Because the "kmem" counter is fed to the main user counter, kernel memory can
-never be limited completely independently of user memory. Say "U" is the user
-limit, and "K" the kernel limit. There are three possible ways limits can be
-set:
-
-    U != 0, K = unlimited:
-    This is the standard memcg limitation mechanism already present before kmem
-    accounting. Kernel memory is completely ignored.
-
-    U != 0, K < U:
-    Kernel memory is a subset of the user memory. This setup is useful in
-    deployments where the total amount of memory per-cgroup is overcommited.
-    Overcommiting kernel memory limits is definitely not recommended, since the
-    box can still run out of non-reclaimable memory.
-    In this case, the admin could set up K so that the sum of all groups is
-    never greater than the total memory, and freely set U at the cost of his
-    QoS.
-    WARNING: In the current implementation, memory reclaim will NOT be
-    triggered for a cgroup when it hits K while staying below U, which makes
-    this setup impractical.
-
-    U != 0, K >= U:
-    Since kmem charges will also be fed to the user counter and reclaim will be
-    triggered for the cgroup for both kinds of memory. This setup gives the
-    admin a unified view of memory, and it is also useful for people who just
-    want to track kernel memory usage.
-
-3. User Interface
-
-3.0. Configuration
-
-a. Enable CONFIG_CGROUPS
-b. Enable CONFIG_MEMCG
-c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
-d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
-
-3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
-# mount -t tmpfs none /sys/fs/cgroup
-# mkdir /sys/fs/cgroup/memory
-# mount -t cgroup none /sys/fs/cgroup/memory -o memory
-
-3.2. Make the new group and move bash into it
-# mkdir /sys/fs/cgroup/memory/0
-# echo $$ > /sys/fs/cgroup/memory/0/tasks
-
-Since now we're in the 0 cgroup, we can alter the memory limit:
-# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
-
-NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
-mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.)
-
-NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
-NOTE: We cannot set limits on the root cgroup any more.
-
-# cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
-4194304
-
-We can check the usage:
-# cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
-1216512
-
-A successful write to this file does not guarantee a successful setting of
-this limit to the value written into the file. This can be due to a
-number of factors, such as rounding up to page boundaries or the total
-availability of memory on the system. The user is required to re-read
-this file after a write to guarantee the value committed by the kernel.
-
-# echo 1 > memory.limit_in_bytes
-# cat memory.limit_in_bytes
-4096
-
-The memory.failcnt field gives the number of times that the cgroup limit was
-exceeded.
-
-The memory.stat file gives accounting information. Now, the number of
-caches, RSS and Active pages/Inactive pages are shown.
-
-4. Testing
-
-For testing features and implementation, see memcg_test.txt.
-
-Performance test is also important. To see pure memory controller's overhead,
-testing on tmpfs will give you good numbers of small overheads.
-Example: do kernel make on tmpfs.
-
-Page-fault scalability is also important. At measuring parallel
-page fault test, multi-process test may be better than multi-thread
-test because it has noise of shared objects/status.
-
-But the above two are testing extreme situations.
-Trying usual test under memory controller is always helpful.
-
-4.1 Troubleshooting
-
-Sometimes a user might find that the application under a cgroup is
-terminated by the OOM killer. There are several causes for this:
-
-1. The cgroup limit is too low (just too low to do anything useful)
-2. The user is using anonymous memory and swap is turned off or too low
-
-A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
-some of the pages cached in the cgroup (page cache pages).
-
-To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
-seeing what happens will be helpful.
-
-4.2 Task migration
-
-When a task migrates from one cgroup to another, its charge is not
-carried forward by default. The pages allocated from the original cgroup still
-remain charged to it, the charge is dropped when the page is freed or
-reclaimed.
-
-You can move charges of a task along with task migration.
-See 8. "Move charges at task migration"
-
-4.3 Removing a cgroup
-
-A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
-cgroup might have some charge associated with it, even though all
-tasks have migrated away from it. (because we charge against pages, not
-against tasks.)
-
-We move the stats to root (if use_hierarchy==0) or parent (if
-use_hierarchy==1), and no change on the charge except uncharging
-from the child.
-
-Charges recorded in swap information is not updated at removal of cgroup.
-Recorded information is discarded and a cgroup which uses swap (swapcache)
-will be charged as a new owner of it.
-
-About use_hierarchy, see Section 6.
-
-5. Misc. interfaces.
-
-5.1 force_empty
-  memory.force_empty interface is provided to make cgroup's memory usage empty.
-  When writing anything to this
-
-  # echo 0 > memory.force_empty
-
-  the cgroup will be reclaimed and as many pages reclaimed as possible.
-
-  The typical use case for this interface is before calling rmdir().
-  Because rmdir() moves all pages to parent, some out-of-use page caches can be
-  moved to the parent. If you want to avoid that, force_empty will be useful.
-
-  Also, note that when memory.kmem.limit_in_bytes is set the charges due to
-  kernel pages will still be seen. This is not considered a failure and the
-  write will still return success. In this case, it is expected that
-  memory.kmem.usage_in_bytes == memory.usage_in_bytes.
-
-  About use_hierarchy, see Section 6.
-
-5.2 stat file
-
-memory.stat file includes following statistics
-
-# per-memory cgroup local status
-cache          - # of bytes of page cache memory.
-rss            - # of bytes of anonymous and swap cache memory (includes
-               transparent hugepages).
-rss_huge       - # of bytes of anonymous transparent hugepages.
-mapped_file    - # of bytes of mapped file (includes tmpfs/shmem)
-pgpgin         - # of charging events to the memory cgroup. The charging
-               event happens each time a page is accounted as either mapped
-               anon page(RSS) or cache page(Page Cache) to the cgroup.
-pgpgout                - # of uncharging events to the memory cgroup. The uncharging
-               event happens each time a page is unaccounted from the cgroup.
-swap           - # of bytes of swap usage
-dirty          - # of bytes that are waiting to get written back to the disk.
-writeback      - # of bytes of file/anon cache that are queued for syncing to
-               disk.
-inactive_anon  - # of bytes of anonymous and swap cache memory on inactive
-               LRU list.
-active_anon    - # of bytes of anonymous and swap cache memory on active
-               LRU list.
-inactive_file  - # of bytes of file-backed memory on inactive LRU list.
-active_file    - # of bytes of file-backed memory on active LRU list.
-unevictable    - # of bytes of memory that cannot be reclaimed (mlocked etc).
-
-# status considering hierarchy (see memory.use_hierarchy settings)
-
-hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy
-                       under which the memory cgroup is
-hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to
-                       hierarchy under which memory cgroup is.
-
-total_<counter>                - # hierarchical version of <counter>, which in
-                       addition to the cgroup's own value includes the
-                       sum of all hierarchical children's values of
-                       <counter>, i.e. total_cache
-
-# The following additional stats are dependent on CONFIG_DEBUG_VM.
-
-recent_rotated_anon    - VM internal parameter. (see mm/vmscan.c)
-recent_rotated_file    - VM internal parameter. (see mm/vmscan.c)
-recent_scanned_anon    - VM internal parameter. (see mm/vmscan.c)
-recent_scanned_file    - VM internal parameter. (see mm/vmscan.c)
-
-Memo:
-       recent_rotated means recent frequency of LRU rotation.
-       recent_scanned means recent # of scans to LRU.
-       showing for better debug please see the code for meanings.
-
-Note:
-       Only anonymous and swap cache memory is listed as part of 'rss' stat.
-       This should not be confused with the true 'resident set size' or the
-       amount of physical memory used by the cgroup.
-       'rss + file_mapped" will give you resident set size of cgroup.
-       (Note: file and shmem may be shared among other cgroups. In that case,
-        file_mapped is accounted only when the memory cgroup is owner of page
-        cache.)
-
-5.3 swappiness
-
-Overrides /proc/sys/vm/swappiness for the particular group. The tunable
-in the root cgroup corresponds to the global swappiness setting.
-
-Please note that unlike during the global reclaim, limit reclaim
-enforces that 0 swappiness really prevents from any swapping even if
-there is a swap storage available. This might lead to memcg OOM killer
-if there are no file pages to reclaim.
-
-5.4 failcnt
-
-A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
-This failcnt(== failure count) shows the number of times that a usage counter
-hit its limit. When a memory cgroup hits a limit, failcnt increases and
-memory under it will be reclaimed.
-
-You can reset failcnt by writing 0 to failcnt file.
-# echo 0 > .../memory.failcnt
-
-5.5 usage_in_bytes
-
-For efficiency, as other kernel components, memory cgroup uses some optimization
-to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
-method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz
-value for efficient access. (Of course, when necessary, it's synchronized.)
-If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
-value in memory.stat(see 5.2).
-
-5.6 numa_stat
-
-This is similar to numa_maps but operates on a per-memcg basis.  This is
-useful for providing visibility into the numa locality information within
-an memcg since the pages are allowed to be allocated from any physical
-node.  One of the use cases is evaluating application performance by
-combining this information with the application's CPU allocation.
-
-Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
-per-node page counts including "hierarchical_<counter>" which sums up all
-hierarchical children's values in addition to the memcg's own value.
-
-The output format of memory.numa_stat is:
-
-total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
-file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
-anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
-unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
-hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
-
-The "total" count is sum of file + anon + unevictable.
-
-6. Hierarchy support
-
-The memory controller supports a deep hierarchy and hierarchical accounting.
-The hierarchy is created by creating the appropriate cgroups in the
-cgroup filesystem. Consider for example, the following cgroup filesystem
-hierarchy
-
-              root
-            /  |   \
-            /  |    \
-          a    b     c
-                     | \
-                     |  \
-                     d   e
-
-In the diagram above, with hierarchical accounting enabled, all memory
-usage of e, is accounted to its ancestors up until the root (i.e, c and root),
-that has memory.use_hierarchy enabled. If one of the ancestors goes over its
-limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
-children of the ancestor.
-
-6.1 Enabling hierarchical accounting and reclaim
-
-A memory cgroup by default disables the hierarchy feature. Support
-can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup
-
-# echo 1 > memory.use_hierarchy
-
-The feature can be disabled by
-
-# echo 0 > memory.use_hierarchy
-
-NOTE1: Enabling/disabling will fail if either the cgroup already has other
-       cgroups created below it, or if the parent cgroup has use_hierarchy
-       enabled.
-
-NOTE2: When panic_on_oom is set to "2", the whole system will panic in
-       case of an OOM event in any cgroup.
-
-7. Soft limits
-
-Soft limits allow for greater sharing of memory. The idea behind soft limits
-is to allow control groups to use as much of the memory as needed, provided
-
-a. There is no memory contention
-b. They do not exceed their hard limit
-
-When the system detects memory contention or low memory, control groups
-are pushed back to their soft limits. If the soft limit of each control
-group is very high, they are pushed back as much as possible to make
-sure that one control group does not starve the others of memory.
-
-Please note that soft limits is a best-effort feature; it comes with
-no guarantees, but it does its best to make sure that when memory is
-heavily contended for, memory is allocated based on the soft limit
-hints/setup. Currently soft limit based reclaim is set up such that
-it gets invoked from balance_pgdat (kswapd).
-
-7.1 Interface
-
-Soft limits can be setup by using the following commands (in this example we
-assume a soft limit of 256 MiB)
-
-# echo 256M > memory.soft_limit_in_bytes
-
-If we want to change this to 1G, we can at any time use
-
-# echo 1G > memory.soft_limit_in_bytes
-
-NOTE1: Soft limits take effect over a long period of time, since they involve
-       reclaiming memory for balancing between memory cgroups
-NOTE2: It is recommended to set the soft limit always below the hard limit,
-       otherwise the hard limit will take precedence.
-
-8. Move charges at task migration
-
-Users can move charges associated with a task along with task migration, that
-is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
-This feature is not supported in !CONFIG_MMU environments because of lack of
-page tables.
-
-8.1 Interface
-
-This feature is disabled by default. It can be enabled (and disabled again) by
-writing to memory.move_charge_at_immigrate of the destination cgroup.
-
-If you want to enable it:
-
-# echo (some positive value) > memory.move_charge_at_immigrate
-
-Note: Each bits of move_charge_at_immigrate has its own meaning about what type
-      of charges should be moved. See 8.2 for details.
-Note: Charges are moved only when you move mm->owner, in other words,
-      a leader of a thread group.
-Note: If we cannot find enough space for the task in the destination cgroup, we
-      try to make space by reclaiming memory. Task migration may fail if we
-      cannot make enough space.
-Note: It can take several seconds if you move charges much.
-
-And if you want disable it again:
-
-# echo 0 > memory.move_charge_at_immigrate
-
-8.2 Type of charges which can be moved
-
-Each bit in move_charge_at_immigrate has its own meaning about what type of
-charges should be moved. But in any case, it must be noted that an account of
-a page or a swap can be moved only when it is charged to the task's current
-(old) memory cgroup.
-
-  bit | what type of charges would be moved ?
- -----+------------------------------------------------------------------------
-   0  | A charge of an anonymous page (or swap of it) used by the target task.
-      | You must enable Swap Extension (see 2.4) to enable move of swap charges.
- -----+------------------------------------------------------------------------
-   1  | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory)
-      | and swaps of tmpfs file) mmapped by the target task. Unlike the case of
-      | anonymous pages, file pages (and swaps) in the range mmapped by the task
-      | will be moved even if the task hasn't done page fault, i.e. they might
-      | not be the task's "RSS", but other task's "RSS" that maps the same file.
-      | And mapcount of the page is ignored (the page can be moved even if
-      | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to
-      | enable move of swap charges.
-
-8.3 TODO
-
-- All of moving charge operations are done under cgroup_mutex. It's not good
-  behavior to hold the mutex too long, so we may need some trick.
-
-9. Memory thresholds
-
-Memory cgroup implements memory thresholds using the cgroups notification
-API (see cgroups.txt). It allows to register multiple memory and memsw
-thresholds and gets notifications when it crosses.
-
-To register a threshold, an application must:
-- create an eventfd using eventfd(2);
-- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
-- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
-  cgroup.event_control.
-
-Application will be notified through eventfd when memory usage crosses
-threshold in any direction.
-
-It's applicable for root and non-root cgroup.
-
-10. OOM Control
-
-memory.oom_control file is for OOM notification and other controls.
-
-Memory cgroup implements OOM notifier using the cgroup notification
-API (See cgroups.txt). It allows to register multiple OOM notification
-delivery and gets notification when OOM happens.
-
-To register a notifier, an application must:
- - create an eventfd using eventfd(2)
- - open memory.oom_control file
- - write string like "<event_fd> <fd of memory.oom_control>" to
-   cgroup.event_control
-
-The application will be notified through eventfd when OOM happens.
-OOM notification doesn't work for the root cgroup.
-
-You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
-
-       #echo 1 > memory.oom_control
-
-If OOM-killer is disabled, tasks under cgroup will hang/sleep
-in memory cgroup's OOM-waitqueue when they request accountable memory.
-
-For running them, you have to relax the memory cgroup's OOM status by
-       * enlarge limit or reduce usage.
-To reduce usage,
-       * kill some tasks.
-       * move some tasks to other group with account migration.
-       * remove some files (on tmpfs?)
-
-Then, stopped tasks will work again.
-
-At reading, current status of OOM is shown.
-       oom_kill_disable 0 or 1 (if 1, oom-killer is disabled)
-       under_oom        0 or 1 (if 1, the memory cgroup is under OOM, tasks may
-                                be stopped.)
-
-11. Memory Pressure
-
-The pressure level notifications can be used to monitor the memory
-allocation cost; based on the pressure, applications can implement
-different strategies of managing their memory resources. The pressure
-levels are defined as following:
-
-The "low" level means that the system is reclaiming memory for new
-allocations. Monitoring this reclaiming activity might be useful for
-maintaining cache level. Upon notification, the program (typically
-"Activity Manager") might analyze vmstat and act in advance (i.e.
-prematurely shutdown unimportant services).
-
-The "medium" level means that the system is experiencing medium memory
-pressure, the system might be making swap, paging out active file caches,
-etc. Upon this event applications may decide to further analyze
-vmstat/zoneinfo/memcg or internal memory usage statistics and free any
-resources that can be easily reconstructed or re-read from a disk.
-
-The "critical" level means that the system is actively thrashing, it is
-about to out of memory (OOM) or even the in-kernel OOM killer is on its
-way to trigger. Applications should do whatever they can to help the
-system. It might be too late to consult with vmstat or any other
-statistics, so it's advisable to take an immediate action.
-
-The events are propagated upward until the event is handled, i.e. the
-events are not pass-through. Here is what this means: for example you have
-three cgroups: A->B->C. Now you set up an event listener on cgroups A, B
-and C, and suppose group C experiences some pressure. In this situation,
-only group C will receive the notification, i.e. groups A and B will not
-receive it. This is done to avoid excessive "broadcasting" of messages,
-which disturbs the system and which is especially bad if we are low on
-memory or thrashing. So, organize the cgroups wisely, or propagate the
-events manually (or, ask us to implement the pass-through events,
-explaining why would you need them.)
-
-The file memory.pressure_level is only used to setup an eventfd. To
-register a notification, an application must:
-
-- create an eventfd using eventfd(2);
-- open memory.pressure_level;
-- write string like "<event_fd> <fd of memory.pressure_level> <level>"
-  to cgroup.event_control.
-
-Application will be notified through eventfd when memory pressure is at
-the specific level (or higher). Read/write operations to
-memory.pressure_level are no implemented.
-
-Test:
-
-   Here is a small script example that makes a new cgroup, sets up a
-   memory limit, sets up a notification in the cgroup and then makes child
-   cgroup experience a critical pressure:
-
-   # cd /sys/fs/cgroup/memory/
-   # mkdir foo
-   # cd foo
-   # cgroup_event_listener memory.pressure_level low &
-   # echo 8000000 > memory.limit_in_bytes
-   # echo 8000000 > memory.memsw.limit_in_bytes
-   # echo $$ > tasks
-   # dd if=/dev/zero | read x
-
-   (Expect a bunch of notifications, and eventually, the oom-killer will
-   trigger.)
-
-12. TODO
-
-1. Make per-cgroup scanner reclaim not-shared pages first
-2. Teach controller to account for shared-pages
-3. Start reclamation in the background when the limit is
-   not yet hit but the usage is getting closer
-
-Summary
-
-Overall, the memory controller has been a stable controller and has been
-commented and discussed quite extensively in the community.
-
-References
-
-1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
-2. Singh, Balbir. Memory Controller (RSS Control),
-   http://lwn.net/Articles/222762/
-3. Emelianov, Pavel. Resource controllers based on process cgroups
-   http://lkml.org/lkml/2007/3/6/198
-4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
-   http://lkml.org/lkml/2007/4/9/78
-5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
-   http://lkml.org/lkml/2007/5/30/244
-6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
-7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
-   subsystem (v3), http://lwn.net/Articles/235534/
-8. Singh, Balbir. RSS controller v2 test results (lmbench),
-   http://lkml.org/lkml/2007/5/17/232
-9. Singh, Balbir. RSS controller v2 AIM9 results
-   http://lkml.org/lkml/2007/5/18/1
-10. Singh, Balbir. Memory controller v6 test results,
-    http://lkml.org/lkml/2007/8/19/36
-11. Singh, Balbir. Memory controller introduction (v6),
-    http://lkml.org/lkml/2007/8/17/69
-12. Corbet, Jonathan, Controlling memory use in cgroups,
-    http://lwn.net/Articles/243795/
diff --git a/Documentation/cgroups/net_cls.txt b/Documentation/cgroups/net_cls.txt

deleted file mode 100644 (file)

index ec18234..0000000
--- a/Documentation/cgroups/net_cls.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-Network classifier cgroup
--------------------------
-
-The Network classifier cgroup provides an interface to
-tag network packets with a class identifier (classid).
-
-The Traffic Controller (tc) can be used to assign
-different priorities to packets from different cgroups.
-Also, Netfilter (iptables) can use this tag to perform
-actions on such packets.
-
-Creating a net_cls cgroups instance creates a net_cls.classid file.
-This net_cls.classid value is initialized to 0.
-
-You can write hexadecimal values to net_cls.classid; the format for these
-values is 0xAAAABBBB; AAAA is the major handle number and BBBB
-is the minor handle number.
-Reading net_cls.classid yields a decimal result.
-
-Example:
-mkdir /sys/fs/cgroup/net_cls
-mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls
-mkdir /sys/fs/cgroup/net_cls/0
-echo 0x100001 >  /sys/fs/cgroup/net_cls/0/net_cls.classid
-       - setting a 10:1 handle.
-
-cat /sys/fs/cgroup/net_cls/0/net_cls.classid
-1048577
-
-configuring tc:
-tc qdisc add dev eth0 root handle 10: htb
-
-tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit
- - creating traffic class 10:1
-
-tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup
-
-configuring iptables, basic example:
-iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP
diff --git a/Documentation/cgroups/net_prio.txt b/Documentation/cgroups/net_prio.txt

deleted file mode 100644 (file)

index a82cbd2..0000000
--- a/Documentation/cgroups/net_prio.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-Network priority cgroup
--------------------------
-
-The Network priority cgroup provides an interface to allow an administrator to
-dynamically set the priority of network traffic generated by various
-applications
-
-Nominally, an application would set the priority of its traffic via the
-SO_PRIORITY socket option.  This however, is not always possible because:
-
-1) The application may not have been coded to set this value
-2) The priority of application traffic is often a site-specific administrative
-   decision rather than an application defined one.
-
-This cgroup allows an administrator to assign a process to a group which defines
-the priority of egress traffic on a given interface. Network priority groups can
-be created by first mounting the cgroup filesystem.
-
-# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
-
-With the above step, the initial group acting as the parent accounting group
-becomes visible at '/sys/fs/cgroup/net_prio'.  This group includes all tasks in
-the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup.
-
-Each net_prio cgroup contains two files that are subsystem specific
-
-net_prio.prioidx
-This file is read-only, and is simply informative.  It contains a unique integer
-value that the kernel uses as an internal representation of this cgroup.
-
-net_prio.ifpriomap
-This file contains a map of the priorities assigned to traffic originating from
-processes in this group and egressing the system on various interfaces. It
-contains a list of tuples in the form <ifname priority>.  Contents of this file
-can be modified by echoing a string into the file using the same tuple format.
-for example:
-
-echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap
-
-This command would force any traffic originating from processes belonging to the
-iscsi net_prio cgroup and egressing on interface eth0 to have the priority of
-said traffic set to the value 5. The parent accounting group also has a
-writeable 'net_prio.ifpriomap' file that can be used to set a system default
-priority.
-
-Priorities are set immediately prior to queueing a frame to the device
-queueing discipline (qdisc) so priorities will be assigned prior to the hardware
-queue selection being made.
-
-One usage for the net_prio cgroup is with mqprio qdisc allowing application
-traffic to be steered to hardware/driver based traffic classes. These mappings
-can then be managed by administrators or other networking protocols such as
-DCBX.
-
-A new net_prio cgroup inherits the parent's configuration.
diff --git a/Documentation/cgroups/pids.txt b/Documentation/cgroups/pids.txt

deleted file mode 100644 (file)

index 1a078b5..0000000
--- a/Documentation/cgroups/pids.txt
+++ /dev/null
@@ -1,85 +0,0 @@
-                                                  Process Number Controller
-                                                  =========================
-
-Abstract
---------
-
-The process number controller is used to allow a cgroup hierarchy to stop any
-new tasks from being fork()'d or clone()'d after a certain limit is reached.
-
-Since it is trivial to hit the task limit without hitting any kmemcg limits in
-place, PIDs are a fundamental resource. As such, PID exhaustion must be
-preventable in the scope of a cgroup hierarchy by allowing resource limiting of
-the number of tasks in a cgroup.
-
-Usage
------
-
-In order to use the `pids` controller, set the maximum number of tasks in
-pids.max (this is not available in the root cgroup for obvious reasons). The
-number of processes currently in the cgroup is given by pids.current.
-
-Organisational operations are not blocked by cgroup policies, so it is possible
-to have pids.current > pids.max. This can be done by either setting the limit to
-be smaller than pids.current, or attaching enough processes to the cgroup such
-that pids.current > pids.max. However, it is not possible to violate a cgroup
-policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
-creation of a new process would cause a cgroup policy to be violated.
-
-To set a cgroup to have no limit, set pids.max to "max". This is the default for
-all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
-limit in the hierarchy is followed).
-
-pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
-superset of parent/child/pids.current.
-
-Example
--------
-
-First, we mount the pids controller:
-# mkdir -p /sys/fs/cgroup/pids
-# mount -t cgroup -o pids none /sys/fs/cgroup/pids
-
-Then we create a hierarchy, set limits and attach processes to it:
-# mkdir -p /sys/fs/cgroup/pids/parent/child
-# echo 2 > /sys/fs/cgroup/pids/parent/pids.max
-# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
-# cat /sys/fs/cgroup/pids/parent/pids.current
-2
-#
-
-It should be noted that attempts to overcome the set limit (2 in this case) will
-fail:
-
-# cat /sys/fs/cgroup/pids/parent/pids.current
-2
-# ( /bin/echo "Here's some processes for you." | cat )
-sh: fork: Resource temporary unavailable
-#
-
-Even if we migrate to a child cgroup (which doesn't have a set limit), we will
-not be able to overcome the most stringent limit in the hierarchy (in this case,
-parent's):
-
-# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
-# cat /sys/fs/cgroup/pids/parent/pids.current
-2
-# cat /sys/fs/cgroup/pids/parent/child/pids.current
-2
-# cat /sys/fs/cgroup/pids/parent/child/pids.max
-max
-# ( /bin/echo "Here's some processes for you." | cat )
-sh: fork: Resource temporary unavailable
-#
-
-We can set a limit that is smaller than pids.current, which will stop any new
-processes from being forked at all (note that the shell itself counts towards
-pids.current):
-
-# echo 1 > /sys/fs/cgroup/pids/parent/pids.max
-# /bin/echo "We can't even spawn a single process now."
-sh: fork: Resource temporary unavailable
-# echo 0 > /sys/fs/cgroup/pids/parent/pids.max
-# /bin/echo "We can't even spawn a single process now."
-sh: fork: Resource temporary unavailable
-#
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt

deleted file mode 100644 (file)

index 781b1d4..0000000
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ /dev/null
@@ -1,647 +0,0 @@
-
-Cgroup unified hierarchy
-
-April, 2014            Tejun Heo <tj@kernel.org>
-
-This document describes the changes made by unified hierarchy and
-their rationales.  It will eventually be merged into the main cgroup
-documentation.
-
-CONTENTS
-
-1. Background
-2. Basic Operation
-  2-1. Mounting
-  2-2. cgroup.subtree_control
-  2-3. cgroup.controllers
-3. Structural Constraints
-  3-1. Top-down
-  3-2. No internal tasks
-4. Delegation
-  4-1. Model of delegation
-  4-2. Common ancestor rule
-5. Other Changes
-  5-1. [Un]populated Notification
-  5-2. Other Core Changes
-  5-3. Controller File Conventions
-    5-3-1. Format
-    5-3-2. Control Knobs
-  5-4. Per-Controller Changes
-    5-4-1. io
-    5-4-2. cpuset
-    5-4-3. memory
-6. Planned Changes
-  6-1. CAP for resource control
-
-
-1. Background
-
-cgroup allows an arbitrary number of hierarchies and each hierarchy
-can host any number of controllers.  While this seems to provide a
-high level of flexibility, it isn't quite useful in practice.
-
-For example, as there is only one instance of each controller, utility
-type controllers such as freezer which can be useful in all
-hierarchies can only be used in one.  The issue is exacerbated by the
-fact that controllers can't be moved around once hierarchies are
-populated.  Another issue is that all controllers bound to a hierarchy
-are forced to have exactly the same view of the hierarchy.  It isn't
-possible to vary the granularity depending on the specific controller.
-
-In practice, these issues heavily limit which controllers can be put
-on the same hierarchy and most configurations resort to putting each
-controller on its own hierarchy.  Only closely related ones, such as
-the cpu and cpuacct controllers, make sense to put on the same
-hierarchy.  This often means that userland ends up managing multiple
-similar hierarchies repeating the same steps on each hierarchy
-whenever a hierarchy management operation is necessary.
-
-Unfortunately, support for multiple hierarchies comes at a steep cost.
-Internal implementation in cgroup core proper is dazzlingly
-complicated but more importantly the support for multiple hierarchies
-restricts how cgroup is used in general and what controllers can do.
-
-There's no limit on how many hierarchies there may be, which means
-that a task's cgroup membership can't be described in finite length.
-The key may contain any varying number of entries and is unlimited in
-length, which makes it highly awkward to handle and leads to addition
-of controllers which exist only to identify membership, which in turn
-exacerbates the original problem.
-
-Also, as a controller can't have any expectation regarding what shape
-of hierarchies other controllers would be on, each controller has to
-assume that all other controllers are operating on completely
-orthogonal hierarchies.  This makes it impossible, or at least very
-cumbersome, for controllers to cooperate with each other.
-
-In most use cases, putting controllers on hierarchies which are
-completely orthogonal to each other isn't necessary.  What usually is
-called for is the ability to have differing levels of granularity
-depending on the specific controller.  In other words, hierarchy may
-be collapsed from leaf towards root when viewed from specific
-controllers.  For example, a given configuration might not care about
-how memory is distributed beyond a certain level while still wanting
-to control how CPU cycles are distributed.
-
-Unified hierarchy is the next version of cgroup interface.  It aims to
-address the aforementioned issues by having more structure while
-retaining enough flexibility for most use cases.  Various other
-general and controller-specific interface issues are also addressed in
-the process.
-
-
-2. Basic Operation
-
-2-1. Mounting
-
-Currently, unified hierarchy can be mounted with the following mount
-command.  Note that this is still under development and scheduled to
-change soon.
-
- mount -t cgroup -o __DEVEL__sane_behavior cgroup $MOUNT_POINT
-
-All controllers which support the unified hierarchy and are not bound
-to other hierarchies are automatically bound to unified hierarchy and
-show up at the root of it.  Controllers which are enabled only in the
-root of unified hierarchy can be bound to other hierarchies.  This
-allows mixing unified hierarchy with the traditional multiple
-hierarchies in a fully backward compatible way.
-
-A controller can be moved across hierarchies only after the controller
-is no longer referenced in its current hierarchy.  Because per-cgroup
-controller states are destroyed asynchronously and controllers may
-have lingering references, a controller may not show up immediately on
-the unified hierarchy after the final umount of the previous
-hierarchy.  Similarly, a controller should be fully disabled to be
-moved out of the unified hierarchy and it may take some time for the
-disabled controller to become available for other hierarchies;
-furthermore, due to dependencies among controllers, other controllers
-may need to be disabled too.
-
-While useful for development and manual configurations, dynamically
-moving controllers between the unified and other hierarchies is
-strongly discouraged for production use.  It is recommended to decide
-the hierarchies and controller associations before starting using the
-controllers.
-
-
-2-2. cgroup.subtree_control
-
-All cgroups on unified hierarchy have a "cgroup.subtree_control" file
-which governs which controllers are enabled on the children of the
-cgroup.  Let's assume a hierarchy like the following.
-
-  root - A - B - C
-               \ D
-
-root's "cgroup.subtree_control" file determines which controllers are
-enabled on A.  A's on B.  B's on C and D.  This coincides with the
-fact that controllers on the immediate sub-level are used to
-distribute the resources of the parent.  In fact, it's natural to
-assume that resource control knobs of a child belong to its parent.
-Enabling a controller in a "cgroup.subtree_control" file declares that
-distribution of the respective resources of the cgroup will be
-controlled.  Note that this means that controller enable states are
-shared among siblings.
-
-When read, the file contains a space-separated list of currently
-enabled controllers.  A write to the file should contain a
-space-separated list of controllers with '+' or '-' prefixed (without
-the quotes).  Controllers prefixed with '+' are enabled and '-'
-disabled.  If a controller is listed multiple times, the last entry
-wins.  The specific operations are executed atomically - either all
-succeed or fail.
-
-
-2-3. cgroup.controllers
-
-Read-only "cgroup.controllers" file contains a space-separated list of
-controllers which can be enabled in the cgroup's
-"cgroup.subtree_control" file.
-
-In the root cgroup, this lists controllers which are not bound to
-other hierarchies and the content changes as controllers are bound to
-and unbound from other hierarchies.
-
-In non-root cgroups, the content of this file equals that of the
-parent's "cgroup.subtree_control" file as only controllers enabled
-from the parent can be used in its children.
-
-
-3. Structural Constraints
-
-3-1. Top-down
-
-As it doesn't make sense to nest control of an uncontrolled resource,
-all non-root "cgroup.subtree_control" files can only contain
-controllers which are enabled in the parent's "cgroup.subtree_control"
-file.  A controller can be enabled only if the parent has the
-controller enabled and a controller can't be disabled if one or more
-children have it enabled.
-
-
-3-2. No internal tasks
-
-One long-standing issue that cgroup faces is the competition between
-tasks belonging to the parent cgroup and its children cgroups.  This
-is inherently nasty as two different types of entities compete and
-there is no agreed-upon obvious way to handle it.  Different
-controllers are doing different things.
-
-The cpu controller considers tasks and cgroups as equivalents and maps
-nice levels to cgroup weights.  This works for some cases but falls
-flat when children should be allocated specific ratios of CPU cycles
-and the number of internal tasks fluctuates - the ratios constantly
-change as the number of competing entities fluctuates.  There also are
-other issues.  The mapping from nice level to weight isn't obvious or
-universal, and there are various other knobs which simply aren't
-available for tasks.
-
-The io controller implicitly creates a hidden leaf node for each
-cgroup to host the tasks.  The hidden leaf has its own copies of all
-the knobs with "leaf_" prefixed.  While this allows equivalent control
-over internal tasks, it's with serious drawbacks.  It always adds an
-extra layer of nesting which may not be necessary, makes the interface
-messy and significantly complicates the implementation.
-
-The memory controller currently doesn't have a way to control what
-happens between internal tasks and child cgroups and the behavior is
-not clearly defined.  There have been attempts to add ad-hoc behaviors
-and knobs to tailor the behavior to specific workloads.  Continuing
-this direction will lead to problems which will be extremely difficult
-to resolve in the long term.
-
-Multiple controllers struggle with internal tasks and came up with
-different ways to deal with it; unfortunately, all the approaches in
-use now are severely flawed and, furthermore, the widely different
-behaviors make cgroup as whole highly inconsistent.
-
-It is clear that this is something which needs to be addressed from
-cgroup core proper in a uniform way so that controllers don't need to
-worry about it and cgroup as a whole shows a consistent and logical
-behavior.  To achieve that, unified hierarchy enforces the following
-structural constraint:
-
- Except for the root, only cgroups which don't contain any task may
- have controllers enabled in their "cgroup.subtree_control" files.
-
-Combined with other properties, this guarantees that, when a
-controller is looking at the part of the hierarchy which has it
-enabled, tasks are always only on the leaves.  This rules out
-situations where child cgroups compete against internal tasks of the
-parent.
-
-There are two things to note.  Firstly, the root cgroup is exempt from
-the restriction.  Root contains tasks and anonymous resource
-consumption which can't be associated with any other cgroup and
-requires special treatment from most controllers.  How resource
-consumption in the root cgroup is governed is up to each controller.
-
-Secondly, the restriction doesn't take effect if there is no enabled
-controller in the cgroup's "cgroup.subtree_control" file.  This is
-important as otherwise it wouldn't be possible to create children of a
-populated cgroup.  To control resource distribution of a cgroup, the
-cgroup must create children and transfer all its tasks to the children
-before enabling controllers in its "cgroup.subtree_control" file.
-
-
-4. Delegation
-
-4-1. Model of delegation
-
-A cgroup can be delegated to a less privileged user by granting write
-access of the directory and its "cgroup.procs" file to the user.  Note
-that the resource control knobs in a given directory concern the
-resources of the parent and thus must not be delegated along with the
-directory.
-
-Once delegated, the user can build sub-hierarchy under the directory,
-organize processes as it sees fit and further distribute the resources
-it got from the parent.  The limits and other settings of all resource
-controllers are hierarchical and regardless of what happens in the
-delegated sub-hierarchy, nothing can escape the resource restrictions
-imposed by the parent.
-
-Currently, cgroup doesn't impose any restrictions on the number of
-cgroups in or nesting depth of a delegated sub-hierarchy; however,
-this may in the future be limited explicitly.
-
-
-4-2. Common ancestor rule
-
-On the unified hierarchy, to write to a "cgroup.procs" file, in
-addition to the usual write permission to the file and uid match, the
-writer must also have write access to the "cgroup.procs" file of the
-common ancestor of the source and destination cgroups.  This prevents
-delegatees from smuggling processes across disjoint sub-hierarchies.
-
-Let's say cgroups C0 and C1 have been delegated to user U0 who created
-C00, C01 under C0 and C10 under C1 as follows.
-
- ~~~~~~~~~~~~~ - C0 - C00
- ~ cgroup    ~      \ C01
- ~ hierarchy ~
- ~~~~~~~~~~~~~ - C1 - C10
-
-C0 and C1 are separate entities in terms of resource distribution
-regardless of their relative positions in the hierarchy.  The
-resources the processes under C0 are entitled to are controlled by
-C0's ancestors and may be completely different from C1.  It's clear
-that the intention of delegating C0 to U0 is allowing U0 to organize
-the processes under C0 and further control the distribution of C0's
-resources.
-
-On traditional hierarchies, if a task has write access to "tasks" or
-"cgroup.procs" file of a cgroup and its uid agrees with the target, it
-can move the target to the cgroup.  In the above example, U0 will not
-only be able to move processes in each sub-hierarchy but also across
-the two sub-hierarchies, effectively allowing it to violate the
-organizational and resource restrictions implied by the hierarchical
-structure above C0 and C1.
-
-On the unified hierarchy, let's say U0 wants to write the pid of a
-process which has a matching uid and is currently in C10 into
-"C00/cgroup.procs".  U0 obviously has write access to the file and
-migration permission on the process; however, the common ancestor of
-the source cgroup C10 and the destination cgroup C00 is above the
-points of delegation and U0 would not have write access to its
-"cgroup.procs" and thus be denied with -EACCES.
-
-
-5. Other Changes
-
-5-1. [Un]populated Notification
-
-cgroup users often need a way to determine when a cgroup's
-subhierarchy becomes empty so that it can be cleaned up.  cgroup
-currently provides release_agent for it; unfortunately, this mechanism
-is riddled with issues.
-
-- It delivers events by forking and execing a userland binary
-  specified as the release_agent.  This is a long deprecated method of
-  notification delivery.  It's extremely heavy, slow and cumbersome to
-  integrate with larger infrastructure.
-
-- There is single monitoring point at the root.  There's no way to
-  delegate management of a subtree.
-
-- The event isn't recursive.  It triggers when a cgroup doesn't have
-  any tasks or child cgroups.  Events for internal nodes trigger only
-  after all children are removed.  This again makes it impossible to
-  delegate management of a subtree.
-
-- Events are filtered from the kernel side.  A "notify_on_release"
-  file is used to subscribe to or suppress release events.  This is
-  unnecessarily complicated and probably done this way because event
-  delivery itself was expensive.
-
-Unified hierarchy implements "populated" field in "cgroup.events"
-interface file which can be used to monitor whether the cgroup's
-subhierarchy has tasks in it or not.  Its value is 0 if there is no
-task in the cgroup and its descendants; otherwise, 1.  poll and
-[id]notify events are triggered when the value changes.
-
-This is significantly lighter and simpler and trivially allows
-delegating management of subhierarchy - subhierarchy monitoring can
-block further propagation simply by putting itself or another process
-in the subhierarchy and monitor events that it's interested in from
-there without interfering with monitoring higher in the tree.
-
-In unified hierarchy, the release_agent mechanism is no longer
-supported and the interface files "release_agent" and
-"notify_on_release" do not exist.
-
-
-5-2. Other Core Changes
-
-- None of the mount options is allowed.
-
-- remount is disallowed.
-
-- rename(2) is disallowed.
-
-- The "tasks" file is removed.  Everything should at process
-  granularity.  Use the "cgroup.procs" file instead.
-
-- The "cgroup.procs" file is not sorted.  pids will be unique unless
-  they got recycled in-between reads.
-
-- The "cgroup.clone_children" file is removed.
-
-- /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged
-  to before exiting.  If the cgroup is removed before the zombie is
-  reaped, " (deleted)" is appeneded to the path.
-
-
-5-3. Controller File Conventions
-
-5-3-1. Format
-
-In general, all controller files should be in one of the following
-formats whenever possible.
-
-- Values only files
-
-  VAL0 VAL1...\n
-
-- Flat keyed files
-
-  KEY0 VAL0\n
-  KEY1 VAL1\n
-  ...
-
-- Nested keyed files
-
-  KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01...
-  KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11...
-  ...
-
-For a writeable file, the format for writing should generally match
-reading; however, controllers may allow omitting later fields or
-implement restricted shortcuts for most common use cases.
-
-For both flat and nested keyed files, only the values for a single key
-can be written at a time.  For nested keyed files, the sub key pairs
-may be specified in any order and not all pairs have to be specified.
-
-
-5-3-2. Control Knobs
-
-- Settings for a single feature should generally be implemented in a
-  single file.
-
-- In general, the root cgroup should be exempt from resource control
-  and thus shouldn't have resource control knobs.
-
-- If a controller implements ratio based resource distribution, the
-  control knob should be named "weight" and have the range [1, 10000]
-  and 100 should be the default value.  The values are chosen to allow
-  enough and symmetric bias in both directions while keeping it
-  intuitive (the default is 100%).
-
-- If a controller implements an absolute resource guarantee and/or
-  limit, the control knobs should be named "min" and "max"
-  respectively.  If a controller implements best effort resource
-  gurantee and/or limit, the control knobs should be named "low" and
-  "high" respectively.
-
-  In the above four control files, the special token "max" should be
-  used to represent upward infinity for both reading and writing.
-
-- If a setting has configurable default value and specific overrides,
-  the default settings should be keyed with "default" and appear as
-  the first entry in the file.  Specific entries can use "default" as
-  its value to indicate inheritance of the default value.
-
-- For events which are not very high frequency, an interface file
-  "events" should be created which lists event key value pairs.
-  Whenever a notifiable event happens, file modified event should be
-  generated on the file.
-
-
-5-4. Per-Controller Changes
-
-5-4-1. io
-
-- blkio is renamed to io.  The interface is overhauled anyway.  The
-  new name is more in line with the other two major controllers, cpu
-  and memory, and better suited given that it may be used for cgroup
-  writeback without involving block layer.
-
-- Everything including stat is always hierarchical making separate
-  recursive stat files pointless and, as no internal node can have
-  tasks, leaf weights are meaningless.  The operation model is
-  simplified and the interface is overhauled accordingly.
-
-  io.stat
-
-       The stat file.  The reported stats are from the point where
-       bio's are issued to request_queue.  The stats are counted
-       independent of which policies are enabled.  Each line in the
-       file follows the following format.  More fields may later be
-       added at the end.
-
-         $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS
-
-  io.weight
-
-       The weight setting, currently only available and effective if
-       cfq-iosched is in use for the target device.  The weight is
-       between 1 and 10000 and defaults to 100.  The first line
-       always contains the default weight in the following format to
-       use when per-device setting is missing.
-
-         default $WEIGHT
-
-       Subsequent lines list per-device weights of the following
-       format.
-
-         $MAJ:$MIN $WEIGHT
-
-       Writing "$WEIGHT" or "default $WEIGHT" changes the default
-       setting.  Writing "$MAJ:$MIN $WEIGHT" sets per-device weight
-       while "$MAJ:$MIN default" clears it.
-
-       This file is available only on non-root cgroups.
-
-  io.max
-
-       The maximum bandwidth and/or iops setting, only available if
-       blk-throttle is enabled.  The file is of the following format.
-
-         $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS
-
-       ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are
-       read/write IOs per second.  "max" indicates no limit.  Writing
-       to the file follows the same format but the individual
-       settings may be omitted or specified in any order.
-
-       This file is available only on non-root cgroups.
-
-
-5-4-2. cpuset
-
-- Tasks are kept in empty cpusets after hotplug and take on the masks
-  of the nearest non-empty ancestor, instead of being moved to it.
-
-- A task can be moved into an empty cpuset, and again it takes on the
-  masks of the nearest non-empty ancestor.
-
-
-5-4-3. memory
-
-- use_hierarchy is on by default and the cgroup file for the flag is
-  not created.
-
-- The original lower boundary, the soft limit, is defined as a limit
-  that is per default unset.  As a result, the set of cgroups that
-  global reclaim prefers is opt-in, rather than opt-out.  The costs
-  for optimizing these mostly negative lookups are so high that the
-  implementation, despite its enormous size, does not even provide the
-  basic desirable behavior.  First off, the soft limit has no
-  hierarchical meaning.  All configured groups are organized in a
-  global rbtree and treated like equal peers, regardless where they
-  are located in the hierarchy.  This makes subtree delegation
-  impossible.  Second, the soft limit reclaim pass is so aggressive
-  that it not just introduces high allocation latencies into the
-  system, but also impacts system performance due to overreclaim, to
-  the point where the feature becomes self-defeating.
-
-  The memory.low boundary on the other hand is a top-down allocated
-  reserve.  A cgroup enjoys reclaim protection when it and all its
-  ancestors are below their low boundaries, which makes delegation of
-  subtrees possible.  Secondly, new cgroups have no reserve per
-  default and in the common case most cgroups are eligible for the
-  preferred reclaim pass.  This allows the new low boundary to be
-  efficiently implemented with just a minor addition to the generic
-  reclaim code, without the need for out-of-band data structures and
-  reclaim passes.  Because the generic reclaim code considers all
-  cgroups except for the ones running low in the preferred first
-  reclaim pass, overreclaim of individual groups is eliminated as
-  well, resulting in much better overall workload performance.
-
-- The original high boundary, the hard limit, is defined as a strict
-  limit that can not budge, even if the OOM killer has to be called.
-  But this generally goes against the goal of making the most out of
-  the available memory.  The memory consumption of workloads varies
-  during runtime, and that requires users to overcommit.  But doing
-  that with a strict upper limit requires either a fairly accurate
-  prediction of the working set size or adding slack to the limit.
-  Since working set size estimation is hard and error prone, and
-  getting it wrong results in OOM kills, most users tend to err on the
-  side of a looser limit and end up wasting precious resources.
-
-  The memory.high boundary on the other hand can be set much more
-  conservatively.  When hit, it throttles allocations by forcing them
-  into direct reclaim to work off the excess, but it never invokes the
-  OOM killer.  As a result, a high boundary that is chosen too
-  aggressively will not terminate the processes, but instead it will
-  lead to gradual performance degradation.  The user can monitor this
-  and make corrections until the minimal memory footprint that still
-  gives acceptable performance is found.
-
-  In extreme cases, with many concurrent allocations and a complete
-  breakdown of reclaim progress within the group, the high boundary
-  can be exceeded.  But even then it's mostly better to satisfy the
-  allocation from the slack available in other groups or the rest of
-  the system than killing the group.  Otherwise, memory.max is there
-  to limit this type of spillover and ultimately contain buggy or even
-  malicious applications.
-
-- The original control file names are unwieldy and inconsistent in
-  many different ways.  For example, the upper boundary hit count is
-  exported in the memory.failcnt file, but an OOM event count has to
-  be manually counted by listening to memory.oom_control events, and
-  lower boundary / soft limit events have to be counted by first
-  setting a threshold for that value and then counting those events.
-  Also, usage and limit files encode their units in the filename.
-  That makes the filenames very long, even though this is not
-  information that a user needs to be reminded of every time they type
-  out those names.
-
-  To address these naming issues, as well as to signal clearly that
-  the new interface carries a new configuration model, the naming
-  conventions in it necessarily differ from the old interface.
-
-- The original limit files indicate the state of an unset limit with a
-  Very High Number, and a configured limit can be unset by echoing -1
-  into those files.  But that very high number is implementation and
-  architecture dependent and not very descriptive.  And while -1 can
-  be understood as an underflow into the highest possible value, -2 or
-  -10M etc. do not work, so it's not consistent.
-
-  memory.low, memory.high, and memory.max will use the string "max" to
-  indicate and set the highest possible value.
-
-6. Planned Changes
-
-6-1. CAP for resource control
-
-Unified hierarchy will require one of the capabilities(7), which is
-yet to be decided, for all resource control related knobs.  Process
-organization operations - creation of sub-cgroups and migration of
-processes in sub-hierarchies may be delegated by changing the
-ownership and/or permissions on the cgroup directory and
-"cgroup.procs" interface file; however, all operations which affect
-resource control - writes to a "cgroup.subtree_control" file or any
-controller-specific knobs - will require an explicit CAP privilege.
-
-This, in part, is to prevent the cgroup interface from being
-inadvertently promoted to programmable API used by non-privileged
-binaries.  cgroup exposes various aspects of the system in ways which
-aren't properly abstracted for direct consumption by regular programs.
-This is an administration interface much closer to sysctl knobs than
-system calls.  Even the basic access model, being filesystem path
-based, isn't suitable for direct consumption.  There's no way to
-access "my cgroup" in a race-free way or make multiple operations
-atomic against migration to another cgroup.
-
-Another aspect is that, for better or for worse, the cgroup interface
-goes through far less scrutiny than regular interfaces for
-unprivileged userland.  The upside is that cgroup is able to expose
-useful features which may not be suitable for general consumption in a
-reasonable time frame.  It provides a relatively short path between
-internal details and userland-visible interface.  Of course, this
-shortcut comes with high risk.  We go through what we go through for
-general kernel APIs for good reasons.  It may end up leaking internal
-details in a way which can exert significant pain by locking the
-kernel into a contract that can't be maintained in a reasonable
-manner.
-
-Also, due to the specific nature, cgroup and its controllers don't
-tend to attract attention from a wide scope of developers.  cgroup's
-short history is already fraught with severely mis-designed
-interfaces, unnecessary commitments to and exposing of internal
-details, broken and dangerous implementations of various features.
-
-Keeping cgroup as an administration interface is both advantageous for
-its role and imperative given its nature.  Some of the cgroup features
-may make sense for unprivileged access.  If deemed justified, those
-must be further abstracted and implemented as a different interface,
-be it a system call or process-private filesystem, and survive through
-the scrutiny that any interface for general consumption is required to
-go through.
-
-Requiring CAP is not a complete solution but should serve as a
-significant deterrent against spraying cgroup usages in non-privileged
-programs.
diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt

index 0cb44dc21f97ca7cfde5056ce7d44941d3439546..601256fe8c0dd99d2df3ff5a77b2cee23a852e5d 100644 (file)
--- a/Documentation/devicetree/bindings/opp/opp.txt
+++ b/Documentation/devicetree/bindings/opp/opp.txt
@@ -45,21 +45,10 @@ Devices supporting OPPs must set their "operating-points-v2" property with
  phandle to a OPP table in their DT node. The OPP core will use this phandle to
  find the operating points for the device.
  
-Devices may want to choose OPP tables at runtime and so can provide a list of
-phandles here. But only *one* of them should be chosen at runtime. This must be
-accompanied by a corresponding "operating-points-names" property, to uniquely
-identify the OPP tables.
-
  If required, this can be extended for SoC vendor specfic bindings. Such bindings
  should be documented as Documentation/devicetree/bindings/power/<vendor>-opp.txt
  and should have a compatible description like: "operating-points-v2-<vendor>".
  
-Optional properties:
-- operating-points-names: Names of OPP tables (required if multiple OPP
-  tables are present), to uniquely identify them. The same list must be present
-  for all the CPUs which are sharing clock/voltage rails and hence the OPP
-  tables.
-
  * OPP Table Node
  
  This describes the OPPs belonging to a device. This node can have following
@@ -100,6 +89,14 @@ Optional properties:
    Entries for multiple regulators must be present in the same order as
    regulators are specified in device's DT node.
  
+- opp-microvolt-<name>: Named opp-microvolt property. This is exactly similar to
+  the above opp-microvolt property, but allows multiple voltage ranges to be
+  provided for the same OPP. At runtime, the platform can pick a <name> and
+  matching opp-microvolt-<name> property will be enabled for all OPPs. If the
+  platform doesn't pick a specific <name> or the <name> doesn't match with any
+  opp-microvolt-<name> properties, then opp-microvolt property shall be used, if
+  present.
+
  - opp-microamp: The maximum current drawn by the device in microamperes
    considering system specific parameters (such as transients, process, aging,
    maximum operating temperature range etc.) as necessary. This may be used to
@@ -112,6 +109,9 @@ Optional properties:
    for few regulators, then this should be marked as zero for them. If it isn't
    required for any regulator, then this property need not be present.
  
+- opp-microamp-<name>: Named opp-microamp property. Similar to
+  opp-microvolt-<name> property, but for microamp instead.
+
  - clock-latency-ns: Specifies the maximum possible transition latency (in
    nanoseconds) for switching to this OPP from any other OPP.
  
@@ -123,6 +123,26 @@ Optional properties:
  - opp-suspend: Marks the OPP to be used during device suspend. Only one OPP in
    the table should have this.
  
+- opp-supported-hw: This enables us to select only a subset of OPPs from the
+  larger OPP table, based on what version of the hardware we are running on. We
+  still can't have multiple nodes with the same opp-hz value in OPP table.
+
+  It's an user defined array containing a hierarchy of hardware version numbers,
+  supported by the OPP. For example: a platform with hierarchy of three levels
+  of versions (A, B and C), this field should be like <X Y Z>, where X
+  corresponds to Version hierarchy A, Y corresponds to version hierarchy B and Z
+  corresponds to version hierarchy C.
+
+  Each level of hierarchy is represented by a 32 bit value, and so there can be
+  only 32 different supported version per hierarchy. i.e. 1 bit per version. A
+  value of 0xFFFFFFFF will enable the OPP for all versions for that hierarchy
+  level. And a value of 0x00000000 will disable the OPP completely, and so we
+  never want that to happen.
+
+  If 32 values aren't sufficient for a version hierarchy, than that version
+  hierarchy can be contained in multiple 32 bit values. i.e. <X Y Z1 Z2> in the
+  above example, Z1 & Z2 refer to the version hierarchy Z.
+
  - status: Marks the node enabled/disabled.
  
  Example 1: Single cluster Dual-core ARM cortex A9, switch DVFS states together.
@@ -157,20 +177,20 @@ Example 1: Single cluster Dual-core ARM cortex A9, switch DVFS states together.
                 compatible = "operating-points-v2";
                 opp-shared;
  
-               opp00 {
+               opp@1000000000 {
                         opp-hz = /bits/ 64 <1000000000>;
                         opp-microvolt = <970000 975000 985000>;
                         opp-microamp = <70000>;
                         clock-latency-ns = <300000>;
                         opp-suspend;
                 };
-               opp01 {
+               opp@1100000000 {
                         opp-hz = /bits/ 64 <1100000000>;
                         opp-microvolt = <980000 1000000 1010000>;
                         opp-microamp = <80000>;
                         clock-latency-ns = <310000>;
                 };
-               opp02 {
+               opp@1200000000 {
                         opp-hz = /bits/ 64 <1200000000>;
                         opp-microvolt = <1025000>;
                         clock-latency-ns = <290000>;
@@ -236,20 +256,20 @@ independently.
                  * independently.
                  */
  
-               opp00 {
+               opp@1000000000 {
                         opp-hz = /bits/ 64 <1000000000>;
                         opp-microvolt = <970000 975000 985000>;
                         opp-microamp = <70000>;
                         clock-latency-ns = <300000>;
                         opp-suspend;
                 };
-               opp01 {
+               opp@1100000000 {
                         opp-hz = /bits/ 64 <1100000000>;
                         opp-microvolt = <980000 1000000 1010000>;
                         opp-microamp = <80000>;
                         clock-latency-ns = <310000>;
                 };
-               opp02 {
+               opp@1200000000 {
                         opp-hz = /bits/ 64 <1200000000>;
                         opp-microvolt = <1025000>;
                         opp-microamp = <90000;
@@ -312,20 +332,20 @@ DVFS state together.
                 compatible = "operating-points-v2";
                 opp-shared;
  
-               opp00 {
+               opp@1000000000 {
                         opp-hz = /bits/ 64 <1000000000>;
                         opp-microvolt = <970000 975000 985000>;
                         opp-microamp = <70000>;
                         clock-latency-ns = <300000>;
                         opp-suspend;
                 };
-               opp01 {
+               opp@1100000000 {
                         opp-hz = /bits/ 64 <1100000000>;
                         opp-microvolt = <980000 1000000 1010000>;
                         opp-microamp = <80000>;
                         clock-latency-ns = <310000>;
                 };
-               opp02 {
+               opp@1200000000 {
                         opp-hz = /bits/ 64 <1200000000>;
                         opp-microvolt = <1025000>;
                         opp-microamp = <90000>;
@@ -338,20 +358,20 @@ DVFS state together.
                 compatible = "operating-points-v2";
                 opp-shared;
  
-               opp10 {
+               opp@1300000000 {
                         opp-hz = /bits/ 64 <1300000000>;
                         opp-microvolt = <1045000 1050000 1055000>;
                         opp-microamp = <95000>;
                         clock-latency-ns = <400000>;
                         opp-suspend;
                 };
-               opp11 {
+               opp@1400000000 {
                         opp-hz = /bits/ 64 <1400000000>;
                         opp-microvolt = <1075000>;
                         opp-microamp = <100000>;
                         clock-latency-ns = <400000>;
                 };
-               opp12 {
+               opp@1500000000 {
                         opp-hz = /bits/ 64 <1500000000>;
                         opp-microvolt = <1010000 1100000 1110000>;
                         opp-microamp = <95000>;
@@ -378,7 +398,7 @@ Example 4: Handling multiple regulators
                 compatible = "operating-points-v2";
                 opp-shared;
  
-               opp00 {
+               opp@1000000000 {
                         opp-hz = /bits/ 64 <1000000000>;
                         opp-microvolt = <970000>, /* Supply 0 */
                                         <960000>, /* Supply 1 */
@@ -391,7 +411,7 @@ Example 4: Handling multiple regulators
  
                 /* OR */
  
-               opp00 {
+               opp@1000000000 {
                         opp-hz = /bits/ 64 <1000000000>;
                         opp-microvolt = <970000 975000 985000>, /* Supply 0 */
                                         <960000 965000 975000>, /* Supply 1 */
@@ -404,7 +424,7 @@ Example 4: Handling multiple regulators
  
                 /* OR */
  
-               opp00 {
+               opp@1000000000 {
                         opp-hz = /bits/ 64 <1000000000>;
                         opp-microvolt = <970000 975000 985000>, /* Supply 0 */
                                         <960000 965000 975000>, /* Supply 1 */
@@ -417,7 +437,8 @@ Example 4: Handling multiple regulators
         };
  };
  
-Example 5: Multiple OPP tables
+Example 5: opp-supported-hw
+(example: three level hierarchy of versions: cuts, substrate and process)
  
  / {
         cpus {
@@ -426,40 +447,73 @@ Example 5: Multiple OPP tables
                         ...
  
                         cpu-supply = <&cpu_supply>
-                       operating-points-v2 = <&cpu0_opp_table_slow>, <&cpu0_opp_table_fast>;
-                       operating-points-names = "slow", "fast";
+                       operating-points-v2 = <&cpu0_opp_table_slow>;
                 };
         };
  
-       cpu0_opp_table_slow: opp_table_slow {
+       opp_table {
                 compatible = "operating-points-v2";
                 status = "okay";
                 opp-shared;
  
-               opp00 {
+               opp@600000000 {
+                       /*
+                        * Supports all substrate and process versions for 0xF
+                        * cuts, i.e. only first four cuts.
+                        */
+                       opp-supported-hw = <0xF 0xFFFFFFFF 0xFFFFFFFF>
                         opp-hz = /bits/ 64 <600000000>;
+                       opp-microvolt = <900000 915000 925000>;
                         ...
                 };
  
-               opp01 {
+               opp@800000000 {
+                       /*
+                        * Supports:
+                        * - cuts: only one, 6th cut (represented by 6th bit).
+                        * - substrate: supports 16 different substrate versions
+                        * - process: supports 9 different process versions
+                        */
+                       opp-supported-hw = <0x20 0xff0000ff 0x0000f4f0>
                         opp-hz = /bits/ 64 <800000000>;
+                       opp-microvolt = <900000 915000 925000>;
                         ...
                 };
         };
+};
+
+Example 6: opp-microvolt-<name>, opp-microamp-<name>:
+(example: device with two possible microvolt ranges: slow and fast)
  
-       cpu0_opp_table_fast: opp_table_fast {
+/ {
+       cpus {
+               cpu@0 {
+                       compatible = "arm,cortex-a7";
+                       ...
+
+                       operating-points-v2 = <&cpu0_opp_table>;
+               };
+       };
+
+       cpu0_opp_table: opp_table0 {
                 compatible = "operating-points-v2";
-               status = "okay";
                 opp-shared;
  
-               opp10 {
+               opp@1000000000 {
                         opp-hz = /bits/ 64 <1000000000>;
-                       ...
+                       opp-microvolt-slow = <900000 915000 925000>;
+                       opp-microvolt-fast = <970000 975000 985000>;
+                       opp-microamp-slow =  <70000>;
+                       opp-microamp-fast =  <71000>;
                 };
  
-               opp11 {
-                       opp-hz = /bits/ 64 <1100000000>;
-                       ...
+               opp@1200000000 {
+                       opp-hz = /bits/ 64 <1200000000>;
+                       opp-microvolt-slow = <900000 915000 925000>, /* Supply vcc0 */
+                                             <910000 925000 935000>; /* Supply vcc1 */
+                       opp-microvolt-fast = <970000 975000 985000>, /* Supply vcc0 */
+                                            <960000 965000 975000>; /* Supply vcc1 */
+                       opp-microamp =  <70000>; /* Will be used for both slow/fast */
                 };
         };
  };
diff --git a/Documentation/features/time/irq-time-acct/arch-support.txt b/Documentation/features/time/irq-time-acct/arch-support.txt

index e63316239938164be150d0a9d73dac94fa9b4b92..4199ffecc0ff06bde4b26fa5f09d1fd0a9a08601 100644 (file)
--- a/Documentation/features/time/irq-time-acct/arch-support.txt
+++ b/Documentation/features/time/irq-time-acct/arch-support.txt
@@ -9,7 +9,7 @@
      |       alpha: |  ..  |
      |         arc: | TODO |
      |         arm: |  ok  |
-    |       arm64: |  ..  |
+    |       arm64: |  ok  |
      |       avr32: | TODO |
      |    blackfin: | TODO |
      |         c6x: | TODO |
diff --git a/Documentation/features/vm/huge-vmap/arch-support.txt b/Documentation/features/vm/huge-vmap/arch-support.txt

index af6816bccb439d76a7f80d43f97f4a95e7d604c2..df1d1f3c9af290aa6ffaa1584f71ba6025e54c4e 100644 (file)
--- a/Documentation/features/vm/huge-vmap/arch-support.txt
+++ b/Documentation/features/vm/huge-vmap/arch-support.txt
@@ -9,7 +9,7 @@
      |       alpha: | TODO |
      |         arc: | TODO |
      |         arm: | TODO |
-    |       arm64: | TODO |
+    |       arm64: |  ok  |
      |       avr32: | TODO |
      |    blackfin: | TODO |
      |         c6x: | TODO |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index c360f80c347383aa1465507bb8cd90a118a25277..5ee92cc9e57801cd2a61832f819dff88d784c67a 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3416,6 +3416,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
  
         ro              [KNL] Mount root device read-only on boot
  
+       rodata=         [KNL]
+               on      Mark read-only kernel memory as read-only (default).
+               off     Leave read-only kernel memory writable for debugging.
+
         root=           [KNL] Root filesystem
                         See name_to_dev_t comment in init/do_mounts.c.
  
diff --git a/Documentation/trace/coresight.txt b/Documentation/trace/coresight.txt

index 0a5c3290e7324f09ab01dac65f6825d22e35ea09..a33c88cd5d1d62f3fb4e377090d3d75615242b55 100644 (file)
--- a/Documentation/trace/coresight.txt
+++ b/Documentation/trace/coresight.txt
@@ -190,8 +190,8 @@ expected to be accessed and controlled using those entries.
  Last but not least, "struct module *owner" is expected to be set to reflect
  the information carried in "THIS_MODULE".
  
-How to use
-----------
+How to use the tracer modules
+-----------------------------
  
  Before trace collection can start, a coresight sink needs to be identify.
  There is no limit on the amount of sinks (nor sources) that can be enabled at
@@ -297,3 +297,36 @@ Info                                    Tracing enabled
  Instruction     13570831        0x8026B584      E28DD00C        false   ADD      sp,sp,#0xc
  Instruction     0       0x8026B588      E8BD8000        true    LDM      sp!,{pc}
  Timestamp                                       Timestamp: 17107041535
+
+How to use the STM module
+-------------------------
+
+Using the System Trace Macrocell module is the same as the tracers - the only
+difference is that clients are driving the trace capture rather
+than the program flow through the code.
+
+As with any other CoreSight component, specifics about the STM tracer can be
+found in sysfs with more information on each entry being found in [1]:
+
+root@genericarmv8:~# ls /sys/bus/coresight/devices/20100000.stm
+enable_source   hwevent_select  port_enable     subsystem       uevent
+hwevent_enable  mgmt            port_select     traceid
+root@genericarmv8:~#
+
+Like any other source a sink needs to be identified and the STM enabled before
+being used:
+
+root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20010000.etf/enable_sink
+root@genericarmv8:~# echo 1 > /sys/bus/coresight/devices/20100000.stm/enable_source
+
+From there user space applications can request and use channels using the devfs
+interface provided for that purpose by the generic STM API:
+
+root@genericarmv8:~# ls -l /dev/20100000.stm
+crw-------    1 root     root       10,  61 Jan  3 18:11 /dev/20100000.stm
+root@genericarmv8:~#
+
+Details on how to use the generic STM API can be found here [2].
+
+[1]. Documentation/ABI/testing/sysfs-bus-coresight-devices-stm
+[2]. Documentation/trace/stm.txt
diff --git a/MAINTAINERS b/MAINTAINERS

index ab65bbecb159279675cf5bb63e64f1feabf13f44..7875f7b71546b1cdead3caad96a077fe2abd7c64 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1007,6 +1007,10 @@ F:       drivers/hwtracing/coresight/*
  F:     Documentation/trace/coresight.txt
  F:     Documentation/devicetree/bindings/arm/coresight.txt
  F:     Documentation/ABI/testing/sysfs-bus-coresight-devices-*
+F:     tools/perf/arch/arm/util/pmu.c
+F:     tools/perf/arch/arm/util/auxtrace.c
+F:     tools/perf/arch/arm/util/cs_etm.c
+F:     tools/perf/arch/arm/util/cs_etm.h
  
  ARM/CORGI MACHINE SUPPORT
  M:     Richard Purdie <rpurdie@rpsys.net>
@@ -9356,6 +9360,7 @@ F:        drivers/mmc/host/dw_mmc*
  SYSTEM TRACE MODULE CLASS
  M:     Alexander Shishkin <alexander.shishkin@linux.intel.com>
  S:     Maintained
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/ash/stm.git
  F:     Documentation/trace/stm.txt
  F:     drivers/hwtracing/stm/
  F:     include/linux/stm.h
diff --git a/arch/Kconfig b/arch/Kconfig

index 4e949e58b1928363232abac3a69a25413e90652e..d4d9845530f1df6cb70117a1eca282d31fb4acbb 100644 (file)
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -423,6 +423,15 @@ config CC_STACKPROTECTOR_STRONG
  
  endchoice
  
+config HAVE_ARCH_WITHIN_STACK_FRAMES
+       bool
+       help
+         An architecture should select this if it can walk the kernel stack
+         frames to determine if an object is part of either the arguments
+         or local variables (i.e. that it excludes saved return addresses,
+         and similar) by implementing an inline arch_within_stack_frames(),
+         which is used by CONFIG_HARDENED_USERCOPY.
+
  config HAVE_CONTEXT_TRACKING
         bool
         help
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig

index 34e1569a11ee322a0a020bdfe0c9b801b6b58c12..9049ac023bee94dd6458b3bf2a4bebcbe9d5f593 100644 (file)
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -35,8 +35,10 @@ config ARM
         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
         select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32
         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32
+       select HAVE_ARCH_HARDENED_USERCOPY
         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
         select HAVE_ARCH_TRACEHOOK
+       select HAVE_ARM_SMCCC if CPU_V7
         select HAVE_BPF_JIT
         select HAVE_CC_STACKPROTECTOR
         select HAVE_CONTEXT_TRACKING
@@ -1422,8 +1424,7 @@ config BIG_LITTLE
  
  config BL_SWITCHER
         bool "big.LITTLE switcher support"
-       depends on BIG_LITTLE && MCPM && HOTPLUG_CPU
-       select ARM_CPU_SUSPEND
+       depends on BIG_LITTLE && MCPM && HOTPLUG_CPU && ARM_GIC
         select CPU_PM
         help
           The big.LITTLE "switcher" provides the core functionality to
@@ -1481,7 +1482,7 @@ config HOTPLUG_CPU
  
  config ARM_PSCI
         bool "Support for the ARM Power State Coordination Interface (PSCI)"
-       depends on CPU_V7
+       depends on HAVE_ARM_SMCCC
         select ARM_PSCI_FW
         help
           Say Y here if you want Linux to communicate with system firmware
@@ -2140,7 +2141,8 @@ config ARCH_SUSPEND_POSSIBLE
         def_bool y
  
  config ARM_CPU_SUSPEND
-       def_bool PM_SLEEP
+       def_bool PM_SLEEP || BL_SWITCHER || ARM_PSCI_FW
+       depends on ARCH_SUSPEND_POSSIBLE
  
  config ARCH_HIBERNATION_POSSIBLE
         bool
diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h

index d5525bfc7e3e61879d278ae08b446e185c206982..9156fc303afd8d278671c6d4d277d866fdd2ad7b 100644 (file)
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -491,7 +491,6 @@ static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
  #endif
  
  #ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
  void set_kernel_text_rw(void);
  void set_kernel_text_ro(void);
  #else
diff --git a/arch/arm/include/asm/cpuidle.h b/arch/arm/include/asm/cpuidle.h

index 0f842492490280b9d0b50fd1b25935a51b5a4c9d..3848259bebf85786d39d4212d74f2e1646b98a44 100644 (file)
--- a/arch/arm/include/asm/cpuidle.h
+++ b/arch/arm/include/asm/cpuidle.h
@@ -30,7 +30,7 @@ static inline int arm_cpuidle_simple_enter(struct cpuidle_device *dev,
  struct device_node;
  
  struct cpuidle_ops {
-       int (*suspend)(int cpu, unsigned long arg);
+       int (*suspend)(unsigned long arg);
         int (*init)(struct device_node *, int cpu);
  };
  
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h

index dc641ddf0784304434635080b5732ee215ab588f..e22089fb44dc86b7ed2fdb175bc6ec7b47ee4001 100644 (file)
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -19,6 +19,7 @@
  #ifndef __ARM_KVM_ARM_H__
  #define __ARM_KVM_ARM_H__
  
+#include <linux/const.h>
  #include <linux/types.h>
  
  /* Hyp Configuration Register (HCR) bits */
@@ -132,10 +133,9 @@
   * space.
   */
  #define KVM_PHYS_SHIFT (40)
-#define KVM_PHYS_SIZE  (1ULL << KVM_PHYS_SHIFT)
-#define KVM_PHYS_MASK  (KVM_PHYS_SIZE - 1ULL)
-#define PTRS_PER_S2_PGD        (1ULL << (KVM_PHYS_SHIFT - 30))
-#define S2_PGD_ORDER   get_order(PTRS_PER_S2_PGD * sizeof(pgd_t))
+#define KVM_PHYS_SIZE  (_AC(1, ULL) << KVM_PHYS_SHIFT)
+#define KVM_PHYS_MASK  (KVM_PHYS_SIZE - _AC(1, ULL))
+#define PTRS_PER_S2_PGD        (_AC(1, ULL) << (KVM_PHYS_SHIFT - 30))
  
  /* Virtualization Translation Control Register (VTCR) bits */
  #define VTCR_SH0       (3 << 12)
@@ -162,17 +162,17 @@
  #define VTTBR_X                (5 - KVM_T0SZ)
  #endif
  #define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
-#define VTTBR_BADDR_MASK  (((1LLU << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
-#define VTTBR_VMID_SHIFT  (48LLU)
-#define VTTBR_VMID_MASK          (0xffLLU << VTTBR_VMID_SHIFT)
+#define VTTBR_BADDR_MASK  (((_AC(1, ULL) << (40 - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
+#define VTTBR_VMID_SHIFT  _AC(48, ULL)
+#define VTTBR_VMID_MASK(size)  (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT)
  
  /* Hyp Syndrome Register (HSR) bits */
  #define HSR_EC_SHIFT   (26)
-#define HSR_EC         (0x3fU << HSR_EC_SHIFT)
-#define HSR_IL         (1U << 25)
+#define HSR_EC         (_AC(0x3f, UL) << HSR_EC_SHIFT)
+#define HSR_IL         (_AC(1, UL) << 25)
  #define HSR_ISS                (HSR_IL - 1)
  #define HSR_ISV_SHIFT  (24)
-#define HSR_ISV                (1U << HSR_ISV_SHIFT)
+#define HSR_ISV                (_AC(1, UL) << HSR_ISV_SHIFT)
  #define HSR_SRT_SHIFT  (16)
  #define HSR_SRT_MASK   (0xf << HSR_SRT_SHIFT)
  #define HSR_FSC                (0x3f)
@@ -180,9 +180,9 @@
  #define HSR_SSE                (1 << 21)
  #define HSR_WNR                (1 << 6)
  #define HSR_CV_SHIFT   (24)
-#define HSR_CV         (1U << HSR_CV_SHIFT)
+#define HSR_CV         (_AC(1, UL) << HSR_CV_SHIFT)
  #define HSR_COND_SHIFT (20)
-#define HSR_COND       (0xfU << HSR_COND_SHIFT)
+#define HSR_COND       (_AC(0xf, UL) << HSR_COND_SHIFT)
  
  #define FSC_FAULT      (0x04)
  #define FSC_ACCESS     (0x08)
@@ -210,13 +210,13 @@
  #define HSR_EC_DABT    (0x24)
  #define HSR_EC_DABT_HYP        (0x25)
  
-#define HSR_WFI_IS_WFE         (1U << 0)
+#define HSR_WFI_IS_WFE         (_AC(1, UL) << 0)
  
-#define HSR_HVC_IMM_MASK       ((1UL << 16) - 1)
+#define HSR_HVC_IMM_MASK       ((_AC(1, UL) << 16) - 1)
  
-#define HSR_DABT_S1PTW         (1U << 7)
-#define HSR_DABT_CM            (1U << 8)
-#define HSR_DABT_EA            (1U << 9)
+#define HSR_DABT_S1PTW         (_AC(1, UL) << 7)
+#define HSR_DABT_CM            (_AC(1, UL) << 8)
+#define HSR_DABT_EA            (_AC(1, UL) << 9)
  
  #define kvm_arm_exception_type \
         {0, "RESET" },          \
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h

index 194c91b610ffecfd4071da89d16b923c614bf68d..c35c349da06983b5eee05bc8bca52e526ed1bc52 100644 (file)
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -79,6 +79,8 @@
  #define rr_lo_hi(a1, a2) a1, a2
  #endif
  
+#define kvm_ksym_ref(kva)      (kva)
+
  #ifndef __ASSEMBLY__
  struct kvm;
  struct kvm_vcpu;
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h

index 6692982c9b575db476bc12a37e2d69b27c00d9fb..bedaf65c0ff96fa3f7d13938f59ed1faa9c94033 100644 (file)
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -214,6 +214,19 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
         kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr);
  }
  
+static inline void __cpu_init_stage2(void)
+{
+}
+
+static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+                                       phys_addr_t phys_idmap_start)
+{
+       /*
+        * TODO
+        * kvm_call_reset(boot_pgd_ptr, phys_idmap_start);
+        */
+}
+
  static inline int kvm_arch_dev_ioctl_check_extension(long ext)
  {
         return 0;
@@ -226,7 +239,6 @@ void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
  
  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
  
-static inline void kvm_arch_hardware_disable(void) {}
  static inline void kvm_arch_hardware_unsetup(void) {}
  static inline void kvm_arch_sync_events(struct kvm *kvm) {}
  static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h

index 405aa18833073b88b52103363b54d534d9c69263..c7ba9a42e85720dcbf78dae2ef3698f8e7b0c851 100644 (file)
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -66,6 +66,7 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
  phys_addr_t kvm_mmu_get_httbr(void);
  phys_addr_t kvm_mmu_get_boot_httbr(void);
  phys_addr_t kvm_get_idmap_vector(void);
+phys_addr_t kvm_get_idmap_start(void);
  int kvm_mmu_init(void);
  void kvm_clear_hyp_idmap(void);
  
@@ -279,6 +280,11 @@ static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
                                        pgd_t *merged_hyp_pgd,
                                        unsigned long hyp_idmap_start) { }
  
+static inline unsigned int kvm_get_vmid_bits(void)
+{
+       return 8;
+}
+
  #endif /* !__ASSEMBLY__ */
  
  #endif /* __ARM_KVM_MMU_H__ */
diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h

index 51622ba7c4a66a91494ecc49be4520fd4b14df88..d3c0c23703b6f90cd2863323462dfbb8da9f13e6 100644 (file)
--- a/arch/arm/include/asm/ptrace.h
+++ b/arch/arm/include/asm/ptrace.h
@@ -121,7 +121,6 @@ extern unsigned long profile_pc(struct pt_regs *regs);
  #define MAX_REG_OFFSET (offsetof(struct pt_regs, ARM_ORIG_r0))
  
  extern int regs_query_register_offset(const char *name);
-extern const char *regs_query_register_name(unsigned int offset);
  extern bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr);
  extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
                                                unsigned int n);
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h

index 35c9db857ebe9c7d53715ec42518a6e9fbe1dc6e..7fb59199c6bbbebc03fafbff1c8083aa5892de91 100644 (file)
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -496,7 +496,10 @@ arm_copy_from_user(void *to, const void __user *from, unsigned long n);
  static inline unsigned long __must_check
  __copy_from_user(void *to, const void __user *from, unsigned long n)
  {
-       unsigned int __ua_flags = uaccess_save_and_enable();
+       unsigned int __ua_flags;
+
+       check_object_size(to, n, false);
+       __ua_flags = uaccess_save_and_enable();
         n = arm_copy_from_user(to, from, n);
         uaccess_restore(__ua_flags);
         return n;
@@ -511,11 +514,15 @@ static inline unsigned long __must_check
  __copy_to_user(void __user *to, const void *from, unsigned long n)
  {
  #ifndef CONFIG_UACCESS_WITH_MEMCPY
-       unsigned int __ua_flags = uaccess_save_and_enable();
+       unsigned int __ua_flags;
+
+       check_object_size(from, n, true);
+       __ua_flags = uaccess_save_and_enable();
         n = arm_copy_to_user(to, from, n);
         uaccess_restore(__ua_flags);
         return n;
  #else
+       check_object_size(from, n, true);
         return arm_copy_to_user(to, from, n);
  #endif
  }
diff --git a/arch/arm/include/asm/virt.h b/arch/arm/include/asm/virt.h

index 4371f45c578401c7f233e565dbb5fd36d9d59d52..d4ceaf5f299b8d03d6e638a3ea5f72fb5b60a93e 100644 (file)
--- a/arch/arm/include/asm/virt.h
+++ b/arch/arm/include/asm/virt.h
@@ -74,6 +74,15 @@ static inline bool is_hyp_mode_mismatched(void)
  {
         return !!(__boot_cpu_mode & BOOT_CPU_MODE_MISMATCH);
  }
+
+static inline bool is_kernel_in_hyp_mode(void)
+{
+       return false;
+}
+
+/* The section containing the hypervisor text */
+extern char __hyp_text_start[];
+extern char __hyp_text_end[];
  #endif
  
  #endif /* __ASSEMBLY__ */
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile

index af9e59bf3831b9fd648d095c2070209b1e97677d..80856def246518e05015591f73d3bd0b4993bf8b 100644 (file)
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -88,8 +88,9 @@ obj-$(CONFIG_EARLY_PRINTK)    += early_printk.o
  
  obj-$(CONFIG_ARM_VIRT_EXT)     += hyp-stub.o
  ifeq ($(CONFIG_ARM_PSCI),y)
-obj-y                          += psci-call.o
  obj-$(CONFIG_SMP)              += psci_smp.o
  endif
  
+obj-$(CONFIG_HAVE_ARM_SMCCC)   += smccc-call.o
+
  extra-y := $(head-y) vmlinux.lds
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c

index f89811fb9a55f3a490c3633ef99ef52745c58129..7e45f69a0ddc9d0ef3b04c145195cf1b4ad7d84b 100644 (file)
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -16,6 +16,7 @@
  #include <linux/syscalls.h>
  #include <linux/uaccess.h>
  #include <linux/io.h>
+#include <linux/arm-smccc.h>
  
  #include <asm/checksum.h>
  #include <asm/ftrace.h>
@@ -175,3 +176,8 @@ EXPORT_SYMBOL(__gnu_mcount_nc);
  EXPORT_SYMBOL(__pv_phys_pfn_offset);
  EXPORT_SYMBOL(__pv_offset);
  #endif
+
+#ifdef CONFIG_HAVE_ARM_SMCCC
+EXPORT_SYMBOL(arm_smccc_smc);
+EXPORT_SYMBOL(arm_smccc_hvc);
+#endif
diff --git a/arch/arm/kernel/cpuidle.c b/arch/arm/kernel/cpuidle.c

index 318da33465f413f8c207eb2dd9b3766e53ab5ad3..703926e7007b4e0000b689006722780a02588e04 100644 (file)
--- a/arch/arm/kernel/cpuidle.c
+++ b/arch/arm/kernel/cpuidle.c
@@ -56,7 +56,7 @@ int arm_cpuidle_suspend(int index)
         int cpu = smp_processor_id();
  
         if (cpuidle_ops[cpu].suspend)
-               ret = cpuidle_ops[cpu].suspend(cpu, index);
+               ret = cpuidle_ops[cpu].suspend(index);
  
         return ret;
  }
diff --git a/arch/arm/kernel/psci-call.S b/arch/arm/kernel/psci-call.S

deleted file mode 100644 (file)

index a78e9e1..0000000
--- a/arch/arm/kernel/psci-call.S
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * Copyright (C) 2015 ARM Limited
- *
- * Author: Mark Rutland <mark.rutland@arm.com>
- */
-
-#include <linux/linkage.h>
-
-#include <asm/opcodes-sec.h>
-#include <asm/opcodes-virt.h>
-
-/* int __invoke_psci_fn_hvc(u32 function_id, u32 arg0, u32 arg1, u32 arg2) */
-ENTRY(__invoke_psci_fn_hvc)
-       __HVC(0)
-       bx      lr
-ENDPROC(__invoke_psci_fn_hvc)
-
-/* int __invoke_psci_fn_smc(u32 function_id, u32 arg0, u32 arg1, u32 arg2) */
-ENTRY(__invoke_psci_fn_smc)
-       __SMC(0)
-       bx      lr
-ENDPROC(__invoke_psci_fn_smc)
diff --git a/arch/arm/kernel/smccc-call.S b/arch/arm/kernel/smccc-call.S

new file mode 100644 (file)

index 0000000..2e48b67
--- /dev/null
+++ b/arch/arm/kernel/smccc-call.S
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/linkage.h>
+
+#include <asm/opcodes-sec.h>
+#include <asm/opcodes-virt.h>
+#include <asm/unwind.h>
+
+       /*
+        * Wrap c macros in asm macros to delay expansion until after the
+        * SMCCC asm macro is expanded.
+        */
+       .macro SMCCC_SMC
+       __SMC(0)
+       .endm
+
+       .macro SMCCC_HVC
+       __HVC(0)
+       .endm
+
+       .macro SMCCC instr
+UNWIND(        .fnstart)
+       mov     r12, sp
+       push    {r4-r7}
+UNWIND(        .save   {r4-r7})
+       ldm     r12, {r4-r7}
+       \instr
+       pop     {r4-r7}
+       ldr     r12, [sp, #(4 * 4)]
+       stm     r12, {r0-r3}
+       bx      lr
+UNWIND(        .fnend)
+       .endm
+
+/*
+ * void smccc_smc(unsigned long a0, unsigned long a1, unsigned long a2,
+ *               unsigned long a3, unsigned long a4, unsigned long a5,
+ *               unsigned long a6, unsigned long a7, struct arm_smccc_res *res)
+ */
+ENTRY(arm_smccc_smc)
+       SMCCC SMCCC_SMC
+ENDPROC(arm_smccc_smc)
+
+/*
+ * void smccc_hvc(unsigned long a0, unsigned long a1, unsigned long a2,
+ *               unsigned long a3, unsigned long a4, unsigned long a5,
+ *               unsigned long a6, unsigned long a7, struct arm_smccc_res *res)
+ */
+ENTRY(arm_smccc_hvc)
+       SMCCC SMCCC_HVC
+ENDPROC(arm_smccc_hvc)
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c

index d7bef2144760c3f02fd5ba0efb3fafe68dd7b79c..4cddf20cdb824394d5368bb5cd10a4eadff2d06c 100644 (file)
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -16,7 +16,6 @@
   * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
   */
  
-#include <linux/cpu.h>
  #include <linux/cpu_pm.h>
  #include <linux/errno.h>
  #include <linux/err.h>
@@ -44,6 +43,7 @@
  #include <asm/kvm_emulate.h>
  #include <asm/kvm_coproc.h>
  #include <asm/kvm_psci.h>
+#include <asm/sections.h>
  
  #ifdef REQUIRES_VIRT
  __asm__(".arch_extension       virt");
@@ -58,9 +58,14 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu);
  
  /* The VMID used in the VTTBR */
  static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
-static u8 kvm_next_vmid;
+static u32 kvm_next_vmid;
+static unsigned int kvm_vmid_bits __read_mostly;
  static DEFINE_SPINLOCK(kvm_vmid_lock);
  
+static bool vgic_present;
+
+static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
+
  static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
  {
         BUG_ON(preemptible());
@@ -85,11 +90,6 @@ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
         return &kvm_arm_running_vcpu;
  }
  
-int kvm_arch_hardware_enable(void)
-{
-       return 0;
-}
-
  int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
  {
         return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
@@ -132,7 +132,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
         kvm->arch.vmid_gen = 0;
  
         /* The maximum number of VCPUs is limited by the host's GIC model */
-       kvm->arch.max_vcpus = kvm_vgic_get_max_vcpus();
+       kvm->arch.max_vcpus = vgic_present ?
+                               kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
  
         return ret;
  out_free_stage2_pgd:
@@ -170,6 +171,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
         int r;
         switch (ext) {
         case KVM_CAP_IRQCHIP:
+               r = vgic_present;
+               break;
         case KVM_CAP_IOEVENTFD:
         case KVM_CAP_DEVICE_CTRL:
         case KVM_CAP_USER_MEMORY:
@@ -431,11 +434,12 @@ static void update_vttbr(struct kvm *kvm)
         kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen);
         kvm->arch.vmid = kvm_next_vmid;
         kvm_next_vmid++;
+       kvm_next_vmid &= (1 << kvm_vmid_bits) - 1;
  
         /* update vttbr to be used with the new vmid */
         pgd_phys = virt_to_phys(kvm_get_hwpgd(kvm));
         BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK);
-       vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK;
+       vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits);
         kvm->arch.vttbr = pgd_phys | vmid;
  
         spin_unlock(&kvm_vmid_lock);
@@ -911,6 +915,8 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
  
         switch (dev_id) {
         case KVM_ARM_DEVICE_VGIC_V2:
+               if (!vgic_present)
+                       return -ENXIO;
                 return kvm_vgic_addr(kvm, type, &dev_addr->addr, true);
         default:
                 return -ENODEV;
@@ -925,6 +931,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
  
         switch (ioctl) {
         case KVM_CREATE_IRQCHIP: {
+               if (!vgic_present)
+                       return -ENXIO;
                 return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
         }
         case KVM_ARM_SET_DEVICE_ADDR: {
@@ -967,43 +975,99 @@ static void cpu_init_hyp_mode(void *dummy)
         pgd_ptr = kvm_mmu_get_httbr();
         stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
         hyp_stack_ptr = stack_page + PAGE_SIZE;
-       vector_ptr = (unsigned long)__kvm_hyp_vector;
+       vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector);
  
         __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+       __cpu_init_stage2();
  
         kvm_arm_init_debug();
  }
  
-static int hyp_init_cpu_notify(struct notifier_block *self,
-                              unsigned long action, void *cpu)
+static void cpu_hyp_reinit(void)
  {
-       switch (action) {
-       case CPU_STARTING:
-       case CPU_STARTING_FROZEN:
+       if (is_kernel_in_hyp_mode()) {
+               /*
+                * __cpu_init_stage2() is safe to call even if the PM
+                * event was cancelled before the CPU was reset.
+                */
+               __cpu_init_stage2();
+       } else {
                 if (__hyp_get_vectors() == hyp_default_vectors)
                         cpu_init_hyp_mode(NULL);
-               break;
         }
+}
+
+static void cpu_hyp_reset(void)
+{
+       phys_addr_t boot_pgd_ptr;
+       phys_addr_t phys_idmap_start;
+
+       if (!is_kernel_in_hyp_mode()) {
+               boot_pgd_ptr = kvm_mmu_get_boot_httbr();
+               phys_idmap_start = kvm_get_idmap_start();
+
+               __cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start);
+       }
+}
  
-       return NOTIFY_OK;
+static void _kvm_arch_hardware_enable(void *discard)
+{
+       if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
+               cpu_hyp_reinit();
+               __this_cpu_write(kvm_arm_hardware_enabled, 1);
+       }
  }
  
-static struct notifier_block hyp_init_cpu_nb = {
-       .notifier_call = hyp_init_cpu_notify,
-};
+int kvm_arch_hardware_enable(void)
+{
+       _kvm_arch_hardware_enable(NULL);
+       return 0;
+}
+
+static void _kvm_arch_hardware_disable(void *discard)
+{
+       if (__this_cpu_read(kvm_arm_hardware_enabled)) {
+               cpu_hyp_reset();
+               __this_cpu_write(kvm_arm_hardware_enabled, 0);
+       }
+}
+
+void kvm_arch_hardware_disable(void)
+{
+       _kvm_arch_hardware_disable(NULL);
+}
  
  #ifdef CONFIG_CPU_PM
  static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
                                     unsigned long cmd,
                                     void *v)
  {
-       if (cmd == CPU_PM_EXIT &&
-           __hyp_get_vectors() == hyp_default_vectors) {
-               cpu_init_hyp_mode(NULL);
+       /*
+        * kvm_arm_hardware_enabled is left with its old value over
+        * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
+        * re-enable hyp.
+        */
+       switch (cmd) {
+       case CPU_PM_ENTER:
+               if (__this_cpu_read(kvm_arm_hardware_enabled))
+                       /*
+                        * don't update kvm_arm_hardware_enabled here
+                        * so that the hardware will be re-enabled
+                        * when we resume. See below.
+                        */
+                       cpu_hyp_reset();
+
+               return NOTIFY_OK;
+       case CPU_PM_EXIT:
+               if (__this_cpu_read(kvm_arm_hardware_enabled))
+                       /* The hardware was enabled before suspend. */
+                       cpu_hyp_reinit();
+
                 return NOTIFY_OK;
-       }
  
-       return NOTIFY_DONE;
+       default:
+               return NOTIFY_DONE;
+       }
  }
  
  static struct notifier_block hyp_init_cpu_pm_nb = {
@@ -1020,6 +1084,91 @@ static inline void hyp_cpu_pm_init(void)
  }
  #endif
  
+static void teardown_common_resources(void)
+{
+       free_percpu(kvm_host_cpu_state);
+}
+
+static int init_common_resources(void)
+{
+       kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t);
+       if (!kvm_host_cpu_state) {
+               kvm_err("Cannot allocate host CPU state\n");
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static int init_subsystems(void)
+{
+       int err = 0;
+
+       /*
+        * Enable hardware so that subsystem initialisation can access EL2.
+        */
+       on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
+
+       /*
+        * Register CPU lower-power notifier
+        */
+       hyp_cpu_pm_init();
+
+       /*
+        * Init HYP view of VGIC
+        */
+       err = kvm_vgic_hyp_init();
+       switch (err) {
+       case 0:
+               vgic_present = true;
+               break;
+       case -ENODEV:
+       case -ENXIO:
+               vgic_present = false;
+               err = 0;
+               break;
+       default:
+               goto out;
+       }
+
+       /*
+        * Init HYP architected timer support
+        */
+       err = kvm_timer_hyp_init();
+       if (err)
+               goto out;
+
+       kvm_perf_init();
+       kvm_coproc_table_init();
+
+out:
+       on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
+
+       return err;
+}
+
+static void teardown_hyp_mode(void)
+{
+       int cpu;
+
+       if (is_kernel_in_hyp_mode())
+               return;
+
+       free_hyp_pgds();
+       for_each_possible_cpu(cpu)
+               free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+}
+
+static int init_vhe_mode(void)
+{
+       /* set size of VMID supported by CPU */
+       kvm_vmid_bits = kvm_get_vmid_bits();
+       kvm_info("%d-bit VMID\n", kvm_vmid_bits);
+
+       kvm_info("VHE mode initialized successfully\n");
+       return 0;
+}
+
  /**
   * Inits Hyp-mode on all online CPUs
   */
@@ -1050,7 +1199,7 @@ static int init_hyp_mode(void)
                 stack_page = __get_free_page(GFP_KERNEL);
                 if (!stack_page) {
                         err = -ENOMEM;
-                       goto out_free_stack_pages;
+                       goto out_err;
                 }
  
                 per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
@@ -1059,10 +1208,18 @@ static int init_hyp_mode(void)
         /*
          * Map the Hyp-code called directly from the host
          */
-       err = create_hyp_mappings(__kvm_hyp_code_start, __kvm_hyp_code_end);
+       err = create_hyp_mappings(kvm_ksym_ref(__kvm_hyp_code_start),
+                                 kvm_ksym_ref(__kvm_hyp_code_end));
         if (err) {
                 kvm_err("Cannot map world-switch code\n");
-               goto out_free_mappings;
+               goto out_err;
+       }
+
+       err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
+                                 kvm_ksym_ref(__end_rodata));
+       if (err) {
+               kvm_err("Cannot map rodata section\n");
+               goto out_err;
         }
  
         /*
@@ -1074,20 +1231,10 @@ static int init_hyp_mode(void)
  
                 if (err) {
                         kvm_err("Cannot map hyp stack\n");
-                       goto out_free_mappings;
+                       goto out_err;
                 }
         }
  
-       /*
-        * Map the host CPU structures
-        */
-       kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t);
-       if (!kvm_host_cpu_state) {
-               err = -ENOMEM;
-               kvm_err("Cannot allocate host CPU state\n");
-               goto out_free_mappings;
-       }
-
         for_each_possible_cpu(cpu) {
                 kvm_cpu_context_t *cpu_ctxt;
  
@@ -1096,46 +1243,24 @@ static int init_hyp_mode(void)
  
                 if (err) {
                         kvm_err("Cannot map host CPU state: %d\n", err);
-                       goto out_free_context;
+                       goto out_err;
                 }
         }
  
-       /*
-        * Execute the init code on each CPU.
-        */
-       on_each_cpu(cpu_init_hyp_mode, NULL, 1);
-
-       /*
-        * Init HYP view of VGIC
-        */
-       err = kvm_vgic_hyp_init();
-       if (err)
-               goto out_free_context;
-
-       /*
-        * Init HYP architected timer support
-        */
-       err = kvm_timer_hyp_init();
-       if (err)
-               goto out_free_context;
-
  #ifndef CONFIG_HOTPLUG_CPU
         free_boot_hyp_pgd();
  #endif
  
-       kvm_perf_init();
+       /* set size of VMID supported by CPU */
+       kvm_vmid_bits = kvm_get_vmid_bits();
+       kvm_info("%d-bit VMID\n", kvm_vmid_bits);
  
         kvm_info("Hyp mode initialized successfully\n");
  
         return 0;
-out_free_context:
-       free_percpu(kvm_host_cpu_state);
-out_free_mappings:
-       free_hyp_pgds();
-out_free_stack_pages:
-       for_each_possible_cpu(cpu)
-               free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+
  out_err:
+       teardown_hyp_mode();
         kvm_err("error initializing Hyp mode: %d\n", err);
         return err;
  }
@@ -1179,26 +1304,27 @@ int kvm_arch_init(void *opaque)
                 }
         }
  
-       cpu_notifier_register_begin();
-
-       err = init_hyp_mode();
+       err = init_common_resources();
         if (err)
-               goto out_err;
+               return err;
  
-       err = __register_cpu_notifier(&hyp_init_cpu_nb);
-       if (err) {
-               kvm_err("Cannot register HYP init CPU notifier (%d)\n", err);
+       if (is_kernel_in_hyp_mode())
+               err = init_vhe_mode();
+       else
+               err = init_hyp_mode();
+       if (err)
                 goto out_err;
-       }
-
-       cpu_notifier_register_done();
  
-       hyp_cpu_pm_init();
+       err = init_subsystems();
+       if (err)
+               goto out_hyp;
  
-       kvm_coproc_table_init();
         return 0;
+
+out_hyp:
+       teardown_hyp_mode();
  out_err:
-       cpu_notifier_register_done();
+       teardown_common_resources();
         return err;
  }
  
diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c

index d6c005283678fe5061a50cc8f5efd1febcc0f27b..dc99159857b4ae70d7d3785b75a1c3f0f8639906 100644 (file)
--- a/arch/arm/kvm/emulate.c
+++ b/arch/arm/kvm/emulate.c
@@ -275,6 +275,40 @@ static u32 exc_vector_base(struct kvm_vcpu *vcpu)
                 return vbar;
  }
  
+/*
+ * Switch to an exception mode, updating both CPSR and SPSR. Follow
+ * the logic described in AArch32.EnterMode() from the ARMv8 ARM.
+ */
+static void kvm_update_psr(struct kvm_vcpu *vcpu, unsigned long mode)
+{
+       unsigned long cpsr = *vcpu_cpsr(vcpu);
+       u32 sctlr = vcpu->arch.cp15[c1_SCTLR];
+
+       *vcpu_cpsr(vcpu) = (cpsr & ~MODE_MASK) | mode;
+
+       switch (mode) {
+       case FIQ_MODE:
+               *vcpu_cpsr(vcpu) |= PSR_F_BIT;
+               /* Fall through */
+       case ABT_MODE:
+       case IRQ_MODE:
+               *vcpu_cpsr(vcpu) |= PSR_A_BIT;
+               /* Fall through */
+       default:
+               *vcpu_cpsr(vcpu) |= PSR_I_BIT;
+       }
+
+       *vcpu_cpsr(vcpu) &= ~(PSR_IT_MASK | PSR_J_BIT | PSR_E_BIT | PSR_T_BIT);
+
+       if (sctlr & SCTLR_TE)
+               *vcpu_cpsr(vcpu) |= PSR_T_BIT;
+       if (sctlr & SCTLR_EE)
+               *vcpu_cpsr(vcpu) |= PSR_E_BIT;
+
+       /* Note: These now point to the mode banked copies */
+       *vcpu_spsr(vcpu) = cpsr;
+}
+
  /**
   * kvm_inject_undefined - inject an undefined exception into the guest
   * @vcpu: The VCPU to receive the undefined exception
@@ -286,29 +320,13 @@ static u32 exc_vector_base(struct kvm_vcpu *vcpu)
   */
  void kvm_inject_undefined(struct kvm_vcpu *vcpu)
  {
-       unsigned long new_lr_value;
-       unsigned long new_spsr_value;
         unsigned long cpsr = *vcpu_cpsr(vcpu);
-       u32 sctlr = vcpu->arch.cp15[c1_SCTLR];
         bool is_thumb = (cpsr & PSR_T_BIT);
         u32 vect_offset = 4;
         u32 return_offset = (is_thumb) ? 2 : 4;
  
-       new_spsr_value = cpsr;
-       new_lr_value = *vcpu_pc(vcpu) - return_offset;
-
-       *vcpu_cpsr(vcpu) = (cpsr & ~MODE_MASK) | UND_MODE;
-       *vcpu_cpsr(vcpu) |= PSR_I_BIT;
-       *vcpu_cpsr(vcpu) &= ~(PSR_IT_MASK | PSR_J_BIT | PSR_E_BIT | PSR_T_BIT);
-
-       if (sctlr & SCTLR_TE)
-               *vcpu_cpsr(vcpu) |= PSR_T_BIT;
-       if (sctlr & SCTLR_EE)
-               *vcpu_cpsr(vcpu) |= PSR_E_BIT;
-
-       /* Note: These now point to UND banked copies */
-       *vcpu_spsr(vcpu) = cpsr;
-       *vcpu_reg(vcpu, 14) = new_lr_value;
+       kvm_update_psr(vcpu, UND_MODE);
+       *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) - return_offset;
  
         /* Branch to exception vector */
         *vcpu_pc(vcpu) = exc_vector_base(vcpu) + vect_offset;
@@ -320,30 +338,14 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu)
   */
  static void inject_abt(struct kvm_vcpu *vcpu, bool is_pabt, unsigned long addr)
  {
-       unsigned long new_lr_value;
-       unsigned long new_spsr_value;
         unsigned long cpsr = *vcpu_cpsr(vcpu);
-       u32 sctlr = vcpu->arch.cp15[c1_SCTLR];
         bool is_thumb = (cpsr & PSR_T_BIT);
         u32 vect_offset;
         u32 return_offset = (is_thumb) ? 4 : 0;
         bool is_lpae;
  
-       new_spsr_value = cpsr;
-       new_lr_value = *vcpu_pc(vcpu) + return_offset;
-
-       *vcpu_cpsr(vcpu) = (cpsr & ~MODE_MASK) | ABT_MODE;
-       *vcpu_cpsr(vcpu) |= PSR_I_BIT | PSR_A_BIT;
-       *vcpu_cpsr(vcpu) &= ~(PSR_IT_MASK | PSR_J_BIT | PSR_E_BIT | PSR_T_BIT);
-
-       if (sctlr & SCTLR_TE)
-               *vcpu_cpsr(vcpu) |= PSR_T_BIT;
-       if (sctlr & SCTLR_EE)
-               *vcpu_cpsr(vcpu) |= PSR_E_BIT;
-
-       /* Note: These now point to ABT banked copies */
-       *vcpu_spsr(vcpu) = cpsr;
-       *vcpu_reg(vcpu, 14) = new_lr_value;
+       kvm_update_psr(vcpu, ABT_MODE);
+       *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
  
         if (is_pabt)
                 vect_offset = 12;
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c

index 11b6595c26723b511341214e13e1a92fe214a0b2..767872411d97bbafddd9d4ee7c255db61a36c5c6 100644 (file)
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -28,6 +28,7 @@
  #include <asm/kvm_mmio.h>
  #include <asm/kvm_asm.h>
  #include <asm/kvm_emulate.h>
+#include <asm/virt.h>
  
  #include "trace.h"
  
@@ -598,6 +599,9 @@ int create_hyp_mappings(void *from, void *to)
         unsigned long start = KERN_TO_HYP((unsigned long)from);
         unsigned long end = KERN_TO_HYP((unsigned long)to);
  
+       if (is_kernel_in_hyp_mode())
+               return 0;
+
         start = start & PAGE_MASK;
         end = PAGE_ALIGN(end);
  
@@ -630,6 +634,9 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
         unsigned long start = KERN_TO_HYP((unsigned long)from);
         unsigned long end = KERN_TO_HYP((unsigned long)to);
  
+       if (is_kernel_in_hyp_mode())
+               return 0;
+
         /* Check for a valid kernel IO mapping */
         if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))
                 return -EINVAL;
@@ -656,9 +663,9 @@ static void *kvm_alloc_hwpgd(void)
   * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
   * @kvm:       The KVM struct pointer for the VM.
   *
- * Allocates the 1st level table only of size defined by S2_PGD_ORDER (can
- * support either full 40-bit input addresses or limited to 32-bit input
- * addresses). Clears the allocated pages.
+ * Allocates only the stage-2 HW PGD level table(s) (can support either full
+ * 40-bit input addresses or limited to 32-bit input addresses). Clears the
+ * allocated pages.
   *
   * Note we don't need locking here as this is only called when the VM is
   * created, which can only be done once.
@@ -1648,6 +1655,11 @@ phys_addr_t kvm_get_idmap_vector(void)
         return hyp_idmap_vector;
  }
  
+phys_addr_t kvm_get_idmap_start(void)
+{
+       return hyp_idmap_start;
+}
+
  int kvm_mmu_init(void)
  {
         int err;
diff --git a/arch/arm/vdso/vdso.S b/arch/arm/vdso/vdso.S

index b2b97e3e7babbbb37072b4185faa27a0e47760c8..a62a7b64f49c52706b8e133b1f786dc9dc34842d 100644 (file)
--- a/arch/arm/vdso/vdso.S
+++ b/arch/arm/vdso/vdso.S
@@ -23,9 +23,8 @@
  #include <linux/const.h>
  #include <asm/page.h>
  
-       __PAGE_ALIGNED_DATA
-
         .globl vdso_start, vdso_end
+       .section .data..ro_after_init
         .balign PAGE_SIZE
  vdso_start:
         .incbin "arch/arm/vdso/vdso.so"
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig

index 14cdc6dea4939f9875b8332fea4fe625f5d7f278..3510b01acc8cab9486d1f88bcb4076b6122941e4 100644 (file)
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -13,6 +13,7 @@ config ARM64
         select ARCH_WANT_OPTIONAL_GPIOLIB
         select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
         select ARCH_WANT_FRAME_POINTERS
+       select ARCH_HAS_UBSAN_SANITIZE_ALL
         select ARM_AMBA
         select ARM_ARCH_TIMER
         select ARM_GIC
@@ -48,6 +49,8 @@ config ARM64
         select HAVE_ALIGNED_STRUCT_PAGE if SLUB
         select HAVE_ARCH_AUDITSYSCALL
         select HAVE_ARCH_BITREVERSE
+       select HAVE_ARCH_HARDENED_USERCOPY
+       select HAVE_ARCH_HUGE_VMAP
         select HAVE_ARCH_JUMP_LABEL
         select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
         select HAVE_ARCH_KGDB
@@ -70,13 +73,17 @@ config ARM64
         select HAVE_FUNCTION_GRAPH_TRACER
         select HAVE_GENERIC_DMA_COHERENT
         select HAVE_HW_BREAKPOINT if PERF_EVENTS
+       select HAVE_IRQ_TIME_ACCOUNTING
         select HAVE_MEMBLOCK
         select HAVE_PATA_PLATFORM
         select HAVE_PERF_EVENTS
         select HAVE_PERF_REGS
         select HAVE_PERF_USER_STACK_DUMP
+       select HAVE_REGS_AND_STACK_ACCESS_API
         select HAVE_RCU_TABLE_FREE
         select HAVE_SYSCALL_TRACEPOINTS
+       select HAVE_KPROBES
+       select HAVE_KRETPROBES if HAVE_KPROBES
         select IOMMU_DMA if IOMMU_SUPPORT
         select IRQ_DOMAIN
         select IRQ_FORCED_THREADING
@@ -92,6 +99,7 @@ config ARM64
         select SPARSE_IRQ
         select SYSCTL_EXCEPTION_TRACE
         select HAVE_CONTEXT_TRACKING
+       select HAVE_ARM_SMCCC
         help
           ARM 64-bit (AArch64) Linux support.
  
@@ -362,6 +370,7 @@ config ARM64_ERRATUM_843419
         bool "Cortex-A53: 843419: A load or store might access an incorrect address"
         depends on MODULES
         default y
+       select ARM64_MODULE_CMODEL_LARGE
         help
           This option builds kernel modules using the large memory model in
           order to avoid the use of the ADRP instruction, which can cause
@@ -526,6 +535,9 @@ config HOTPLUG_CPU
  source kernel/Kconfig.preempt
  source kernel/Kconfig.hz
  
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+       def_bool y
+
  config ARCH_HAS_HOLES_MEMORYMODEL
         def_bool y if SPARSEMEM
  
@@ -549,9 +561,6 @@ config HW_PERF_EVENTS
  config SYS_SUPPORTS_HUGETLBFS
         def_bool y
  
-config ARCH_WANT_GENERAL_HUGETLB
-       def_bool y
-
  config ARCH_WANT_HUGE_PMD_SHARE
         def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
  
@@ -724,10 +733,93 @@ config ARM64_LSE_ATOMICS
  
  endmenu
  
+config ARM64_UAO
+       bool "Enable support for User Access Override (UAO)"
+       default y
+       help
+         User Access Override (UAO; part of the ARMv8.2 Extensions)
+         causes the 'unprivileged' variant of the load/store instructions to
+         be overriden to be privileged.
+
+         This option changes get_user() and friends to use the 'unprivileged'
+         variant of the load/store instructions. This ensures that user-space
+         really did have access to the supplied memory. When addr_limit is
+         set to kernel memory the UAO bit will be set, allowing privileged
+         access to kernel memory.
+
+         Choosing this option will cause copy_to_user() et al to use user-space
+         memory permissions.
+
+         The feature is detected at runtime, the kernel will use the
+         regular load/store instructions if the cpu does not implement the
+         feature.
+
+config ARM64_MODULE_CMODEL_LARGE
+       bool
+
+config ARM64_MODULE_PLTS
+       bool
+       select ARM64_MODULE_CMODEL_LARGE
+       select HAVE_MOD_ARCH_SPECIFIC
+
+config RELOCATABLE
+       bool
+       help
+         This builds the kernel as a Position Independent Executable (PIE),
+         which retains all relocation metadata required to relocate the
+         kernel binary at runtime to a different virtual address than the
+         address it was linked at.
+         Since AArch64 uses the RELA relocation format, this requires a
+         relocation pass at runtime even if the kernel is loaded at the
+         same address it was linked at.
+
+config RANDOMIZE_BASE
+       bool "Randomize the address of the kernel image"
+       select ARM64_MODULE_PLTS
+       select RELOCATABLE
+       help
+         Randomizes the virtual address at which the kernel image is
+         loaded, as a security feature that deters exploit attempts
+         relying on knowledge of the location of kernel internals.
+
+         It is the bootloader's job to provide entropy, by passing a
+         random u64 value in /chosen/kaslr-seed at kernel entry.
+
+         When booting via the UEFI stub, it will invoke the firmware's
+         EFI_RNG_PROTOCOL implementation (if available) to supply entropy
+         to the kernel proper. In addition, it will randomise the physical
+         location of the kernel Image as well.
+
+         If unsure, say N.
+
+config RANDOMIZE_MODULE_REGION_FULL
+       bool "Randomize the module region independently from the core kernel"
+       depends on RANDOMIZE_BASE
+       default y
+       help
+         Randomizes the location of the module region without considering the
+         location of the core kernel. This way, it is impossible for modules
+         to leak information about the location of core kernel data structures
+         but it does imply that function calls between modules and the core
+         kernel will need to be resolved via veneers in the module PLT.
+
+         When this option is not set, the module region will be randomized over
+         a limited range that contains the [_stext, _etext] interval of the
+         core kernel, so branch relocations are always in range.
+
  endmenu
  
  menu "Boot options"
  
+config ARM64_ACPI_PARKING_PROTOCOL
+       bool "Enable support for the ARM64 ACPI parking protocol"
+       depends on ACPI
+       help
+         Enable support for the ARM64 ACPI parking protocol. If disabled
+         the kernel will not allow booting through the ARM64 ACPI parking
+         protocol even if the corresponding data is present in the ACPI
+         MADT table.
+
  config CMDLINE
         string "Default kernel command string"
         default ""
@@ -810,6 +902,14 @@ menu "Power management options"
  
  source "kernel/power/Kconfig"
  
+config ARCH_HIBERNATION_POSSIBLE
+       def_bool y
+       depends on CPU_PM
+
+config ARCH_HIBERNATION_HEADER
+       def_bool y
+       depends on HIBERNATION
+
  config ARCH_SUSPEND_POSSIBLE
         def_bool y
  
diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug

index 04fb73b973f155b477425ff1841e370f0ba87d41..ab1cb1fc4e3d6c2891fd0381463020795bfb7b0e 100644 (file)
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -73,7 +73,7 @@ config DEBUG_RODATA
            If in doubt, say Y
  
  config DEBUG_ALIGN_RODATA
-       depends on DEBUG_RODATA && ARM64_4K_PAGES
+       depends on DEBUG_RODATA
         bool "Align linker sections up to SECTION_SIZE"
         help
           If this option is enabled, sections that may potentially be marked as
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile

index b6c90e5006e45ae01654c3b5867c3a49910dc54e..0a9bf4500852e988688cccb4f848cadecfe47ea6 100644 (file)
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -15,6 +15,10 @@ CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET)
  OBJCOPYFLAGS   :=-O binary -R .note -R .note.gnu.build-id -R .comment -S
  GZFLAGS                :=-9
  
+ifneq ($(CONFIG_RELOCATABLE),)
+LDFLAGS_vmlinux                += -pie -Bsymbolic
+endif
+
  KBUILD_DEFCONFIG := defconfig
  
  # Check for binutils support for specific extensions
@@ -28,6 +32,7 @@ endif
  
  KBUILD_CFLAGS  += -mgeneral-regs-only $(lseinstr)
  KBUILD_CFLAGS  += $(call cc-option, -mpc-relative-literal-loads)
+KBUILD_CFLAGS  += -fno-asynchronous-unwind-tables
  KBUILD_AFLAGS  += $(lseinstr)
  
  ifeq ($(CONFIG_CPU_BIG_ENDIAN), y)
@@ -42,10 +47,14 @@ endif
  
  CHECKFLAGS     += -D__aarch64__
  
-ifeq ($(CONFIG_ARM64_ERRATUM_843419), y)
+ifeq ($(CONFIG_ARM64_MODULE_CMODEL_LARGE), y)
  KBUILD_CFLAGS_MODULE   += -mcmodel=large
  endif
  
+ifeq ($(CONFIG_ARM64_MODULE_PLTS),y)
+KBUILD_LDFLAGS_MODULE  += -T $(srctree)/arch/arm64/kernel/module.lds
+endif
+
  # Default value
  head-y         := arch/arm64/kernel/head.o
  
diff --git a/arch/arm64/boot/dts/arm/juno.dts b/arch/arm64/boot/dts/arm/juno.dts

index 53442b5ee4ff99170056ddb15eee296461d167a0..3e1a84b01b500807fea638858ebb2c879a5dae94 100644 (file)
--- a/arch/arm64/boot/dts/arm/juno.dts
+++ b/arch/arm64/boot/dts/arm/juno.dts
@@ -143,5 +143,310 @@
                                      <&A53_3>;
         };
  
+       etr@20070000 {
+               compatible = "arm,coresight-tmc", "arm,primecell";
+               reg = <0 0x20070000 0 0x1000>;
+
+               clocks = <&soc_smc50mhz>;
+               clock-names = "apb_pclk";
+               port {
+                       etr_in_port: endpoint {
+                               slave-mode;
+                               remote-endpoint = <&replicator_out_port1>;
+                       };
+               };
+       };
+
+       tpiu@20030000 {
+               compatible = "arm,coresight-tpiu", "arm,primecell";
+               reg = <0 0x20030000 0 0x1000>;
+
+               clocks = <&soc_smc50mhz>;
+               clock-names = "apb_pclk";
+               port {
+                       tpiu_in_port: endpoint {
+                               slave-mode;
+                               remote-endpoint = <&replicator_out_port0>;
+                       };
+               };
+       };
+
+       replicator@20020000 {
+               /* non-configurable replicators don't show up on the
+                * AMBA bus.  As such no need to add "arm,primecell".
+                */
+               compatible = "arm,coresight-replicator";
+
+               ports {
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+
+                       /* replicator output ports */
+                       port@0 {
+                               reg = <0>;
+                               replicator_out_port0: endpoint {
+                                       remote-endpoint = <&tpiu_in_port>;
+                               };
+                       };
+
+                       port@1 {
+                               reg = <1>;
+                               replicator_out_port1: endpoint {
+                                       remote-endpoint = <&etr_in_port>;
+                               };
+                       };
+
+                       /* replicator input port */
+                       port@2 {
+                               reg = <0>;
+                               replicator_in_port0: endpoint {
+                                       slave-mode;
+                                       remote-endpoint = <&etf_out_port>;
+                               };
+                       };
+               };
+       };
+
+       etf@20010000 {
+               compatible = "arm,coresight-tmc", "arm,primecell";
+               reg = <0 0x20010000 0 0x1000>;
+
+               clocks = <&soc_smc50mhz>;
+               clock-names = "apb_pclk";
+               ports {
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+
+                       /* input port */
+                       port@0 {
+                               reg = <0>;
+                               etf_in_port: endpoint {
+                                       slave-mode;
+                                       remote-endpoint =
+                                               <&main_funnel_out_port>;
+                               };
+                       };
+
+                       /* output port */
+                       port@1 {
+                               reg = <0>;
+                               etf_out_port: endpoint {
+                                       remote-endpoint =
+                                               <&replicator_in_port0>;
+                               };
+                       };
+               };
+       };
+
+       main_funnel@20040000 {
+               compatible = "arm,coresight-funnel", "arm,primecell";
+               reg = <0 0x20040000 0 0x1000>;
+
+               clocks = <&soc_smc50mhz>;
+               clock-names = "apb_pclk";
+               ports {
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+
+                       port@0 {
+                               reg = <0>;
+                               main_funnel_out_port: endpoint {
+                                       remote-endpoint =
+                                               <&etf_in_port>;
+                               };
+                       };
+
+                       port@1 {
+                               reg = <0>;
+                               main_funnel_in_port0: endpoint {
+                                       slave-mode;
+                                       remote-endpoint =
+                                               <&A72_57_funnel_out_port>;
+                               };
+                       };
+
+                       port@2 {
+                               reg = <1>;
+                               main_funnel_in_port1: endpoint {
+                                       slave-mode;
+                                       remote-endpoint = <&A53_funnel_out_port>;
+                               };
+                       };
+
+               };
+       };
+
+       A72_57_funnel@220c0000 {
+               compatible = "arm,coresight-funnel", "arm,primecell";
+               reg = <0 0x220c0000 0 0x1000>;
+
+               clocks = <&soc_smc50mhz>;
+               clock-names = "apb_pclk";
+               ports {
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+
+                       port@0 {
+                               reg = <0>;
+                               A72_57_funnel_out_port: endpoint {
+                                       remote-endpoint =
+                                               <&main_funnel_in_port0>;
+                               };
+                       };
+
+                       port@1 {
+                               reg = <0>;
+                               A72_57_funnel_in_port0: endpoint {
+                                       slave-mode;
+                                       remote-endpoint =
+                                                       <&A72_57_etm0_out_port>;
+                               };
+                       };
+
+                       port@2 {
+                               reg = <1>;
+                               A72_57_funnel_in_port1: endpoint {
+                                       slave-mode;
+                                       remote-endpoint =
+                                                       <&A72_57_etm1_out_port>;
+                               };
+                       };
+               };
+       };
+
+       A53_funnel@220c0000 {
+               compatible = "arm,coresight-funnel", "arm,primecell";
+               reg = <0 0x230c0000 0 0x1000>;
+
+               clocks = <&soc_smc50mhz>;
+               clock-names = "apb_pclk";
+               ports {
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+
+                       port@0 {
+                               reg = <0>;
+                               A53_funnel_out_port: endpoint {
+                                       remote-endpoint =
+                                               <&main_funnel_in_port1>;
+                               };
+                       };
+
+                       port@1 {
+                               reg = <0>;
+                               A53_funnel_in_port0: endpoint {
+                                       slave-mode;
+                                       remote-endpoint = <&A53_etm0_out_port>;
+                               };
+                       };
+
+                       port@2 {
+                               reg = <1>;
+                               A53_funnel_in_port1: endpoint {
+                                       slave-mode;
+                                       remote-endpoint = <&A53_etm1_out_port>;
+                               };
+                       };
+                       port@3 {
+                               reg = <2>;
+                               A53_funnel_in_port2: endpoint {
+                                       slave-mode;
+                                       remote-endpoint = <&A53_etm2_out_port>;
+                               };
+                       };
+                       port@4 {
+                               reg = <3>;
+                               A53_funnel_in_port3: endpoint {
+                                       slave-mode;
+                                       remote-endpoint = <&A53_etm3_out_port>;
+                               };
+                       };
+               };
+       };
+
+       etm@22040000 {
+               compatible = "arm,coresight-etm4x", "arm,primecell";
+               reg = <0 0x22040000 0 0x1000>;
+
+               cpu = <&A57_0>;
+               clocks = <&soc_smc50mhz>;
+               clock-names = "apb_pclk";
+               port {
+                       A72_57_etm0_out_port: endpoint {
+                               remote-endpoint = <&A72_57_funnel_in_port0>;
+                       };
+               };
+       };
+
+       etm@22140000 {
+               compatible = "arm,coresight-etm4x", "arm,primecell";
+               reg = <0 0x22140000 0 0x1000>;
+
+               cpu = <&A57_1>;
+               clocks = <&soc_smc50mhz>;
+               clock-names = "apb_pclk";
+               port {
+                       A72_57_etm1_out_port: endpoint {
+                               remote-endpoint = <&A72_57_funnel_in_port1>;
+                       };
+               };
+       };
+
+       etm@23040000 {
+               compatible = "arm,coresight-etm4x", "arm,primecell";
+               reg = <0 0x23040000 0 0x1000>;
+
+               cpu = <&A53_0>;
+               clocks = <&soc_smc50mhz>;
+               clock-names = "apb_pclk";
+               port {
+                       A53_etm0_out_port: endpoint {
+                               remote-endpoint = <&A53_funnel_in_port0>;
+                       };
+               };
+       };
+
+       etm@23140000 {
+               compatible = "arm,coresight-etm4x", "arm,primecell";
+               reg = <0 0x23140000 0 0x1000>;
+
+               cpu = <&A53_1>;
+               clocks = <&soc_smc50mhz>;
+               clock-names = "apb_pclk";
+               port {
+                       A53_etm1_out_port: endpoint {
+                               remote-endpoint = <&A53_funnel_in_port1>;
+                       };
+               };
+       };
+
+       etm@23240000 {
+               compatible = "arm,coresight-etm4x", "arm,primecell";
+               reg = <0 0x23240000 0 0x1000>;
+
+               cpu = <&A53_2>;
+               clocks = <&soc_smc50mhz>;
+               clock-names = "apb_pclk";
+               port {
+                       A53_etm2_out_port: endpoint {
+                               remote-endpoint = <&A53_funnel_in_port2>;
+                       };
+               };
+       };
+
+       etm@23340000 {
+               compatible = "arm,coresight-etm4x", "arm,primecell";
+               reg = <0 0x23340000 0 0x1000>;
+
+               cpu = <&A53_3>;
+               clocks = <&soc_smc50mhz>;
+               clock-names = "apb_pclk";
+               port {
+                       A53_etm3_out_port: endpoint {
+                               remote-endpoint = <&A53_funnel_in_port3>;
+                       };
+               };
+       };
+
         #include "juno-base.dtsi"
  };
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig

index bdd7aa358d2a5bc232dc510e789e61939416f700..79717faf2161f8235b206133a0f00813156a6040 100644 (file)
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -225,3 +225,6 @@ CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
  CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
  CONFIG_CRYPTO_AES_ARM64_NEON_BLK=y
  CONFIG_CRYPTO_CRC32_ARM64=y
+CONFIG_HIBERNATION=y
+CONFIG_KPROBES=y
+CONFIG_CORESIGHT=y
diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h

index caafd63b8092d8102401112d811b1055f4cc3524..aee323b13802ad143e9774d2076a4b1c63731ece 100644 (file)
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -87,9 +87,26 @@ void __init acpi_init_cpus(void);
  static inline void acpi_init_cpus(void) { }
  #endif /* CONFIG_ACPI */
  
+#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
+bool acpi_parking_protocol_valid(int cpu);
+void __init
+acpi_set_mailbox_entry(int cpu, struct acpi_madt_generic_interrupt *processor);
+#else
+static inline bool acpi_parking_protocol_valid(int cpu) { return false; }
+static inline void
+acpi_set_mailbox_entry(int cpu, struct acpi_madt_generic_interrupt *processor)
+{}
+#endif
+
  static inline const char *acpi_get_enable_method(int cpu)
  {
-       return acpi_psci_present() ? "psci" : NULL;
+       if (acpi_psci_present())
+               return "psci";
+
+       if (acpi_parking_protocol_valid(cpu))
+               return "parking-protocol";
+
+       return NULL;
  }
  
  #ifdef CONFIG_ACPI_APEI
diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h

index d56ec07151570e6e498e9bf35b2fd9ccaed335fe..beccbdefa106a4ba0dd7d98537427a37d2c26517 100644 (file)
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -1,6 +1,8 @@
  #ifndef __ASM_ALTERNATIVE_H
  #define __ASM_ALTERNATIVE_H
  
+#include <asm/cpufeature.h>
+
  #ifndef __ASSEMBLY__
  
  #include <linux/init.h>
@@ -19,7 +21,6 @@ struct alt_instr {
  
  void __init apply_alternatives_all(void);
  void apply_alternatives(void *start, size_t length);
-void free_alternatives_memory(void);
  
  #define ALTINSTR_ENTRY(feature)                                                      \
         " .word 661b - .\n"                             /* label           */ \
@@ -64,6 +65,8 @@ void free_alternatives_memory(void);
  
  #else
  
+#include <asm/assembler.h>
+
  .macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len
         .word \orig_offset - .
         .word \alt_offset - .
@@ -137,6 +140,65 @@ void free_alternatives_memory(void);
         alternative_insn insn1, insn2, cap, IS_ENABLED(cfg)
  
  
+/*
+ * Generate the assembly for UAO alternatives with exception table entries.
+ * This is complicated as there is no post-increment or pair versions of the
+ * unprivileged instructions, and USER() only works for single instructions.
+ */
+#ifdef CONFIG_ARM64_UAO
+       .macro uao_ldp l, reg1, reg2, addr, post_inc
+               alternative_if_not ARM64_HAS_UAO
+8888:                  ldp     \reg1, \reg2, [\addr], \post_inc;
+8889:                  nop;
+                       nop;
+               alternative_else
+                       ldtr    \reg1, [\addr];
+                       ldtr    \reg2, [\addr, #8];
+                       add     \addr, \addr, \post_inc;
+               alternative_endif
+
+               _asm_extable    8888b,\l;
+               _asm_extable    8889b,\l;
+       .endm
+
+       .macro uao_stp l, reg1, reg2, addr, post_inc
+               alternative_if_not ARM64_HAS_UAO
+8888:                  stp     \reg1, \reg2, [\addr], \post_inc;
+8889:                  nop;
+                       nop;
+               alternative_else
+                       sttr    \reg1, [\addr];
+                       sttr    \reg2, [\addr, #8];
+                       add     \addr, \addr, \post_inc;
+               alternative_endif
+
+               _asm_extable    8888b,\l;
+               _asm_extable    8889b,\l;
+       .endm
+
+       .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
+               alternative_if_not ARM64_HAS_UAO
+8888:                  \inst   \reg, [\addr], \post_inc;
+                       nop;
+               alternative_else
+                       \alt_inst       \reg, [\addr];
+                       add             \addr, \addr, \post_inc;
+               alternative_endif
+
+               _asm_extable    8888b,\l;
+       .endm
+#else
+       .macro uao_ldp l, reg1, reg2, addr, post_inc
+               USER(\l, ldp \reg1, \reg2, [\addr], \post_inc)
+       .endm
+       .macro uao_stp l, reg1, reg2, addr, post_inc
+               USER(\l, stp \reg1, \reg2, [\addr], \post_inc)
+       .endm
+       .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
+               USER(\l, \inst \reg, [\addr], \post_inc)
+       .endm
+#endif
+
  #endif  /*  __ASSEMBLY__  */
  
  /*
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h

index 12eff928ef8b38dd18ae3bd157b12eb918f797a6..290e13428f4a1c6b1b9425938a80dcd69460e592 100644 (file)
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -1,5 +1,5 @@
  /*
- * Based on arch/arm/include/asm/assembler.h
+ * Based on arch/arm/include/asm/assembler.h, arch/arm/mm/proc-macros.S
   *
   * Copyright (C) 1996-2000 Russell King
   * Copyright (C) 2012 ARM Ltd.
@@ -23,6 +23,9 @@
  #ifndef __ASM_ASSEMBLER_H
  #define __ASM_ASSEMBLER_H
  
+#include <asm/asm-offsets.h>
+#include <asm/page.h>
+#include <asm/pgtable-hwdef.h>
  #include <asm/ptrace.h>
  #include <asm/thread_info.h>
  
@@ -94,12 +97,19 @@
         dmb     \opt
         .endm
  
+/*
+ * Emit an entry into the exception table
+ */
+       .macro          _asm_extable, from, to
+       .pushsection    __ex_table, "a"
+       .align          3
+       .long           (\from - .), (\to - .)
+       .popsection
+       .endm
+
  #define USER(l, x...)                          \
  9999:  x;                                      \
-       .section __ex_table,"a";                \
-       .align  3;                              \
-       .quad   9999b,l;                        \
-       .previous
+       _asm_extable    9999b, l
  
  /*
   * Register aliases.
@@ -193,6 +203,113 @@ lr        .req    x30             // link register
         str     \src, [\tmp, :lo12:\sym]
         .endm
  
+       /*
+        * @sym: The name of the per-cpu variable
+        * @reg: Result of per_cpu(sym, smp_processor_id())
+        * @tmp: scratch register
+        */
+       .macro this_cpu_ptr, sym, reg, tmp
+       adr_l   \reg, \sym
+       mrs     \tmp, tpidr_el1
+       add     \reg, \reg, \tmp
+       .endm
+
+/*
+ * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm)
+ */
+       .macro  vma_vm_mm, rd, rn
+       ldr     \rd, [\rn, #VMA_VM_MM]
+       .endm
+
+/*
+ * mmid - get context id from mm pointer (mm->context.id)
+ */
+       .macro  mmid, rd, rn
+       ldr     \rd, [\rn, #MM_CONTEXT_ID]
+       .endm
+
+/*
+ * dcache_line_size - get the minimum D-cache line size from the CTR register.
+ */
+       .macro  dcache_line_size, reg, tmp
+       mrs     \tmp, ctr_el0                   // read CTR
+       ubfm    \tmp, \tmp, #16, #19            // cache line size encoding
+       mov     \reg, #4                        // bytes per word
+       lsl     \reg, \reg, \tmp                // actual cache line size
+       .endm
+
+/*
+ * icache_line_size - get the minimum I-cache line size from the CTR register.
+ */
+       .macro  icache_line_size, reg, tmp
+       mrs     \tmp, ctr_el0                   // read CTR
+       and     \tmp, \tmp, #0xf                // cache line size encoding
+       mov     \reg, #4                        // bytes per word
+       lsl     \reg, \reg, \tmp                // actual cache line size
+       .endm
+
+/*
+ * tcr_set_idmap_t0sz - update TCR.T0SZ so that we can load the ID map
+ */
+       .macro  tcr_set_idmap_t0sz, valreg, tmpreg
+#ifndef CONFIG_ARM64_VA_BITS_48
+       ldr_l   \tmpreg, idmap_t0sz
+       bfi     \valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH
+#endif
+       .endm
+
+/*
+ * Macro to perform a data cache maintenance for the interval
+ * [kaddr, kaddr + size)
+ *
+ *     op:             operation passed to dc instruction
+ *     domain:         domain used in dsb instruciton
+ *     kaddr:          starting virtual address of the region
+ *     size:           size of the region
+ *     Corrupts:       kaddr, size, tmp1, tmp2
+ */
+       .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
+       dcache_line_size \tmp1, \tmp2
+       add     \size, \kaddr, \size
+       sub     \tmp2, \tmp1, #1
+       bic     \kaddr, \kaddr, \tmp2
+9998:  dc      \op, \kaddr
+       add     \kaddr, \kaddr, \tmp1
+       cmp     \kaddr, \size
+       b.lo    9998b
+       dsb     \domain
+       .endm
+
+/*
+ * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present
+ */
+       .macro  reset_pmuserenr_el0, tmpreg
+       mrs     \tmpreg, id_aa64dfr0_el1        // Check ID_AA64DFR0_EL1 PMUVer
+       sbfx    \tmpreg, \tmpreg, #8, #4
+       cmp     \tmpreg, #1                     // Skip if no PMU present
+       b.lt    9000f
+       msr     pmuserenr_el0, xzr              // Disable PMU access from EL0
+9000:
+       .endm
+
+/*
+ * copy_page - copy src to dest using temp registers t1-t8
+ */
+       .macro copy_page dest:req src:req t1:req t2:req t3:req t4:req t5:req t6:req t7:req t8:req
+9998:  ldp     \t1, \t2, [\src]
+       ldp     \t3, \t4, [\src, #16]
+       ldp     \t5, \t6, [\src, #32]
+       ldp     \t7, \t8, [\src, #48]
+       add     \src, \src, #64
+       stnp    \t1, \t2, [\dest]
+       stnp    \t3, \t4, [\dest, #16]
+       stnp    \t5, \t6, [\dest, #32]
+       stnp    \t7, \t8, [\dest, #48]
+       add     \dest, \dest, #64
+       tst     \src, #(PAGE_SIZE - 1)
+       b.ne    9998b
+       .endm
+
  /*
   * Annotate a function as position independent, i.e., safe to be called before
   * the kernel virtual mapping is activated.
@@ -204,4 +321,35 @@ lr .req    x30             // link register
         .size   __pi_##x, . - x;        \
         ENDPROC(x)
  
+       /*
+        * Emit a 64-bit absolute little endian symbol reference in a way that
+        * ensures that it will be resolved at build time, even when building a
+        * PIE binary. This requires cooperation from the linker script, which
+        * must emit the lo32/hi32 halves individually.
+        */
+       .macro  le64sym, sym
+       .long   \sym\()_lo32
+       .long   \sym\()_hi32
+       .endm
+
+       /*
+        * mov_q - move an immediate constant into a 64-bit register using
+        *         between 2 and 4 movz/movk instructions (depending on the
+        *         magnitude and sign of the operand)
+        */
+       .macro  mov_q, reg, val
+       .if (((\val) >> 31) == 0 || ((\val) >> 31) == 0x1ffffffff)
+       movz    \reg, :abs_g1_s:\val
+       .else
+       .if (((\val) >> 47) == 0 || ((\val) >> 47) == 0x1ffff)
+       movz    \reg, :abs_g2_s:\val
+       .else
+       movz    \reg, :abs_g3:\val
+       movk    \reg, :abs_g2_nc:\val
+       .endif
+       movk    \reg, :abs_g1_nc:\val
+       .endif
+       movk    \reg, :abs_g0_nc:\val
+       .endm
+
  #endif /* __ASM_ASSEMBLER_H */
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h

index 197e06afbf71947eb505a893e53b73be73543be9..39c1d340fec59136b8ddd6e3ac3f39354289189f 100644 (file)
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -36,7 +36,7 @@ static inline void atomic_andnot(int i, atomic_t *v)
         "       stclr   %w[i], %[v]\n")
         : [i] "+r" (w0), [v] "+Q" (v->counter)
         : "r" (x1)
-       : "x30");
+       : __LL_SC_CLOBBERS);
  }
  
  static inline void atomic_or(int i, atomic_t *v)
@@ -48,7 +48,7 @@ static inline void atomic_or(int i, atomic_t *v)
         "       stset   %w[i], %[v]\n")
         : [i] "+r" (w0), [v] "+Q" (v->counter)
         : "r" (x1)
-       : "x30");
+       : __LL_SC_CLOBBERS);
  }
  
  static inline void atomic_xor(int i, atomic_t *v)
@@ -60,7 +60,7 @@ static inline void atomic_xor(int i, atomic_t *v)
         "       steor   %w[i], %[v]\n")
         : [i] "+r" (w0), [v] "+Q" (v->counter)
         : "r" (x1)
-       : "x30");
+       : __LL_SC_CLOBBERS);
  }
  
  static inline void atomic_add(int i, atomic_t *v)
@@ -72,7 +72,7 @@ static inline void atomic_add(int i, atomic_t *v)
         "       stadd   %w[i], %[v]\n")
         : [i] "+r" (w0), [v] "+Q" (v->counter)
         : "r" (x1)
-       : "x30");
+       : __LL_SC_CLOBBERS);
  }
  
  #define ATOMIC_OP_ADD_RETURN(name, mb, cl...)                          \
@@ -90,7 +90,7 @@ static inline int atomic_add_return##name(int i, atomic_t *v)         \
         "       add     %w[i], %w[i], w30")                             \
         : [i] "+r" (w0), [v] "+Q" (v->counter)                          \
         : "r" (x1)                                                      \
-       : "x30" , ##cl);                                                \
+       : __LL_SC_CLOBBERS, ##cl);                                      \
                                                                         \
         return w0;                                                      \
  }
@@ -116,7 +116,7 @@ static inline void atomic_and(int i, atomic_t *v)
         "       stclr   %w[i], %[v]")
         : [i] "+r" (w0), [v] "+Q" (v->counter)
         : "r" (x1)
-       : "x30");
+       : __LL_SC_CLOBBERS);
  }
  
  static inline void atomic_sub(int i, atomic_t *v)
@@ -133,7 +133,7 @@ static inline void atomic_sub(int i, atomic_t *v)
         "       stadd   %w[i], %[v]")
         : [i] "+r" (w0), [v] "+Q" (v->counter)
         : "r" (x1)
-       : "x30");
+       : __LL_SC_CLOBBERS);
  }
  
  #define ATOMIC_OP_SUB_RETURN(name, mb, cl...)                          \
@@ -153,7 +153,7 @@ static inline int atomic_sub_return##name(int i, atomic_t *v)               \
         "       add     %w[i], %w[i], w30")                             \
         : [i] "+r" (w0), [v] "+Q" (v->counter)                          \
         : "r" (x1)                                                      \
-       : "x30" , ##cl);                                                \
+       : __LL_SC_CLOBBERS , ##cl);                                     \
                                                                         \
         return w0;                                                      \
  }
@@ -177,7 +177,7 @@ static inline void atomic64_andnot(long i, atomic64_t *v)
         "       stclr   %[i], %[v]\n")
         : [i] "+r" (x0), [v] "+Q" (v->counter)
         : "r" (x1)
-       : "x30");
+       : __LL_SC_CLOBBERS);
  }
  
  static inline void atomic64_or(long i, atomic64_t *v)
@@ -189,7 +189,7 @@ static inline void atomic64_or(long i, atomic64_t *v)
         "       stset   %[i], %[v]\n")
         : [i] "+r" (x0), [v] "+Q" (v->counter)
         : "r" (x1)
-       : "x30");
+       : __LL_SC_CLOBBERS);
  }
  
  static inline void atomic64_xor(long i, atomic64_t *v)
@@ -201,7 +201,7 @@ static inline void atomic64_xor(long i, atomic64_t *v)
         "       steor   %[i], %[v]\n")
         : [i] "+r" (x0), [v] "+Q" (v->counter)
         : "r" (x1)
-       : "x30");
+       : __LL_SC_CLOBBERS);
  }
  
  static inline void atomic64_add(long i, atomic64_t *v)
@@ -213,7 +213,7 @@ static inline void atomic64_add(long i, atomic64_t *v)
         "       stadd   %[i], %[v]\n")
         : [i] "+r" (x0), [v] "+Q" (v->counter)
         : "r" (x1)
-       : "x30");
+       : __LL_SC_CLOBBERS);
  }
  
  #define ATOMIC64_OP_ADD_RETURN(name, mb, cl...)                                \
@@ -231,7 +231,7 @@ static inline long atomic64_add_return##name(long i, atomic64_t *v) \
         "       add     %[i], %[i], x30")                               \
         : [i] "+r" (x0), [v] "+Q" (v->counter)                          \
         : "r" (x1)                                                      \
-       : "x30" , ##cl);                                                \
+       : __LL_SC_CLOBBERS, ##cl);                                      \
                                                                         \
         return x0;                                                      \
  }
@@ -257,7 +257,7 @@ static inline void atomic64_and(long i, atomic64_t *v)
         "       stclr   %[i], %[v]")
         : [i] "+r" (x0), [v] "+Q" (v->counter)
         : "r" (x1)
-       : "x30");
+       : __LL_SC_CLOBBERS);
  }
  
  static inline void atomic64_sub(long i, atomic64_t *v)
@@ -274,7 +274,7 @@ static inline void atomic64_sub(long i, atomic64_t *v)
         "       stadd   %[i], %[v]")
         : [i] "+r" (x0), [v] "+Q" (v->counter)
         : "r" (x1)
-       : "x30");
+       : __LL_SC_CLOBBERS);
  }
  
  #define ATOMIC64_OP_SUB_RETURN(name, mb, cl...)                                \
@@ -294,7 +294,7 @@ static inline long atomic64_sub_return##name(long i, atomic64_t *v) \
         "       add     %[i], %[i], x30")                               \
         : [i] "+r" (x0), [v] "+Q" (v->counter)                          \
         : "r" (x1)                                                      \
-       : "x30" , ##cl);                                                \
+       : __LL_SC_CLOBBERS, ##cl);                                      \
                                                                         \
         return x0;                                                      \
  }
@@ -330,7 +330,7 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
         "2:")
         : [ret] "+&r" (x0), [v] "+Q" (v->counter)
         :
-       : "x30", "cc", "memory");
+       : __LL_SC_CLOBBERS, "cc", "memory");
  
         return x0;
  }
@@ -359,7 +359,7 @@ static inline unsigned long __cmpxchg_case_##name(volatile void *ptr,       \
         "       mov     %" #w "[ret], " #w "30")                        \
         : [ret] "+r" (x0), [v] "+Q" (*(unsigned long *)ptr)             \
         : [old] "r" (x1), [new] "r" (x2)                                \
-       : "x30" , ##cl);                                                \
+       : __LL_SC_CLOBBERS, ##cl);                                      \
                                                                         \
         return x0;                                                      \
  }
@@ -416,7 +416,7 @@ static inline long __cmpxchg_double##name(unsigned long old1,               \
           [v] "+Q" (*(unsigned long *)ptr)                              \
         : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),             \
           [oldval1] "r" (oldval1), [oldval2] "r" (oldval2)              \
-       : "x30" , ##cl);                                                \
+       : __LL_SC_CLOBBERS, ##cl);                                      \
                                                                         \
         return x0;                                                      \
  }
diff --git a/arch/arm64/include/asm/boot.h b/arch/arm64/include/asm/boot.h

index 81151b67b26bf61fd756540f5643d8b5ab6882c7..ebf2481889c34848be0b34158647a4f60b770090 100644 (file)
--- a/arch/arm64/include/asm/boot.h
+++ b/arch/arm64/include/asm/boot.h
@@ -11,4 +11,10 @@
  #define MIN_FDT_ALIGN          8
  #define MAX_FDT_SIZE           SZ_2M
  
+/*
+ * arm64 requires the kernel image to placed
+ * TEXT_OFFSET bytes beyond a 2 MB aligned base
+ */
+#define MIN_KIMG_ALIGN         SZ_2M
+
  #endif
diff --git a/arch/arm64/include/asm/brk-imm.h b/arch/arm64/include/asm/brk-imm.h

new file mode 100644 (file)

index 0000000..ed693c5
--- /dev/null
+++ b/arch/arm64/include/asm/brk-imm.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __ASM_BRK_IMM_H
+#define __ASM_BRK_IMM_H
+
+/*
+ * #imm16 values used for BRK instruction generation
+ * Allowed values for kgdb are 0x400 - 0x7ff
+ * 0x100: for triggering a fault on purpose (reserved)
+ * 0x400: for dynamic BRK instruction
+ * 0x401: for compile time BRK instruction
+ * 0x800: kernel-mode BUG() and WARN() traps
+ */
+#define FAULT_BRK_IMM                  0x100
+#define KGDB_DYN_DBG_BRK_IMM           0x400
+#define KGDB_COMPILED_DBG_BRK_IMM      0x401
+#define BUG_BRK_IMM                    0x800
+
+#endif
diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h

index 4a748ce9ba1a71241c8f13834e488cef9753dc38..561190d1588136b1f7bc207ebfd04251694ea552 100644 (file)
--- a/arch/arm64/include/asm/bug.h
+++ b/arch/arm64/include/asm/bug.h
@@ -18,7 +18,7 @@
  #ifndef _ARCH_ARM64_ASM_BUG_H
  #define _ARCH_ARM64_ASM_BUG_H
  
-#include <asm/debug-monitors.h>
+#include <asm/brk-imm.h>
  
  #ifdef CONFIG_GENERIC_BUG
  #define HAVE_ARCH_BUG
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h

index 54efedaf331fda55478d001d860d6137be5d08e8..22dda613f9c91bd3bcfe3a8aad435f7384795401 100644 (file)
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -68,6 +68,7 @@
  extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end);
  extern void flush_icache_range(unsigned long start, unsigned long end);
  extern void __flush_dcache_area(void *addr, size_t len);
+extern void __clean_dcache_area_pou(void *addr, size_t len);
  extern long __flush_cache_user_range(unsigned long start, unsigned long end);
  
  static inline void flush_cache_mm(struct mm_struct *mm)
@@ -155,8 +156,4 @@ int set_memory_rw(unsigned long addr, int numpages);
  int set_memory_x(unsigned long addr, int numpages);
  int set_memory_nx(unsigned long addr, int numpages);
  
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-#endif
-
  #endif
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h

index 9ea611ea69df739009d0a6d432bbbedcab05284b..510c7b4044547f82750ca9295e6d35bdeb0b67bf 100644 (file)
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -19,7 +19,6 @@
  #define __ASM_CMPXCHG_H
  
  #include <linux/bug.h>
-#include <linux/mmdebug.h>
  
  #include <asm/atomic.h>
  #include <asm/barrier.h>
diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h

index b5e9cee4b5f81a3498a67b934dc9157e2d4cf45b..13a6103130cd7036b889250889264847b6871249 100644 (file)
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -36,6 +36,7 @@ struct cpuinfo_arm64 {
         u64             reg_id_aa64isar1;
         u64             reg_id_aa64mmfr0;
         u64             reg_id_aa64mmfr1;
+       u64             reg_id_aa64mmfr2;
         u64             reg_id_aa64pfr0;
         u64             reg_id_aa64pfr1;
  
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h

index 8884b5d5f48c204fd215a490240586320340daed..1695f77d8bf2fa73f3ed464c4cb3f6d9f75da382 100644 (file)
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -30,9 +30,13 @@
  #define ARM64_HAS_LSE_ATOMICS                  5
  #define ARM64_WORKAROUND_CAVIUM_23154          6
  #define ARM64_WORKAROUND_834220                        7
-#define ARM64_WORKAROUND_CAVIUM_27456          8
+#define ARM64_HAS_NO_HW_PREFETCH               8
+#define ARM64_HAS_UAO                          9
+#define ARM64_ALT_PAN_NOT_UAO                  10
  
-#define ARM64_NCAPS                            9
+#define ARM64_WORKAROUND_CAVIUM_27456          11
+#define ARM64_HAS_VIRT_HOST_EXTN               12
+#define ARM64_NCAPS                            13
  
  #ifndef __ASSEMBLY__
  
@@ -177,7 +181,7 @@ u64 read_system_reg(u32 id);
  
  static inline bool cpu_supports_mixed_endian_el0(void)
  {
-       return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1));
+       return id_aa64mmfr0_mixed_endian_el0(read_cpuid(SYS_ID_AA64MMFR0_EL1));
  }
  
  static inline bool system_supports_mixed_endian_el0(void)
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h

index 1a5949364ed0f43eee2be4b61c3497fe4fdbbb7b..b3a83da152a7c75d3ae3964bebb143efadcff6e2 100644 (file)
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -32,12 +32,6 @@
  #define MPIDR_AFFINITY_LEVEL(mpidr, level) \
         ((mpidr >> MPIDR_LEVEL_SHIFT(level)) & MPIDR_LEVEL_MASK)
  
-#define read_cpuid(reg) ({                                             \
-       u64 __val;                                                      \
-       asm("mrs        %0, " #reg : "=r" (__val));                     \
-       __val;                                                          \
-})
-
  #define MIDR_REVISION_MASK     0xf
  #define MIDR_REVISION(midr)    ((midr) & MIDR_REVISION_MASK)
  #define MIDR_PARTNUM_SHIFT     4
@@ -57,11 +51,22 @@
  #define MIDR_IMPLEMENTOR(midr) \
         (((midr) & MIDR_IMPLEMENTOR_MASK) >> MIDR_IMPLEMENTOR_SHIFT)
  
-#define MIDR_CPU_PART(imp, partnum) \
+#define MIDR_CPU_MODEL(imp, partnum) \
         (((imp)                 << MIDR_IMPLEMENTOR_SHIFT) | \
         (0xf                    << MIDR_ARCHITECTURE_SHIFT) | \
         ((partnum)              << MIDR_PARTNUM_SHIFT))
  
+#define MIDR_CPU_MODEL_MASK (MIDR_IMPLEMENTOR_MASK | MIDR_PARTNUM_MASK | \
+                            MIDR_ARCHITECTURE_MASK)
+
+#define MIDR_IS_CPU_MODEL_RANGE(midr, model, rv_min, rv_max)           \
+({                                                                     \
+       u32 _model = (midr) & MIDR_CPU_MODEL_MASK;                      \
+       u32 rv = (midr) & (MIDR_REVISION_MASK | MIDR_VARIANT_MASK);     \
+                                                                       \
+       _model == (model) && rv >= (rv_min) && rv <= (rv_max);          \
+ })
+
  #define ARM_CPU_IMP_ARM                        0x41
  #define ARM_CPU_IMP_APM                        0x50
  #define ARM_CPU_IMP_CAVIUM             0x43
@@ -75,8 +80,20 @@
  
  #define CAVIUM_CPU_PART_THUNDERX       0x0A1
  
+#define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
+#define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
+#define MIDR_THUNDERX  MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
+
  #ifndef __ASSEMBLY__
  
+#include <asm/sysreg.h>
+
+#define read_cpuid(reg) ({                                             \
+       u64 __val;                                                      \
+       asm("mrs_s      %0, " __stringify(reg) : "=r" (__val));         \
+       __val;                                                          \
+})
+
  /*
   * The CPU ID never changes at run time, so we might as well tell the
   * compiler that it's constant.  Use this function to read the CPU ID
@@ -84,12 +101,12 @@
   */
  static inline u32 __attribute_const__ read_cpuid_id(void)
  {
-       return read_cpuid(MIDR_EL1);
+       return read_cpuid(SYS_MIDR_EL1);
  }
  
  static inline u64 __attribute_const__ read_cpuid_mpidr(void)
  {
-       return read_cpuid(MPIDR_EL1);
+       return read_cpuid(SYS_MPIDR_EL1);
  }
  
  static inline unsigned int __attribute_const__ read_cpuid_implementor(void)
@@ -104,7 +121,7 @@ static inline unsigned int __attribute_const__ read_cpuid_part_number(void)
  
  static inline u32 __attribute_const__ read_cpuid_cachetype(void)
  {
-       return read_cpuid(CTR_EL0);
+       return read_cpuid(SYS_CTR_EL0);
  }
  #endif /* __ASSEMBLY__ */
  
diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h

index 279c85b5ec091eafaa37fbb10d4ff59e2092a605..4b6b3f72a2158a06e07f36529912c81c0b5fb43b 100644 (file)
--- a/arch/arm64/include/asm/debug-monitors.h
+++ b/arch/arm64/include/asm/debug-monitors.h
@@ -20,6 +20,7 @@
  
  #include <linux/errno.h>
  #include <linux/types.h>
+#include <asm/brk-imm.h>
  #include <asm/esr.h>
  #include <asm/insn.h>
  #include <asm/ptrace.h>
@@ -46,19 +47,6 @@
   */
  #define BREAK_INSTR_SIZE               AARCH64_INSN_SIZE
  
-/*
- * #imm16 values used for BRK instruction generation
- * Allowed values for kgbd are 0x400 - 0x7ff
- * 0x100: for triggering a fault on purpose (reserved)
- * 0x400: for dynamic BRK instruction
- * 0x401: for compile time BRK instruction
- * 0x800: kernel-mode BUG() and WARN() traps
- */
-#define FAULT_BRK_IMM                  0x100
-#define KGDB_DYN_DBG_BRK_IMM           0x400
-#define KGDB_COMPILED_DBG_BRK_IMM      0x401
-#define BUG_BRK_IMM                    0x800
-
  /*
   * BRK instruction encoding
   * The #imm16 value should be placed at bits[20:5] within BRK ins
@@ -78,6 +66,11 @@
  
  #define CACHE_FLUSH_IS_SAFE            1
  
+/* kprobes BRK opcodes with ESR encoding  */
+#define BRK64_ESR_MASK         0xFFFF
+#define BRK64_ESR_KPROBES      0x0004
+#define BRK64_OPCODE_KPROBES   (AARCH64_BREAK_MON | (BRK64_ESR_KPROBES << 5))
+
  /* AArch32 */
  #define DBG_ESR_EVT_BKPT       0x4
  #define DBG_ESR_EVT_VECC       0x5
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h

index 44dd892a4bbea515692ea090823210491425d7d8..7875c886ad24226bea0617649b6e50a7944361bf 100644 (file)
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -24,15 +24,6 @@
  #include <asm/ptrace.h>
  #include <asm/user.h>
  
-typedef unsigned long elf_greg_t;
-
-#define ELF_NGREG (sizeof(struct user_pt_regs) / sizeof(elf_greg_t))
-#define ELF_CORE_COPY_REGS(dest, regs) \
-       *(struct user_pt_regs *)&(dest) = (regs)->user_regs;
-
-typedef elf_greg_t elf_gregset_t[ELF_NGREG];
-typedef struct user_fpsimd_state elf_fpregset_t;
-
  /*
   * AArch64 static relocation types.
   */
@@ -86,6 +77,8 @@ typedef struct user_fpsimd_state elf_fpregset_t;
  #define R_AARCH64_MOVW_PREL_G2_NC      292
  #define R_AARCH64_MOVW_PREL_G3         293
  
+#define R_AARCH64_RELATIVE             1027
+
  /*
   * These are used to set parameters in the core dumps.
   */
@@ -127,6 +120,17 @@ typedef struct user_fpsimd_state elf_fpregset_t;
   */
  #define ELF_ET_DYN_BASE        (2 * TASK_SIZE_64 / 3)
  
+#ifndef __ASSEMBLY__
+
+typedef unsigned long elf_greg_t;
+
+#define ELF_NGREG (sizeof(struct user_pt_regs) / sizeof(elf_greg_t))
+#define ELF_CORE_COPY_REGS(dest, regs) \
+       *(struct user_pt_regs *)&(dest) = (regs)->user_regs;
+
+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+typedef struct user_fpsimd_state elf_fpregset_t;
+
  /*
   * When the program starts, a1 contains a pointer to a function to be
   * registered with atexit, as per the SVR4 ABI.  A value of 0 means we have no
@@ -187,4 +191,6 @@ extern int aarch32_setup_vectors_page(struct linux_binprm *bprm,
  
  #endif /* CONFIG_COMPAT */
  
+#endif /* !__ASSEMBLY__ */
+
  #endif
diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h

index 309704544d22763d6348095814fbdce935172c1b..1a617d46fce93247cf42fd0cda36a1355fc89aa9 100644 (file)
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -62,6 +62,16 @@ enum fixed_addresses {
  
         FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
         FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
+
+       /*
+        * Used for kernel page table creation, so unmapped memory may be used
+        * for tables.
+        */
+       FIX_PTE,
+       FIX_PMD,
+       FIX_PUD,
+       FIX_PGD,
+
         __end_of_fixed_addresses
  };
  
diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h

index c5534facf9416fefbec606ad8f320f023f662ce7..3c60f37e48ab51998db2c5870fe2df4427949b37 100644 (file)
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -28,6 +28,8 @@ struct dyn_arch_ftrace {
  
  extern unsigned long ftrace_graph_call;
  
+extern void return_to_handler(void);
+
  static inline unsigned long ftrace_call_adjust(unsigned long addr)
  {
         /*
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h

index 5f3ab8c1db55cca8dbf4c9e1fc315e90335b6d60..f2585cdd32c29832566718e99d7b5fd9c61d2322 100644 (file)
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -42,10 +42,8 @@
  "4:    mov     %w0, %w5\n"                                             \
  "      b       3b\n"                                                   \
  "      .popsection\n"                                                  \
-"      .pushsection __ex_table,\"a\"\n"                                \
-"      .align  3\n"                                                    \
-"      .quad   1b, 4b, 2b, 4b\n"                                       \
-"      .popsection\n"                                                  \
+       _ASM_EXTABLE(1b, 4b)                                            \
+       _ASM_EXTABLE(2b, 4b)                                            \
         ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,            \
                     CONFIG_ARM64_PAN)                                   \
         : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp)       \
@@ -134,10 +132,8 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
  "4:    mov     %w0, %w6\n"
  "      b       3b\n"
  "      .popsection\n"
-"      .pushsection __ex_table,\"a\"\n"
-"      .align  3\n"
-"      .quad   1b, 4b, 2b, 4b\n"
-"      .popsection\n"
+       _ASM_EXTABLE(1b, 4b)
+       _ASM_EXTABLE(2b, 4b)
  ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
         : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp)
         : "r" (oldval), "r" (newval), "Ir" (-EFAULT)
diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h

index a57601f9d17cdffb1122e2864ae5353273eb59ee..8740297dac775dac5bac2bb9260fca62df7d0fb9 100644 (file)
--- a/arch/arm64/include/asm/hardirq.h
+++ b/arch/arm64/include/asm/hardirq.h
@@ -20,7 +20,7 @@
  #include <linux/threads.h>
  #include <asm/irq.h>
  
-#define NR_IPI 5
+#define NR_IPI 6
  
  typedef struct {
         unsigned int __softirq_pending;
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h

index bb4052e85dbac913c1670fd622f66a6c47909fd8..bbc1e35aa6014c8ea83a1c06acbea83da250121d 100644 (file)
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -26,36 +26,7 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
         return *ptep;
  }
  
-static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-                                  pte_t *ptep, pte_t pte)
-{
-       set_pte_at(mm, addr, ptep, pte);
-}
-
-static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
-                                        unsigned long addr, pte_t *ptep)
-{
-       ptep_clear_flush(vma, addr, ptep);
-}
-
-static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
-                                          unsigned long addr, pte_t *ptep)
-{
-       ptep_set_wrprotect(mm, addr, ptep);
-}
  
-static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
-                                           unsigned long addr, pte_t *ptep)
-{
-       return ptep_get_and_clear(mm, addr, ptep);
-}
-
-static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
-                                            unsigned long addr, pte_t *ptep,
-                                            pte_t pte, int dirty)
-{
-       return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-}
  
  static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                                           unsigned long addr, unsigned long end,
@@ -97,4 +68,19 @@ static inline void arch_clear_hugepage_flags(struct page *page)
         clear_bit(PG_dcache_clean, &page->flags);
  }
  
+extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+                               struct page *page, int writable);
+#define arch_make_huge_pte arch_make_huge_pte
+extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+                           pte_t *ptep, pte_t pte);
+extern int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+                                     unsigned long addr, pte_t *ptep,
+                                     pte_t pte, int dirty);
+extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+                                    unsigned long addr, pte_t *ptep);
+extern void huge_ptep_set_wrprotect(struct mm_struct *mm,
+                                   unsigned long addr, pte_t *ptep);
+extern void huge_ptep_clear_flush(struct vm_area_struct *vma,
+                                 unsigned long addr, pte_t *ptep);
+
  #endif /* __ASM_HUGETLB_H */
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h

index 30e50eb54a6795aa9c504a2d22dfabd8b8db195a..1dbaa901d7e5d022f19f5bc3ee5e18fe6fa47a73 100644 (file)
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -120,6 +120,29 @@ enum aarch64_insn_register {
         AARCH64_INSN_REG_SP = 31  /* Stack pointer: as load/store base reg */
  };
  
+enum aarch64_insn_special_register {
+       AARCH64_INSN_SPCLREG_SPSR_EL1   = 0xC200,
+       AARCH64_INSN_SPCLREG_ELR_EL1    = 0xC201,
+       AARCH64_INSN_SPCLREG_SP_EL0     = 0xC208,
+       AARCH64_INSN_SPCLREG_SPSEL      = 0xC210,
+       AARCH64_INSN_SPCLREG_CURRENTEL  = 0xC212,
+       AARCH64_INSN_SPCLREG_DAIF       = 0xDA11,
+       AARCH64_INSN_SPCLREG_NZCV       = 0xDA10,
+       AARCH64_INSN_SPCLREG_FPCR       = 0xDA20,
+       AARCH64_INSN_SPCLREG_DSPSR_EL0  = 0xDA28,
+       AARCH64_INSN_SPCLREG_DLR_EL0    = 0xDA29,
+       AARCH64_INSN_SPCLREG_SPSR_EL2   = 0xE200,
+       AARCH64_INSN_SPCLREG_ELR_EL2    = 0xE201,
+       AARCH64_INSN_SPCLREG_SP_EL1     = 0xE208,
+       AARCH64_INSN_SPCLREG_SPSR_INQ   = 0xE218,
+       AARCH64_INSN_SPCLREG_SPSR_ABT   = 0xE219,
+       AARCH64_INSN_SPCLREG_SPSR_UND   = 0xE21A,
+       AARCH64_INSN_SPCLREG_SPSR_FIQ   = 0xE21B,
+       AARCH64_INSN_SPCLREG_SPSR_EL3   = 0xF200,
+       AARCH64_INSN_SPCLREG_ELR_EL3    = 0xF201,
+       AARCH64_INSN_SPCLREG_SP_EL2     = 0xF210
+};
+
  enum aarch64_insn_variant {
         AARCH64_INSN_VARIANT_32BIT,
         AARCH64_INSN_VARIANT_64BIT
@@ -223,8 +246,15 @@ static __always_inline bool aarch64_insn_is_##abbr(u32 code) \
  static __always_inline u32 aarch64_insn_get_##abbr##_value(void) \
  { return (val); }
  
+__AARCH64_INSN_FUNCS(adr_adrp, 0x1F000000, 0x10000000)
+__AARCH64_INSN_FUNCS(prfm_lit, 0xFF000000, 0xD8000000)
  __AARCH64_INSN_FUNCS(str_reg,  0x3FE0EC00, 0x38206800)
  __AARCH64_INSN_FUNCS(ldr_reg,  0x3FE0EC00, 0x38606800)
+__AARCH64_INSN_FUNCS(ldr_lit,  0xBF000000, 0x18000000)
+__AARCH64_INSN_FUNCS(ldrsw_lit,        0xFF000000, 0x98000000)
+__AARCH64_INSN_FUNCS(exclusive,        0x3F800000, 0x08000000)
+__AARCH64_INSN_FUNCS(load_ex,  0x3F400000, 0x08400000)
+__AARCH64_INSN_FUNCS(store_ex, 0x3F400000, 0x08000000)
  __AARCH64_INSN_FUNCS(stp_post, 0x7FC00000, 0x28800000)
  __AARCH64_INSN_FUNCS(ldp_post, 0x7FC00000, 0x28C00000)
  __AARCH64_INSN_FUNCS(stp_pre,  0x7FC00000, 0x29800000)
@@ -273,10 +303,15 @@ __AARCH64_INSN_FUNCS(svc, 0xFFE0001F, 0xD4000001)
  __AARCH64_INSN_FUNCS(hvc,      0xFFE0001F, 0xD4000002)
  __AARCH64_INSN_FUNCS(smc,      0xFFE0001F, 0xD4000003)
  __AARCH64_INSN_FUNCS(brk,      0xFFE0001F, 0xD4200000)
+__AARCH64_INSN_FUNCS(exception,        0xFF000000, 0xD4000000)
  __AARCH64_INSN_FUNCS(hint,     0xFFFFF01F, 0xD503201F)
  __AARCH64_INSN_FUNCS(br,       0xFFFFFC1F, 0xD61F0000)
  __AARCH64_INSN_FUNCS(blr,      0xFFFFFC1F, 0xD63F0000)
  __AARCH64_INSN_FUNCS(ret,      0xFFFFFC1F, 0xD65F0000)
+__AARCH64_INSN_FUNCS(eret,     0xFFFFFFFF, 0xD69F03E0)
+__AARCH64_INSN_FUNCS(mrs,      0xFFF00000, 0xD5300000)
+__AARCH64_INSN_FUNCS(msr_imm,  0xFFF8F01F, 0xD500401F)
+__AARCH64_INSN_FUNCS(msr_reg,  0xFFF00000, 0xD5100000)
  
  #undef __AARCH64_INSN_FUNCS
  
@@ -286,6 +321,8 @@ bool aarch64_insn_is_branch_imm(u32 insn);
  int aarch64_insn_read(void *addr, u32 *insnp);
  int aarch64_insn_write(void *addr, u32 insn);
  enum aarch64_insn_encoding_class aarch64_get_insn_class(u32 insn);
+bool aarch64_insn_uses_literal(u32 insn);
+bool aarch64_insn_is_branch(u32 insn);
  u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn);
  u32 aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type,
                                   u32 insn, u64 imm);
@@ -367,9 +404,13 @@ bool aarch32_insn_is_wide(u32 insn);
  #define A32_RT_OFFSET  12
  #define A32_RT2_OFFSET  0
  
+u32 aarch64_insn_extract_system_reg(u32 insn);
  u32 aarch32_insn_extract_reg_num(u32 insn, int offset);
  u32 aarch32_insn_mcr_extract_opc2(u32 insn);
  u32 aarch32_insn_mcr_extract_crm(u32 insn);
+
+typedef bool (pstate_check_t)(unsigned long);
+extern pstate_check_t * const aarch32_opcode_cond_checks[16];
  #endif /* __ASSEMBLY__ */
  
  #endif /* __ASM_INSN_H */
diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h

index 8e8d30684392b1065b0c5d1f65e7715e4028331c..b77197d941fc442c4bad04b185d5cf786ca9fea5 100644 (file)
--- a/arch/arm64/include/asm/irq.h
+++ b/arch/arm64/include/asm/irq.h
@@ -1,10 +1,45 @@
  #ifndef __ASM_IRQ_H
  #define __ASM_IRQ_H
  
+#define IRQ_STACK_SIZE                 THREAD_SIZE
+#define IRQ_STACK_START_SP             THREAD_START_SP
+
+#ifndef __ASSEMBLER__
+
+#include <linux/percpu.h>
+
  #include <asm-generic/irq.h>
+#include <asm/thread_info.h>
  
  struct pt_regs;
  
+DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack);
+
+/*
+ * The highest address on the stack, and the first to be used. Used to
+ * find the dummy-stack frame put down by el?_irq() in entry.S, which
+ * is structured as follows:
+ *
+ *       ------------
+ *       |          |  <- irq_stack_ptr
+ *   top ------------
+ *       |   x19    | <- irq_stack_ptr - 0x08
+ *       ------------
+ *       |   x29    | <- irq_stack_ptr - 0x10
+ *       ------------
+ *
+ * where x19 holds a copy of the task stack pointer where the struct pt_regs
+ * from kernel_entry can be found.
+ *
+ */
+#define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP)
+
+/*
+ * The offset from irq_stack_ptr where entry.S will store the original
+ * stack pointer. Used by unwind_frame() and dump_backtrace().
+ */
+#define IRQ_STACK_TO_TASK_STACK(ptr) (*((unsigned long *)((ptr) - 0x08)))
+
  extern void set_handle_irq(void (*handle_irq)(struct pt_regs *));
  
  static inline int nr_legacy_irqs(void)
@@ -12,4 +47,14 @@ static inline int nr_legacy_irqs(void)
         return 0;
  }
  
+static inline bool on_irq_stack(unsigned long sp, int cpu)
+{
+       /* variable names the same as kernel/stacktrace.c */
+       unsigned long low = (unsigned long)per_cpu(irq_stack, cpu);
+       unsigned long high = low + IRQ_STACK_START_SP;
+
+       return (low <= sp && sp <= high);
+}
+
+#endif /* !__ASSEMBLER__ */
  #endif
diff --git a/arch/arm64/include/asm/kasan.h b/arch/arm64/include/asm/kasan.h

index 2774fa384c47f27b4e936644cef0e980f7fbcfe7..71ad0f93eb7153226a43d78e909d3e7c3690bf92 100644 (file)
--- a/arch/arm64/include/asm/kasan.h
+++ b/arch/arm64/include/asm/kasan.h
@@ -7,13 +7,14 @@
  
  #include <linux/linkage.h>
  #include <asm/memory.h>
+#include <asm/pgtable-types.h>
  
  /*
   * KASAN_SHADOW_START: beginning of the kernel virtual addresses.
   * KASAN_SHADOW_END: KASAN_SHADOW_START + 1/8 of kernel virtual addresses.
   */
  #define KASAN_SHADOW_START      (VA_START)
-#define KASAN_SHADOW_END        (KASAN_SHADOW_START + (1UL << (VA_BITS - 3)))
+#define KASAN_SHADOW_END        (KASAN_SHADOW_START + KASAN_SHADOW_SIZE)
  
  /*
   * This value is used to map an address to the corresponding shadow
@@ -28,10 +29,12 @@
  #define KASAN_SHADOW_OFFSET     (KASAN_SHADOW_END - (1ULL << (64 - 3)))
  
  void kasan_init(void);
+void kasan_copy_shadow(pgd_t *pgdir);
  asmlinkage void kasan_early_init(void);
  
  #else
  static inline void kasan_init(void) { }
+static inline void kasan_copy_shadow(pgd_t *pgdir) { }
  #endif
  
  #endif
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h

index a459714ee29e38fbf81f2061026933736ed1cbfc..5c6375d8528bb8ddd313bfa2911f7a0d77819028 100644 (file)
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -79,5 +79,17 @@
  #define SWAPPER_MM_MMUFLAGS    (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)
  #endif
  
+/*
+ * To make optimal use of block mappings when laying out the linear
+ * mapping, round down the base of physical memory to a size that can
+ * be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE
+ * (64k granule), or a multiple that can be mapped using contiguous bits
+ * in the page tables: 32 * PMD_SIZE (16k granule)
+ */
+#ifdef CONFIG_ARM64_64K_PAGES
+#define ARM64_MEMSTART_ALIGN   SZ_512M
+#else
+#define ARM64_MEMSTART_ALIGN   SZ_1G
+#endif
  
  #endif /* __ASM_KERNEL_PGTABLE_H */
diff --git a/arch/arm64/include/asm/kprobes.h b/arch/arm64/include/asm/kprobes.h

new file mode 100644 (file)

index 0000000..1737aec
--- /dev/null
+++ b/arch/arm64/include/asm/kprobes.h
@@ -0,0 +1,60 @@
+/*
+ * arch/arm64/include/asm/kprobes.h
+ *
+ * Copyright (C) 2013 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _ARM_KPROBES_H
+#define _ARM_KPROBES_H
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+
+#define __ARCH_WANT_KPROBES_INSN_SLOT
+#define MAX_INSN_SIZE                  1
+
+#define flush_insn_slot(p)             do { } while (0)
+#define kretprobe_blacklist_size       0
+
+#include <asm/probes.h>
+
+struct prev_kprobe {
+       struct kprobe *kp;
+       unsigned int status;
+};
+
+/* Single step context for kprobe */
+struct kprobe_step_ctx {
+       unsigned long ss_pending;
+       unsigned long match_addr;
+};
+
+/* per-cpu kprobe control block */
+struct kprobe_ctlblk {
+       unsigned int kprobe_status;
+       unsigned long saved_irqflag;
+       struct prev_kprobe prev_kprobe;
+       struct kprobe_step_ctx ss_ctx;
+       struct pt_regs jprobe_saved_regs;
+};
+
+void arch_remove_kprobe(struct kprobe *);
+int kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr);
+int kprobe_exceptions_notify(struct notifier_block *self,
+                            unsigned long val, void *data);
+int kprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr);
+int kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr);
+void kretprobe_trampoline(void);
+void __kprobes *trampoline_probe_handler(struct pt_regs *regs);
+
+#endif /* _ARM_KPROBES_H */
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h

index 2d960f8588b0639f80c272f833c3796fd7c286b6..8b709f53f87423ef55f2422204439dc4f63cd86f 100644 (file)
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -83,17 +83,6 @@
  #define HCR_INT_OVERRIDE   (HCR_FMO | HCR_IMO)
  
  
-/* Hyp System Control Register (SCTLR_EL2) bits */
-#define SCTLR_EL2_EE   (1 << 25)
-#define SCTLR_EL2_WXN  (1 << 19)
-#define SCTLR_EL2_I    (1 << 12)
-#define SCTLR_EL2_SA   (1 << 3)
-#define SCTLR_EL2_C    (1 << 2)
-#define SCTLR_EL2_A    (1 << 1)
-#define SCTLR_EL2_M    1
-#define SCTLR_EL2_FLAGS        (SCTLR_EL2_M | SCTLR_EL2_A | SCTLR_EL2_C |      \
-                        SCTLR_EL2_SA | SCTLR_EL2_I)
-
  /* TCR_EL2 Registers bits */
  #define TCR_EL2_RES1   ((1 << 31) | (1 << 23))
  #define TCR_EL2_TBI    (1 << 20)
@@ -123,6 +112,7 @@
  #define VTCR_EL2_SL0_LVL1      (1 << 6)
  #define VTCR_EL2_T0SZ_MASK     0x3f
  #define VTCR_EL2_T0SZ_40B      24
+#define VTCR_EL2_VS            19
  
  /*
   * We configure the Stage-2 page tables to always restrict the IPA space to be
@@ -167,7 +157,7 @@
  #define VTTBR_BADDR_SHIFT (VTTBR_X - 1)
  #define VTTBR_BADDR_MASK  (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_BADDR_SHIFT)
  #define VTTBR_VMID_SHIFT  (UL(48))
-#define VTTBR_VMID_MASK          (UL(0xFF) << VTTBR_VMID_SHIFT)
+#define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT)
  
  /* Hyp System Trap Register */
  #define HSTR_EL2_T(x)  (1 << x)
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h

index 5e377101f91948f9ee711bea3f7659c86010301c..36a30c80032d46b8bd51b0aaee3eb2f5c11599e5 100644 (file)
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -20,94 +20,38 @@
  
  #include <asm/virt.h>
  
-/*
- * 0 is reserved as an invalid value.
- * Order *must* be kept in sync with the hyp switch code.
- */
-#define        MPIDR_EL1       1       /* MultiProcessor Affinity Register */
-#define        CSSELR_EL1      2       /* Cache Size Selection Register */
-#define        SCTLR_EL1       3       /* System Control Register */
-#define        ACTLR_EL1       4       /* Auxiliary Control Register */
-#define        CPACR_EL1       5       /* Coprocessor Access Control */
-#define        TTBR0_EL1       6       /* Translation Table Base Register 0 */
-#define        TTBR1_EL1       7       /* Translation Table Base Register 1 */
-#define        TCR_EL1         8       /* Translation Control Register */
-#define        ESR_EL1         9       /* Exception Syndrome Register */
-#define        AFSR0_EL1       10      /* Auxilary Fault Status Register 0 */
-#define        AFSR1_EL1       11      /* Auxilary Fault Status Register 1 */
-#define        FAR_EL1         12      /* Fault Address Register */
-#define        MAIR_EL1        13      /* Memory Attribute Indirection Register */
-#define        VBAR_EL1        14      /* Vector Base Address Register */
-#define        CONTEXTIDR_EL1  15      /* Context ID Register */
-#define        TPIDR_EL0       16      /* Thread ID, User R/W */
-#define        TPIDRRO_EL0     17      /* Thread ID, User R/O */
-#define        TPIDR_EL1       18      /* Thread ID, Privileged */
-#define        AMAIR_EL1       19      /* Aux Memory Attribute Indirection Register */
-#define        CNTKCTL_EL1     20      /* Timer Control Register (EL1) */
-#define        PAR_EL1         21      /* Physical Address Register */
-#define MDSCR_EL1      22      /* Monitor Debug System Control Register */
-#define MDCCINT_EL1    23      /* Monitor Debug Comms Channel Interrupt Enable Reg */
-
-/* 32bit specific registers. Keep them at the end of the range */
-#define        DACR32_EL2      24      /* Domain Access Control Register */
-#define        IFSR32_EL2      25      /* Instruction Fault Status Register */
-#define        FPEXC32_EL2     26      /* Floating-Point Exception Control Register */
-#define        DBGVCR32_EL2    27      /* Debug Vector Catch Register */
-#define        NR_SYS_REGS     28
-
-/* 32bit mapping */
-#define c0_MPIDR       (MPIDR_EL1 * 2) /* MultiProcessor ID Register */
-#define c0_CSSELR      (CSSELR_EL1 * 2)/* Cache Size Selection Register */
-#define c1_SCTLR       (SCTLR_EL1 * 2) /* System Control Register */
-#define c1_ACTLR       (ACTLR_EL1 * 2) /* Auxiliary Control Register */
-#define c1_CPACR       (CPACR_EL1 * 2) /* Coprocessor Access Control */
-#define c2_TTBR0       (TTBR0_EL1 * 2) /* Translation Table Base Register 0 */
-#define c2_TTBR0_high  (c2_TTBR0 + 1)  /* TTBR0 top 32 bits */
-#define c2_TTBR1       (TTBR1_EL1 * 2) /* Translation Table Base Register 1 */
-#define c2_TTBR1_high  (c2_TTBR1 + 1)  /* TTBR1 top 32 bits */
-#define c2_TTBCR       (TCR_EL1 * 2)   /* Translation Table Base Control R. */
-#define c3_DACR                (DACR32_EL2 * 2)/* Domain Access Control Register */
-#define c5_DFSR                (ESR_EL1 * 2)   /* Data Fault Status Register */
-#define c5_IFSR                (IFSR32_EL2 * 2)/* Instruction Fault Status Register */
-#define c5_ADFSR       (AFSR0_EL1 * 2) /* Auxiliary Data Fault Status R */
-#define c5_AIFSR       (AFSR1_EL1 * 2) /* Auxiliary Instr Fault Status R */
-#define c6_DFAR                (FAR_EL1 * 2)   /* Data Fault Address Register */
-#define c6_IFAR                (c6_DFAR + 1)   /* Instruction Fault Address Register */
-#define c7_PAR         (PAR_EL1 * 2)   /* Physical Address Register */
-#define c7_PAR_high    (c7_PAR + 1)    /* PAR top 32 bits */
-#define c10_PRRR       (MAIR_EL1 * 2)  /* Primary Region Remap Register */
-#define c10_NMRR       (c10_PRRR + 1)  /* Normal Memory Remap Register */
-#define c12_VBAR       (VBAR_EL1 * 2)  /* Vector Base Address Register */
-#define c13_CID                (CONTEXTIDR_EL1 * 2)    /* Context ID Register */
-#define c13_TID_URW    (TPIDR_EL0 * 2) /* Thread ID, User R/W */
-#define c13_TID_URO    (TPIDRRO_EL0 * 2)/* Thread ID, User R/O */
-#define c13_TID_PRIV   (TPIDR_EL1 * 2) /* Thread ID, Privileged */
-#define c10_AMAIR0     (AMAIR_EL1 * 2) /* Aux Memory Attr Indirection Reg */
-#define c10_AMAIR1     (c10_AMAIR0 + 1)/* Aux Memory Attr Indirection Reg */
-#define c14_CNTKCTL    (CNTKCTL_EL1 * 2) /* Timer Control Register (PL1) */
-
-#define cp14_DBGDSCRext        (MDSCR_EL1 * 2)
-#define cp14_DBGBCR0   (DBGBCR0_EL1 * 2)
-#define cp14_DBGBVR0   (DBGBVR0_EL1 * 2)
-#define cp14_DBGBXVR0  (cp14_DBGBVR0 + 1)
-#define cp14_DBGWCR0   (DBGWCR0_EL1 * 2)
-#define cp14_DBGWVR0   (DBGWVR0_EL1 * 2)
-#define cp14_DBGDCCINT (MDCCINT_EL1 * 2)
-
-#define NR_COPRO_REGS  (NR_SYS_REGS * 2)
-
  #define ARM_EXCEPTION_IRQ        0
  #define ARM_EXCEPTION_TRAP       1
+/* The hyp-stub will return this for any kvm_call_hyp() call */
+#define ARM_EXCEPTION_HYP_GONE   2
  
  #define KVM_ARM64_DEBUG_DIRTY_SHIFT    0
  #define KVM_ARM64_DEBUG_DIRTY          (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT)
  
+#define kvm_ksym_ref(sym)              phys_to_virt((u64)&sym - kimage_voffset)
+
  #ifndef __ASSEMBLY__
+#if __GNUC__ > 4
+#define kvm_ksym_shift                 (PAGE_OFFSET - KIMAGE_VADDR)
+#else
+/*
+ * GCC versions 4.9 and older will fold the constant below into the addend of
+ * the reference to 'sym' above if kvm_ksym_shift is declared static or if the
+ * constant is used directly. However, since we use the small code model for
+ * the core kernel, the reference to 'sym' will be emitted as a adrp/add pair,
+ * with a +/- 4 GB range, resulting in linker relocation errors if the shift
+ * is sufficiently large. So prevent the compiler from folding the shift into
+ * the addend, by making the shift a variable with external linkage.
+ */
+__weak u64 kvm_ksym_shift = PAGE_OFFSET - KIMAGE_VADDR;
+#endif
+
  struct kvm;
  struct kvm_vcpu;
  
  extern char __kvm_hyp_init[];
  extern char __kvm_hyp_init_end[];
+extern char __kvm_hyp_reset[];
  
  extern char __kvm_hyp_vector[];
  
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h

index 25a40213bd9b87cb6802ecfd4204dc66c41e5850..3066328cd86b69a91274e0cb841059b428666140 100644 (file)
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -26,7 +26,6 @@
  
  #include <asm/esr.h>
  #include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
  #include <asm/kvm_mmio.h>
  #include <asm/ptrace.h>
  #include <asm/cputype.h>
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h

index a35ce7266aac3688fa6460bace61f90477448aa6..3be7a7b52d809fa6eee7a3f1b4d2ec516b5697f4 100644 (file)
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -25,7 +25,6 @@
  #include <linux/types.h>
  #include <linux/kvm_types.h>
  #include <asm/kvm.h>
-#include <asm/kvm_asm.h>
  #include <asm/kvm_mmio.h>
  
  #define __KVM_HAVE_ARCH_INTC_INITIALIZED
@@ -45,6 +44,7 @@
  int __attribute_const__ kvm_target_cpu(void);
  int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
  int kvm_arch_dev_ioctl_check_extension(long ext);
+phys_addr_t kvm_hyp_reset_entry(void);
  
  struct kvm_arch {
         /* The VMID generation used for the virt. memory system */
@@ -85,6 +85,86 @@ struct kvm_vcpu_fault_info {
         u64 hpfar_el2;          /* Hyp IPA Fault Address Register */
  };
  
+/*
+ * 0 is reserved as an invalid value.
+ * Order should be kept in sync with the save/restore code.
+ */
+enum vcpu_sysreg {
+       __INVALID_SYSREG__,
+       MPIDR_EL1,      /* MultiProcessor Affinity Register */
+       CSSELR_EL1,     /* Cache Size Selection Register */
+       SCTLR_EL1,      /* System Control Register */
+       ACTLR_EL1,      /* Auxiliary Control Register */
+       CPACR_EL1,      /* Coprocessor Access Control */
+       TTBR0_EL1,      /* Translation Table Base Register 0 */
+       TTBR1_EL1,      /* Translation Table Base Register 1 */
+       TCR_EL1,        /* Translation Control Register */
+       ESR_EL1,        /* Exception Syndrome Register */
+       AFSR0_EL1,      /* Auxilary Fault Status Register 0 */
+       AFSR1_EL1,      /* Auxilary Fault Status Register 1 */
+       FAR_EL1,        /* Fault Address Register */
+       MAIR_EL1,       /* Memory Attribute Indirection Register */
+       VBAR_EL1,       /* Vector Base Address Register */
+       CONTEXTIDR_EL1, /* Context ID Register */
+       TPIDR_EL0,      /* Thread ID, User R/W */
+       TPIDRRO_EL0,    /* Thread ID, User R/O */
+       TPIDR_EL1,      /* Thread ID, Privileged */
+       AMAIR_EL1,      /* Aux Memory Attribute Indirection Register */
+       CNTKCTL_EL1,    /* Timer Control Register (EL1) */
+       PAR_EL1,        /* Physical Address Register */
+       MDSCR_EL1,      /* Monitor Debug System Control Register */
+       MDCCINT_EL1,    /* Monitor Debug Comms Channel Interrupt Enable Reg */
+
+       /* 32bit specific registers. Keep them at the end of the range */
+       DACR32_EL2,     /* Domain Access Control Register */
+       IFSR32_EL2,     /* Instruction Fault Status Register */
+       FPEXC32_EL2,    /* Floating-Point Exception Control Register */
+       DBGVCR32_EL2,   /* Debug Vector Catch Register */
+
+       NR_SYS_REGS     /* Nothing after this line! */
+};
+
+/* 32bit mapping */
+#define c0_MPIDR       (MPIDR_EL1 * 2) /* MultiProcessor ID Register */
+#define c0_CSSELR      (CSSELR_EL1 * 2)/* Cache Size Selection Register */
+#define c1_SCTLR       (SCTLR_EL1 * 2) /* System Control Register */
+#define c1_ACTLR       (ACTLR_EL1 * 2) /* Auxiliary Control Register */
+#define c1_CPACR       (CPACR_EL1 * 2) /* Coprocessor Access Control */
+#define c2_TTBR0       (TTBR0_EL1 * 2) /* Translation Table Base Register 0 */
+#define c2_TTBR0_high  (c2_TTBR0 + 1)  /* TTBR0 top 32 bits */
+#define c2_TTBR1       (TTBR1_EL1 * 2) /* Translation Table Base Register 1 */
+#define c2_TTBR1_high  (c2_TTBR1 + 1)  /* TTBR1 top 32 bits */
+#define c2_TTBCR       (TCR_EL1 * 2)   /* Translation Table Base Control R. */
+#define c3_DACR                (DACR32_EL2 * 2)/* Domain Access Control Register */
+#define c5_DFSR                (ESR_EL1 * 2)   /* Data Fault Status Register */
+#define c5_IFSR                (IFSR32_EL2 * 2)/* Instruction Fault Status Register */
+#define c5_ADFSR       (AFSR0_EL1 * 2) /* Auxiliary Data Fault Status R */
+#define c5_AIFSR       (AFSR1_EL1 * 2) /* Auxiliary Instr Fault Status R */
+#define c6_DFAR                (FAR_EL1 * 2)   /* Data Fault Address Register */
+#define c6_IFAR                (c6_DFAR + 1)   /* Instruction Fault Address Register */
+#define c7_PAR         (PAR_EL1 * 2)   /* Physical Address Register */
+#define c7_PAR_high    (c7_PAR + 1)    /* PAR top 32 bits */
+#define c10_PRRR       (MAIR_EL1 * 2)  /* Primary Region Remap Register */
+#define c10_NMRR       (c10_PRRR + 1)  /* Normal Memory Remap Register */
+#define c12_VBAR       (VBAR_EL1 * 2)  /* Vector Base Address Register */
+#define c13_CID                (CONTEXTIDR_EL1 * 2)    /* Context ID Register */
+#define c13_TID_URW    (TPIDR_EL0 * 2) /* Thread ID, User R/W */
+#define c13_TID_URO    (TPIDRRO_EL0 * 2)/* Thread ID, User R/O */
+#define c13_TID_PRIV   (TPIDR_EL1 * 2) /* Thread ID, Privileged */
+#define c10_AMAIR0     (AMAIR_EL1 * 2) /* Aux Memory Attr Indirection Reg */
+#define c10_AMAIR1     (c10_AMAIR0 + 1)/* Aux Memory Attr Indirection Reg */
+#define c14_CNTKCTL    (CNTKCTL_EL1 * 2) /* Timer Control Register (PL1) */
+
+#define cp14_DBGDSCRext        (MDSCR_EL1 * 2)
+#define cp14_DBGBCR0   (DBGBCR0_EL1 * 2)
+#define cp14_DBGBVR0   (DBGBVR0_EL1 * 2)
+#define cp14_DBGBXVR0  (cp14_DBGBVR0 + 1)
+#define cp14_DBGWCR0   (DBGWCR0_EL1 * 2)
+#define cp14_DBGWVR0   (DBGWVR0_EL1 * 2)
+#define cp14_DBGDCCINT (MDCCINT_EL1 * 2)
+
+#define NR_COPRO_REGS  (NR_SYS_REGS * 2)
+
  struct kvm_cpu_context {
         struct kvm_regs gp_regs;
         union {
@@ -222,7 +302,7 @@ static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
  struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
  struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
  
-u64 kvm_call_hyp(void *hypfn, ...);
+u64 __kvm_call_hyp(void *hypfn, ...);
  void force_vm_exit(const cpumask_t *mask);
  void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
  
@@ -243,11 +323,25 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
          * Call initialization code, and switch to the full blown
          * HYP code.
          */
-       kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr,
-                    hyp_stack_ptr, vector_ptr);
+       __kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr,
+                      hyp_stack_ptr, vector_ptr);
+}
+
+static inline void __cpu_init_stage2(void)
+{
+}
+
+static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+                                       phys_addr_t phys_idmap_start)
+{
+       /*
+        * Call reset code, and switch back to stub hyp vectors.
+        * Uses __kvm_call_hyp() to avoid kaslr's kvm_ksym_ref() translation.
+        */
+       __kvm_call_hyp((void *)kvm_hyp_reset_entry(),
+                      boot_pgd_ptr, phys_idmap_start);
  }
  
-static inline void kvm_arch_hardware_disable(void) {}
  static inline void kvm_arch_hardware_unsetup(void) {}
  static inline void kvm_arch_sync_events(struct kvm *kvm) {}
  static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
@@ -258,4 +352,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
  void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
  void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
  
+#define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__)
+
  #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_mmio.h b/arch/arm64/include/asm/kvm_mmio.h

index 889c908ee631b526594b5dfc32ef5dfde15480df..fe612a9625766b5fff3698e2d1014d7376744e4f 100644 (file)
--- a/arch/arm64/include/asm/kvm_mmio.h
+++ b/arch/arm64/include/asm/kvm_mmio.h
@@ -19,7 +19,6 @@
  #define __ARM64_KVM_MMIO_H__
  
  #include <linux/kvm_host.h>
-#include <asm/kvm_asm.h>
  #include <asm/kvm_arm.h>
  
  /*
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h

index 61505676d0853bb65710fc9ad7746db8f58e4658..342a5ac2f3da238da956eae467a37558a672c07a 100644 (file)
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -20,6 +20,7 @@
  
  #include <asm/page.h>
  #include <asm/memory.h>
+#include <asm/cpufeature.h>
  
  /*
   * As we only have the TTBR0_EL2 register, we cannot express
@@ -98,6 +99,7 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
  phys_addr_t kvm_mmu_get_httbr(void);
  phys_addr_t kvm_mmu_get_boot_httbr(void);
  phys_addr_t kvm_get_idmap_vector(void);
+phys_addr_t kvm_get_idmap_start(void);
  int kvm_mmu_init(void);
  void kvm_clear_hyp_idmap(void);
  
@@ -158,7 +160,6 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
  #define PTRS_PER_S2_PGD_SHIFT  (KVM_PHYS_SHIFT - PGDIR_SHIFT)
  #endif
  #define PTRS_PER_S2_PGD                (1 << PTRS_PER_S2_PGD_SHIFT)
-#define S2_PGD_ORDER           get_order(PTRS_PER_S2_PGD * sizeof(pgd_t))
  
  #define kvm_pgd_index(addr)    (((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1))
  
@@ -302,5 +303,12 @@ static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
         merged_hyp_pgd[idmap_idx] = __pgd(__pa(boot_hyp_pgd) | PMD_TYPE_TABLE);
  }
  
+static inline unsigned int kvm_get_vmid_bits(void)
+{
+       int reg = read_system_reg(SYS_ID_AA64MMFR1_EL1);
+
+       return (cpuid_feature_extract_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8;
+}
+
  #endif /* __ASSEMBLY__ */
  #endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/include/asm/lse.h b/arch/arm64/include/asm/lse.h

index 3de42d68611df91ba6d46e32c197f700bb52bf52..23acc00be32d019a9f0f71b75153b5b32996b083 100644 (file)
--- a/arch/arm64/include/asm/lse.h
+++ b/arch/arm64/include/asm/lse.h
@@ -26,6 +26,7 @@ __asm__(".arch_extension      lse");
  
  /* Macro for constructing calls to out-of-line ll/sc atomics */
  #define __LL_SC_CALL(op)       "bl\t" __stringify(__LL_SC_PREFIX(op)) "\n"
+#define __LL_SC_CLOBBERS       "x16", "x17", "x30"
  
  /* In-line patching at runtime */
  #define ARM64_LSE_ATOMIC_INSN(llsc, lse)                               \
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h

index 853953cd1f0813fd562b68b7cdddd95582f1e392..d776037d199fa4ca0257ba6cf5beda36d001d6e2 100644 (file)
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -24,6 +24,7 @@
  #include <linux/compiler.h>
  #include <linux/const.h>
  #include <linux/types.h>
+#include <asm/bug.h>
  #include <asm/sizes.h>
  
  /*
@@ -45,15 +46,15 @@
   * VA_START - the first kernel virtual address.
   * TASK_SIZE - the maximum size of a user space task.
   * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area.
- * The module space lives between the addresses given by TASK_SIZE
- * and PAGE_OFFSET - it must be within 128MB of the kernel text.
   */
  #define VA_BITS                        (CONFIG_ARM64_VA_BITS)
  #define VA_START               (UL(0xffffffffffffffff) << VA_BITS)
  #define PAGE_OFFSET            (UL(0xffffffffffffffff) << (VA_BITS - 1))
-#define MODULES_END            (PAGE_OFFSET)
-#define MODULES_VADDR          (MODULES_END - SZ_64M)
-#define PCI_IO_END             (MODULES_VADDR - SZ_2M)
+#define KIMAGE_VADDR           (MODULES_END)
+#define MODULES_END            (MODULES_VADDR + MODULES_VSIZE)
+#define MODULES_VADDR          (VA_START + KASAN_SHADOW_SIZE)
+#define MODULES_VSIZE          (SZ_128M)
+#define PCI_IO_END             (PAGE_OFFSET - SZ_2M)
  #define PCI_IO_START           (PCI_IO_END - PCI_IO_SIZE)
  #define FIXADDR_TOP            (PCI_IO_START - SZ_2M)
  #define TASK_SIZE_64           (UL(1) << VA_BITS)
@@ -70,13 +71,31 @@
  
  #define TASK_UNMAPPED_BASE     (PAGE_ALIGN(TASK_SIZE / 4))
  
+#define KERNEL_START      _text
+#define KERNEL_END        _end
+
+/*
+ * The size of the KASAN shadow region. This should be 1/8th of the
+ * size of the entire kernel virtual address space.
+ */
+#ifdef CONFIG_KASAN
+#define KASAN_SHADOW_SIZE      (UL(1) << (VA_BITS - 3))
+#else
+#define KASAN_SHADOW_SIZE      (0)
+#endif
+
  /*
   * Physical vs virtual RAM address space conversion.  These are
   * private definitions which should NOT be used outside memory.h
   * files.  Use virt_to_phys/phys_to_virt/__pa/__va instead.
   */
-#define __virt_to_phys(x)      (((phys_addr_t)(x) - PAGE_OFFSET + PHYS_OFFSET))
-#define __phys_to_virt(x)      ((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET))
+#define __virt_to_phys(x) ({                                           \
+       phys_addr_t __x = (phys_addr_t)(x);                             \
+       __x & BIT(VA_BITS - 1) ? (__x & ~PAGE_OFFSET) + PHYS_OFFSET :   \
+                                (__x - kimage_voffset); })
+
+#define __phys_to_virt(x)      ((unsigned long)((x) - PHYS_OFFSET) | PAGE_OFFSET)
+#define __phys_to_kimg(x)      ((unsigned long)((x) + kimage_voffset))
  
  /*
   * Convert a page to/from a physical address
@@ -100,19 +119,40 @@
  #define MT_S2_NORMAL           0xf
  #define MT_S2_DEVICE_nGnRE     0x1
  
+#ifdef CONFIG_ARM64_4K_PAGES
+#define IOREMAP_MAX_ORDER      (PUD_SHIFT)
+#else
+#define IOREMAP_MAX_ORDER      (PMD_SHIFT)
+#endif
+
+#ifdef CONFIG_BLK_DEV_INITRD
+#define __early_init_dt_declare_initrd(__start, __end)                 \
+       do {                                                            \
+               initrd_start = (__start);                               \
+               initrd_end = (__end);                                   \
+       } while (0)
+#endif
+
  #ifndef __ASSEMBLY__
  
-extern phys_addr_t             memstart_addr;
+#include <linux/bitops.h>
+#include <linux/mmdebug.h>
+
+extern s64                     memstart_addr;
  /* PHYS_OFFSET - the physical address of the start of memory. */
-#define PHYS_OFFSET            ({ memstart_addr; })
+#define PHYS_OFFSET            ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; })
+
+/* the virtual base of the kernel image (minus TEXT_OFFSET) */
+extern u64                     kimage_vaddr;
+
+/* the offset between the kernel virtual and physical mappings */
+extern u64                     kimage_voffset;
  
  /*
- * The maximum physical address that the linear direct mapping
- * of system RAM can cover. (PAGE_OFFSET can be interpreted as
- * a 2's complement signed quantity and negated to derive the
- * maximum size of the linear mapping.)
+ * Allow all memory at the discovery stage. We will clip it later.
   */
-#define MAX_MEMBLOCK_ADDR      ({ memstart_addr - PAGE_OFFSET - 1; })
+#define MIN_MEMBLOCK_ADDR      0
+#define MAX_MEMBLOCK_ADDR      U64_MAX
  
  /*
   * PFNs are used to describe any physical page; this means
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h

index 24165784b8038b732ea568d1e74fd8c0a699b914..a00f7cf35bbd4d80ce045bfeb0cbb6bd061aeaaa 100644 (file)
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -27,6 +27,7 @@
  #include <asm-generic/mm_hooks.h>
  #include <asm/cputype.h>
  #include <asm/pgtable.h>
+#include <asm/tlbflush.h>
  
  #ifdef CONFIG_PID_IN_CONTEXTIDR
  static inline void contextidr_thread_switch(struct task_struct *next)
@@ -48,7 +49,7 @@ static inline void contextidr_thread_switch(struct task_struct *next)
   */
  static inline void cpu_set_reserved_ttbr0(void)
  {
-       unsigned long ttbr = page_to_phys(empty_zero_page);
+       unsigned long ttbr = virt_to_phys(empty_zero_page);
  
         asm(
         "       msr     ttbr0_el1, %0                   // set TTBR0\n"
@@ -73,7 +74,7 @@ static inline bool __cpu_uses_extended_idmap(void)
  /*
   * Set TCR.T0SZ to its default value (based on VA_BITS)
   */
-static inline void cpu_set_default_tcr_t0sz(void)
+static inline void __cpu_set_tcr_t0sz(unsigned long t0sz)
  {
         unsigned long tcr;
  
@@ -86,7 +87,62 @@ static inline void cpu_set_default_tcr_t0sz(void)
         "       msr     tcr_el1, %0     ;"
         "       isb"
         : "=&r" (tcr)
-       : "r"(TCR_T0SZ(VA_BITS)), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH));
+       : "r"(t0sz), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH));
+}
+
+#define cpu_set_default_tcr_t0sz()     __cpu_set_tcr_t0sz(TCR_T0SZ(VA_BITS))
+#define cpu_set_idmap_tcr_t0sz()       __cpu_set_tcr_t0sz(idmap_t0sz)
+
+/*
+ * Remove the idmap from TTBR0_EL1 and install the pgd of the active mm.
+ *
+ * The idmap lives in the same VA range as userspace, but uses global entries
+ * and may use a different TCR_EL1.T0SZ. To avoid issues resulting from
+ * speculative TLB fetches, we must temporarily install the reserved page
+ * tables while we invalidate the TLBs and set up the correct TCR_EL1.T0SZ.
+ *
+ * If current is a not a user task, the mm covers the TTBR1_EL1 page tables,
+ * which should not be installed in TTBR0_EL1. In this case we can leave the
+ * reserved page tables in place.
+ */
+static inline void cpu_uninstall_idmap(void)
+{
+       struct mm_struct *mm = current->active_mm;
+
+       cpu_set_reserved_ttbr0();
+       local_flush_tlb_all();
+       cpu_set_default_tcr_t0sz();
+
+       if (mm != &init_mm)
+               cpu_switch_mm(mm->pgd, mm);
+}
+
+static inline void cpu_install_idmap(void)
+{
+       cpu_set_reserved_ttbr0();
+       local_flush_tlb_all();
+       cpu_set_idmap_tcr_t0sz();
+
+       cpu_switch_mm(idmap_pg_dir, &init_mm);
+}
+
+/*
+ * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD,
+ * avoiding the possibility of conflicting TLB entries being allocated.
+ */
+static inline void cpu_replace_ttbr1(pgd_t *pgd)
+{
+       typedef void (ttbr_replace_func)(phys_addr_t);
+       extern ttbr_replace_func idmap_cpu_replace_ttbr1;
+       ttbr_replace_func *replace_phys;
+
+       phys_addr_t pgd_phys = virt_to_phys(pgd);
+
+       replace_phys = (void *)virt_to_phys(idmap_cpu_replace_ttbr1);
+
+       cpu_install_idmap();
+       replace_phys(pgd_phys);
+       cpu_uninstall_idmap();
  }
  
  /*
diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h

index e80e232b730e2d29f77dba57bfa91ad4439f9781..e12af6754634b3d2aa031ae23ce25228dc766cfb 100644 (file)
--- a/arch/arm64/include/asm/module.h
+++ b/arch/arm64/include/asm/module.h
@@ -20,4 +20,21 @@
  
  #define MODULE_ARCH_VERMAGIC   "aarch64"
  
+#ifdef CONFIG_ARM64_MODULE_PLTS
+struct mod_arch_specific {
+       struct elf64_shdr       *plt;
+       int                     plt_num_entries;
+       int                     plt_max_entries;
+};
+#endif
+
+u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela,
+                         Elf64_Sym *sym);
+
+#ifdef CONFIG_RANDOMIZE_BASE
+extern u64 module_alloc_base;
+#else
+#define module_alloc_base      ((u64)_etext - MODULES_VSIZE)
+#endif
+
  #endif /* __ASM_MODULE_H */
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h

index 9b2f5a9d019df493fa6021ee3ca6b4779401d8c4..fbafd0ad16df768fa7966a769014db5020294c41 100644 (file)
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -19,6 +19,8 @@
  #ifndef __ASM_PAGE_H
  #define __ASM_PAGE_H
  
+#include <linux/const.h>
+
  /* PAGE_SHIFT determines the page size */
  /* CONT_SHIFT determines the number of pages which can be tracked together  */
  #ifdef CONFIG_ARM64_64K_PAGES
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h

index c15053902942e0a3a34ba4882545c5b6d163c23c..ff98585d085aa5737c9c17478555f22b11261f2f 100644 (file)
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -42,11 +42,20 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
         free_page((unsigned long)pmd);
  }
  
-static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot)
  {
-       set_pud(pud, __pud(__pa(pmd) | PMD_TYPE_TABLE));
+       set_pud(pud, __pud(pmd | prot));
  }
  
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+       __pud_populate(pud, __pa(pmd), PMD_TYPE_TABLE);
+}
+#else
+static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot)
+{
+       BUILD_BUG();
+}
  #endif /* CONFIG_PGTABLE_LEVELS > 2 */
  
  #if CONFIG_PGTABLE_LEVELS > 3
@@ -62,11 +71,20 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
         free_page((unsigned long)pud);
  }
  
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot)
  {
-       set_pgd(pgd, __pgd(__pa(pud) | PUD_TYPE_TABLE));
+       set_pgd(pgdp, __pgd(pud | prot));
  }
  
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+       __pgd_populate(pgd, __pa(pud), PUD_TYPE_TABLE);
+}
+#else
+static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot)
+{
+       BUILD_BUG();
+}
  #endif /* CONFIG_PGTABLE_LEVELS > 3 */
  
  extern pgd_t *pgd_alloc(struct mm_struct *mm);
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h

index b9da9545b442cecde8119f1d2f10625483c575e0..9786f770088df41e919921b3a18024f045bfd707 100644 (file)
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -90,7 +90,23 @@
  /*
   * Contiguous page definitions.
   */
-#define CONT_PTES              (_AC(1, UL) << CONT_SHIFT)
+#ifdef CONFIG_ARM64_64K_PAGES
+#define CONT_PTE_SHIFT         5
+#define CONT_PMD_SHIFT         5
+#elif defined(CONFIG_ARM64_16K_PAGES)
+#define CONT_PTE_SHIFT         7
+#define CONT_PMD_SHIFT         5
+#else
+#define CONT_PTE_SHIFT         4
+#define CONT_PMD_SHIFT         4
+#endif
+
+#define CONT_PTES              (1 << CONT_PTE_SHIFT)
+#define CONT_PTE_SIZE          (CONT_PTES * PAGE_SIZE)
+#define CONT_PTE_MASK          (~(CONT_PTE_SIZE - 1))
+#define CONT_PMDS              (1 << CONT_PMD_SHIFT)
+#define CONT_PMD_SIZE          (CONT_PMDS * PMD_SIZE)
+#define CONT_PMD_MASK          (~(CONT_PMD_SIZE - 1))
  /* the the numerical offset of the PTE within a range of CONT_PTES */
  #define CONT_RANGE_OFFSET(addr) (((addr)>>PAGE_SHIFT)&(CONT_PTES-1))
  
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h

index 67c2ad6d33b7b91914ef56af3ca04f67ffb1b48c..9a09ccf7122dc529c5e508b341f268596b614501 100644 (file)
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -36,19 +36,13 @@
   *
   * VMEMAP_SIZE: allows the whole linear region to be covered by a struct page array
   *     (rounded up to PUD_SIZE).
- * VMALLOC_START: beginning of the kernel VA space
+ * VMALLOC_START: beginning of the kernel vmalloc space
   * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space,
   *     fixed mappings and modules
   */
  #define VMEMMAP_SIZE           ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
  
-#ifndef CONFIG_KASAN
-#define VMALLOC_START          (VA_START)
-#else
-#include <asm/kasan.h>
-#define VMALLOC_START          (KASAN_SHADOW_END + SZ_64K)
-#endif
-
+#define VMALLOC_START          (MODULES_END)
  #define VMALLOC_END            (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
  
  #define VMEMMAP_START          (VMALLOC_END + SZ_64K)
@@ -59,6 +53,7 @@
  
  #ifndef __ASSEMBLY__
  
+#include <asm/fixmap.h>
  #include <linux/mmdebug.h>
  
  extern void __pte_error(const char *file, int line, unsigned long val);
@@ -123,8 +118,8 @@ extern void __pgd_error(const char *file, int line, unsigned long val);
   * ZERO_PAGE is a global shared page that is always zero: used
   * for zero-mapped memory areas etc..
   */
-extern struct page *empty_zero_page;
-#define ZERO_PAGE(vaddr)       (empty_zero_page)
+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
+#define ZERO_PAGE(vaddr)       virt_to_page(empty_zero_page)
  
  #define pte_ERROR(pte)         __pte_error(__FILE__, __LINE__, pte_val(pte))
  
@@ -136,16 +131,6 @@ extern struct page *empty_zero_page;
  #define pte_clear(mm,addr,ptep)        set_pte(ptep, __pte(0))
  #define pte_page(pte)          (pfn_to_page(pte_pfn(pte)))
  
-/* Find an entry in the third-level page table. */
-#define pte_index(addr)                (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-
-#define pte_offset_kernel(dir,addr)    (pmd_page_vaddr(*(dir)) + pte_index(addr))
-
-#define pte_offset_map(dir,addr)       pte_offset_kernel((dir), (addr))
-#define pte_offset_map_nested(dir,addr)        pte_offset_kernel((dir), (addr))
-#define pte_unmap(pte)                 do { } while (0)
-#define pte_unmap_nested(pte)          do { } while (0)
-
  /*
   * The following only work if pte_present(). Undefined behaviour otherwise.
   */
@@ -168,6 +153,16 @@ extern struct page *empty_zero_page;
  #define pte_valid(pte)         (!!(pte_val(pte) & PTE_VALID))
  #define pte_valid_not_user(pte) \
         ((pte_val(pte) & (PTE_VALID | PTE_USER)) == PTE_VALID)
+#define pte_valid_young(pte) \
+       ((pte_val(pte) & (PTE_VALID | PTE_AF)) == (PTE_VALID | PTE_AF))
+
+/*
+ * Could the pte be present in the TLB? We must check mm_tlb_flush_pending
+ * so that we don't erroneously return false for pages that have been
+ * remapped as PROT_NONE but are yet to be flushed from the TLB.
+ */
+#define pte_accessible(mm, pte)        \
+       (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid_young(pte))
  
  static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot)
  {
@@ -218,7 +213,8 @@ static inline pte_t pte_mkspecial(pte_t pte)
  
  static inline pte_t pte_mkcont(pte_t pte)
  {
-       return set_pte_bit(pte, __pgprot(PTE_CONT));
+       pte = set_pte_bit(pte, __pgprot(PTE_CONT));
+       return set_pte_bit(pte, __pgprot(PTE_TYPE_PAGE));
  }
  
  static inline pte_t pte_mknoncont(pte_t pte)
@@ -226,6 +222,11 @@ static inline pte_t pte_mknoncont(pte_t pte)
         return clear_pte_bit(pte, __pgprot(PTE_CONT));
  }
  
+static inline pmd_t pmd_mkcont(pmd_t pmd)
+{
+       return __pmd(pmd_val(pmd) | PMD_SECT_CONT);
+}
+
  static inline void set_pte(pte_t *ptep, pte_t pte)
  {
         *ptep = pte;
@@ -299,7 +300,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
  /*
   * Hugetlb definitions.
   */
-#define HUGE_MAX_HSTATE                2
+#define HUGE_MAX_HSTATE                4
  #define HPAGE_SHIFT            PMD_SHIFT
  #define HPAGE_SIZE             (_AC(1, UL) << HPAGE_SHIFT)
  #define HPAGE_MASK             (~(HPAGE_SIZE - 1))
@@ -354,6 +355,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
  #define pmd_mksplitting(pmd)   pte_pmd(pte_mkspecial(pmd_pte(pmd)))
  #define pmd_mkold(pmd)         pte_pmd(pte_mkold(pmd_pte(pmd)))
  #define pmd_mkwrite(pmd)       pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_mkclean(pmd)       pte_pmd(pte_mkclean(pmd_pte(pmd)))
  #define pmd_mkdirty(pmd)       pte_pmd(pte_mkdirty(pmd_pte(pmd)))
  #define pmd_mkyoung(pmd)       pte_pmd(pte_mkyoung(pmd_pte(pmd)))
  #define pmd_mknotpresent(pmd)  (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID))
@@ -425,13 +427,31 @@ static inline void pmd_clear(pmd_t *pmdp)
         set_pmd(pmdp, __pmd(0));
  }
  
-static inline pte_t *pmd_page_vaddr(pmd_t pmd)
+static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
  {
-       return __va(pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK);
+       return pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK;
  }
  
+/* Find an entry in the third-level page table. */
+#define pte_index(addr)                (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+
+#define pte_offset_phys(dir,addr)      (pmd_page_paddr(*(dir)) + pte_index(addr) * sizeof(pte_t))
+#define pte_offset_kernel(dir,addr)    ((pte_t *)__va(pte_offset_phys((dir), (addr))))
+
+#define pte_offset_map(dir,addr)       pte_offset_kernel((dir), (addr))
+#define pte_offset_map_nested(dir,addr)        pte_offset_kernel((dir), (addr))
+#define pte_unmap(pte)                 do { } while (0)
+#define pte_unmap_nested(pte)          do { } while (0)
+
+#define pte_set_fixmap(addr)           ((pte_t *)set_fixmap_offset(FIX_PTE, addr))
+#define pte_set_fixmap_offset(pmd, addr)       pte_set_fixmap(pte_offset_phys(pmd, addr))
+#define pte_clear_fixmap()             clear_fixmap(FIX_PTE)
+
  #define pmd_page(pmd)          pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK))
  
+/* use ONLY for statically allocated translation tables */
+#define pte_offset_kimg(dir,addr)      ((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr))))
+
  /*
   * Conversion functions: convert a page and protection to a page entry,
   * and a page entry and page directory to the page they refer to.
@@ -458,21 +478,37 @@ static inline void pud_clear(pud_t *pudp)
         set_pud(pudp, __pud(0));
  }
  
-static inline pmd_t *pud_page_vaddr(pud_t pud)
+static inline phys_addr_t pud_page_paddr(pud_t pud)
  {
-       return __va(pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK);
+       return pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK;
  }
  
  /* Find an entry in the second-level page table. */
  #define pmd_index(addr)                (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
  
-static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
-{
-       return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(addr);
-}
+#define pmd_offset_phys(dir, addr)     (pud_page_paddr(*(dir)) + pmd_index(addr) * sizeof(pmd_t))
+#define pmd_offset(dir, addr)          ((pmd_t *)__va(pmd_offset_phys((dir), (addr))))
+
+#define pmd_set_fixmap(addr)           ((pmd_t *)set_fixmap_offset(FIX_PMD, addr))
+#define pmd_set_fixmap_offset(pud, addr)       pmd_set_fixmap(pmd_offset_phys(pud, addr))
+#define pmd_clear_fixmap()             clear_fixmap(FIX_PMD)
  
  #define pud_page(pud)          pfn_to_page(__phys_to_pfn(pud_val(pud) & PHYS_MASK))
  
+/* use ONLY for statically allocated translation tables */
+#define pmd_offset_kimg(dir,addr)      ((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr))))
+
+#else
+
+#define pud_page_paddr(pud)    ({ BUILD_BUG(); 0; })
+
+/* Match pmd_offset folding in <asm/generic/pgtable-nopmd.h> */
+#define pmd_set_fixmap(addr)           NULL
+#define pmd_set_fixmap_offset(pudp, addr)      ((pmd_t *)pudp)
+#define pmd_clear_fixmap()
+
+#define pmd_offset_kimg(dir,addr)      ((pmd_t *)dir)
+
  #endif /* CONFIG_PGTABLE_LEVELS > 2 */
  
  #if CONFIG_PGTABLE_LEVELS > 3
@@ -494,21 +530,37 @@ static inline void pgd_clear(pgd_t *pgdp)
         set_pgd(pgdp, __pgd(0));
  }
  
-static inline pud_t *pgd_page_vaddr(pgd_t pgd)
+static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
  {
-       return __va(pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK);
+       return pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK;
  }
  
  /* Find an entry in the frst-level page table. */
  #define pud_index(addr)                (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
  
-static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr)
-{
-       return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(addr);
-}
+#define pud_offset_phys(dir, addr)     (pgd_page_paddr(*(dir)) + pud_index(addr) * sizeof(pud_t))
+#define pud_offset(dir, addr)          ((pud_t *)__va(pud_offset_phys((dir), (addr))))
+
+#define pud_set_fixmap(addr)           ((pud_t *)set_fixmap_offset(FIX_PUD, addr))
+#define pud_set_fixmap_offset(pgd, addr)       pud_set_fixmap(pud_offset_phys(pgd, addr))
+#define pud_clear_fixmap()             clear_fixmap(FIX_PUD)
  
  #define pgd_page(pgd)          pfn_to_page(__phys_to_pfn(pgd_val(pgd) & PHYS_MASK))
  
+/* use ONLY for statically allocated translation tables */
+#define pud_offset_kimg(dir,addr)      ((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr))))
+
+#else
+
+#define pgd_page_paddr(pgd)    ({ BUILD_BUG(); 0;})
+
+/* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
+#define pud_set_fixmap(addr)           NULL
+#define pud_set_fixmap_offset(pgdp, addr)      ((pud_t *)pgdp)
+#define pud_clear_fixmap()
+
+#define pud_offset_kimg(dir,addr)      ((pud_t *)dir)
+
  #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
  
  #define pgd_ERROR(pgd)         __pgd_error(__FILE__, __LINE__, pgd_val(pgd))
@@ -516,11 +568,16 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr)
  /* to find an entry in a page-table-directory */
  #define pgd_index(addr)                (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
  
-#define pgd_offset(mm, addr)   ((mm)->pgd+pgd_index(addr))
+#define pgd_offset_raw(pgd, addr)      ((pgd) + pgd_index(addr))
+
+#define pgd_offset(mm, addr)   (pgd_offset_raw((mm)->pgd, (addr)))
  
  /* to find an entry in a kernel page-table-directory */
  #define pgd_offset_k(addr)     pgd_offset(&init_mm, addr)
  
+#define pgd_set_fixmap(addr)   ((pgd_t *)set_fixmap_offset(FIX_PGD, addr))
+#define pgd_clear_fixmap()     clear_fixmap(FIX_PGD)
+
  static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
  {
         const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY |
@@ -681,7 +738,8 @@ extern int kern_addr_valid(unsigned long addr);
  
  #include <asm-generic/pgtable.h>
  
-#define pgtable_cache_init() do { } while (0)
+void pgd_cache_init(void);
+#define pgtable_cache_init     pgd_cache_init
  
  /*
   * On AArch64, the cache coherency is handled via the set_pte_at() function.
diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h

new file mode 100644 (file)

index 0000000..5af574d
--- /dev/null
+++ b/arch/arm64/include/asm/probes.h
@@ -0,0 +1,35 @@
+/*
+ * arch/arm64/include/asm/probes.h
+ *
+ * Copyright (C) 2013 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef _ARM_PROBES_H
+#define _ARM_PROBES_H
+
+#include <asm/opcodes.h>
+
+struct kprobe;
+struct arch_specific_insn;
+
+typedef u32 kprobe_opcode_t;
+typedef void (kprobes_handler_t) (u32 opcode, long addr, struct pt_regs *);
+
+/* architecture specific copy of original instruction */
+struct arch_specific_insn {
+       kprobe_opcode_t *insn;
+       pstate_check_t *pstate_cc;
+       kprobes_handler_t *handler;
+       /* restore address after step xol */
+       unsigned long restore;
+};
+
+#endif
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h

index d085595289271bbcc663c70129ab7649de1dd3a8..4be934fde40906cac5d3b6e6f2c34ea8f35d695a 100644 (file)
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -29,8 +29,10 @@
  
  #include <linux/string.h>
  
+#include <asm/alternative.h>
  #include <asm/fpsimd.h>
  #include <asm/hw_breakpoint.h>
+#include <asm/lse.h>
  #include <asm/pgtable-hwdef.h>
  #include <asm/ptrace.h>
  #include <asm/types.h>
@@ -177,9 +179,11 @@ static inline void prefetchw(const void *ptr)
  }
  
  #define ARCH_HAS_SPINLOCK_PREFETCH
-static inline void spin_lock_prefetch(const void *x)
+static inline void spin_lock_prefetch(const void *ptr)
  {
-       prefetchw(x);
+       asm volatile(ARM64_LSE_ATOMIC_INSN(
+                    "prfm pstl1strm, %a0",
+                    "nop") : : "p" (ptr));
  }
  
  #define HAVE_ARCH_PICK_MMAP_LAYOUT
@@ -187,5 +191,6 @@ static inline void spin_lock_prefetch(const void *x)
  #endif
  
  int cpu_enable_pan(void *__unused);
+int cpu_enable_uao(void *__unused);
  
  #endif /* __ASM_PROCESSOR_H */
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h

index 7f94755089e200afbd4c015316da4517fe113bcd..1528d52eb8c0db9365df4e85c91b13524db3375b 100644 (file)
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -121,6 +121,8 @@ struct pt_regs {
         u64 unused;     // maintain 16 byte alignment
  };
  
+#define MAX_REG_OFFSET offsetof(struct pt_regs, pstate)
+
  #define arch_has_single_step() (1)
  
  #ifdef CONFIG_COMPAT
@@ -146,9 +148,57 @@ struct pt_regs {
  #define fast_interrupts_enabled(regs) \
         (!((regs)->pstate & PSR_F_BIT))
  
-#define user_stack_pointer(regs) \
+#define GET_USP(regs) \
         (!compat_user_mode(regs) ? (regs)->sp : (regs)->compat_sp)
  
+#define SET_USP(ptregs, value) \
+       (!compat_user_mode(regs) ? ((regs)->sp = value) : ((regs)->compat_sp = value))
+
+extern int regs_query_register_offset(const char *name);
+extern const char *regs_query_register_name(unsigned int offset);
+extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
+                                              unsigned int n);
+
+/**
+ * regs_get_register() - get register value from its offset
+ * @regs:      pt_regs from which register value is gotten
+ * @offset:    offset of the register.
+ *
+ * regs_get_register returns the value of a register whose offset from @regs.
+ * The @offset is the offset of the register in struct pt_regs.
+ * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
+ */
+static inline u64 regs_get_register(struct pt_regs *regs, unsigned int offset)
+{
+       u64 val = 0;
+
+       offset >>= 3;
+       switch (offset) {
+       case 0 ... 30:
+               val = regs->regs[offset];
+               break;
+       case offsetof(struct pt_regs, sp) >> 3:
+               val = regs->sp;
+               break;
+       case offsetof(struct pt_regs, pc) >> 3:
+               val = regs->pc;
+               break;
+       case offsetof(struct pt_regs, pstate) >> 3:
+               val = regs->pstate;
+               break;
+       default:
+               val = 0;
+       }
+
+       return val;
+}
+
+/* Valid only for Kernel mode traps. */
+static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
+{
+       return regs->sp;
+}
+
  static inline unsigned long regs_return_value(struct pt_regs *regs)
  {
         return regs->regs[0];
@@ -158,8 +208,15 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
  struct task_struct;
  int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task);
  
-#define instruction_pointer(regs)      ((unsigned long)(regs)->pc)
+#define GET_IP(regs)           ((unsigned long)(regs)->pc)
+#define SET_IP(regs, value)    ((regs)->pc = ((u64) (value)))
+
+#define GET_FP(ptregs)         ((unsigned long)(ptregs)->regs[29])
+#define SET_FP(ptregs, value)  ((ptregs)->regs[29] = ((u64) (value)))
+
+#include <asm-generic/ptrace.h>
  
+#undef profile_pc
  extern unsigned long profile_pc(struct pt_regs *regs);
  
  #endif /* __ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/shmparam.h b/arch/arm64/include/asm/shmparam.h

index 4df608a8459e27c657055edd5ca519b34e755b80..e368a55ebd22d0c3dab4a690afdd33d62c76ef73 100644 (file)
--- a/arch/arm64/include/asm/shmparam.h
+++ b/arch/arm64/include/asm/shmparam.h
@@ -21,7 +21,7 @@
   * alignment value. Since we don't have aliasing D-caches, the rest of
   * the time we can safely use PAGE_SIZE.
   */
-#define COMPAT_SHMLBA  0x4000
+#define COMPAT_SHMLBA  (4 * PAGE_SIZE)
  
  #include <asm-generic/shmparam.h>
  
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h

index d9c3d6a6100ac5d68e9b412113daccd1e43d8371..2013a4dc5124a55c5c304306b41908f16a0e5d64 100644 (file)
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -64,6 +64,15 @@ extern void secondary_entry(void);
  extern void arch_send_call_function_single_ipi(int cpu);
  extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
  
+#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
+extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask);
+#else
+static inline void arch_send_wakeup_ipi_mask(const struct cpumask *mask)
+{
+       BUILD_BUG();
+}
+#endif
+
  extern int __cpu_disable(void);
  
  extern void __cpu_die(unsigned int cpu);
diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h

index 499e8de33a0004c42c170de2fc007d3224908ac3..53ee219e76a735a8824df8b8b472ffb9febef29e 100644 (file)
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -26,9 +26,28 @@
   * The memory barriers are implicit with the load-acquire and store-release
   * instructions.
   */
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+       unsigned int tmp;
+       arch_spinlock_t lockval;
  
-#define arch_spin_unlock_wait(lock) \
-       do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
+       asm volatile(
+"      sevl\n"
+"1:    wfe\n"
+"2:    ldaxr   %w0, %2\n"
+"      eor     %w1, %w0, %w0, ror #16\n"
+"      cbnz    %w1, 1b\n"
+       ARM64_LSE_ATOMIC_INSN(
+       /* LL/SC */
+"      stxr    %w1, %w0, %2\n"
+"      cbnz    %w1, 2b\n", /* Serialise against any concurrent lockers */
+       /* LSE atomics */
+"      nop\n"
+"      nop\n")
+       : "=&r" (lockval), "=&r" (tmp), "+Q" (*lock)
+       :
+       : "memory");
+}
  
  #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
  
diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h

index 7318f6d54aa949ca906a990cce357a4cdd1358db..801a16dbbdf622d5239cf61be4f67eccab5c1e09 100644 (file)
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -16,14 +16,19 @@
  #ifndef __ASM_STACKTRACE_H
  #define __ASM_STACKTRACE_H
  
+struct task_struct;
+
  struct stackframe {
         unsigned long fp;
         unsigned long sp;
         unsigned long pc;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       unsigned int graph;
+#endif
  };
  
-extern int unwind_frame(struct stackframe *frame);
-extern void walk_stackframe(struct stackframe *frame,
+extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame);
+extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
                             int (*fn)(struct stackframe *, void *), void *data);
  
  #endif /* __ASM_STACKTRACE_H */
diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h

index 59a5b0f1e81c3274f3c1c5c0ffb4ce4014ce35c2..024d623f662e588c845f47abd752bfba2470ed2f 100644 (file)
--- a/arch/arm64/include/asm/suspend.h
+++ b/arch/arm64/include/asm/suspend.h
@@ -1,7 +1,8 @@
  #ifndef __ASM_SUSPEND_H
  #define __ASM_SUSPEND_H
  
-#define NR_CTX_REGS 11
+#define NR_CTX_REGS 10
+#define NR_CALLEE_SAVED_REGS 12
  
  /*
   * struct cpu_suspend_ctx must be 16-byte aligned since it is allocated on
@@ -16,11 +17,34 @@ struct cpu_suspend_ctx {
         u64 sp;
  } __aligned(16);
  
-struct sleep_save_sp {
-       phys_addr_t *save_ptr_stash;
-       phys_addr_t save_ptr_stash_phys;
+/*
+ * Memory to save the cpu state is allocated on the stack by
+ * __cpu_suspend_enter()'s caller, and populated by __cpu_suspend_enter().
+ * This data must survive until cpu_resume() is called.
+ *
+ * This struct desribes the size and the layout of the saved cpu state.
+ * The layout of the callee_saved_regs is defined by the implementation
+ * of __cpu_suspend_enter(), and cpu_resume(). This struct must be passed
+ * in by the caller as __cpu_suspend_enter()'s stack-frame is gone once it
+ * returns, and the data would be subsequently corrupted by the call to the
+ * finisher.
+ */
+struct sleep_stack_data {
+       struct cpu_suspend_ctx  system_regs;
+       unsigned long           callee_saved_regs[NR_CALLEE_SAVED_REGS];
  };
  
+extern unsigned long *sleep_save_stash;
+
  extern int cpu_suspend(unsigned long arg, int (*fn)(unsigned long));
  extern void cpu_resume(void);
+int __cpu_suspend_enter(struct sleep_stack_data *state);
+void __cpu_suspend_exit(void);
+void _cpu_resume(void);
+
+int swsusp_arch_suspend(void);
+int swsusp_arch_resume(void);
+int arch_hibernation_header_save(void *addr, unsigned int max_size);
+int arch_hibernation_header_restore(void *addr);
+
  #endif
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h

index d48ab5b41f521c23819c0b3927a9ea0f73db242c..0961a24e8d4891bdf976f7cf1d39a39cd31cc229 100644 (file)
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -20,6 +20,8 @@
  #ifndef __ASM_SYSREG_H
  #define __ASM_SYSREG_H
  
+#include <linux/stringify.h>
+
  #include <asm/opcodes.h>
  
  /*
@@ -70,20 +72,35 @@
  
  #define SYS_ID_AA64MMFR0_EL1           sys_reg(3, 0, 0, 7, 0)
  #define SYS_ID_AA64MMFR1_EL1           sys_reg(3, 0, 0, 7, 1)
+#define SYS_ID_AA64MMFR2_EL1           sys_reg(3, 0, 0, 7, 2)
  
  #define SYS_CNTFRQ_EL0                 sys_reg(3, 3, 14, 0, 0)
  #define SYS_CTR_EL0                    sys_reg(3, 3, 0, 0, 1)
  #define SYS_DCZID_EL0                  sys_reg(3, 3, 0, 0, 7)
  
  #define REG_PSTATE_PAN_IMM             sys_reg(0, 0, 4, 0, 4)
+#define REG_PSTATE_UAO_IMM             sys_reg(0, 0, 4, 0, 3)
  
  #define SET_PSTATE_PAN(x) __inst_arm(0xd5000000 | REG_PSTATE_PAN_IMM |\
                                      (!!x)<<8 | 0x1f)
+#define SET_PSTATE_UAO(x) __inst_arm(0xd5000000 | REG_PSTATE_UAO_IMM |\
+                                    (!!x)<<8 | 0x1f)
+
+/* Common SCTLR_ELx flags. */
+#define SCTLR_ELx_EE    (1 << 25)
+#define SCTLR_ELx_I    (1 << 12)
+#define SCTLR_ELx_SA   (1 << 3)
+#define SCTLR_ELx_C    (1 << 2)
+#define SCTLR_ELx_A    (1 << 1)
+#define SCTLR_ELx_M    1
  
-/* SCTLR_EL1 */
-#define SCTLR_EL1_CP15BEN      (0x1 << 5)
-#define SCTLR_EL1_SED          (0x1 << 8)
-#define SCTLR_EL1_SPAN         (0x1 << 23)
+#define SCTLR_ELx_FLAGS        (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \
+                        SCTLR_ELx_SA | SCTLR_ELx_I)
+
+/* SCTLR_EL1 specific flags. */
+#define SCTLR_EL1_SPAN         (1 << 23)
+#define SCTLR_EL1_SED          (1 << 8)
+#define SCTLR_EL1_CP15BEN      (1 << 5)
  
  
  /* id_aa64isar0 */
@@ -135,6 +152,9 @@
  #define ID_AA64MMFR1_VMIDBITS_SHIFT    4
  #define ID_AA64MMFR1_HADBS_SHIFT       0
  
+/* id_aa64mmfr2 */
+#define ID_AA64MMFR2_UAO_SHIFT         4
+
  /* id_aa64dfr0 */
  #define ID_AA64DFR0_CTX_CMPS_SHIFT     28
  #define ID_AA64DFR0_WRPS_SHIFT         20
@@ -194,32 +214,34 @@
  #ifdef __ASSEMBLY__
  
         .irp    num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
-       .equ    __reg_num_x\num, \num
+       .equ    .L__reg_num_x\num, \num
         .endr
-       .equ    __reg_num_xzr, 31
+       .equ    .L__reg_num_xzr, 31
  
         .macro  mrs_s, rt, sreg
-       .inst   0xd5200000|(\sreg)|(__reg_num_\rt)
+       .inst   0xd5200000|(\sreg)|(.L__reg_num_\rt)
         .endm
  
         .macro  msr_s, sreg, rt
-       .inst   0xd5000000|(\sreg)|(__reg_num_\rt)
+       .inst   0xd5000000|(\sreg)|(.L__reg_num_\rt)
         .endm
  
  #else
  
+#include <linux/types.h>
+
  asm(
  "      .irp    num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n"
-"      .equ    __reg_num_x\\num, \\num\n"
+"      .equ    .L__reg_num_x\\num, \\num\n"
  "      .endr\n"
-"      .equ    __reg_num_xzr, 31\n"
+"      .equ    .L__reg_num_xzr, 31\n"
  "\n"
  "      .macro  mrs_s, rt, sreg\n"
-"      .inst   0xd5200000|(\\sreg)|(__reg_num_\\rt)\n"
+"      .inst   0xd5200000|(\\sreg)|(.L__reg_num_\\rt)\n"
  "      .endm\n"
  "\n"
  "      .macro  msr_s, sreg, rt\n"
-"      .inst   0xd5000000|(\\sreg)|(__reg_num_\\rt)\n"
+"      .inst   0xd5000000|(\\sreg)|(.L__reg_num_\\rt)\n"
  "      .endm\n"
  );
  
@@ -232,6 +254,23 @@ static inline void config_sctlr_el1(u32 clear, u32 set)
         val |= set;
         asm volatile("msr sctlr_el1, %0" : : "r" (val));
  }
+
+/*
+ * Unlike read_cpuid, calls to read_sysreg are never expected to be
+ * optimized away or replaced with synthetic values.
+ */
+#define read_sysreg(r) ({                                      \
+       u64 __val;                                              \
+       asm volatile("mrs %0, " __stringify(r) : "=r" (__val)); \
+       __val;                                                  \
+})
+
+#define write_sysreg(v, r) do {                                        \
+       u64 __val = (u64)v;                                     \
+       asm volatile("msr " __stringify(r) ", %0"               \
+                    : : "r" (__val));                          \
+} while (0)
+
  #endif
  
  #endif /* __ASM_SYSREG_H */
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h

index 90c7ff233735d7691bf3b34b7075dffd07dbf939..abd64bd1f6d9f0160a3122555cf23be1a30f87eb 100644 (file)
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -73,10 +73,16 @@ register unsigned long current_stack_pointer asm ("sp");
   */
  static inline struct thread_info *current_thread_info(void) __attribute_const__;
  
+/*
+ * struct thread_info can be accessed directly via sp_el0.
+ */
  static inline struct thread_info *current_thread_info(void)
  {
-       return (struct thread_info *)
-               (current_stack_pointer & ~(THREAD_SIZE - 1));
+       unsigned long sp_el0;
+
+       asm ("mrs %0, sp_el0" : "=r" (sp_el0));
+
+       return (struct thread_info *)sp_el0;
  }
  
  #define thread_saved_pc(tsk)   \
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h

index b2ede967fe7d49258c56af51e73786efb5d1a420..c3d445b42351e1a529d94c4ba44e9c4d37544821 100644 (file)
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -36,11 +36,11 @@
  #define VERIFY_WRITE 1
  
  /*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
+ * The exception table consists of pairs of relative offsets: the first
+ * is the relative offset to an instruction that is allowed to fault,
+ * and the second is the relative offset at which the program should
+ * continue. No registers are modified, so it is entirely up to the
+ * continuation code to figure out what to do.
   *
   * All the routines below use bits of fixup code that are out of line
   * with the main instruction path.  This means when everything is well,
@@ -50,9 +50,11 @@
  
  struct exception_table_entry
  {
-       unsigned long insn, fixup;
+       int insn, fixup;
  };
  
+#define ARCH_HAS_RELATIVE_EXTABLE
+
  extern int fixup_exception(struct pt_regs *regs);
  
  #define KERNEL_DS      (-1UL)
@@ -64,6 +66,16 @@ extern int fixup_exception(struct pt_regs *regs);
  static inline void set_fs(mm_segment_t fs)
  {
         current_thread_info()->addr_limit = fs;
+
+       /*
+        * Enable/disable UAO so that copy_to_user() etc can access
+        * kernel memory with the unprivileged instructions.
+        */
+       if (IS_ENABLED(CONFIG_ARM64_UAO) && fs == KERNEL_DS)
+               asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO));
+       else
+               asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO,
+                               CONFIG_ARM64_UAO));
  }
  
  #define segment_eq(a, b)       ((a) == (b))
@@ -105,6 +117,12 @@ static inline void set_fs(mm_segment_t fs)
  #define access_ok(type, addr, size)    __range_ok(addr, size)
  #define user_addr_max                  get_fs
  
+#define _ASM_EXTABLE(from, to)                                         \
+       "       .pushsection    __ex_table, \"a\"\n"                    \
+       "       .align          3\n"                                    \
+       "       .long           (" #from " - .), (" #to " - .)\n"       \
+       "       .popsection\n"
+
  /*
   * The "__xxx" versions of the user access functions do not verify the address
   * space - it must have been done previously with a separate "access_ok()"
@@ -113,9 +131,10 @@ static inline void set_fs(mm_segment_t fs)
   * The "__xxx_error" versions set the third argument to -EFAULT if an error
   * occurs, and leave it unchanged on success.
   */
-#define __get_user_asm(instr, reg, x, addr, err)                       \
+#define __get_user_asm(instr, alt_instr, reg, x, addr, err, feature)   \
         asm volatile(                                                   \
-       "1:     " instr "       " reg "1, [%2]\n"                       \
+       "1:"ALTERNATIVE(instr "     " reg "1, [%2]\n",                  \
+                       alt_instr " " reg "1, [%2]\n", feature)         \
         "2:\n"                                                          \
         "       .section .fixup, \"ax\"\n"                              \
         "       .align  2\n"                                            \
@@ -123,10 +142,7 @@ static inline void set_fs(mm_segment_t fs)
         "       mov     %1, #0\n"                                       \
         "       b       2b\n"                                           \
         "       .previous\n"                                            \
-       "       .section __ex_table,\"a\"\n"                            \
-       "       .align  3\n"                                            \
-       "       .quad   1b, 3b\n"                                       \
-       "       .previous"                                              \
+       _ASM_EXTABLE(1b, 3b)                                            \
         : "+r" (err), "=&r" (x)                                         \
         : "r" (addr), "i" (-EFAULT))
  
@@ -134,26 +150,30 @@ static inline void set_fs(mm_segment_t fs)
  do {                                                                   \
         unsigned long __gu_val;                                         \
         __chk_user_ptr(ptr);                                            \
-       asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,        \
+       asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
                         CONFIG_ARM64_PAN));                             \
         switch (sizeof(*(ptr))) {                                       \
         case 1:                                                         \
-               __get_user_asm("ldrb", "%w", __gu_val, (ptr), (err));   \
+               __get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr),  \
+                              (err), ARM64_HAS_UAO);                   \
                 break;                                                  \
         case 2:                                                         \
-               __get_user_asm("ldrh", "%w", __gu_val, (ptr), (err));   \
+               __get_user_asm("ldrh", "ldtrh", "%w", __gu_val, (ptr),  \
+                              (err), ARM64_HAS_UAO);                   \
                 break;                                                  \
         case 4:                                                         \
-               __get_user_asm("ldr", "%w", __gu_val, (ptr), (err));    \
+               __get_user_asm("ldr", "ldtr", "%w", __gu_val, (ptr),    \
+                              (err), ARM64_HAS_UAO);                   \
                 break;                                                  \
         case 8:                                                         \
-               __get_user_asm("ldr", "%",  __gu_val, (ptr), (err));    \
+               __get_user_asm("ldr", "ldtr", "%",  __gu_val, (ptr),    \
+                              (err), ARM64_HAS_UAO);                   \
                 break;                                                  \
         default:                                                        \
                 BUILD_BUG();                                            \
         }                                                               \
         (x) = (__force __typeof__(*(ptr)))__gu_val;                     \
-       asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,        \
+       asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
                         CONFIG_ARM64_PAN));                             \
  } while (0)
  
@@ -181,19 +201,17 @@ do {                                                                      \
                 ((x) = 0, -EFAULT);                                     \
  })
  
-#define __put_user_asm(instr, reg, x, addr, err)                       \
+#define __put_user_asm(instr, alt_instr, reg, x, addr, err, feature)   \
         asm volatile(                                                   \
-       "1:     " instr "       " reg "1, [%2]\n"                       \
+       "1:"ALTERNATIVE(instr "     " reg "1, [%2]\n",                  \
+                       alt_instr " " reg "1, [%2]\n", feature)         \
         "2:\n"                                                          \
         "       .section .fixup,\"ax\"\n"                               \
         "       .align  2\n"                                            \
         "3:     mov     %w0, %3\n"                                      \
         "       b       2b\n"                                           \
         "       .previous\n"                                            \
-       "       .section __ex_table,\"a\"\n"                            \
-       "       .align  3\n"                                            \
-       "       .quad   1b, 3b\n"                                       \
-       "       .previous"                                              \
+       _ASM_EXTABLE(1b, 3b)                                            \
         : "+r" (err)                                                    \
         : "r" (x), "r" (addr), "i" (-EFAULT))
  
@@ -201,25 +219,29 @@ do {                                                                      \
  do {                                                                   \
         __typeof__(*(ptr)) __pu_val = (x);                              \
         __chk_user_ptr(ptr);                                            \
-       asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,        \
+       asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\
                         CONFIG_ARM64_PAN));                             \
         switch (sizeof(*(ptr))) {                                       \
         case 1:                                                         \
-               __put_user_asm("strb", "%w", __pu_val, (ptr), (err));   \
+               __put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr),  \
+                              (err), ARM64_HAS_UAO);                   \
                 break;                                                  \
         case 2:                                                         \
-               __put_user_asm("strh", "%w", __pu_val, (ptr), (err));   \
+               __put_user_asm("strh", "sttrh", "%w", __pu_val, (ptr),  \
+                              (err), ARM64_HAS_UAO);                   \
                 break;                                                  \
         case 4:                                                         \
-               __put_user_asm("str",  "%w", __pu_val, (ptr), (err));   \
+               __put_user_asm("str", "sttr", "%w", __pu_val, (ptr),    \
+                              (err), ARM64_HAS_UAO);                   \
                 break;                                                  \
         case 8:                                                         \
-               __put_user_asm("str",  "%", __pu_val, (ptr), (err));    \
+               __put_user_asm("str", "sttr", "%", __pu_val, (ptr),     \
+                              (err), ARM64_HAS_UAO);                   \
                 break;                                                  \
         default:                                                        \
                 BUILD_BUG();                                            \
         }                                                               \
-       asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,        \
+       asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\
                         CONFIG_ARM64_PAN));                             \
  } while (0)
  
@@ -247,24 +269,39 @@ do {                                                                      \
                 -EFAULT;                                                \
  })
  
-extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n);
-extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n);
+extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n);
+extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n);
  extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n);
  extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n);
  
+static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+       check_object_size(to, n, false);
+       return __arch_copy_from_user(to, from, n);
+}
+
+static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+       check_object_size(from, n, true);
+       return __arch_copy_to_user(to, from, n);
+}
+
  static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n)
  {
-       if (access_ok(VERIFY_READ, from, n))
-               n = __copy_from_user(to, from, n);
-       else /* security hole - plug it */
+       if (access_ok(VERIFY_READ, from, n)) {
+               check_object_size(to, n, false);
+               n = __arch_copy_from_user(to, from, n);
+       } else /* security hole - plug it */
                 memset(to, 0, n);
         return n;
  }
  
  static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n)
  {
-       if (access_ok(VERIFY_WRITE, to, n))
-               n = __copy_to_user(to, from, n);
+       if (access_ok(VERIFY_WRITE, to, n)) {
+               check_object_size(from, n, true);
+               n = __arch_copy_to_user(to, from, n);
+       }
         return n;
  }
  
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h

index 7a5df5252dd736e4038e04e40510351820056183..06e6a5238c4c47b41a9567cf7c004c7dae78b888 100644 (file)
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -18,11 +18,29 @@
  #ifndef __ASM__VIRT_H
  #define __ASM__VIRT_H
  
+/*
+ * The arm64 hcall implementation uses x0 to specify the hcall type. A value
+ * less than 0xfff indicates a special hcall, such as get/set vector.
+ * Any other value is used as a pointer to the function to call.
+ */
+
+/* HVC_GET_VECTORS - Return the value of the vbar_el2 register. */
+#define HVC_GET_VECTORS 0
+
+/*
+ * HVC_SET_VECTORS - Set the value of the vbar_el2 register.
+ *
+ * @x1: Physical address of the new vector table.
+ */
+#define HVC_SET_VECTORS 1
+
  #define BOOT_CPU_MODE_EL1      (0xe11)
  #define BOOT_CPU_MODE_EL2      (0xe12)
  
  #ifndef __ASSEMBLY__
  
+#include <asm/ptrace.h>
+
  /*
   * __boot_cpu_mode records what mode CPUs were booted in.
   * A correctly-implemented bootloader must start all CPUs in the same mode:
@@ -50,6 +68,14 @@ static inline bool is_hyp_mode_mismatched(void)
         return __boot_cpu_mode[0] != __boot_cpu_mode[1];
  }
  
+static inline bool is_kernel_in_hyp_mode(void)
+{
+       u64 el;
+
+       asm("mrs %0, CurrentEL" : "=r" (el));
+       return el == CurrentEL_EL2;
+}
+
  /* The section containing the hypervisor text */
  extern char __hyp_text_start[];
  extern char __hyp_text_end[];
diff --git a/arch/arm64/include/asm/word-at-a-time.h b/arch/arm64/include/asm/word-at-a-time.h

index aab5bf09e9d902f7bdf3d09e61fcd97db748c505..2b79b8a89457bd70e20e24ba33c8029195e3284c 100644 (file)
--- a/arch/arm64/include/asm/word-at-a-time.h
+++ b/arch/arm64/include/asm/word-at-a-time.h
@@ -16,6 +16,8 @@
  #ifndef __ASM_WORD_AT_A_TIME_H
  #define __ASM_WORD_AT_A_TIME_H
  
+#include <asm/uaccess.h>
+
  #ifndef __AARCH64EB__
  
  #include <linux/kernel.h>
@@ -81,10 +83,7 @@ static inline unsigned long load_unaligned_zeropad(const void *addr)
  #endif
         "       b       2b\n"
         "       .popsection\n"
-       "       .pushsection __ex_table,\"a\"\n"
-       "       .align  3\n"
-       "       .quad   1b, 3b\n"
-       "       .popsection"
+       _ASM_EXTABLE(1b, 3b)
         : "=&r" (ret), "=&r" (offset)
         : "r" (addr), "Q" (*(unsigned long *)addr));
  
diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h

index 3378238b5d8ba2d66ac8202c492a28e0730f71a0..d1ff83dfe5deae580cdb0ebf93f25b32a63f5a74 100644 (file)
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -45,6 +45,7 @@
  #define PSR_A_BIT      0x00000100
  #define PSR_D_BIT      0x00000200
  #define PSR_PAN_BIT    0x00400000
+#define PSR_UAO_BIT    0x00800000
  #define PSR_Q_BIT      0x08000000
  #define PSR_V_BIT      0x10000000
  #define PSR_C_BIT      0x20000000
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile

index 474691f8b13ab893cf403b8c1737a91aeff87bc0..20bcc2db06bfab77cc0793af96b7ce051d321e10 100644 (file)
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -14,10 +14,10 @@ CFLAGS_REMOVE_return_address.o = -pg
  arm64-obj-y            := debug-monitors.o entry.o irq.o fpsimd.o              \
                            entry-fpsimd.o process.o ptrace.o setup.o signal.o   \
                            sys.o stacktrace.o time.o traps.o io.o vdso.o        \
-                          hyp-stub.o psci.o psci-call.o cpu_ops.o insn.o       \
+                          hyp-stub.o psci.o cpu_ops.o insn.o   \
                            return_address.o cpuinfo.o cpu_errata.o              \
                            cpufeature.o alternative.o cacheinfo.o               \
-                          smp.o smp_spin_table.o topology.o
+                          smp.o smp_spin_table.o topology.o smccc-call.o
  
  extra-$(CONFIG_EFI)                    := efi-entry.o
  
@@ -26,10 +26,10 @@ $(obj)/%.stub.o: $(obj)/%.o FORCE
         $(call if_changed,objcopy)
  
  arm64-obj-$(CONFIG_COMPAT)             += sys32.o kuser32.o signal32.o         \
-                                          sys_compat.o entry32.o               \
-                                          ../../arm/kernel/opcodes.o
+                                          sys_compat.o entry32.o
  arm64-obj-$(CONFIG_FUNCTION_TRACER)    += ftrace.o entry-ftrace.o
  arm64-obj-$(CONFIG_MODULES)            += arm64ksyms.o module.o
+arm64-obj-$(CONFIG_ARM64_MODULE_PLTS)  += module-plts.o
  arm64-obj-$(CONFIG_PERF_EVENTS)                += perf_regs.o perf_callchain.o
  arm64-obj-$(CONFIG_HW_PERF_EVENTS)     += perf_event.o
  arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
@@ -41,8 +41,12 @@ arm64-obj-$(CONFIG_EFI)                      += efi.o efi-entry.stub.o
  arm64-obj-$(CONFIG_PCI)                        += pci.o
  arm64-obj-$(CONFIG_ARMV8_DEPRECATED)   += armv8_deprecated.o
  arm64-obj-$(CONFIG_ACPI)               += acpi.o
+arm64-obj-$(CONFIG_RANDOMIZE_BASE)     += kaslr.o
+arm64-obj-$(CONFIG_HIBERNATION)                += hibernate.o hibernate-asm.o
+arm64-obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL)        += acpi_parking_protocol.o
+arm64-obj-$(CONFIG_PARAVIRT)           += paravirt.o
  
-obj-y                                  += $(arm64-obj-y) vdso/
+obj-y                                  += $(arm64-obj-y) vdso/ probes/
  obj-m                                  += $(arm64-obj-m)
  head-y                                 := head.o
  extra-y                                        += $(head-y) vmlinux.lds
diff --git a/arch/arm64/kernel/acpi_parking_protocol.c b/arch/arm64/kernel/acpi_parking_protocol.c

new file mode 100644 (file)

index 0000000..4b1e5a7
--- /dev/null
+++ b/arch/arm64/kernel/acpi_parking_protocol.c
@@ -0,0 +1,153 @@
+/*
+ * ARM64 ACPI Parking Protocol implementation
+ *
+ * Authors: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+ *         Mark Salter <msalter@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/acpi.h>
+#include <linux/types.h>
+
+#include <asm/cpu_ops.h>
+
+struct cpu_mailbox_entry {
+       phys_addr_t mailbox_addr;
+       u8 version;
+       u8 gic_cpu_id;
+};
+
+static struct cpu_mailbox_entry cpu_mailbox_entries[NR_CPUS];
+
+void __init acpi_set_mailbox_entry(int cpu,
+                                  struct acpi_madt_generic_interrupt *p)
+{
+       struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu];
+
+       cpu_entry->mailbox_addr = p->parked_address;
+       cpu_entry->version = p->parking_version;
+       cpu_entry->gic_cpu_id = p->cpu_interface_number;
+}
+
+bool acpi_parking_protocol_valid(int cpu)
+{
+       struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu];
+
+       return cpu_entry->mailbox_addr && cpu_entry->version;
+}
+
+static int acpi_parking_protocol_cpu_init(unsigned int cpu)
+{
+       pr_debug("%s: ACPI parked addr=%llx\n", __func__,
+                 cpu_mailbox_entries[cpu].mailbox_addr);
+
+       return 0;
+}
+
+static int acpi_parking_protocol_cpu_prepare(unsigned int cpu)
+{
+       return 0;
+}
+
+struct parking_protocol_mailbox {
+       __le32 cpu_id;
+       __le32 reserved;
+       __le64 entry_point;
+};
+
+static int acpi_parking_protocol_cpu_boot(unsigned int cpu)
+{
+       struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu];
+       struct parking_protocol_mailbox __iomem *mailbox;
+       __le32 cpu_id;
+
+       /*
+        * Map mailbox memory with attribute device nGnRE (ie ioremap -
+        * this deviates from the parking protocol specifications since
+        * the mailboxes are required to be mapped nGnRnE; the attribute
+        * discrepancy is harmless insofar as the protocol specification
+        * is concerned).
+        * If the mailbox is mistakenly allocated in the linear mapping
+        * by FW ioremap will fail since the mapping will be prevented
+        * by the kernel (it clashes with the linear mapping attributes
+        * specifications).
+        */
+       mailbox = ioremap(cpu_entry->mailbox_addr, sizeof(*mailbox));
+       if (!mailbox)
+               return -EIO;
+
+       cpu_id = readl_relaxed(&mailbox->cpu_id);
+       /*
+        * Check if firmware has set-up the mailbox entry properly
+        * before kickstarting the respective cpu.
+        */
+       if (cpu_id != ~0U) {
+               iounmap(mailbox);
+               return -ENXIO;
+       }
+
+       /*
+        * We write the entry point and cpu id as LE regardless of the
+        * native endianness of the kernel. Therefore, any boot-loaders
+        * that read this address need to convert this address to the
+        * Boot-Loader's endianness before jumping.
+        */
+       writeq_relaxed(__pa(secondary_entry), &mailbox->entry_point);
+       writel_relaxed(cpu_entry->gic_cpu_id, &mailbox->cpu_id);
+
+       arch_send_wakeup_ipi_mask(cpumask_of(cpu));
+
+       iounmap(mailbox);
+
+       return 0;
+}
+
+static void acpi_parking_protocol_cpu_postboot(void)
+{
+       int cpu = smp_processor_id();
+       struct cpu_mailbox_entry *cpu_entry = &cpu_mailbox_entries[cpu];
+       struct parking_protocol_mailbox __iomem *mailbox;
+       __le64 entry_point;
+
+       /*
+        * Map mailbox memory with attribute device nGnRE (ie ioremap -
+        * this deviates from the parking protocol specifications since
+        * the mailboxes are required to be mapped nGnRnE; the attribute
+        * discrepancy is harmless insofar as the protocol specification
+        * is concerned).
+        * If the mailbox is mistakenly allocated in the linear mapping
+        * by FW ioremap will fail since the mapping will be prevented
+        * by the kernel (it clashes with the linear mapping attributes
+        * specifications).
+        */
+       mailbox = ioremap(cpu_entry->mailbox_addr, sizeof(*mailbox));
+       if (!mailbox)
+               return;
+
+       entry_point = readl_relaxed(&mailbox->entry_point);
+       /*
+        * Check if firmware has cleared the entry_point as expected
+        * by the protocol specification.
+        */
+       WARN_ON(entry_point);
+
+       iounmap(mailbox);
+}
+
+const struct cpu_operations acpi_parking_protocol_ops = {
+       .name           = "parking-protocol",
+       .cpu_init       = acpi_parking_protocol_cpu_init,
+       .cpu_prepare    = acpi_parking_protocol_cpu_prepare,
+       .cpu_boot       = acpi_parking_protocol_cpu_boot,
+       .cpu_postboot   = acpi_parking_protocol_cpu_postboot
+};
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c

index ab9db0e9818c0caa52aa040b7c01aa83183d774e..d2ee1b21a10ddd1bcce1718210c6ce7b7725cfe3 100644 (file)
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -158,9 +158,3 @@ void apply_alternatives(void *start, size_t length)
  
         __apply_alternatives(&region);
  }
-
-void free_alternatives_memory(void)
-{
-       free_reserved_area(__alt_instructions, __alt_instructions_end,
-                          0, "alternatives");
-}
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c

index 3b6d8cc9dfe00ce14b764a3102ad0cd73f546c82..ee97181e44770cdec504f8dc15b41cbe5129b67e 100644 (file)
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -26,6 +26,8 @@
  #include <linux/syscalls.h>
  #include <linux/uaccess.h>
  #include <linux/io.h>
+#include <linux/kprobes.h>
+#include <linux/arm-smccc.h>
  
  #include <asm/checksum.h>
  
@@ -33,8 +35,8 @@ EXPORT_SYMBOL(copy_page);
  EXPORT_SYMBOL(clear_page);
  
         /* user mem (segment) */
-EXPORT_SYMBOL(__copy_from_user);
-EXPORT_SYMBOL(__copy_to_user);
+EXPORT_SYMBOL(__arch_copy_from_user);
+EXPORT_SYMBOL(__arch_copy_to_user);
  EXPORT_SYMBOL(__clear_user);
  EXPORT_SYMBOL(__copy_in_user);
  
@@ -67,4 +69,9 @@ EXPORT_SYMBOL(test_and_change_bit);
  
  #ifdef CONFIG_FUNCTION_TRACER
  EXPORT_SYMBOL(_mcount);
+NOKPROBE_SYMBOL(_mcount);
  #endif
+
+       /* arm-smccc */
+EXPORT_SYMBOL(arm_smccc_smc);
+EXPORT_SYMBOL(arm_smccc_hvc);
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c

index 937f5e58a4d340a27234c76b5a84fedcf9aa6373..29348947652985e1556e4c7225cbfd17774a4d87 100644 (file)
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -62,7 +62,7 @@ struct insn_emulation {
  };
  
  static LIST_HEAD(insn_emulation);
-static int nr_insn_emulated;
+static int nr_insn_emulated __initdata;
  static DEFINE_RAW_SPINLOCK(insn_emulation_lock);
  
  static void register_emulation_hooks(struct insn_emulation_ops *ops)
@@ -173,7 +173,7 @@ static int update_insn_emulation_mode(struct insn_emulation *insn,
         return ret;
  }
  
-static void register_insn_emulation(struct insn_emulation_ops *ops)
+static void __init register_insn_emulation(struct insn_emulation_ops *ops)
  {
         unsigned long flags;
         struct insn_emulation *insn;
@@ -237,7 +237,7 @@ static struct ctl_table ctl_abi[] = {
         { }
  };
  
-static void register_insn_emulation_sysctl(struct ctl_table *table)
+static void __init register_insn_emulation_sysctl(struct ctl_table *table)
  {
         unsigned long flags;
         int i = 0;
@@ -297,11 +297,8 @@ static void register_insn_emulation_sysctl(struct ctl_table *table)
         "4:     mov             %w0, %w5\n"                     \
         "       b               3b\n"                           \
         "       .popsection"                                    \
-       "       .pushsection     __ex_table,\"a\"\n"            \
-       "       .align          3\n"                            \
-       "       .quad           0b, 4b\n"                       \
-       "       .quad           1b, 4b\n"                       \
-       "       .popsection\n"                                  \
+       _ASM_EXTABLE(0b, 4b)                                    \
+       _ASM_EXTABLE(1b, 4b)                                    \
         ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,    \
                 CONFIG_ARM64_PAN)                               \
         : "=&r" (res), "+r" (data), "=&r" (temp)                \
@@ -369,6 +366,21 @@ static int emulate_swpX(unsigned int address, unsigned int *data,
         return res;
  }
  
+#define        ARM_OPCODE_CONDITION_UNCOND     0xf
+
+static unsigned int __kprobes aarch32_check_condition(u32 opcode, u32 psr)
+{
+       u32 cc_bits  = opcode >> 28;
+
+       if (cc_bits != ARM_OPCODE_CONDITION_UNCOND) {
+               if ((*aarch32_opcode_cond_checks[cc_bits])(psr))
+                       return ARM_OPCODE_CONDTEST_PASS;
+               else
+                       return ARM_OPCODE_CONDTEST_FAIL;
+       }
+       return ARM_OPCODE_CONDTEST_UNCOND;
+}
+
  /*
   * swp_handler logs the id of calling process, dissects the instruction, sanity
   * checks the memory location, calls emulate_swpX for the actual operation and
@@ -383,7 +395,7 @@ static int swp_handler(struct pt_regs *regs, u32 instr)
  
         type = instr & TYPE_SWPB;
  
-       switch (arm_check_condition(instr, regs->pstate)) {
+       switch (aarch32_check_condition(instr, regs->pstate)) {
         case ARM_OPCODE_CONDTEST_PASS:
                 break;
         case ARM_OPCODE_CONDTEST_FAIL:
@@ -464,7 +476,7 @@ static int cp15barrier_handler(struct pt_regs *regs, u32 instr)
  {
         perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, regs->pc);
  
-       switch (arm_check_condition(instr, regs->pstate)) {
+       switch (aarch32_check_condition(instr, regs->pstate)) {
         case ARM_OPCODE_CONDTEST_PASS:
                 break;
         case ARM_OPCODE_CONDTEST_FAIL:
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c

index 087cf9a65359b5fac32a0fcb4c5fa2804ffd73bc..2bb17bd556f8dd5114e52b7fec90bf926cd012c7 100644 (file)
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -22,12 +22,14 @@
  #include <linux/mm.h>
  #include <linux/dma-mapping.h>
  #include <linux/kvm_host.h>
+#include <linux/suspend.h>
  #include <asm/thread_info.h>
  #include <asm/memory.h>
  #include <asm/smp_plat.h>
  #include <asm/suspend.h>
  #include <asm/vdso_datapage.h>
  #include <linux/kbuild.h>
+#include <linux/arm-smccc.h>
  
  int main(void)
  {
@@ -49,6 +51,17 @@ int main(void)
    DEFINE(S_X5,                 offsetof(struct pt_regs, regs[5]));
    DEFINE(S_X6,                 offsetof(struct pt_regs, regs[6]));
    DEFINE(S_X7,                 offsetof(struct pt_regs, regs[7]));
+  DEFINE(S_X8,                 offsetof(struct pt_regs, regs[8]));
+  DEFINE(S_X10,                        offsetof(struct pt_regs, regs[10]));
+  DEFINE(S_X12,                        offsetof(struct pt_regs, regs[12]));
+  DEFINE(S_X14,                        offsetof(struct pt_regs, regs[14]));
+  DEFINE(S_X16,                        offsetof(struct pt_regs, regs[16]));
+  DEFINE(S_X18,                        offsetof(struct pt_regs, regs[18]));
+  DEFINE(S_X20,                        offsetof(struct pt_regs, regs[20]));
+  DEFINE(S_X22,                        offsetof(struct pt_regs, regs[22]));
+  DEFINE(S_X24,                        offsetof(struct pt_regs, regs[24]));
+  DEFINE(S_X26,                        offsetof(struct pt_regs, regs[26]));
+  DEFINE(S_X28,                        offsetof(struct pt_regs, regs[28]));
    DEFINE(S_LR,                 offsetof(struct pt_regs, regs[30]));
    DEFINE(S_SP,                 offsetof(struct pt_regs, sp));
  #ifdef CONFIG_COMPAT
@@ -109,58 +122,25 @@ int main(void)
    DEFINE(CPU_GP_REGS,          offsetof(struct kvm_cpu_context, gp_regs));
    DEFINE(CPU_USER_PT_REGS,     offsetof(struct kvm_regs, regs));
    DEFINE(CPU_FP_REGS,          offsetof(struct kvm_regs, fp_regs));
-  DEFINE(CPU_SP_EL1,           offsetof(struct kvm_regs, sp_el1));
-  DEFINE(CPU_ELR_EL1,          offsetof(struct kvm_regs, elr_el1));
-  DEFINE(CPU_SPSR,             offsetof(struct kvm_regs, spsr));
-  DEFINE(CPU_SYSREGS,          offsetof(struct kvm_cpu_context, sys_regs));
+  DEFINE(VCPU_FPEXC32_EL2,     offsetof(struct kvm_vcpu, arch.ctxt.sys_regs[FPEXC32_EL2]));
    DEFINE(VCPU_ESR_EL2,         offsetof(struct kvm_vcpu, arch.fault.esr_el2));
    DEFINE(VCPU_FAR_EL2,         offsetof(struct kvm_vcpu, arch.fault.far_el2));
    DEFINE(VCPU_HPFAR_EL2,       offsetof(struct kvm_vcpu, arch.fault.hpfar_el2));
-  DEFINE(VCPU_DEBUG_FLAGS,     offsetof(struct kvm_vcpu, arch.debug_flags));
-  DEFINE(VCPU_DEBUG_PTR,       offsetof(struct kvm_vcpu, arch.debug_ptr));
-  DEFINE(DEBUG_BCR,            offsetof(struct kvm_guest_debug_arch, dbg_bcr));
-  DEFINE(DEBUG_BVR,            offsetof(struct kvm_guest_debug_arch, dbg_bvr));
-  DEFINE(DEBUG_WCR,            offsetof(struct kvm_guest_debug_arch, dbg_wcr));
-  DEFINE(DEBUG_WVR,            offsetof(struct kvm_guest_debug_arch, dbg_wvr));
-  DEFINE(VCPU_HCR_EL2,         offsetof(struct kvm_vcpu, arch.hcr_el2));
-  DEFINE(VCPU_MDCR_EL2,        offsetof(struct kvm_vcpu, arch.mdcr_el2));
-  DEFINE(VCPU_IRQ_LINES,       offsetof(struct kvm_vcpu, arch.irq_lines));
    DEFINE(VCPU_HOST_CONTEXT,    offsetof(struct kvm_vcpu, arch.host_cpu_context));
-  DEFINE(VCPU_HOST_DEBUG_STATE, offsetof(struct kvm_vcpu, arch.host_debug_state));
-  DEFINE(VCPU_TIMER_CNTV_CTL,  offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl));
-  DEFINE(VCPU_TIMER_CNTV_CVAL, offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_cval));
-  DEFINE(KVM_TIMER_CNTVOFF,    offsetof(struct kvm, arch.timer.cntvoff));
-  DEFINE(KVM_TIMER_ENABLED,    offsetof(struct kvm, arch.timer.enabled));
-  DEFINE(VCPU_KVM,             offsetof(struct kvm_vcpu, kvm));
-  DEFINE(VCPU_VGIC_CPU,                offsetof(struct kvm_vcpu, arch.vgic_cpu));
-  DEFINE(VGIC_V2_CPU_HCR,      offsetof(struct vgic_cpu, vgic_v2.vgic_hcr));
-  DEFINE(VGIC_V2_CPU_VMCR,     offsetof(struct vgic_cpu, vgic_v2.vgic_vmcr));
-  DEFINE(VGIC_V2_CPU_MISR,     offsetof(struct vgic_cpu, vgic_v2.vgic_misr));
-  DEFINE(VGIC_V2_CPU_EISR,     offsetof(struct vgic_cpu, vgic_v2.vgic_eisr));
-  DEFINE(VGIC_V2_CPU_ELRSR,    offsetof(struct vgic_cpu, vgic_v2.vgic_elrsr));
-  DEFINE(VGIC_V2_CPU_APR,      offsetof(struct vgic_cpu, vgic_v2.vgic_apr));
-  DEFINE(VGIC_V2_CPU_LR,       offsetof(struct vgic_cpu, vgic_v2.vgic_lr));
-  DEFINE(VGIC_V3_CPU_SRE,      offsetof(struct vgic_cpu, vgic_v3.vgic_sre));
-  DEFINE(VGIC_V3_CPU_HCR,      offsetof(struct vgic_cpu, vgic_v3.vgic_hcr));
-  DEFINE(VGIC_V3_CPU_VMCR,     offsetof(struct vgic_cpu, vgic_v3.vgic_vmcr));
-  DEFINE(VGIC_V3_CPU_MISR,     offsetof(struct vgic_cpu, vgic_v3.vgic_misr));
-  DEFINE(VGIC_V3_CPU_EISR,     offsetof(struct vgic_cpu, vgic_v3.vgic_eisr));
-  DEFINE(VGIC_V3_CPU_ELRSR,    offsetof(struct vgic_cpu, vgic_v3.vgic_elrsr));
-  DEFINE(VGIC_V3_CPU_AP0R,     offsetof(struct vgic_cpu, vgic_v3.vgic_ap0r));
-  DEFINE(VGIC_V3_CPU_AP1R,     offsetof(struct vgic_cpu, vgic_v3.vgic_ap1r));
-  DEFINE(VGIC_V3_CPU_LR,       offsetof(struct vgic_cpu, vgic_v3.vgic_lr));
-  DEFINE(VGIC_CPU_NR_LR,       offsetof(struct vgic_cpu, nr_lr));
-  DEFINE(KVM_VTTBR,            offsetof(struct kvm, arch.vttbr));
-  DEFINE(KVM_VGIC_VCTRL,       offsetof(struct kvm, arch.vgic.vctrl_base));
  #endif
  #ifdef CONFIG_CPU_PM
    DEFINE(CPU_SUSPEND_SZ,       sizeof(struct cpu_suspend_ctx));
    DEFINE(CPU_CTX_SP,           offsetof(struct cpu_suspend_ctx, sp));
    DEFINE(MPIDR_HASH_MASK,      offsetof(struct mpidr_hash, mask));
    DEFINE(MPIDR_HASH_SHIFTS,    offsetof(struct mpidr_hash, shift_aff));
-  DEFINE(SLEEP_SAVE_SP_SZ,     sizeof(struct sleep_save_sp));
-  DEFINE(SLEEP_SAVE_SP_PHYS,   offsetof(struct sleep_save_sp, save_ptr_stash_phys));
-  DEFINE(SLEEP_SAVE_SP_VIRT,   offsetof(struct sleep_save_sp, save_ptr_stash));
+  DEFINE(SLEEP_STACK_DATA_SYSTEM_REGS, offsetof(struct sleep_stack_data, system_regs));
+  DEFINE(SLEEP_STACK_DATA_CALLEE_REGS, offsetof(struct sleep_stack_data, callee_saved_regs));
  #endif
+  DEFINE(ARM_SMCCC_RES_X0_OFFS,        offsetof(struct arm_smccc_res, a0));
+  DEFINE(ARM_SMCCC_RES_X2_OFFS,        offsetof(struct arm_smccc_res, a2));
+  BLANK();
+  DEFINE(HIBERN_PBE_ORIG,      offsetof(struct pbe, orig_address));
+  DEFINE(HIBERN_PBE_ADDR,      offsetof(struct pbe, address));
+  DEFINE(HIBERN_PBE_NEXT,      offsetof(struct pbe, next));
    return 0;
  }
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c

index a3e846a28b05f9e80bc65f8f34618c11d21cff2a..06afd04e02c0d05f1e0546230a5d446b8bb06b60 100644 (file)
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -21,24 +21,12 @@
  #include <asm/cputype.h>
  #include <asm/cpufeature.h>
  
-#define MIDR_CORTEX_A53 MIDR_CPU_PART(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
-#define MIDR_CORTEX_A57 MIDR_CPU_PART(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
-#define MIDR_THUNDERX  MIDR_CPU_PART(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
-
-#define CPU_MODEL_MASK (MIDR_IMPLEMENTOR_MASK | MIDR_PARTNUM_MASK | \
-                       MIDR_ARCHITECTURE_MASK)
-
  static bool __maybe_unused
  is_affected_midr_range(const struct arm64_cpu_capabilities *entry)
  {
-       u32 midr = read_cpuid_id();
-
-       if ((midr & CPU_MODEL_MASK) != entry->midr_model)
-               return false;
-
-       midr &= MIDR_REVISION_MASK | MIDR_VARIANT_MASK;
-
-       return (midr >= entry->midr_range_min && midr <= entry->midr_range_max);
+       return MIDR_IS_CPU_MODEL_RANGE(read_cpuid_id(), entry->midr_model,
+                                      entry->midr_range_min,
+                                      entry->midr_range_max);
  }
  
  #define MIDR_RANGE(model, min, max) \
diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c

index b6bd7d4477683393fb34dc6b055b07382e8ab050..c7cfb8fe06f94c7f5113abf0cb624980e4227127 100644 (file)
--- a/arch/arm64/kernel/cpu_ops.c
+++ b/arch/arm64/kernel/cpu_ops.c
@@ -25,19 +25,30 @@
  #include <asm/smp_plat.h>
  
  extern const struct cpu_operations smp_spin_table_ops;
+extern const struct cpu_operations acpi_parking_protocol_ops;
  extern const struct cpu_operations cpu_psci_ops;
  
  const struct cpu_operations *cpu_ops[NR_CPUS];
  
-static const struct cpu_operations *supported_cpu_ops[] __initconst = {
+static const struct cpu_operations *dt_supported_cpu_ops[] __initconst = {
         &smp_spin_table_ops,
         &cpu_psci_ops,
         NULL,
  };
  
+static const struct cpu_operations *acpi_supported_cpu_ops[] __initconst = {
+#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
+       &acpi_parking_protocol_ops,
+#endif
+       &cpu_psci_ops,
+       NULL,
+};
+
  static const struct cpu_operations * __init cpu_get_ops(const char *name)
  {
-       const struct cpu_operations **ops = supported_cpu_ops;
+       const struct cpu_operations **ops;
+
+       ops = acpi_disabled ? dt_supported_cpu_ops : acpi_supported_cpu_ops;
  
         while (*ops) {
                 if (!strcmp(name, (*ops)->name))
@@ -75,8 +86,16 @@ static const char *__init cpu_read_enable_method(int cpu)
                 }
         } else {
                 enable_method = acpi_get_enable_method(cpu);
-               if (!enable_method)
-                       pr_err("Unsupported ACPI enable-method\n");
+               if (!enable_method) {
+                       /*
+                        * In ACPI systems the boot CPU does not require
+                        * checking the enable method since for some
+                        * boot protocol (ie parking protocol) it need not
+                        * be initialized. Don't warn spuriously.
+                        */
+                       if (cpu != 0)
+                               pr_err("Unsupported ACPI enable-method\n");
+               }
         }
  
         return enable_method;
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c

index 2735bf8145926c5c8c03f2d58a7ec08646bcee2e..eda7d5915fbb209bf4057cd1f621beadd8c02606 100644 (file)
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -28,6 +28,7 @@
  #include <asm/cpu_ops.h>
  #include <asm/processor.h>
  #include <asm/sysreg.h>
+#include <asm/virt.h>
  
  unsigned long elf_hwcap __read_mostly;
  EXPORT_SYMBOL_GPL(elf_hwcap);
@@ -69,6 +70,10 @@ DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS);
                 .width = 0,                             \
         }
  
+/* meta feature for alternatives */
+static bool __maybe_unused
+cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry);
+
  static struct arm64_ftr_bits ftr_id_aa64isar0[] = {
         ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 32, 0),
         ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64ISAR0_RDM_SHIFT, 4, 0),
@@ -125,6 +130,11 @@ static struct arm64_ftr_bits ftr_id_aa64mmfr1[] = {
         ARM64_FTR_END,
  };
  
+static struct arm64_ftr_bits ftr_id_aa64mmfr2[] = {
+       ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64MMFR2_UAO_SHIFT, 4, 0),
+       ARM64_FTR_END,
+};
+
  static struct arm64_ftr_bits ftr_ctr[] = {
         U_ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 31, 1, 1),      /* RAO */
         ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 28, 3, 0),
@@ -286,6 +296,7 @@ static struct arm64_ftr_reg arm64_ftr_regs[] = {
         /* Op1 = 0, CRn = 0, CRm = 7 */
         ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0),
         ARM64_FTR_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1),
+       ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2),
  
         /* Op1 = 3, CRn = 0, CRm = 0 */
         ARM64_FTR_REG(SYS_CTR_EL0, ftr_ctr),
@@ -410,6 +421,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
         init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1);
         init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0);
         init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1);
+       init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2);
         init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0);
         init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1);
         init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0);
@@ -519,6 +531,8 @@ void update_cpu_features(int cpu,
                                       info->reg_id_aa64mmfr0, boot->reg_id_aa64mmfr0);
         taint |= check_update_ftr_reg(SYS_ID_AA64MMFR1_EL1, cpu,
                                       info->reg_id_aa64mmfr1, boot->reg_id_aa64mmfr1);
+       taint |= check_update_ftr_reg(SYS_ID_AA64MMFR2_EL1, cpu,
+                                     info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2);
  
         /*
          * EL3 is not our concern.
@@ -623,6 +637,23 @@ static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry)
         return has_sre;
  }
  
+static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry)
+{
+       u32 midr = read_cpuid_id();
+       u32 rv_min, rv_max;
+
+       /* Cavium ThunderX pass 1.x and 2.x */
+       rv_min = 0;
+       rv_max = (1 << MIDR_VARIANT_SHIFT) | MIDR_REVISION_MASK;
+
+       return MIDR_IS_CPU_MODEL_RANGE(midr, MIDR_THUNDERX, rv_min, rv_max);
+}
+
+static bool runs_at_el2(const struct arm64_cpu_capabilities *entry)
+{
+       return is_kernel_in_hyp_mode();
+}
+
  static const struct arm64_cpu_capabilities arm64_features[] = {
         {
                 .desc = "GIC system register CPU interface",
@@ -653,6 +684,33 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
                 .min_field_value = 2,
         },
  #endif /* CONFIG_AS_LSE && CONFIG_ARM64_LSE_ATOMICS */
+       {
+               .desc = "Software prefetching using PRFM",
+               .capability = ARM64_HAS_NO_HW_PREFETCH,
+               .matches = has_no_hw_prefetch,
+       },
+#ifdef CONFIG_ARM64_UAO
+       {
+               .desc = "User Access Override",
+               .capability = ARM64_HAS_UAO,
+               .matches = has_cpuid_feature,
+               .sys_reg = SYS_ID_AA64MMFR2_EL1,
+               .field_pos = ID_AA64MMFR2_UAO_SHIFT,
+               .min_field_value = 1,
+               .enable = cpu_enable_uao,
+       },
+#endif /* CONFIG_ARM64_UAO */
+#ifdef CONFIG_ARM64_PAN
+       {
+               .capability = ARM64_ALT_PAN_NOT_UAO,
+               .matches = cpufeature_pan_not_uao,
+       },
+#endif /* CONFIG_ARM64_PAN */
+       {
+               .desc = "Virtualization Host Extensions",
+               .capability = ARM64_HAS_VIRT_HOST_EXTN,
+               .matches = runs_at_el2,
+       },
         {},
  };
  
@@ -686,7 +744,7 @@ static const struct arm64_cpu_capabilities arm64_hwcaps[] = {
         {},
  };
  
-static void cap_set_hwcap(const struct arm64_cpu_capabilities *cap)
+static void __init cap_set_hwcap(const struct arm64_cpu_capabilities *cap)
  {
         switch (cap->hwcap_type) {
         case CAP_HWCAP:
@@ -731,12 +789,12 @@ static bool __maybe_unused cpus_have_hwcap(const struct arm64_cpu_capabilities *
         return rc;
  }
  
-static void setup_cpu_hwcaps(void)
+static void __init setup_cpu_hwcaps(void)
  {
         int i;
         const struct arm64_cpu_capabilities *hwcaps = arm64_hwcaps;
  
-       for (i = 0; hwcaps[i].desc; i++)
+       for (i = 0; hwcaps[i].matches; i++)
                 if (hwcaps[i].matches(&hwcaps[i]))
                         cap_set_hwcap(&hwcaps[i]);
  }
@@ -746,11 +804,11 @@ void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
  {
         int i;
  
-       for (i = 0; caps[i].desc; i++) {
+       for (i = 0; caps[i].matches; i++) {
                 if (!caps[i].matches(&caps[i]))
                         continue;
  
-               if (!cpus_have_cap(caps[i].capability))
+               if (!cpus_have_cap(caps[i].capability) && caps[i].desc)
                         pr_info("%s %s\n", info, caps[i].desc);
                 cpus_set_cap(caps[i].capability);
         }
@@ -760,11 +818,12 @@ void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps,
   * Run through the enabled capabilities and enable() it on all active
   * CPUs
   */
-static void enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps)
+static void __init
+enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps)
  {
         int i;
  
-       for (i = 0; caps[i].desc; i++)
+       for (i = 0; caps[i].matches; i++)
                 if (caps[i].enable && cpus_have_cap(caps[i].capability))
                         /*
                          * Use stop_machine() as it schedules the work allowing
@@ -798,35 +857,36 @@ static inline void set_sys_caps_initialised(void)
  static u64 __raw_read_system_reg(u32 sys_id)
  {
         switch (sys_id) {
-       case SYS_ID_PFR0_EL1:           return (u64)read_cpuid(ID_PFR0_EL1);
-       case SYS_ID_PFR1_EL1:           return (u64)read_cpuid(ID_PFR1_EL1);
-       case SYS_ID_DFR0_EL1:           return (u64)read_cpuid(ID_DFR0_EL1);
-       case SYS_ID_MMFR0_EL1:          return (u64)read_cpuid(ID_MMFR0_EL1);
-       case SYS_ID_MMFR1_EL1:          return (u64)read_cpuid(ID_MMFR1_EL1);
-       case SYS_ID_MMFR2_EL1:          return (u64)read_cpuid(ID_MMFR2_EL1);
-       case SYS_ID_MMFR3_EL1:          return (u64)read_cpuid(ID_MMFR3_EL1);
-       case SYS_ID_ISAR0_EL1:          return (u64)read_cpuid(ID_ISAR0_EL1);
-       case SYS_ID_ISAR1_EL1:          return (u64)read_cpuid(ID_ISAR1_EL1);
-       case SYS_ID_ISAR2_EL1:          return (u64)read_cpuid(ID_ISAR2_EL1);
-       case SYS_ID_ISAR3_EL1:          return (u64)read_cpuid(ID_ISAR3_EL1);
-       case SYS_ID_ISAR4_EL1:          return (u64)read_cpuid(ID_ISAR4_EL1);
-       case SYS_ID_ISAR5_EL1:          return (u64)read_cpuid(ID_ISAR4_EL1);
-       case SYS_MVFR0_EL1:             return (u64)read_cpuid(MVFR0_EL1);
-       case SYS_MVFR1_EL1:             return (u64)read_cpuid(MVFR1_EL1);
-       case SYS_MVFR2_EL1:             return (u64)read_cpuid(MVFR2_EL1);
-
-       case SYS_ID_AA64PFR0_EL1:       return (u64)read_cpuid(ID_AA64PFR0_EL1);
-       case SYS_ID_AA64PFR1_EL1:       return (u64)read_cpuid(ID_AA64PFR0_EL1);
-       case SYS_ID_AA64DFR0_EL1:       return (u64)read_cpuid(ID_AA64DFR0_EL1);
-       case SYS_ID_AA64DFR1_EL1:       return (u64)read_cpuid(ID_AA64DFR0_EL1);
-       case SYS_ID_AA64MMFR0_EL1:      return (u64)read_cpuid(ID_AA64MMFR0_EL1);
-       case SYS_ID_AA64MMFR1_EL1:      return (u64)read_cpuid(ID_AA64MMFR1_EL1);
-       case SYS_ID_AA64ISAR0_EL1:      return (u64)read_cpuid(ID_AA64ISAR0_EL1);
-       case SYS_ID_AA64ISAR1_EL1:      return (u64)read_cpuid(ID_AA64ISAR1_EL1);
-
-       case SYS_CNTFRQ_EL0:            return (u64)read_cpuid(CNTFRQ_EL0);
-       case SYS_CTR_EL0:               return (u64)read_cpuid(CTR_EL0);
-       case SYS_DCZID_EL0:             return (u64)read_cpuid(DCZID_EL0);
+       case SYS_ID_PFR0_EL1:           return read_cpuid(SYS_ID_PFR0_EL1);
+       case SYS_ID_PFR1_EL1:           return read_cpuid(SYS_ID_PFR1_EL1);
+       case SYS_ID_DFR0_EL1:           return read_cpuid(SYS_ID_DFR0_EL1);
+       case SYS_ID_MMFR0_EL1:          return read_cpuid(SYS_ID_MMFR0_EL1);
+       case SYS_ID_MMFR1_EL1:          return read_cpuid(SYS_ID_MMFR1_EL1);
+       case SYS_ID_MMFR2_EL1:          return read_cpuid(SYS_ID_MMFR2_EL1);
+       case SYS_ID_MMFR3_EL1:          return read_cpuid(SYS_ID_MMFR3_EL1);
+       case SYS_ID_ISAR0_EL1:          return read_cpuid(SYS_ID_ISAR0_EL1);
+       case SYS_ID_ISAR1_EL1:          return read_cpuid(SYS_ID_ISAR1_EL1);
+       case SYS_ID_ISAR2_EL1:          return read_cpuid(SYS_ID_ISAR2_EL1);
+       case SYS_ID_ISAR3_EL1:          return read_cpuid(SYS_ID_ISAR3_EL1);
+       case SYS_ID_ISAR4_EL1:          return read_cpuid(SYS_ID_ISAR4_EL1);
+       case SYS_ID_ISAR5_EL1:          return read_cpuid(SYS_ID_ISAR4_EL1);
+       case SYS_MVFR0_EL1:             return read_cpuid(SYS_MVFR0_EL1);
+       case SYS_MVFR1_EL1:             return read_cpuid(SYS_MVFR1_EL1);
+       case SYS_MVFR2_EL1:             return read_cpuid(SYS_MVFR2_EL1);
+
+       case SYS_ID_AA64PFR0_EL1:       return read_cpuid(SYS_ID_AA64PFR0_EL1);
+       case SYS_ID_AA64PFR1_EL1:       return read_cpuid(SYS_ID_AA64PFR0_EL1);
+       case SYS_ID_AA64DFR0_EL1:       return read_cpuid(SYS_ID_AA64DFR0_EL1);
+       case SYS_ID_AA64DFR1_EL1:       return read_cpuid(SYS_ID_AA64DFR0_EL1);
+       case SYS_ID_AA64MMFR0_EL1:      return read_cpuid(SYS_ID_AA64MMFR0_EL1);
+       case SYS_ID_AA64MMFR1_EL1:      return read_cpuid(SYS_ID_AA64MMFR1_EL1);
+       case SYS_ID_AA64MMFR2_EL1:      return read_cpuid(SYS_ID_AA64MMFR2_EL1);
+       case SYS_ID_AA64ISAR0_EL1:      return read_cpuid(SYS_ID_AA64ISAR0_EL1);
+       case SYS_ID_AA64ISAR1_EL1:      return read_cpuid(SYS_ID_AA64ISAR1_EL1);
+
+       case SYS_CNTFRQ_EL0:            return read_cpuid(SYS_CNTFRQ_EL0);
+       case SYS_CTR_EL0:               return read_cpuid(SYS_CTR_EL0);
+       case SYS_DCZID_EL0:             return read_cpuid(SYS_DCZID_EL0);
         default:
                 BUG();
                 return 0;
@@ -876,7 +936,7 @@ void verify_local_cpu_capabilities(void)
                 return;
  
         caps = arm64_features;
-       for (i = 0; caps[i].desc; i++) {
+       for (i = 0; caps[i].matches; i++) {
                 if (!cpus_have_cap(caps[i].capability) || !caps[i].sys_reg)
                         continue;
                 /*
@@ -889,7 +949,7 @@ void verify_local_cpu_capabilities(void)
                         caps[i].enable(NULL);
         }
  
-       for (i = 0, caps = arm64_hwcaps; caps[i].desc; i++) {
+       for (i = 0, caps = arm64_hwcaps; caps[i].matches; i++) {
                 if (!cpus_have_hwcap(&caps[i]))
                         continue;
                 if (!feature_matches(__raw_read_system_reg(caps[i].sys_reg), &caps[i]))
@@ -905,7 +965,7 @@ static inline void set_sys_caps_initialised(void)
  
  #endif /* CONFIG_HOTPLUG_CPU */
  
-static void setup_feature_capabilities(void)
+static void __init setup_feature_capabilities(void)
  {
         update_cpu_capabilities(arm64_features, "detected feature:");
         enable_cpu_capabilities(arm64_features);
@@ -935,3 +995,9 @@ void __init setup_cpu_features(void)
                 pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n",
                         L1_CACHE_BYTES, cls);
  }
+
+static bool __maybe_unused
+cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry)
+{
+       return (cpus_have_cap(ARM64_HAS_PAN) && !cpus_have_cap(ARM64_HAS_UAO));
+}
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c

index 0166cfbc866c019e9f754c7596489ae6461eff8c..95a6fae54740e003a0b9b947a15954c389548f9b 100644 (file)
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -208,35 +208,36 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
  {
         info->reg_cntfrq = arch_timer_get_cntfrq();
         info->reg_ctr = read_cpuid_cachetype();
-       info->reg_dczid = read_cpuid(DCZID_EL0);
+       info->reg_dczid = read_cpuid(SYS_DCZID_EL0);
         info->reg_midr = read_cpuid_id();
  
-       info->reg_id_aa64dfr0 = read_cpuid(ID_AA64DFR0_EL1);
-       info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1);
-       info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1);
-       info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1);
-       info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1);
-       info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1);
-       info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1);
-       info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1);
-
-       info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1);
-       info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1);
-       info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1);
-       info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1);
-       info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1);
-       info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1);
-       info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1);
-       info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1);
-       info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1);
-       info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1);
-       info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1);
-       info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1);
-       info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1);
-
-       info->reg_mvfr0 = read_cpuid(MVFR0_EL1);
-       info->reg_mvfr1 = read_cpuid(MVFR1_EL1);
-       info->reg_mvfr2 = read_cpuid(MVFR2_EL1);
+       info->reg_id_aa64dfr0 = read_cpuid(SYS_ID_AA64DFR0_EL1);
+       info->reg_id_aa64dfr1 = read_cpuid(SYS_ID_AA64DFR1_EL1);
+       info->reg_id_aa64isar0 = read_cpuid(SYS_ID_AA64ISAR0_EL1);
+       info->reg_id_aa64isar1 = read_cpuid(SYS_ID_AA64ISAR1_EL1);
+       info->reg_id_aa64mmfr0 = read_cpuid(SYS_ID_AA64MMFR0_EL1);
+       info->reg_id_aa64mmfr1 = read_cpuid(SYS_ID_AA64MMFR1_EL1);
+       info->reg_id_aa64mmfr2 = read_cpuid(SYS_ID_AA64MMFR2_EL1);
+       info->reg_id_aa64pfr0 = read_cpuid(SYS_ID_AA64PFR0_EL1);
+       info->reg_id_aa64pfr1 = read_cpuid(SYS_ID_AA64PFR1_EL1);
+
+       info->reg_id_dfr0 = read_cpuid(SYS_ID_DFR0_EL1);
+       info->reg_id_isar0 = read_cpuid(SYS_ID_ISAR0_EL1);
+       info->reg_id_isar1 = read_cpuid(SYS_ID_ISAR1_EL1);
+       info->reg_id_isar2 = read_cpuid(SYS_ID_ISAR2_EL1);
+       info->reg_id_isar3 = read_cpuid(SYS_ID_ISAR3_EL1);
+       info->reg_id_isar4 = read_cpuid(SYS_ID_ISAR4_EL1);
+       info->reg_id_isar5 = read_cpuid(SYS_ID_ISAR5_EL1);
+       info->reg_id_mmfr0 = read_cpuid(SYS_ID_MMFR0_EL1);
+       info->reg_id_mmfr1 = read_cpuid(SYS_ID_MMFR1_EL1);
+       info->reg_id_mmfr2 = read_cpuid(SYS_ID_MMFR2_EL1);
+       info->reg_id_mmfr3 = read_cpuid(SYS_ID_MMFR3_EL1);
+       info->reg_id_pfr0 = read_cpuid(SYS_ID_PFR0_EL1);
+       info->reg_id_pfr1 = read_cpuid(SYS_ID_PFR1_EL1);
+
+       info->reg_mvfr0 = read_cpuid(SYS_MVFR0_EL1);
+       info->reg_mvfr1 = read_cpuid(SYS_MVFR1_EL1);
+       info->reg_mvfr2 = read_cpuid(SYS_MVFR2_EL1);
  
         cpuinfo_detect_icache_policy(info);
  
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c

index c8875b64be909f4e1c15c9be6060a1da988f972f..6de6d9f43b959fd1cbb45a16bdc33bba524fccd6 100644 (file)
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -23,6 +23,7 @@
  #include <linux/hardirq.h>
  #include <linux/init.h>
  #include <linux/ptrace.h>
+#include <linux/kprobes.h>
  #include <linux/stat.h>
  #include <linux/uaccess.h>
  
@@ -48,6 +49,7 @@ static void mdscr_write(u32 mdscr)
         asm volatile("msr mdscr_el1, %0" :: "r" (mdscr));
         local_dbg_restore(flags);
  }
+NOKPROBE_SYMBOL(mdscr_write);
  
  static u32 mdscr_read(void)
  {
@@ -55,6 +57,7 @@ static u32 mdscr_read(void)
         asm volatile("mrs %0, mdscr_el1" : "=r" (mdscr));
         return mdscr;
  }
+NOKPROBE_SYMBOL(mdscr_read);
  
  /*
   * Allow root to disable self-hosted debug from userspace.
@@ -103,6 +106,7 @@ void enable_debug_monitors(enum dbg_active_el el)
                 mdscr_write(mdscr);
         }
  }
+NOKPROBE_SYMBOL(enable_debug_monitors);
  
  void disable_debug_monitors(enum dbg_active_el el)
  {
@@ -123,6 +127,7 @@ void disable_debug_monitors(enum dbg_active_el el)
                 mdscr_write(mdscr);
         }
  }
+NOKPROBE_SYMBOL(disable_debug_monitors);
  
  /*
   * OS lock clearing.
@@ -173,6 +178,7 @@ static void set_regs_spsr_ss(struct pt_regs *regs)
         spsr |= DBG_SPSR_SS;
         regs->pstate = spsr;
  }
+NOKPROBE_SYMBOL(set_regs_spsr_ss);
  
  static void clear_regs_spsr_ss(struct pt_regs *regs)
  {
@@ -182,6 +188,7 @@ static void clear_regs_spsr_ss(struct pt_regs *regs)
         spsr &= ~DBG_SPSR_SS;
         regs->pstate = spsr;
  }
+NOKPROBE_SYMBOL(clear_regs_spsr_ss);
  
  /* EL1 Single Step Handler hooks */
  static LIST_HEAD(step_hook);
@@ -225,6 +232,7 @@ static int call_step_hook(struct pt_regs *regs, unsigned int esr)
  
         return retval;
  }
+NOKPROBE_SYMBOL(call_step_hook);
  
  static int single_step_handler(unsigned long addr, unsigned int esr,
                                struct pt_regs *regs)
@@ -253,6 +261,10 @@ static int single_step_handler(unsigned long addr, unsigned int esr,
                  */
                 user_rewind_single_step(current);
         } else {
+#ifdef CONFIG_KPROBES
+               if (kprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED)
+                       return 0;
+#endif
                 if (call_step_hook(regs, esr) == DBG_HOOK_HANDLED)
                         return 0;
  
@@ -266,6 +278,7 @@ static int single_step_handler(unsigned long addr, unsigned int esr,
  
         return 0;
  }
+NOKPROBE_SYMBOL(single_step_handler);
  
  /*
   * Breakpoint handler is re-entrant as another breakpoint can
@@ -303,6 +316,7 @@ static int call_break_hook(struct pt_regs *regs, unsigned int esr)
  
         return fn ? fn(regs, esr) : DBG_HOOK_ERROR;
  }
+NOKPROBE_SYMBOL(call_break_hook);
  
  static int brk_handler(unsigned long addr, unsigned int esr,
                        struct pt_regs *regs)
@@ -318,13 +332,21 @@ static int brk_handler(unsigned long addr, unsigned int esr,
                 };
  
                 force_sig_info(SIGTRAP, &info, current);
-       } else if (call_break_hook(regs, esr) != DBG_HOOK_HANDLED) {
-               pr_warning("Unexpected kernel BRK exception at EL1\n");
+       }
+#ifdef CONFIG_KPROBES
+       else if ((esr & BRK64_ESR_MASK) == BRK64_ESR_KPROBES) {
+               if (kprobe_breakpoint_handler(regs, esr) != DBG_HOOK_HANDLED)
+                       return -EFAULT;
+       }
+#endif
+       else if (call_break_hook(regs, esr) != DBG_HOOK_HANDLED) {
+               pr_warn("Unexpected kernel BRK exception at EL1\n");
                 return -EFAULT;
         }
  
         return 0;
  }
+NOKPROBE_SYMBOL(brk_handler);
  
  int aarch32_break_handler(struct pt_regs *regs)
  {
@@ -369,6 +391,7 @@ int aarch32_break_handler(struct pt_regs *regs)
         force_sig_info(SIGTRAP, &info, current);
         return 0;
  }
+NOKPROBE_SYMBOL(aarch32_break_handler);
  
  static int __init debug_traps_init(void)
  {
@@ -390,6 +413,7 @@ void user_rewind_single_step(struct task_struct *task)
         if (test_ti_thread_flag(task_thread_info(task), TIF_SINGLESTEP))
                 set_regs_spsr_ss(task_pt_regs(task));
  }
+NOKPROBE_SYMBOL(user_rewind_single_step);
  
  void user_fastforward_single_step(struct task_struct *task)
  {
@@ -405,6 +429,7 @@ void kernel_enable_single_step(struct pt_regs *regs)
         mdscr_write(mdscr_read() | DBG_MDSCR_SS);
         enable_debug_monitors(DBG_ACTIVE_EL1);
  }
+NOKPROBE_SYMBOL(kernel_enable_single_step);
  
  void kernel_disable_single_step(void)
  {
@@ -412,12 +437,14 @@ void kernel_disable_single_step(void)
         mdscr_write(mdscr_read() & ~DBG_MDSCR_SS);
         disable_debug_monitors(DBG_ACTIVE_EL1);
  }
+NOKPROBE_SYMBOL(kernel_disable_single_step);
  
  int kernel_active_single_step(void)
  {
         WARN_ON(!irqs_disabled());
         return mdscr_read() & DBG_MDSCR_SS;
  }
+NOKPROBE_SYMBOL(kernel_active_single_step);
  
  /* ptrace API */
  void user_enable_single_step(struct task_struct *task)
@@ -427,8 +454,10 @@ void user_enable_single_step(struct task_struct *task)
         if (!test_and_set_ti_thread_flag(ti, TIF_SINGLESTEP))
                 set_regs_spsr_ss(task_pt_regs(task));
  }
+NOKPROBE_SYMBOL(user_enable_single_step);
  
  void user_disable_single_step(struct task_struct *task)
  {
         clear_ti_thread_flag(task_thread_info(task), TIF_SINGLESTEP);
  }
+NOKPROBE_SYMBOL(user_disable_single_step);
diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S

index a773db92908b03d325c26dba0dc5ea9c287ef49d..936022f0655e9ff3dce703c46adc2901118c6b82 100644 (file)
--- a/arch/arm64/kernel/efi-entry.S
+++ b/arch/arm64/kernel/efi-entry.S
@@ -61,7 +61,7 @@ ENTRY(entry)
          */
         mov     x20, x0         // DTB address
         ldr     x0, [sp, #16]   // relocated _text address
-       ldr     x21, =stext_offset
+       ldr     w21, =stext_offset
         add     x21, x0, x21
  
         /*
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S

index bd14849beb73f547d720fcc70e011f83813083da..a096d0980ade869f124f1e16aba10bc3c83caaff 100644 (file)
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -27,6 +27,7 @@
  #include <asm/cpufeature.h>
  #include <asm/errno.h>
  #include <asm/esr.h>
+#include <asm/irq.h>
  #include <asm/memory.h>
  #include <asm/thread_info.h>
  #include <asm/unistd.h>
@@ -89,9 +90,12 @@
  
         .if     \el == 0
         mrs     x21, sp_el0
-       get_thread_info tsk                     // Ensure MDSCR_EL1.SS is clear,
+       mov     tsk, sp
+       and     tsk, tsk, #~(THREAD_SIZE - 1)   // Ensure MDSCR_EL1.SS is clear,
         ldr     x19, [tsk, #TI_FLAGS]           // since we can unmask debug
         disable_step_tsk x19, x20               // exceptions when scheduling.
+
+       mov     x29, xzr                        // fp pointed to user-space
         .else
         add     x21, sp, #S_FRAME_SIZE
         get_thread_info tsk
@@ -114,6 +118,13 @@
         str     x21, [sp, #S_SYSCALLNO]
         .endif
  
+       /*
+        * Set sp_el0 to current thread_info.
+        */
+       .if     \el == 0
+       msr     sp_el0, tsk
+       .endif
+
         /*
          * Registers that may be useful after this macro is invoked:
          *
@@ -177,8 +188,44 @@ alternative_endif
         .endm
  
         .macro  get_thread_info, rd
-       mov     \rd, sp
-       and     \rd, \rd, #~(THREAD_SIZE - 1)   // top of stack
+       mrs     \rd, sp_el0
+       .endm
+
+       .macro  irq_stack_entry
+       mov     x19, sp                 // preserve the original sp
+
+       /*
+        * Compare sp with the current thread_info, if the top
+        * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and
+        * should switch to the irq stack.
+        */
+       and     x25, x19, #~(THREAD_SIZE - 1)
+       cmp     x25, tsk
+       b.ne    9998f
+
+       this_cpu_ptr irq_stack, x25, x26
+       mov     x26, #IRQ_STACK_START_SP
+       add     x26, x25, x26
+
+       /* switch to the irq stack */
+       mov     sp, x26
+
+       /*
+        * Add a dummy stack frame, this non-standard format is fixed up
+        * by unwind_frame()
+        */
+       stp     x29, x19, [sp, #-16]!
+       mov     x29, sp
+
+9998:
+       .endm
+
+       /*
+        * x19 should be preserved between irq_stack_entry and
+        * irq_stack_exit.
+        */
+       .macro  irq_stack_exit
+       mov     sp, x19
         .endm
  
  /*
@@ -196,10 +243,11 @@ tsk       .req    x28             // current thread_info
   * Interrupt handling.
   */
         .macro  irq_handler
-       adrp    x1, handle_arch_irq
-       ldr     x1, [x1, #:lo12:handle_arch_irq]
+       ldr_l   x1, handle_arch_irq
         mov     x0, sp
+       irq_stack_entry
         blr     x1
+       irq_stack_exit
         .endm
  
         .text
@@ -207,6 +255,7 @@ tsk .req    x28             // current thread_info
  /*
   * Exception vectors.
   */
+       .pushsection ".entry.text", "ax"
  
         .align  11
  ENTRY(vectors)
@@ -371,10 +420,10 @@ el1_irq:
         bl      trace_hardirqs_off
  #endif
  
+       get_thread_info tsk
         irq_handler
  
  #ifdef CONFIG_PREEMPT
-       get_thread_info tsk
         ldr     w24, [tsk, #TI_PREEMPT]         // get preempt count
         cbnz    w24, 1f                         // preempt count != 0
         ldr     x0, [tsk, #TI_FLAGS]            // get flags
@@ -612,6 +661,8 @@ ENTRY(cpu_switch_to)
         ldp     x29, x9, [x8], #16
         ldr     lr, [x8]
         mov     sp, x9
+       and     x9, x9, #~(THREAD_SIZE - 1)
+       msr     sp_el0, x9
         ret
  ENDPROC(cpu_switch_to)
  
@@ -639,14 +690,14 @@ ret_fast_syscall_trace:
  work_pending:
         tbnz    x1, #TIF_NEED_RESCHED, work_resched
         /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
-       ldr     x2, [sp, #S_PSTATE]
         mov     x0, sp                          // 'regs'
-       tst     x2, #PSR_MODE_MASK              // user mode regs?
-       b.ne    no_work_pending                 // returning to kernel
         enable_irq                              // enable interrupts for do_notify_resume()
         bl      do_notify_resume
         b       ret_to_user
  work_resched:
+#ifdef CONFIG_TRACE_IRQFLAGS
+       bl      trace_hardirqs_off              // the IRQs are off here, inform the tracing code
+#endif
         bl      schedule
  
  /*
@@ -658,7 +709,6 @@ ret_to_user:
         and     x2, x1, #_TIF_WORK_MASK
         cbnz    x2, work_pending
         enable_step_tsk x1, x2
-no_work_pending:
         kernel_exit 0
  ENDPROC(ret_to_user)
  
@@ -738,6 +788,8 @@ __ni_sys_trace:
         bl      do_ni_syscall
         b       __sys_trace_return
  
+       .popsection                             // .entry.text
+
  /*
   * Special system call wrappers.
   */
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c

index 4c46c54a3ad7ad817b8ba410565b8eff47cd3c08..acc1afd5c749a62b7c0bae5a14d2fd7dbc33adf2 100644 (file)
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -289,7 +289,7 @@ static struct notifier_block fpsimd_cpu_pm_notifier_block = {
         .notifier_call = fpsimd_cpu_pm_notifier,
  };
  
-static void fpsimd_pm_init(void)
+static void __init fpsimd_pm_init(void)
  {
         cpu_pm_register_notifier(&fpsimd_cpu_pm_notifier_block);
  }
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c

index c851be795080336938f4826cc0608234b0e34bfa..ebecf9aa33d12da8a564ea0314f59a71b89e64a0 100644 (file)
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -29,12 +29,11 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
  
         /*
          * Note:
-        * Due to modules and __init, code can disappear and change,
-        * we need to protect against faulting as well as code changing.
-        * We do this by aarch64_insn_*() which use the probe_kernel_*().
-        *
-        * No lock is held here because all the modifications are run
-        * through stop_machine().
+        * We are paranoid about modifying text, as if a bug were to happen, it
+        * could cause us to read or write to someplace that could cause harm.
+        * Carefully read and modify the code with aarch64_insn_*() which uses
+        * probe_kernel_*(), and make sure what we read is what we expected it
+        * to be before modifying it.
          */
         if (validate) {
                 if (aarch64_insn_read((void *)pc, &replaced))
@@ -93,6 +92,11 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
         return ftrace_modify_code(pc, old, new, true);
  }
  
+void arch_ftrace_update_code(int command)
+{
+       ftrace_modify_all_code(command);
+}
+
  int __init ftrace_dyn_arch_init(void)
  {
         return 0;
@@ -125,23 +129,20 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
          * on other archs. It's unlikely on AArch64.
          */
         old = *parent;
-       *parent = return_hooker;
  
         trace.func = self_addr;
         trace.depth = current->curr_ret_stack + 1;
  
         /* Only trace if the calling function expects to */
-       if (!ftrace_graph_entry(&trace)) {
-               *parent = old;
+       if (!ftrace_graph_entry(&trace))
                 return;
-       }
  
         err = ftrace_push_return_trace(old, self_addr, &trace.depth,
                                        frame_pointer);
-       if (err == -EBUSY) {
-               *parent = old;
+       if (err == -EBUSY)
                 return;
-       }
+       else
+               *parent = return_hooker;
  }
  
  #ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S

index 20ceb5edf7b89443d13f5f81fe6ebef87d02e6e8..029c466eaa4c68cad67da330be9024ccfbbd82b4 100644 (file)
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -25,10 +25,12 @@
  #include <linux/irqchip/arm-gic-v3.h>
  
  #include <asm/assembler.h>
+#include <asm/boot.h>
  #include <asm/ptrace.h>
  #include <asm/asm-offsets.h>
  #include <asm/cache.h>
  #include <asm/cputype.h>
+#include <asm/elf.h>
  #include <asm/kernel-pgtable.h>
  #include <asm/memory.h>
  #include <asm/pgtable-hwdef.h>
@@ -48,9 +50,6 @@
  #error TEXT_OFFSET must be less than 2MB
  #endif
  
-#define KERNEL_START   _text
-#define KERNEL_END     _end
-
  /*
   * Kernel startup entry point.
   * ---------------------------
@@ -67,12 +66,11 @@
   * in the entry routines.
   */
         __HEAD
-
+_head:
         /*
          * DO NOT MODIFY. Image header expected by Linux boot-loaders.
          */
  #ifdef CONFIG_EFI
-efi_head:
         /*
          * This add instruction has no meaningful effect except that
          * its opcode forms the magic "MZ" signature required by UEFI.
@@ -83,9 +81,9 @@ efi_head:
         b       stext                           // branch to kernel start, magic
         .long   0                               // reserved
  #endif
-       .quad   _kernel_offset_le               // Image load offset from start of RAM, little-endian
-       .quad   _kernel_size_le                 // Effective size of kernel image, little-endian
-       .quad   _kernel_flags_le                // Informative flags, little-endian
+       le64sym _kernel_offset_le               // Image load offset from start of RAM, little-endian
+       le64sym _kernel_size_le                 // Effective size of kernel image, little-endian
+       le64sym _kernel_flags_le                // Informative flags, little-endian
         .quad   0                               // reserved
         .quad   0                               // reserved
         .quad   0                               // reserved
@@ -94,14 +92,12 @@ efi_head:
         .byte   0x4d
         .byte   0x64
  #ifdef CONFIG_EFI
-       .long   pe_header - efi_head            // Offset to the PE header.
+       .long   pe_header - _head               // Offset to the PE header.
  #else
         .word   0                               // reserved
  #endif
  
  #ifdef CONFIG_EFI
-       .globl  __efistub_stext_offset
-       .set    __efistub_stext_offset, stext - efi_head
         .align 3
  pe_header:
         .ascii  "PE"
@@ -121,11 +117,11 @@ optional_header:
         .short  0x20b                           // PE32+ format
         .byte   0x02                            // MajorLinkerVersion
         .byte   0x14                            // MinorLinkerVersion
-       .long   _end - stext                    // SizeOfCode
+       .long   _end - efi_header_end           // SizeOfCode
         .long   0                               // SizeOfInitializedData
         .long   0                               // SizeOfUninitializedData
-       .long   __efistub_entry - efi_head      // AddressOfEntryPoint
-       .long   __efistub_stext_offset          // BaseOfCode
+       .long   __efistub_entry - _head         // AddressOfEntryPoint
+       .long   efi_header_end - _head          // BaseOfCode
  
  extra_header_fields:
         .quad   0                               // ImageBase
@@ -139,10 +135,10 @@ extra_header_fields:
         .short  0                               // MinorSubsystemVersion
         .long   0                               // Win32VersionValue
  
-       .long   _end - efi_head                 // SizeOfImage
+       .long   _end - _head                    // SizeOfImage
  
         // Everything before the kernel image is considered part of the header
-       .long   __efistub_stext_offset          // SizeOfHeaders
+       .long   efi_header_end - _head          // SizeOfHeaders
         .long   0                               // CheckSum
         .short  0xa                             // Subsystem (EFI application)
         .short  0                               // DllCharacteristics
@@ -186,10 +182,10 @@ section_table:
         .byte   0
         .byte   0
         .byte   0                       // end of 0 padding of section name
-       .long   _end - stext            // VirtualSize
-       .long   __efistub_stext_offset  // VirtualAddress
-       .long   _edata - stext          // SizeOfRawData
-       .long   __efistub_stext_offset  // PointerToRawData
+       .long   _end - efi_header_end   // VirtualSize
+       .long   efi_header_end - _head  // VirtualAddress
+       .long   _edata - efi_header_end // SizeOfRawData
+       .long   efi_header_end - _head  // PointerToRawData
  
         .long   0               // PointerToRelocations (0 for executables)
         .long   0               // PointerToLineNumbers (0 for executables)
@@ -198,19 +194,23 @@ section_table:
         .long   0xe0500020      // Characteristics (section flags)
  
         /*
-        * EFI will load stext onwards at the 4k section alignment
+        * EFI will load .text onwards at the 4k section alignment
          * described in the PE/COFF header. To ensure that instruction
          * sequences using an adrp and a :lo12: immediate will function
-        * correctly at this alignment, we must ensure that stext is
+        * correctly at this alignment, we must ensure that .text is
          * placed at a 4k boundary in the Image to begin with.
          */
         .align 12
+efi_header_end:
  #endif
  
+       __INIT
+
  ENTRY(stext)
         bl      preserve_boot_args
         bl      el2_setup                       // Drop to EL1, w20=cpu_boot_mode
         adrp    x24, __PHYS_OFFSET
+       and     x23, x24, MIN_KIMG_ALIGN - 1    // KASLR offset, defaults to 0
         bl      set_cpu_boot_mode_flag
         bl      __create_page_tables            // x25=TTBR0, x26=TTBR1
         /*
@@ -219,10 +219,10 @@ ENTRY(stext)
          * On return, the CPU will be ready for the MMU to be turned on and
          * the TCR will have been set.
          */
-       ldr     x27, =__mmap_switched           // address to jump to after
+       bl      __cpu_setup                     // initialise processor
+       adr_l   x27, __primary_switch           // address to jump to after
                                                 // MMU has been enabled
-       adr_l   lr, __enable_mmu                // return (PIC) address
-       b       __cpu_setup                     // initialise processor
+       b       __enable_mmu
  ENDPROC(stext)
  
  /*
@@ -311,7 +311,7 @@ ENDPROC(preserve_boot_args)
  __create_page_tables:
         adrp    x25, idmap_pg_dir
         adrp    x26, swapper_pg_dir
-       mov     x27, lr
+       mov     x28, lr
  
         /*
          * Invalidate the idmap and swapper page tables to avoid potential
@@ -333,7 +333,7 @@ __create_page_tables:
         cmp     x0, x6
         b.lo    1b
  
-       ldr     x7, =SWAPPER_MM_MMUFLAGS
+       mov     x7, SWAPPER_MM_MMUFLAGS
  
         /*
          * Create the identity mapping.
@@ -389,10 +389,13 @@ __create_page_tables:
          * Map the kernel image (starting with PHYS_OFFSET).
          */
         mov     x0, x26                         // swapper_pg_dir
-       mov     x5, #PAGE_OFFSET
+       mov_q   x5, KIMAGE_VADDR + TEXT_OFFSET  // compile time __va(_text)
+       add     x5, x5, x23                     // add KASLR displacement
         create_pgd_entry x0, x5, x3, x6
-       ldr     x6, =KERNEL_END                 // __va(KERNEL_END)
-       mov     x3, x24                         // phys offset
+       adrp    x6, _end                        // runtime __pa(_end)
+       adrp    x3, _text                       // runtime __pa(_text)
+       sub     x6, x6, x3                      // _end - _text
+       add     x6, x6, x5                      // runtime __va(_end)
         create_block_map x0, x7, x3, x5, x6
  
         /*
@@ -405,8 +408,7 @@ __create_page_tables:
         dmb     sy
         bl      __inval_cache_range
  
-       mov     lr, x27
-       ret
+       ret     x28
  ENDPROC(__create_page_tables)
         .ltorg
  
@@ -414,30 +416,58 @@ ENDPROC(__create_page_tables)
   * The following fragment of code is executed with the MMU enabled.
   */
         .set    initial_sp, init_thread_union + THREAD_START_SP
-__mmap_switched:
-       adr_l   x6, __bss_start
-       adr_l   x7, __bss_stop
-
-1:     cmp     x6, x7
-       b.hs    2f
-       str     xzr, [x6], #8                   // Clear BSS
-       b       1b
-2:
+__primary_switched:
+       mov     x28, lr                         // preserve LR
+       adr_l   x8, vectors                     // load VBAR_EL1 with virtual
+       msr     vbar_el1, x8                    // vector table address
+       isb
+
+       // Clear BSS
+       adr_l   x0, __bss_start
+       mov     x1, xzr
+       adr_l   x2, __bss_stop
+       sub     x2, x2, x0
+       bl      __pi_memset
+       dsb     ishst                           // Make zero page visible to PTW
+
         adr_l   sp, initial_sp, x4
+       mov     x4, sp
+       and     x4, x4, #~(THREAD_SIZE - 1)
+       msr     sp_el0, x4                      // Save thread_info
         str_l   x21, __fdt_pointer, x5          // Save FDT pointer
-       str_l   x24, memstart_addr, x6          // Save PHYS_OFFSET
+
+       ldr_l   x4, kimage_vaddr                // Save the offset between
+       sub     x4, x4, x24                     // the kernel virtual and
+       str_l   x4, kimage_voffset, x5          // physical mappings
+
         mov     x29, #0
  #ifdef CONFIG_KASAN
         bl      kasan_early_init
+#endif
+#ifdef CONFIG_RANDOMIZE_BASE
+       tst     x23, ~(MIN_KIMG_ALIGN - 1)      // already running randomized?
+       b.ne    0f
+       mov     x0, x21                         // pass FDT address in x0
+       mov     x1, x23                         // pass modulo offset in x1
+       bl      kaslr_early_init                // parse FDT for KASLR options
+       cbz     x0, 0f                          // KASLR disabled? just proceed
+       orr     x23, x23, x0                    // record KASLR offset
+       ret     x28                             // we must enable KASLR, return
+                                               // to __enable_mmu()
+0:
  #endif
         b       start_kernel
-ENDPROC(__mmap_switched)
+ENDPROC(__primary_switched)
  
  /*
   * end early head section, begin head code that is also used for
   * hotplug and needs to have the same protections as the text region
   */
         .section ".text","ax"
+
+ENTRY(kimage_vaddr)
+       .quad           _text - TEXT_OFFSET
+
  /*
   * If we're fortunate enough to boot at EL2, ensure that the world is
   * sane before dropping to EL1.
@@ -543,7 +573,7 @@ ENDPROC(el2_setup)
   * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed
   * in x20. See arch/arm64/include/asm/virt.h for more info.
   */
-ENTRY(set_cpu_boot_mode_flag)
+set_cpu_boot_mode_flag:
         adr_l   x1, __boot_cpu_mode
         cmp     w20, #BOOT_CPU_MODE_EL2
         b.ne    1f
@@ -576,7 +606,7 @@ ENTRY(secondary_holding_pen)
         bl      el2_setup                       // Drop to EL1, w20=cpu_boot_mode
         bl      set_cpu_boot_mode_flag
         mrs     x0, mpidr_el1
-       ldr     x1, =MPIDR_HWID_BITMASK
+       mov_q   x1, MPIDR_HWID_BITMASK
         and     x0, x0, x1
         adr_l   x3, secondary_holding_pen_release
  pen:   ldr     x4, [x3]
@@ -596,7 +626,7 @@ ENTRY(secondary_entry)
         b       secondary_startup
  ENDPROC(secondary_entry)
  
-ENTRY(secondary_startup)
+secondary_startup:
         /*
          * Common entry point for secondary CPUs.
          */
@@ -604,14 +634,19 @@ ENTRY(secondary_startup)
         adrp    x26, swapper_pg_dir
         bl      __cpu_setup                     // initialise processor
  
-       ldr     x21, =secondary_data
-       ldr     x27, =__secondary_switched      // address to jump to after enabling the MMU
+       adr_l   x27, __secondary_switch         // address to jump to after enabling the MMU
         b       __enable_mmu
  ENDPROC(secondary_startup)
  
-ENTRY(__secondary_switched)
-       ldr     x0, [x21]                       // get secondary_data.stack
+__secondary_switched:
+       adr_l   x5, vectors
+       msr     vbar_el1, x5
+       isb
+
+       ldr_l   x0, secondary_data              // get secondary_data.stack
         mov     sp, x0
+       and     x0, x0, #~(THREAD_SIZE - 1)
+       msr     sp_el0, x0                      // save thread_info
         mov     x29, #0
         b       secondary_start_kernel
  ENDPROC(__secondary_switched)
@@ -628,13 +663,12 @@ ENDPROC(__secondary_switched)
   * If it isn't, park the CPU
   */
         .section        ".idmap.text", "ax"
-__enable_mmu:
+ENTRY(__enable_mmu)
+       mrs     x18, sctlr_el1                  // preserve old SCTLR_EL1 value
         mrs     x1, ID_AA64MMFR0_EL1
         ubfx    x2, x1, #ID_AA64MMFR0_TGRAN_SHIFT, 4
         cmp     x2, #ID_AA64MMFR0_TGRAN_SUPPORTED
         b.ne    __no_granule_support
-       ldr     x5, =vectors
-       msr     vbar_el1, x5
         msr     ttbr0_el1, x25                  // load TTBR0
         msr     ttbr1_el1, x26                  // load TTBR1
         isb
@@ -648,6 +682,25 @@ __enable_mmu:
         ic      iallu
         dsb     nsh
         isb
+#ifdef CONFIG_RANDOMIZE_BASE
+       mov     x19, x0                         // preserve new SCTLR_EL1 value
+       blr     x27
+
+       /*
+        * If we return here, we have a KASLR displacement in x23 which we need
+        * to take into account by discarding the current kernel mapping and
+        * creating a new one.
+        */
+       msr     sctlr_el1, x18                  // disable the MMU
+       isb
+       bl      __create_page_tables            // recreate kernel mapping
+
+       msr     sctlr_el1, x19                  // re-enable the MMU
+       isb
+       ic      iallu                           // flush instructions fetched
+       dsb     nsh                             // via old mapping
+       isb
+#endif
         br      x27
  ENDPROC(__enable_mmu)
  
@@ -655,3 +708,38 @@ __no_granule_support:
         wfe
         b __no_granule_support
  ENDPROC(__no_granule_support)
+
+__primary_switch:
+#ifdef CONFIG_RELOCATABLE
+       /*
+        * Iterate over each entry in the relocation table, and apply the
+        * relocations in place.
+        */
+       ldr     w9, =__rela_offset              // offset to reloc table
+       ldr     w10, =__rela_size               // size of reloc table
+
+       mov_q   x11, KIMAGE_VADDR               // default virtual offset
+       add     x11, x11, x23                   // actual virtual offset
+       add     x9, x9, x11                     // __va(.rela)
+       add     x10, x9, x10                    // __va(.rela) + sizeof(.rela)
+
+0:     cmp     x9, x10
+       b.hs    1f
+       ldp     x11, x12, [x9], #24
+       ldr     x13, [x9, #-8]
+       cmp     w12, #R_AARCH64_RELATIVE
+       b.ne    0b
+       add     x13, x13, x23                   // relocate
+       str     x13, [x11, x23]
+       b       0b
+
+1:
+#endif
+       ldr     x8, =__primary_switched
+       br      x8
+ENDPROC(__primary_switch)
+
+__secondary_switch:
+       ldr     x8, =__secondary_switched
+       br      x8
+ENDPROC(__secondary_switch)
diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S

new file mode 100644 (file)

index 0000000..46f29b6
--- /dev/null
+++ b/arch/arm64/kernel/hibernate-asm.S
@@ -0,0 +1,176 @@
+/*
+ * Hibernate low-level support
+ *
+ * Copyright (C) 2016 ARM Ltd.
+ * Author:     James Morse <james.morse@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/linkage.h>
+#include <linux/errno.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/assembler.h>
+#include <asm/cputype.h>
+#include <asm/memory.h>
+#include <asm/page.h>
+#include <asm/virt.h>
+
+/*
+ * To prevent the possibility of old and new partial table walks being visible
+ * in the tlb, switch the ttbr to a zero page when we invalidate the old
+ * records. D4.7.1 'General TLB maintenance requirements' in ARM DDI 0487A.i
+ * Even switching to our copied tables will cause a changed output address at
+ * each stage of the walk.
+ */
+.macro break_before_make_ttbr_switch zero_page, page_table
+       msr     ttbr1_el1, \zero_page
+       isb
+       tlbi    vmalle1is
+       dsb     ish
+       msr     ttbr1_el1, \page_table
+       isb
+.endm
+
+
+/*
+ * Resume from hibernate
+ *
+ * Loads temporary page tables then restores the memory image.
+ * Finally branches to cpu_resume() to restore the state saved by
+ * swsusp_arch_suspend().
+ *
+ * Because this code has to be copied to a 'safe' page, it can't call out to
+ * other functions by PC-relative address. Also remember that it may be
+ * mid-way through over-writing other functions. For this reason it contains
+ * code from flush_icache_range() and uses the copy_page() macro.
+ *
+ * This 'safe' page is mapped via ttbr0, and executed from there. This function
+ * switches to a copy of the linear map in ttbr1, performs the restore, then
+ * switches ttbr1 to the original kernel's swapper_pg_dir.
+ *
+ * All of memory gets written to, including code. We need to clean the kernel
+ * text to the Point of Coherence (PoC) before secondary cores can be booted.
+ * Because the kernel modules and executable pages mapped to user space are
+ * also written as data, we clean all pages we touch to the Point of
+ * Unification (PoU).
+ *
+ * x0: physical address of temporary page tables
+ * x1: physical address of swapper page tables
+ * x2: address of cpu_resume
+ * x3: linear map address of restore_pblist in the current kernel
+ * x4: physical address of __hyp_stub_vectors, or 0
+ * x5: physical address of a  zero page that remains zero after resume
+ */
+.pushsection    ".hibernate_exit.text", "ax"
+ENTRY(swsusp_arch_suspend_exit)
+       /*
+        * We execute from ttbr0, change ttbr1 to our copied linear map tables
+        * with a break-before-make via the zero page
+        */
+       break_before_make_ttbr_switch   x5, x0
+
+       mov     x21, x1
+       mov     x30, x2
+       mov     x24, x4
+       mov     x25, x5
+
+       /* walk the restore_pblist and use copy_page() to over-write memory */
+       mov     x19, x3
+
+1:     ldr     x10, [x19, #HIBERN_PBE_ORIG]
+       mov     x0, x10
+       ldr     x1, [x19, #HIBERN_PBE_ADDR]
+
+       copy_page       x0, x1, x2, x3, x4, x5, x6, x7, x8, x9
+
+       add     x1, x10, #PAGE_SIZE
+       /* Clean the copied page to PoU - based on flush_icache_range() */
+       dcache_line_size x2, x3
+       sub     x3, x2, #1
+       bic     x4, x10, x3
+2:     dc      cvau, x4        /* clean D line / unified line */
+       add     x4, x4, x2
+       cmp     x4, x1
+       b.lo    2b
+
+       ldr     x19, [x19, #HIBERN_PBE_NEXT]
+       cbnz    x19, 1b
+       dsb     ish             /* wait for PoU cleaning to finish */
+
+       /* switch to the restored kernels page tables */
+       break_before_make_ttbr_switch   x25, x21
+
+       ic      ialluis
+       dsb     ish
+       isb
+
+       cbz     x24, 3f         /* Do we need to re-initialise EL2? */
+       hvc     #0
+3:     ret
+
+       .ltorg
+ENDPROC(swsusp_arch_suspend_exit)
+
+/*
+ * Restore the hyp stub.
+ * This must be done before the hibernate page is unmapped by _cpu_resume(),
+ * but happens before any of the hyp-stub's code is cleaned to PoC.
+ *
+ * x24: The physical address of __hyp_stub_vectors
+ */
+el1_sync:
+       msr     vbar_el2, x24
+       eret
+ENDPROC(el1_sync)
+
+.macro invalid_vector  label
+\label:
+       b \label
+ENDPROC(\label)
+.endm
+
+       invalid_vector  el2_sync_invalid
+       invalid_vector  el2_irq_invalid
+       invalid_vector  el2_fiq_invalid
+       invalid_vector  el2_error_invalid
+       invalid_vector  el1_sync_invalid
+       invalid_vector  el1_irq_invalid
+       invalid_vector  el1_fiq_invalid
+       invalid_vector  el1_error_invalid
+
+/* el2 vectors - switch el2 here while we restore the memory image. */
+       .align 11
+ENTRY(hibernate_el2_vectors)
+       ventry  el2_sync_invalid                // Synchronous EL2t
+       ventry  el2_irq_invalid                 // IRQ EL2t
+       ventry  el2_fiq_invalid                 // FIQ EL2t
+       ventry  el2_error_invalid               // Error EL2t
+
+       ventry  el2_sync_invalid                // Synchronous EL2h
+       ventry  el2_irq_invalid                 // IRQ EL2h
+       ventry  el2_fiq_invalid                 // FIQ EL2h
+       ventry  el2_error_invalid               // Error EL2h
+
+       ventry  el1_sync                        // Synchronous 64-bit EL1
+       ventry  el1_irq_invalid                 // IRQ 64-bit EL1
+       ventry  el1_fiq_invalid                 // FIQ 64-bit EL1
+       ventry  el1_error_invalid               // Error 64-bit EL1
+
+       ventry  el1_sync_invalid                // Synchronous 32-bit EL1
+       ventry  el1_irq_invalid                 // IRQ 32-bit EL1
+       ventry  el1_fiq_invalid                 // FIQ 32-bit EL1
+       ventry  el1_error_invalid               // Error 32-bit EL1
+END(hibernate_el2_vectors)
+
+.popsection
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c

new file mode 100644 (file)

index 0000000..f8df75d
--- /dev/null
+++ b/arch/arm64/kernel/hibernate.c
@@ -0,0 +1,487 @@
+/*:
+ * Hibernate support specific for ARM64
+ *
+ * Derived from work on ARM hibernation support by:
+ *
+ * Ubuntu project, hibernation support for mach-dove
+ * Copyright (C) 2010 Nokia Corporation (Hiroshi Doyu)
+ * Copyright (C) 2010 Texas Instruments, Inc. (Teerth Reddy et al.)
+ *  https://lkml.org/lkml/2010/6/18/4
+ *  https://lists.linux-foundation.org/pipermail/linux-pm/2010-June/027422.html
+ *  https://patchwork.kernel.org/patch/96442/
+ *
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * License terms: GNU General Public License (GPL) version 2
+ */
+#define pr_fmt(x) "hibernate: " x
+#include <linux/kvm_host.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/pm.h>
+#include <linux/sched.h>
+#include <linux/suspend.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+
+#include <asm/barrier.h>
+#include <asm/cacheflush.h>
+#include <asm/irqflags.h>
+#include <asm/memory.h>
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/pgtable-hwdef.h>
+#include <asm/sections.h>
+#include <asm/suspend.h>
+#include <asm/virt.h>
+
+/*
+ * Hibernate core relies on this value being 0 on resume, and marks it
+ * __nosavedata assuming it will keep the resume kernel's '0' value. This
+ * doesn't happen with either KASLR.
+ *
+ * defined as "__visible int in_suspend __nosavedata" in
+ * kernel/power/hibernate.c
+ */
+extern int in_suspend;
+
+/* Find a symbols alias in the linear map */
+#define LMADDR(x)      phys_to_virt(virt_to_phys(x))
+
+/* Do we need to reset el2? */
+#define el2_reset_needed() (is_hyp_mode_available() && !is_kernel_in_hyp_mode())
+
+/*
+ * Start/end of the hibernate exit code, this must be copied to a 'safe'
+ * location in memory, and executed from there.
+ */
+extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[];
+
+/* temporary el2 vectors in the __hibernate_exit_text section. */
+extern char hibernate_el2_vectors[];
+
+/* hyp-stub vectors, used to restore el2 during resume from hibernate. */
+extern char __hyp_stub_vectors[];
+
+/*
+ * Values that may not change over hibernate/resume. We put the build number
+ * and date in here so that we guarantee not to resume with a different
+ * kernel.
+ */
+struct arch_hibernate_hdr_invariants {
+       char            uts_version[__NEW_UTS_LEN + 1];
+};
+
+/* These values need to be know across a hibernate/restore. */
+static struct arch_hibernate_hdr {
+       struct arch_hibernate_hdr_invariants invariants;
+
+       /* These are needed to find the relocated kernel if built with kaslr */
+       phys_addr_t     ttbr1_el1;
+       void            (*reenter_kernel)(void);
+
+       /*
+        * We need to know where the __hyp_stub_vectors are after restore to
+        * re-configure el2.
+        */
+       phys_addr_t     __hyp_stub_vectors;
+} resume_hdr;
+
+static inline void arch_hdr_invariants(struct arch_hibernate_hdr_invariants *i)
+{
+       memset(i, 0, sizeof(*i));
+       memcpy(i->uts_version, init_utsname()->version, sizeof(i->uts_version));
+}
+
+int pfn_is_nosave(unsigned long pfn)
+{
+       unsigned long nosave_begin_pfn = virt_to_pfn(&__nosave_begin);
+       unsigned long nosave_end_pfn = virt_to_pfn(&__nosave_end - 1);
+
+       return (pfn >= nosave_begin_pfn) && (pfn <= nosave_end_pfn);
+}
+
+void notrace save_processor_state(void)
+{
+       WARN_ON(num_online_cpus() != 1);
+}
+
+void notrace restore_processor_state(void)
+{
+}
+
+int arch_hibernation_header_save(void *addr, unsigned int max_size)
+{
+       struct arch_hibernate_hdr *hdr = addr;
+
+       if (max_size < sizeof(*hdr))
+               return -EOVERFLOW;
+
+       arch_hdr_invariants(&hdr->invariants);
+       hdr->ttbr1_el1          = virt_to_phys(swapper_pg_dir);
+       hdr->reenter_kernel     = _cpu_resume;
+
+       /* We can't use __hyp_get_vectors() because kvm may still be loaded */
+       if (el2_reset_needed())
+               hdr->__hyp_stub_vectors = virt_to_phys(__hyp_stub_vectors);
+       else
+               hdr->__hyp_stub_vectors = 0;
+
+       return 0;
+}
+EXPORT_SYMBOL(arch_hibernation_header_save);
+
+int arch_hibernation_header_restore(void *addr)
+{
+       struct arch_hibernate_hdr_invariants invariants;
+       struct arch_hibernate_hdr *hdr = addr;
+
+       arch_hdr_invariants(&invariants);
+       if (memcmp(&hdr->invariants, &invariants, sizeof(invariants))) {
+               pr_crit("Hibernate image not generated by this kernel!\n");
+               return -EINVAL;
+       }
+
+       resume_hdr = *hdr;
+
+       return 0;
+}
+EXPORT_SYMBOL(arch_hibernation_header_restore);
+
+/*
+ * Copies length bytes, starting at src_start into an new page,
+ * perform cache maintentance, then maps it at the specified address low
+ * address as executable.
+ *
+ * This is used by hibernate to copy the code it needs to execute when
+ * overwriting the kernel text. This function generates a new set of page
+ * tables, which it loads into ttbr0.
+ *
+ * Length is provided as we probably only want 4K of data, even on a 64K
+ * page system.
+ */
+static int create_safe_exec_page(void *src_start, size_t length,
+                                unsigned long dst_addr,
+                                phys_addr_t *phys_dst_addr,
+                                void *(*allocator)(gfp_t mask),
+                                gfp_t mask)
+{
+       int rc = 0;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       unsigned long dst = (unsigned long)allocator(mask);
+
+       if (!dst) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       memcpy((void *)dst, src_start, length);
+       flush_icache_range(dst, dst + length);
+
+       pgd = pgd_offset_raw(allocator(mask), dst_addr);
+       if (pgd_none(*pgd)) {
+               pud = allocator(mask);
+               if (!pud) {
+                       rc = -ENOMEM;
+                       goto out;
+               }
+               pgd_populate(&init_mm, pgd, pud);
+       }
+
+       pud = pud_offset(pgd, dst_addr);
+       if (pud_none(*pud)) {
+               pmd = allocator(mask);
+               if (!pmd) {
+                       rc = -ENOMEM;
+                       goto out;
+               }
+               pud_populate(&init_mm, pud, pmd);
+       }
+
+       pmd = pmd_offset(pud, dst_addr);
+       if (pmd_none(*pmd)) {
+               pte = allocator(mask);
+               if (!pte) {
+                       rc = -ENOMEM;
+                       goto out;
+               }
+               pmd_populate_kernel(&init_mm, pmd, pte);
+       }
+
+       pte = pte_offset_kernel(pmd, dst_addr);
+       set_pte(pte, __pte(virt_to_phys((void *)dst) |
+                        pgprot_val(PAGE_KERNEL_EXEC)));
+
+       /* Load our new page tables */
+       asm volatile("msr       ttbr0_el1, %0;"
+                    "isb;"
+                    "tlbi      vmalle1is;"
+                    "dsb       ish;"
+                    "isb" : : "r"(virt_to_phys(pgd)));
+
+       *phys_dst_addr = virt_to_phys((void *)dst);
+
+out:
+       return rc;
+}
+
+
+int swsusp_arch_suspend(void)
+{
+       int ret = 0;
+       unsigned long flags;
+       struct sleep_stack_data state;
+
+       local_dbg_save(flags);
+
+       if (__cpu_suspend_enter(&state)) {
+               ret = swsusp_save();
+       } else {
+               /* Clean kernel to PoC for secondary core startup */
+               __flush_dcache_area(LMADDR(KERNEL_START), KERNEL_END - KERNEL_START);
+
+               /*
+                * Tell the hibernation core that we've just restored
+                * the memory
+                */
+               in_suspend = 0;
+
+               __cpu_suspend_exit();
+       }
+
+       local_dbg_restore(flags);
+
+       return ret;
+}
+
+static int copy_pte(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long start,
+                   unsigned long end)
+{
+       pte_t *src_pte;
+       pte_t *dst_pte;
+       unsigned long addr = start;
+
+       dst_pte = (pte_t *)get_safe_page(GFP_ATOMIC);
+       if (!dst_pte)
+               return -ENOMEM;
+       pmd_populate_kernel(&init_mm, dst_pmd, dst_pte);
+       dst_pte = pte_offset_kernel(dst_pmd, start);
+
+       src_pte = pte_offset_kernel(src_pmd, start);
+       do {
+               if (!pte_none(*src_pte))
+                       /*
+                        * Resume will overwrite areas that may be marked
+                        * read only (code, rodata). Clear the RDONLY bit from
+                        * the temporary mappings we use during restore.
+                        */
+                       set_pte(dst_pte, __pte(pte_val(*src_pte) & ~PTE_RDONLY));
+       } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+
+       return 0;
+}
+
+static int copy_pmd(pud_t *dst_pud, pud_t *src_pud, unsigned long start,
+                   unsigned long end)
+{
+       pmd_t *src_pmd;
+       pmd_t *dst_pmd;
+       unsigned long next;
+       unsigned long addr = start;
+
+       if (pud_none(*dst_pud)) {
+               dst_pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
+               if (!dst_pmd)
+                       return -ENOMEM;
+               pud_populate(&init_mm, dst_pud, dst_pmd);
+       }
+       dst_pmd = pmd_offset(dst_pud, start);
+
+       src_pmd = pmd_offset(src_pud, start);
+       do {
+               next = pmd_addr_end(addr, end);
+               if (pmd_none(*src_pmd))
+                       continue;
+               if (pmd_table(*src_pmd)) {
+                       if (copy_pte(dst_pmd, src_pmd, addr, next))
+                               return -ENOMEM;
+               } else {
+                       set_pmd(dst_pmd,
+                               __pmd(pmd_val(*src_pmd) & ~PMD_SECT_RDONLY));
+               }
+       } while (dst_pmd++, src_pmd++, addr = next, addr != end);
+
+       return 0;
+}
+
+static int copy_pud(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long start,
+                   unsigned long end)
+{
+       pud_t *dst_pud;
+       pud_t *src_pud;
+       unsigned long next;
+       unsigned long addr = start;
+
+       if (pgd_none(*dst_pgd)) {
+               dst_pud = (pud_t *)get_safe_page(GFP_ATOMIC);
+               if (!dst_pud)
+                       return -ENOMEM;
+               pgd_populate(&init_mm, dst_pgd, dst_pud);
+       }
+       dst_pud = pud_offset(dst_pgd, start);
+
+       src_pud = pud_offset(src_pgd, start);
+       do {
+               next = pud_addr_end(addr, end);
+               if (pud_none(*src_pud))
+                       continue;
+               if (pud_table(*(src_pud))) {
+                       if (copy_pmd(dst_pud, src_pud, addr, next))
+                               return -ENOMEM;
+               } else {
+                       set_pud(dst_pud,
+                               __pud(pud_val(*src_pud) & ~PMD_SECT_RDONLY));
+               }
+       } while (dst_pud++, src_pud++, addr = next, addr != end);
+
+       return 0;
+}
+
+static int copy_page_tables(pgd_t *dst_pgd, unsigned long start,
+                           unsigned long end)
+{
+       unsigned long next;
+       unsigned long addr = start;
+       pgd_t *src_pgd = pgd_offset_k(start);
+
+       dst_pgd = pgd_offset_raw(dst_pgd, start);
+       do {
+               next = pgd_addr_end(addr, end);
+               if (pgd_none(*src_pgd))
+                       continue;
+               if (copy_pud(dst_pgd, src_pgd, addr, next))
+                       return -ENOMEM;
+       } while (dst_pgd++, src_pgd++, addr = next, addr != end);
+
+       return 0;
+}
+
+/*
+ * Setup then Resume from the hibernate image using swsusp_arch_suspend_exit().
+ *
+ * Memory allocated by get_safe_page() will be dealt with by the hibernate code,
+ * we don't need to free it here.
+ */
+int swsusp_arch_resume(void)
+{
+       int rc = 0;
+       void *zero_page;
+       size_t exit_size;
+       pgd_t *tmp_pg_dir;
+       void *lm_restore_pblist;
+       phys_addr_t phys_hibernate_exit;
+       void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *,
+                                         void *, phys_addr_t, phys_addr_t);
+
+       /*
+        * Locate the exit code in the bottom-but-one page, so that *NULL
+        * still has disastrous affects.
+        */
+       hibernate_exit = (void *)PAGE_SIZE;
+       exit_size = __hibernate_exit_text_end - __hibernate_exit_text_start;
+       /*
+        * Copy swsusp_arch_suspend_exit() to a safe page. This will generate
+        * a new set of ttbr0 page tables and load them.
+        */
+       rc = create_safe_exec_page(__hibernate_exit_text_start, exit_size,
+                                  (unsigned long)hibernate_exit,
+                                  &phys_hibernate_exit,
+                                  (void *)get_safe_page, GFP_ATOMIC);
+       if (rc) {
+               pr_err("Failed to create safe executable page for hibernate_exit code.");
+               goto out;
+       }
+
+       /*
+        * The hibernate exit text contains a set of el2 vectors, that will
+        * be executed at el2 with the mmu off in order to reload hyp-stub.
+        */
+       __flush_dcache_area(hibernate_exit, exit_size);
+
+       /*
+        * Restoring the memory image will overwrite the ttbr1 page tables.
+        * Create a second copy of just the linear map, and use this when
+        * restoring.
+        */
+       tmp_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC);
+       if (!tmp_pg_dir) {
+               pr_err("Failed to allocate memory for temporary page tables.");
+               rc = -ENOMEM;
+               goto out;
+       }
+       rc = copy_page_tables(tmp_pg_dir, PAGE_OFFSET, 0);
+       if (rc)
+               goto out;
+
+       /*
+        * Since we only copied the linear map, we need to find restore_pblist's
+        * linear map address.
+        */
+       lm_restore_pblist = LMADDR(restore_pblist);
+
+       /*
+        * KASLR will cause the el2 vectors to be in a different location in
+        * the resumed kernel. Load hibernate's temporary copy into el2.
+        *
+        * We can skip this step if we booted at EL1, or are running with VHE.
+        */
+       if (el2_reset_needed()) {
+               phys_addr_t el2_vectors = phys_hibernate_exit;  /* base */
+               el2_vectors += hibernate_el2_vectors -
+                              __hibernate_exit_text_start;     /* offset */
+
+               __hyp_set_vectors(el2_vectors);
+       }
+
+       /*
+        * We need a zero page that is zero before & after resume in order to
+        * to break before make on the ttbr1 page tables.
+        */
+       zero_page = (void *)get_safe_page(GFP_ATOMIC);
+
+       hibernate_exit(virt_to_phys(tmp_pg_dir), resume_hdr.ttbr1_el1,
+                      resume_hdr.reenter_kernel, lm_restore_pblist,
+                      resume_hdr.__hyp_stub_vectors, virt_to_phys(zero_page));
+
+out:
+       return rc;
+}
+
+static int check_boot_cpu_online_pm_callback(struct notifier_block *nb,
+                                            unsigned long action, void *ptr)
+{
+       if (action == PM_HIBERNATION_PREPARE &&
+            cpumask_first(cpu_online_mask) != 0) {
+               pr_warn("CPU0 is offline.\n");
+               return notifier_from_errno(-ENODEV);
+       }
+
+       return NOTIFY_OK;
+}
+
+static int __init check_boot_cpu_online_init(void)
+{
+       /*
+        * Set this pm_notifier callback with a lower priority than
+        * cpu_hotplug_pm_callback, so that cpu_hotplug_pm_callback will be
+        * called earlier to disable cpu hotplug before the cpu online check.
+        */
+       pm_notifier(check_boot_cpu_online_pm_callback, -INT_MAX);
+
+       return 0;
+}
+core_initcall(check_boot_cpu_online_init);
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c

index b45c95d34b8323e74992e0a4a56e6da0e1257c60..367a954f9937979c574a805bab2e3cbad706d202 100644 (file)
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -24,6 +24,7 @@
  #include <linux/cpu_pm.h>
  #include <linux/errno.h>
  #include <linux/hw_breakpoint.h>
+#include <linux/kprobes.h>
  #include <linux/perf_event.h>
  #include <linux/ptrace.h>
  #include <linux/smp.h>
@@ -127,6 +128,7 @@ static u64 read_wb_reg(int reg, int n)
  
         return val;
  }
+NOKPROBE_SYMBOL(read_wb_reg);
  
  static void write_wb_reg(int reg, int n, u64 val)
  {
@@ -140,6 +142,7 @@ static void write_wb_reg(int reg, int n, u64 val)
         }
         isb();
  }
+NOKPROBE_SYMBOL(write_wb_reg);
  
  /*
   * Convert a breakpoint privilege level to the corresponding exception
@@ -157,6 +160,7 @@ static enum dbg_active_el debug_exception_level(int privilege)
                 return -EINVAL;
         }
  }
+NOKPROBE_SYMBOL(debug_exception_level);
  
  enum hw_breakpoint_ops {
         HW_BREAKPOINT_INSTALL,
@@ -575,6 +579,7 @@ static void toggle_bp_registers(int reg, enum dbg_active_el el, int enable)
                 write_wb_reg(reg, i, ctrl);
         }
  }
+NOKPROBE_SYMBOL(toggle_bp_registers);
  
  /*
   * Debug exception handlers.
@@ -654,6 +659,7 @@ unlock:
  
         return 0;
  }
+NOKPROBE_SYMBOL(breakpoint_handler);
  
  static int watchpoint_handler(unsigned long addr, unsigned int esr,
                               struct pt_regs *regs)
@@ -756,6 +762,7 @@ unlock:
  
         return 0;
  }
+NOKPROBE_SYMBOL(watchpoint_handler);
  
  /*
   * Handle single-step exception.
@@ -813,6 +820,7 @@ int reinstall_suspended_bps(struct pt_regs *regs)
  
         return !handled_exception;
  }
+NOKPROBE_SYMBOL(reinstall_suspended_bps);
  
  /*
   * Context-switcher for restoring suspended breakpoints.
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S

index a272f335c289dcb5f52144c815edf6938757a218..8727f44907725445efd25735b9638672a2599cb0 100644 (file)
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -22,6 +22,8 @@
  #include <linux/irqchip/arm-gic-v3.h>
  
  #include <asm/assembler.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
  #include <asm/ptrace.h>
  #include <asm/virt.h>
  
@@ -53,15 +55,26 @@ ENDPROC(__hyp_stub_vectors)
         .align 11
  
  el1_sync:
-       mrs     x1, esr_el2
-       lsr     x1, x1, #26
-       cmp     x1, #0x16
-       b.ne    2f                              // Not an HVC trap
-       cbz     x0, 1f
-       msr     vbar_el2, x0                    // Set vbar_el2
-       b       2f
-1:     mrs     x0, vbar_el2                    // Return vbar_el2
-2:     eret
+       mrs     x30, esr_el2
+       lsr     x30, x30, #ESR_ELx_EC_SHIFT
+
+       cmp     x30, #ESR_ELx_EC_HVC64
+       b.ne    9f                              // Not an HVC trap
+
+       cmp     x0, #HVC_GET_VECTORS
+       b.ne    1f
+       mrs     x0, vbar_el2
+       b       9f
+
+1:     cmp     x0, #HVC_SET_VECTORS
+       b.ne    2f
+       msr     vbar_el2, x1
+       b       9f
+
+       /* Someone called kvm_call_hyp() against the hyp-stub... */
+2:     mov     x0, #ARM_EXCEPTION_HYP_GONE
+
+9:     eret
  ENDPROC(el1_sync)
  
  .macro invalid_vector  label
@@ -101,10 +114,18 @@ ENDPROC(\label)
   */
  
  ENTRY(__hyp_get_vectors)
-       mov     x0, xzr
-       // fall through
-ENTRY(__hyp_set_vectors)
+       str     lr, [sp, #-16]!
+       mov     x0, #HVC_GET_VECTORS
         hvc     #0
+       ldr     lr, [sp], #16
         ret
  ENDPROC(__hyp_get_vectors)
+
+ENTRY(__hyp_set_vectors)
+       str     lr, [sp, #-16]!
+       mov     x1, x0
+       mov     x0, #HVC_SET_VECTORS
+       hvc     #0
+       ldr     lr, [sp], #16
+       ret
  ENDPROC(__hyp_set_vectors)
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h

index bc2abb8b1599576ae2dec02bce0c46c48fc707dd..f0be31f1dd4515a2195c153556afae1b2c002d06 100644 (file)
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -26,31 +26,40 @@
   * There aren't any ELF relocations we can use to endian-swap values known only
   * at link time (e.g. the subtraction of two symbol addresses), so we must get
   * the linker to endian-swap certain values before emitting them.
+ *
+ * Note that, in order for this to work when building the ELF64 PIE executable
+ * (for KASLR), these values should not be referenced via R_AARCH64_ABS64
+ * relocations, since these are fixed up at runtime rather than at build time
+ * when PIE is in effect. So we need to split them up in 32-bit high and low
+ * words.
   */
  #ifdef CONFIG_CPU_BIG_ENDIAN
-#define DATA_LE64(data)                                        \
-       ((((data) & 0x00000000000000ff) << 56) |        \
-        (((data) & 0x000000000000ff00) << 40) |        \
-        (((data) & 0x0000000000ff0000) << 24) |        \
-        (((data) & 0x00000000ff000000) << 8)  |        \
-        (((data) & 0x000000ff00000000) >> 8)  |        \
-        (((data) & 0x0000ff0000000000) >> 24) |        \
-        (((data) & 0x00ff000000000000) >> 40) |        \
-        (((data) & 0xff00000000000000) >> 56))
+#define DATA_LE32(data)                                \
+       ((((data) & 0x000000ff) << 24) |        \
+        (((data) & 0x0000ff00) << 8)  |        \
+        (((data) & 0x00ff0000) >> 8)  |        \
+        (((data) & 0xff000000) >> 24))
  #else
-#define DATA_LE64(data) ((data) & 0xffffffffffffffff)
+#define DATA_LE32(data) ((data) & 0xffffffff)
  #endif
  
+#define DEFINE_IMAGE_LE64(sym, data)                           \
+       sym##_lo32 = DATA_LE32((data) & 0xffffffff);            \
+       sym##_hi32 = DATA_LE32((data) >> 32)
+
  #ifdef CONFIG_CPU_BIG_ENDIAN
-#define __HEAD_FLAG_BE 1
+#define __HEAD_FLAG_BE         1
  #else
-#define __HEAD_FLAG_BE 0
+#define __HEAD_FLAG_BE         0
  #endif
  
-#define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2)
+#define __HEAD_FLAG_PAGE_SIZE  ((PAGE_SHIFT - 10) / 2)
  
-#define __HEAD_FLAGS   ((__HEAD_FLAG_BE << 0) |        \
-                        (__HEAD_FLAG_PAGE_SIZE << 1))
+#define __HEAD_FLAG_PHYS_BASE  1
+
+#define __HEAD_FLAGS           ((__HEAD_FLAG_BE << 0) |        \
+                                (__HEAD_FLAG_PAGE_SIZE << 1) | \
+                                (__HEAD_FLAG_PHYS_BASE << 3))
  
  /*
   * These will output as part of the Image header, which should be little-endian
@@ -58,12 +67,24 @@
   * endian swapped in head.S, all are done here for consistency.
   */
  #define HEAD_SYMBOLS                                           \
-       _kernel_size_le         = DATA_LE64(_end - _text);      \
-       _kernel_offset_le       = DATA_LE64(TEXT_OFFSET);       \
-       _kernel_flags_le        = DATA_LE64(__HEAD_FLAGS);
+       DEFINE_IMAGE_LE64(_kernel_size_le, _end - _text);       \
+       DEFINE_IMAGE_LE64(_kernel_offset_le, TEXT_OFFSET);      \
+       DEFINE_IMAGE_LE64(_kernel_flags_le, __HEAD_FLAGS);
  
  #ifdef CONFIG_EFI
  
+__efistub_stext_offset = stext - _text;
+
+/*
+ * Prevent the symbol aliases below from being emitted into the kallsyms
+ * table, by forcing them to be absolute symbols (which are conveniently
+ * ignored by scripts/kallsyms) rather than section relative symbols.
+ * The distinction is only relevant for partial linking, and only for symbols
+ * that are defined within a section declaration (which is not the case for
+ * the definitions below) so the resulting values will be identical.
+ */
+#define KALLSYMS_HIDE(sym)     ABSOLUTE(sym)
+
  /*
   * The EFI stub has its own symbol namespace prefixed by __efistub_, to
   * isolate it from the kernel proper. The following symbols are legally
@@ -73,25 +94,25 @@
   * linked at. The routines below are all implemented in assembler in a
   * position independent manner
   */
-__efistub_memcmp               = __pi_memcmp;
-__efistub_memchr               = __pi_memchr;
-__efistub_memcpy               = __pi_memcpy;
-__efistub_memmove              = __pi_memmove;
-__efistub_memset               = __pi_memset;
-__efistub_strlen               = __pi_strlen;
-__efistub_strcmp               = __pi_strcmp;
-__efistub_strncmp              = __pi_strncmp;
-__efistub___flush_dcache_area  = __pi___flush_dcache_area;
+__efistub_memcmp               = KALLSYMS_HIDE(__pi_memcmp);
+__efistub_memchr               = KALLSYMS_HIDE(__pi_memchr);
+__efistub_memcpy               = KALLSYMS_HIDE(__pi_memcpy);
+__efistub_memmove              = KALLSYMS_HIDE(__pi_memmove);
+__efistub_memset               = KALLSYMS_HIDE(__pi_memset);
+__efistub_strlen               = KALLSYMS_HIDE(__pi_strlen);
+__efistub_strcmp               = KALLSYMS_HIDE(__pi_strcmp);
+__efistub_strncmp              = KALLSYMS_HIDE(__pi_strncmp);
+__efistub___flush_dcache_area  = KALLSYMS_HIDE(__pi___flush_dcache_area);
  
  #ifdef CONFIG_KASAN
-__efistub___memcpy             = __pi_memcpy;
-__efistub___memmove            = __pi_memmove;
-__efistub___memset             = __pi_memset;
+__efistub___memcpy             = KALLSYMS_HIDE(__pi_memcpy);
+__efistub___memmove            = KALLSYMS_HIDE(__pi_memmove);
+__efistub___memset             = KALLSYMS_HIDE(__pi_memset);
  #endif
  
-__efistub__text                        = _text;
-__efistub__end                 = _end;
-__efistub__edata               = _edata;
+__efistub__text                        = KALLSYMS_HIDE(_text);
+__efistub__end                 = KALLSYMS_HIDE(_end);
+__efistub__edata               = KALLSYMS_HIDE(_edata);
  
  #endif
  
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c

index c08b9ad6f42931e8766d0186daa51a6cce8dbe39..750f422f3f2c263a4a13cc59d20baa12b304a835 100644 (file)
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -30,6 +30,7 @@
  #include <asm/cacheflush.h>
  #include <asm/debug-monitors.h>
  #include <asm/fixmap.h>
+#include <asm/opcodes.h>
  #include <asm/insn.h>
  
  #define AARCH64_INSN_SF_BIT    BIT(31)
@@ -162,6 +163,32 @@ static bool __kprobes __aarch64_insn_hotpatch_safe(u32 insn)
                 aarch64_insn_is_nop(insn);
  }
  
+bool __kprobes aarch64_insn_uses_literal(u32 insn)
+{
+       /* ldr/ldrsw (literal), prfm */
+
+       return aarch64_insn_is_ldr_lit(insn) ||
+               aarch64_insn_is_ldrsw_lit(insn) ||
+               aarch64_insn_is_adr_adrp(insn) ||
+               aarch64_insn_is_prfm_lit(insn);
+}
+
+bool __kprobes aarch64_insn_is_branch(u32 insn)
+{
+       /* b, bl, cb*, tb*, b.cond, br, blr */
+
+       return aarch64_insn_is_b(insn) ||
+               aarch64_insn_is_bl(insn) ||
+               aarch64_insn_is_cbz(insn) ||
+               aarch64_insn_is_cbnz(insn) ||
+               aarch64_insn_is_tbz(insn) ||
+               aarch64_insn_is_tbnz(insn) ||
+               aarch64_insn_is_ret(insn) ||
+               aarch64_insn_is_br(insn) ||
+               aarch64_insn_is_blr(insn) ||
+               aarch64_insn_is_bcond(insn);
+}
+
  /*
   * ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a
   * Section B2.6.5 "Concurrent modification and execution of instructions":
@@ -1116,6 +1143,14 @@ u32 aarch64_set_branch_offset(u32 insn, s32 offset)
         BUG();
  }
  
+/*
+ * Extract the Op/CR data from a msr/mrs instruction.
+ */
+u32 aarch64_insn_extract_system_reg(u32 insn)
+{
+       return (insn & 0x1FFFE0) >> 5;
+}
+
  bool aarch32_insn_is_wide(u32 insn)
  {
         return insn >= 0xe800;
@@ -1141,3 +1176,101 @@ u32 aarch32_insn_mcr_extract_crm(u32 insn)
  {
         return insn & CRM_MASK;
  }
+
+static bool __kprobes __check_eq(unsigned long pstate)
+{
+       return (pstate & PSR_Z_BIT) != 0;
+}
+
+static bool __kprobes __check_ne(unsigned long pstate)
+{
+       return (pstate & PSR_Z_BIT) == 0;
+}
+
+static bool __kprobes __check_cs(unsigned long pstate)
+{
+       return (pstate & PSR_C_BIT) != 0;
+}
+
+static bool __kprobes __check_cc(unsigned long pstate)
+{
+       return (pstate & PSR_C_BIT) == 0;
+}
+
+static bool __kprobes __check_mi(unsigned long pstate)
+{
+       return (pstate & PSR_N_BIT) != 0;
+}
+
+static bool __kprobes __check_pl(unsigned long pstate)
+{
+       return (pstate & PSR_N_BIT) == 0;
+}
+
+static bool __kprobes __check_vs(unsigned long pstate)
+{
+       return (pstate & PSR_V_BIT) != 0;
+}
+
+static bool __kprobes __check_vc(unsigned long pstate)
+{
+       return (pstate & PSR_V_BIT) == 0;
+}
+
+static bool __kprobes __check_hi(unsigned long pstate)
+{
+       pstate &= ~(pstate >> 1);       /* PSR_C_BIT &= ~PSR_Z_BIT */
+       return (pstate & PSR_C_BIT) != 0;
+}
+
+static bool __kprobes __check_ls(unsigned long pstate)
+{
+       pstate &= ~(pstate >> 1);       /* PSR_C_BIT &= ~PSR_Z_BIT */
+       return (pstate & PSR_C_BIT) == 0;
+}
+
+static bool __kprobes __check_ge(unsigned long pstate)
+{
+       pstate ^= (pstate << 3);        /* PSR_N_BIT ^= PSR_V_BIT */
+       return (pstate & PSR_N_BIT) == 0;
+}
+
+static bool __kprobes __check_lt(unsigned long pstate)
+{
+       pstate ^= (pstate << 3);        /* PSR_N_BIT ^= PSR_V_BIT */
+       return (pstate & PSR_N_BIT) != 0;
+}
+
+static bool __kprobes __check_gt(unsigned long pstate)
+{
+       /*PSR_N_BIT ^= PSR_V_BIT */
+       unsigned long temp = pstate ^ (pstate << 3);
+
+       temp |= (pstate << 1);  /*PSR_N_BIT |= PSR_Z_BIT */
+       return (temp & PSR_N_BIT) == 0;
+}
+
+static bool __kprobes __check_le(unsigned long pstate)
+{
+       /*PSR_N_BIT ^= PSR_V_BIT */
+       unsigned long temp = pstate ^ (pstate << 3);
+
+       temp |= (pstate << 1);  /*PSR_N_BIT |= PSR_Z_BIT */
+       return (temp & PSR_N_BIT) != 0;
+}
+
+static bool __kprobes __check_al(unsigned long pstate)
+{
+       return true;
+}
+
+/*
+ * Note that the ARMv8 ARM calls condition code 0b1111 "nv", but states that
+ * it behaves identically to 0b1110 ("al").
+ */
+pstate_check_t * const aarch32_opcode_cond_checks[16] = {
+       __check_eq, __check_ne, __check_cs, __check_cc,
+       __check_mi, __check_pl, __check_vs, __check_vc,
+       __check_hi, __check_ls, __check_ge, __check_lt,
+       __check_gt, __check_le, __check_al, __check_al
+};
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c

index 9f17ec071ee0e8a8b1380133cf319a9ad1c80de5..2386b26c071274d4d563a4fdc5e864aedda37204 100644 (file)
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -30,6 +30,9 @@
  
  unsigned long irq_err_count;
  
+/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */
+DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16);
+
  int arch_show_interrupts(struct seq_file *p, int prec)
  {
         show_ipi_list(p, prec);
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c

new file mode 100644 (file)

index 0000000..b054691
--- /dev/null
+++ b/arch/arm64/kernel/kaslr.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/crc32.h>
+#include <linux/init.h>
+#include <linux/libfdt.h>
+#include <linux/mm_types.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#include <asm/fixmap.h>
+#include <asm/kernel-pgtable.h>
+#include <asm/memory.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+
+u64 __read_mostly module_alloc_base;
+u16 __initdata memstart_offset_seed;
+
+static __init u64 get_kaslr_seed(void *fdt)
+{
+       int node, len;
+       u64 *prop;
+       u64 ret;
+
+       node = fdt_path_offset(fdt, "/chosen");
+       if (node < 0)
+               return 0;
+
+       prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len);
+       if (!prop || len != sizeof(u64))
+               return 0;
+
+       ret = fdt64_to_cpu(*prop);
+       *prop = 0;
+       return ret;
+}
+
+static __init const u8 *get_cmdline(void *fdt)
+{
+       static __initconst const u8 default_cmdline[] = CONFIG_CMDLINE;
+
+       if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) {
+               int node;
+               const u8 *prop;
+
+               node = fdt_path_offset(fdt, "/chosen");
+               if (node < 0)
+                       goto out;
+
+               prop = fdt_getprop(fdt, node, "bootargs", NULL);
+               if (!prop)
+                       goto out;
+               return prop;
+       }
+out:
+       return default_cmdline;
+}
+
+extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size,
+                                      pgprot_t prot);
+
+/*
+ * This routine will be executed with the kernel mapped at its default virtual
+ * address, and if it returns successfully, the kernel will be remapped, and
+ * start_kernel() will be executed from a randomized virtual offset. The
+ * relocation will result in all absolute references (e.g., static variables
+ * containing function pointers) to be reinitialized, and zero-initialized
+ * .bss variables will be reset to 0.
+ */
+u64 __init kaslr_early_init(u64 dt_phys, u64 modulo_offset)
+{
+       void *fdt;
+       u64 seed, offset, mask, module_range;
+       const u8 *cmdline, *str;
+       int size;
+
+       /*
+        * Set a reasonable default for module_alloc_base in case
+        * we end up running with module randomization disabled.
+        */
+       module_alloc_base = (u64)_etext - MODULES_VSIZE;
+
+       /*
+        * Try to map the FDT early. If this fails, we simply bail,
+        * and proceed with KASLR disabled. We will make another
+        * attempt at mapping the FDT in setup_machine()
+        */
+       early_fixmap_init();
+       fdt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL);
+       if (!fdt)
+               return 0;
+
+       /*
+        * Retrieve (and wipe) the seed from the FDT
+        */
+       seed = get_kaslr_seed(fdt);
+       if (!seed)
+               return 0;
+
+       /*
+        * Check if 'nokaslr' appears on the command line, and
+        * return 0 if that is the case.
+        */
+       cmdline = get_cmdline(fdt);
+       str = strstr(cmdline, "nokaslr");
+       if (str == cmdline || (str > cmdline && *(str - 1) == ' '))
+               return 0;
+
+       /*
+        * OK, so we are proceeding with KASLR enabled. Calculate a suitable
+        * kernel image offset from the seed. Let's place the kernel in the
+        * lower half of the VMALLOC area (VA_BITS - 2).
+        * Even if we could randomize at page granularity for 16k and 64k pages,
+        * let's always round to 2 MB so we don't interfere with the ability to
+        * map using contiguous PTEs
+        */
+       mask = ((1UL << (VA_BITS - 2)) - 1) & ~(SZ_2M - 1);
+       offset = seed & mask;
+
+       /* use the top 16 bits to randomize the linear region */
+       memstart_offset_seed = seed >> 48;
+
+       /*
+        * The kernel Image should not extend across a 1GB/32MB/512MB alignment
+        * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this
+        * happens, increase the KASLR offset by the size of the kernel image.
+        */
+       if ((((u64)_text + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT) !=
+           (((u64)_end + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT))
+               offset = (offset + (u64)(_end - _text)) & mask;
+
+       if (IS_ENABLED(CONFIG_KASAN))
+               /*
+                * KASAN does not expect the module region to intersect the
+                * vmalloc region, since shadow memory is allocated for each
+                * module at load time, whereas the vmalloc region is shadowed
+                * by KASAN zero pages. So keep modules out of the vmalloc
+                * region if KASAN is enabled.
+                */
+               return offset;
+
+       if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
+               /*
+                * Randomize the module region independently from the core
+                * kernel. This prevents modules from leaking any information
+                * about the address of the kernel itself, but results in
+                * branches between modules and the core kernel that are
+                * resolved via PLTs. (Branches between modules will be
+                * resolved normally.)
+                */
+               module_range = VMALLOC_END - VMALLOC_START - MODULES_VSIZE;
+               module_alloc_base = VMALLOC_START;
+       } else {
+               /*
+                * Randomize the module region by setting module_alloc_base to
+                * a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE,
+                * _stext) . This guarantees that the resulting region still
+                * covers [_stext, _etext], and that all relative branches can
+                * be resolved without veneers.
+                */
+               module_range = MODULES_VSIZE - (u64)(_etext - _stext);
+               module_alloc_base = (u64)_etext + offset - MODULES_VSIZE;
+       }
+
+       /* use the lower 21 bits to randomize the base of the module region */
+       module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21;
+       module_alloc_base &= PAGE_MASK;
+
+       return offset;
+}
diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c

index bcac81e600b9af09341cffdd80d83880626f77b2..814d0c51b2f91b2e2e696ad81ec8066f3ed0d5be 100644 (file)
--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
@@ -22,6 +22,7 @@
  #include <linux/irq.h>
  #include <linux/kdebug.h>
  #include <linux/kgdb.h>
+#include <linux/kprobes.h>
  #include <asm/traps.h>
  
  struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
@@ -218,6 +219,7 @@ static int kgdb_brk_fn(struct pt_regs *regs, unsigned int esr)
         kgdb_handle_exception(1, SIGTRAP, 0, regs);
         return 0;
  }
+NOKPROBE_SYMBOL(kgdb_brk_fn)
  
  static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int esr)
  {
@@ -226,12 +228,14 @@ static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int esr)
  
         return 0;
  }
+NOKPROBE_SYMBOL(kgdb_compiled_brk_fn);
  
  static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr)
  {
         kgdb_handle_exception(1, SIGTRAP, 0, regs);
         return 0;
  }
+NOKPROBE_SYMBOL(kgdb_step_brk_fn);
  
  static struct break_hook kgdb_brkpt_hook = {
         .esr_mask       = 0xffffffff,
diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c

new file mode 100644 (file)

index 0000000..1ce90d8
--- /dev/null
+++ b/arch/arm64/kernel/module-plts.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (C) 2014-2016 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/elf.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sort.h>
+
+struct plt_entry {
+       /*
+        * A program that conforms to the AArch64 Procedure Call Standard
+        * (AAPCS64) must assume that a veneer that alters IP0 (x16) and/or
+        * IP1 (x17) may be inserted at any branch instruction that is
+        * exposed to a relocation that supports long branches. Since that
+        * is exactly what we are dealing with here, we are free to use x16
+        * as a scratch register in the PLT veneers.
+        */
+       __le32  mov0;   /* movn x16, #0x....                    */
+       __le32  mov1;   /* movk x16, #0x...., lsl #16           */
+       __le32  mov2;   /* movk x16, #0x...., lsl #32           */
+       __le32  br;     /* br   x16                             */
+};
+
+u64 module_emit_plt_entry(struct module *mod, const Elf64_Rela *rela,
+                         Elf64_Sym *sym)
+{
+       struct plt_entry *plt = (struct plt_entry *)mod->arch.plt->sh_addr;
+       int i = mod->arch.plt_num_entries;
+       u64 val = sym->st_value + rela->r_addend;
+
+       /*
+        * We only emit PLT entries against undefined (SHN_UNDEF) symbols,
+        * which are listed in the ELF symtab section, but without a type
+        * or a size.
+        * So, similar to how the module loader uses the Elf64_Sym::st_value
+        * field to store the resolved addresses of undefined symbols, let's
+        * borrow the Elf64_Sym::st_size field (whose value is never used by
+        * the module loader, even for symbols that are defined) to record
+        * the address of a symbol's associated PLT entry as we emit it for a
+        * zero addend relocation (which is the only kind we have to deal with
+        * in practice). This allows us to find duplicates without having to
+        * go through the table every time.
+        */
+       if (rela->r_addend == 0 && sym->st_size != 0) {
+               BUG_ON(sym->st_size < (u64)plt || sym->st_size >= (u64)&plt[i]);
+               return sym->st_size;
+       }
+
+       mod->arch.plt_num_entries++;
+       BUG_ON(mod->arch.plt_num_entries > mod->arch.plt_max_entries);
+
+       /*
+        * MOVK/MOVN/MOVZ opcode:
+        * +--------+------------+--------+-----------+-------------+---------+
+        * | sf[31] | opc[30:29] | 100101 | hw[22:21] | imm16[20:5] | Rd[4:0] |
+        * +--------+------------+--------+-----------+-------------+---------+
+        *
+        * Rd     := 0x10 (x16)
+        * hw     := 0b00 (no shift), 0b01 (lsl #16), 0b10 (lsl #32)
+        * opc    := 0b11 (MOVK), 0b00 (MOVN), 0b10 (MOVZ)
+        * sf     := 1 (64-bit variant)
+        */
+       plt[i] = (struct plt_entry){
+               cpu_to_le32(0x92800010 | (((~val      ) & 0xffff)) << 5),
+               cpu_to_le32(0xf2a00010 | ((( val >> 16) & 0xffff)) << 5),
+               cpu_to_le32(0xf2c00010 | ((( val >> 32) & 0xffff)) << 5),
+               cpu_to_le32(0xd61f0200)
+       };
+
+       if (rela->r_addend == 0)
+               sym->st_size = (u64)&plt[i];
+
+       return (u64)&plt[i];
+}
+
+#define cmp_3way(a,b)  ((a) < (b) ? -1 : (a) > (b))
+
+static int cmp_rela(const void *a, const void *b)
+{
+       const Elf64_Rela *x = a, *y = b;
+       int i;
+
+       /* sort by type, symbol index and addend */
+       i = cmp_3way(ELF64_R_TYPE(x->r_info), ELF64_R_TYPE(y->r_info));
+       if (i == 0)
+               i = cmp_3way(ELF64_R_SYM(x->r_info), ELF64_R_SYM(y->r_info));
+       if (i == 0)
+               i = cmp_3way(x->r_addend, y->r_addend);
+       return i;
+}
+
+static bool duplicate_rel(const Elf64_Rela *rela, int num)
+{
+       /*
+        * Entries are sorted by type, symbol index and addend. That means
+        * that, if a duplicate entry exists, it must be in the preceding
+        * slot.
+        */
+       return num > 0 && cmp_rela(rela + num, rela + num - 1) == 0;
+}
+
+static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num)
+{
+       unsigned int ret = 0;
+       Elf64_Sym *s;
+       int i;
+
+       for (i = 0; i < num; i++) {
+               switch (ELF64_R_TYPE(rela[i].r_info)) {
+               case R_AARCH64_JUMP26:
+               case R_AARCH64_CALL26:
+                       /*
+                        * We only have to consider branch targets that resolve
+                        * to undefined symbols. This is not simply a heuristic,
+                        * it is a fundamental limitation, since the PLT itself
+                        * is part of the module, and needs to be within 128 MB
+                        * as well, so modules can never grow beyond that limit.
+                        */
+                       s = syms + ELF64_R_SYM(rela[i].r_info);
+                       if (s->st_shndx != SHN_UNDEF)
+                               break;
+
+                       /*
+                        * Jump relocations with non-zero addends against
+                        * undefined symbols are supported by the ELF spec, but
+                        * do not occur in practice (e.g., 'jump n bytes past
+                        * the entry point of undefined function symbol f').
+                        * So we need to support them, but there is no need to
+                        * take them into consideration when trying to optimize
+                        * this code. So let's only check for duplicates when
+                        * the addend is zero: this allows us to record the PLT
+                        * entry address in the symbol table itself, rather than
+                        * having to search the list for duplicates each time we
+                        * emit one.
+                        */
+                       if (rela[i].r_addend != 0 || !duplicate_rel(rela, i))
+                               ret++;
+                       break;
+               }
+       }
+       return ret;
+}
+
+int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+                             char *secstrings, struct module *mod)
+{
+       unsigned long plt_max_entries = 0;
+       Elf64_Sym *syms = NULL;
+       int i;
+
+       /*
+        * Find the empty .plt section so we can expand it to store the PLT
+        * entries. Record the symtab address as well.
+        */
+       for (i = 0; i < ehdr->e_shnum; i++) {
+               if (strcmp(".plt", secstrings + sechdrs[i].sh_name) == 0)
+                       mod->arch.plt = sechdrs + i;
+               else if (sechdrs[i].sh_type == SHT_SYMTAB)
+                       syms = (Elf64_Sym *)sechdrs[i].sh_addr;
+       }
+
+       if (!mod->arch.plt) {
+               pr_err("%s: module PLT section missing\n", mod->name);
+               return -ENOEXEC;
+       }
+       if (!syms) {
+               pr_err("%s: module symtab section missing\n", mod->name);
+               return -ENOEXEC;
+       }
+
+       for (i = 0; i < ehdr->e_shnum; i++) {
+               Elf64_Rela *rels = (void *)ehdr + sechdrs[i].sh_offset;
+               int numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela);
+               Elf64_Shdr *dstsec = sechdrs + sechdrs[i].sh_info;
+
+               if (sechdrs[i].sh_type != SHT_RELA)
+                       continue;
+
+               /* ignore relocations that operate on non-exec sections */
+               if (!(dstsec->sh_flags & SHF_EXECINSTR))
+                       continue;
+
+               /* sort by type, symbol index and addend */
+               sort(rels, numrels, sizeof(Elf64_Rela), cmp_rela, NULL);
+
+               plt_max_entries += count_plts(syms, rels, numrels);
+       }
+
+       mod->arch.plt->sh_type = SHT_NOBITS;
+       mod->arch.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+       mod->arch.plt->sh_addralign = L1_CACHE_BYTES;
+       mod->arch.plt->sh_size = plt_max_entries * sizeof(struct plt_entry);
+       mod->arch.plt_num_entries = 0;
+       mod->arch.plt_max_entries = plt_max_entries;
+       return 0;
+}
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c

index f4bc779e62e887547b7a17b7672487f0851e1479..7f316982ce00186262728518f3a03f7871fb7dd7 100644 (file)
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -30,17 +30,30 @@
  #include <asm/insn.h>
  #include <asm/sections.h>
  
-#define        AARCH64_INSN_IMM_MOVNZ          AARCH64_INSN_IMM_MAX
-#define        AARCH64_INSN_IMM_MOVK           AARCH64_INSN_IMM_16
-
  void *module_alloc(unsigned long size)
  {
         void *p;
  
-       p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
+       p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
+                               module_alloc_base + MODULES_VSIZE,
                                 GFP_KERNEL, PAGE_KERNEL_EXEC, 0,
                                 NUMA_NO_NODE, __builtin_return_address(0));
  
+       if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
+           !IS_ENABLED(CONFIG_KASAN))
+               /*
+                * KASAN can only deal with module allocations being served
+                * from the reserved module region, since the remainder of
+                * the vmalloc region is already backed by zero shadow pages,
+                * and punching holes into it is non-trivial. Since the module
+                * region is not randomized when KASAN is enabled, it is even
+                * less likely that the module region gets exhausted, so we
+                * can simply omit this fallback in that case.
+                */
+               p = __vmalloc_node_range(size, MODULE_ALIGN, VMALLOC_START,
+                               VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0,
+                               NUMA_NO_NODE, __builtin_return_address(0));
+
         if (p && (kasan_module_alloc(p, size) < 0)) {
                 vfree(p);
                 return NULL;
@@ -75,15 +88,18 @@ static u64 do_reloc(enum aarch64_reloc_op reloc_op, void *place, u64 val)
  
  static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len)
  {
-       u64 imm_mask = (1 << len) - 1;
         s64 sval = do_reloc(op, place, val);
  
         switch (len) {
         case 16:
                 *(s16 *)place = sval;
+               if (sval < S16_MIN || sval > U16_MAX)
+                       return -ERANGE;
                 break;
         case 32:
                 *(s32 *)place = sval;
+               if (sval < S32_MIN || sval > U32_MAX)
+                       return -ERANGE;
                 break;
         case 64:
                 *(s64 *)place = sval;
@@ -92,34 +108,23 @@ static int reloc_data(enum aarch64_reloc_op op, void *place, u64 val, int len)
                 pr_err("Invalid length (%d) for data relocation\n", len);
                 return 0;
         }
-
-       /*
-        * Extract the upper value bits (including the sign bit) and
-        * shift them to bit 0.
-        */
-       sval = (s64)(sval & ~(imm_mask >> 1)) >> (len - 1);
-
-       /*
-        * Overflow has occurred if the value is not representable in
-        * len bits (i.e the bottom len bits are not sign-extended and
-        * the top bits are not all zero).
-        */
-       if ((u64)(sval + 1) > 2)
-               return -ERANGE;
-
         return 0;
  }
  
+enum aarch64_insn_movw_imm_type {
+       AARCH64_INSN_IMM_MOVNZ,
+       AARCH64_INSN_IMM_MOVKZ,
+};
+
  static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val,
-                          int lsb, enum aarch64_insn_imm_type imm_type)
+                          int lsb, enum aarch64_insn_movw_imm_type imm_type)
  {
-       u64 imm, limit = 0;
+       u64 imm;
         s64 sval;
         u32 insn = le32_to_cpu(*(u32 *)place);
  
         sval = do_reloc(op, place, val);
-       sval >>= lsb;
-       imm = sval & 0xffff;
+       imm = sval >> lsb;
  
         if (imm_type == AARCH64_INSN_IMM_MOVNZ) {
                 /*
@@ -128,7 +133,7 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val,
                  * immediate is less than zero.
                  */
                 insn &= ~(3 << 29);
-               if ((s64)imm >= 0) {
+               if (sval >= 0) {
                         /* >=0: Set the instruction to MOVZ (opcode 10b). */
                         insn |= 2 << 29;
                 } else {
@@ -140,29 +145,13 @@ static int reloc_insn_movw(enum aarch64_reloc_op op, void *place, u64 val,
                          */
                         imm = ~imm;
                 }
-               imm_type = AARCH64_INSN_IMM_MOVK;
         }
  
         /* Update the instruction with the new encoding. */
-       insn = aarch64_insn_encode_immediate(imm_type, insn, imm);
+       insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm);
         *(u32 *)place = cpu_to_le32(insn);
  
-       /* Shift out the immediate field. */
-       sval >>= 16;
-
-       /*
-        * For unsigned immediates, the overflow check is straightforward.
-        * For signed immediates, the sign bit is actually the bit past the
-        * most significant bit of the field.
-        * The AARCH64_INSN_IMM_16 immediate type is unsigned.
-        */
-       if (imm_type != AARCH64_INSN_IMM_16) {
-               sval++;
-               limit++;
-       }
-
-       /* Check the upper bits depending on the sign of the immediate. */
-       if ((u64)sval > limit)
+       if (imm > U16_MAX)
                 return -ERANGE;
  
         return 0;
@@ -267,25 +256,25 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
                         overflow_check = false;
                 case R_AARCH64_MOVW_UABS_G0:
                         ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0,
-                                             AARCH64_INSN_IMM_16);
+                                             AARCH64_INSN_IMM_MOVKZ);
                         break;
                 case R_AARCH64_MOVW_UABS_G1_NC:
                         overflow_check = false;
                 case R_AARCH64_MOVW_UABS_G1:
                         ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16,
-                                             AARCH64_INSN_IMM_16);
+                                             AARCH64_INSN_IMM_MOVKZ);
                         break;
                 case R_AARCH64_MOVW_UABS_G2_NC:
                         overflow_check = false;
                 case R_AARCH64_MOVW_UABS_G2:
                         ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32,
-                                             AARCH64_INSN_IMM_16);
+                                             AARCH64_INSN_IMM_MOVKZ);
                         break;
                 case R_AARCH64_MOVW_UABS_G3:
                         /* We're using the top bits so we can't overflow. */
                         overflow_check = false;
                         ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 48,
-                                             AARCH64_INSN_IMM_16);
+                                             AARCH64_INSN_IMM_MOVKZ);
                         break;
                 case R_AARCH64_MOVW_SABS_G0:
                         ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0,
@@ -302,7 +291,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
                 case R_AARCH64_MOVW_PREL_G0_NC:
                         overflow_check = false;
                         ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0,
-                                             AARCH64_INSN_IMM_MOVK);
+                                             AARCH64_INSN_IMM_MOVKZ);
                         break;
                 case R_AARCH64_MOVW_PREL_G0:
                         ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0,
@@ -311,7 +300,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
                 case R_AARCH64_MOVW_PREL_G1_NC:
                         overflow_check = false;
                         ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16,
-                                             AARCH64_INSN_IMM_MOVK);
+                                             AARCH64_INSN_IMM_MOVKZ);
                         break;
                 case R_AARCH64_MOVW_PREL_G1:
                         ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16,
@@ -320,7 +309,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
                 case R_AARCH64_MOVW_PREL_G2_NC:
                         overflow_check = false;
                         ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32,
-                                             AARCH64_INSN_IMM_MOVK);
+                                             AARCH64_INSN_IMM_MOVKZ);
                         break;
                 case R_AARCH64_MOVW_PREL_G2:
                         ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 32,
@@ -388,6 +377,13 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
                 case R_AARCH64_CALL26:
                         ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26,
                                              AARCH64_INSN_IMM_26);
+
+                       if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
+                           ovf == -ERANGE) {
+                               val = module_emit_plt_entry(me, &rel[i], sym);
+                               ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2,
+                                                    26, AARCH64_INSN_IMM_26);
+                       }
                         break;
  
                 default:
diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/kernel/module.lds

new file mode 100644 (file)

index 0000000..8949f6c
--- /dev/null
+++ b/arch/arm64/kernel/module.lds
@@ -0,0 +1,3 @@
+SECTIONS {
+       .plt (NOLOAD) : { BYTE(0) }
+}
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c

index 3aa74830cc69af0053efb77c72e9c26ae3bd6c02..ff4665462a025d4ec2655ca30d49732a63194e53 100644 (file)
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -164,8 +164,11 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
         frame.fp = regs->regs[29];
         frame.sp = regs->sp;
         frame.pc = regs->pc;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       frame.graph = current->curr_ret_stack;
+#endif
  
-       walk_stackframe(&frame, callchain_trace, entry);
+       walk_stackframe(current, &frame, callchain_trace, entry);
  }
  
  unsigned long perf_instruction_pointer(struct pt_regs *regs)
diff --git a/arch/arm64/kernel/probes/Makefile b/arch/arm64/kernel/probes/Makefile

new file mode 100644 (file)

index 0000000..ce06312
--- /dev/null
+++ b/arch/arm64/kernel/probes/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_KPROBES)          += kprobes.o decode-insn.o      \
+                                  kprobes_trampoline.o         \
+                                  simulate-insn.o
diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c

new file mode 100644 (file)

index 0000000..f7931d9
--- /dev/null
+++ b/arch/arm64/kernel/probes/decode-insn.c
@@ -0,0 +1,174 @@
+/*
+ * arch/arm64/kernel/probes/decode-insn.c
+ *
+ * Copyright (C) 2013 Linaro Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <asm/kprobes.h>
+#include <asm/insn.h>
+#include <asm/sections.h>
+
+#include "decode-insn.h"
+#include "simulate-insn.h"
+
+static bool __kprobes aarch64_insn_is_steppable(u32 insn)
+{
+       /*
+        * Branch instructions will write a new value into the PC which is
+        * likely to be relative to the XOL address and therefore invalid.
+        * Deliberate generation of an exception during stepping is also not
+        * currently safe. Lastly, MSR instructions can do any number of nasty
+        * things we can't handle during single-stepping.
+        */
+       if (aarch64_get_insn_class(insn) == AARCH64_INSN_CLS_BR_SYS) {
+               if (aarch64_insn_is_branch(insn) ||
+                   aarch64_insn_is_msr_imm(insn) ||
+                   aarch64_insn_is_msr_reg(insn) ||
+                   aarch64_insn_is_exception(insn) ||
+                   aarch64_insn_is_eret(insn))
+                       return false;
+
+               /*
+                * The MRS instruction may not return a correct value when
+                * executing in the single-stepping environment. We do make one
+                * exception, for reading the DAIF bits.
+                */
+               if (aarch64_insn_is_mrs(insn))
+                       return aarch64_insn_extract_system_reg(insn)
+                            != AARCH64_INSN_SPCLREG_DAIF;
+
+               /*
+                * The HINT instruction is is problematic when single-stepping,
+                * except for the NOP case.
+                */
+               if (aarch64_insn_is_hint(insn))
+                       return aarch64_insn_is_nop(insn);
+
+               return true;
+       }
+
+       /*
+        * Instructions which load PC relative literals are not going to work
+        * when executed from an XOL slot. Instructions doing an exclusive
+        * load/store are not going to complete successfully when single-step
+        * exception handling happens in the middle of the sequence.
+        */
+       if (aarch64_insn_uses_literal(insn) ||
+           aarch64_insn_is_exclusive(insn))
+               return false;
+
+       return true;
+}
+
+/* Return:
+ *   INSN_REJECTED     If instruction is one not allowed to kprobe,
+ *   INSN_GOOD         If instruction is supported and uses instruction slot,
+ *   INSN_GOOD_NO_SLOT If instruction is supported but doesn't use its slot.
+ */
+static enum kprobe_insn __kprobes
+arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi)
+{
+       /*
+        * Instructions reading or modifying the PC won't work from the XOL
+        * slot.
+        */
+       if (aarch64_insn_is_steppable(insn))
+               return INSN_GOOD;
+
+       if (aarch64_insn_is_bcond(insn)) {
+               asi->handler = simulate_b_cond;
+       } else if (aarch64_insn_is_cbz(insn) ||
+           aarch64_insn_is_cbnz(insn)) {
+               asi->handler = simulate_cbz_cbnz;
+       } else if (aarch64_insn_is_tbz(insn) ||
+           aarch64_insn_is_tbnz(insn)) {
+               asi->handler = simulate_tbz_tbnz;
+       } else if (aarch64_insn_is_adr_adrp(insn)) {
+               asi->handler = simulate_adr_adrp;
+       } else if (aarch64_insn_is_b(insn) ||
+           aarch64_insn_is_bl(insn)) {
+               asi->handler = simulate_b_bl;
+       } else if (aarch64_insn_is_br(insn) ||
+           aarch64_insn_is_blr(insn) ||
+           aarch64_insn_is_ret(insn)) {
+               asi->handler = simulate_br_blr_ret;
+       } else if (aarch64_insn_is_ldr_lit(insn)) {
+               asi->handler = simulate_ldr_literal;
+       } else if (aarch64_insn_is_ldrsw_lit(insn)) {
+               asi->handler = simulate_ldrsw_literal;
+       } else {
+               /*
+                * Instruction cannot be stepped out-of-line and we don't
+                * (yet) simulate it.
+                */
+               return INSN_REJECTED;
+       }
+
+       return INSN_GOOD_NO_SLOT;
+}
+
+static bool __kprobes
+is_probed_address_atomic(kprobe_opcode_t *scan_start, kprobe_opcode_t *scan_end)
+{
+       while (scan_start > scan_end) {
+               /*
+                * atomic region starts from exclusive load and ends with
+                * exclusive store.
+                */
+               if (aarch64_insn_is_store_ex(le32_to_cpu(*scan_start)))
+                       return false;
+               else if (aarch64_insn_is_load_ex(le32_to_cpu(*scan_start)))
+                       return true;
+               scan_start--;
+       }
+
+       return false;
+}
+
+enum kprobe_insn __kprobes
+arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi)
+{
+       enum kprobe_insn decoded;
+       kprobe_opcode_t insn = le32_to_cpu(*addr);
+       kprobe_opcode_t *scan_start = addr - 1;
+       kprobe_opcode_t *scan_end = addr - MAX_ATOMIC_CONTEXT_SIZE;
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+       struct module *mod;
+#endif
+
+       if (addr >= (kprobe_opcode_t *)_text &&
+           scan_end < (kprobe_opcode_t *)_text)
+               scan_end = (kprobe_opcode_t *)_text;
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+       else {
+               preempt_disable();
+               mod = __module_address((unsigned long)addr);
+               if (mod && within_module_init((unsigned long)addr, mod) &&
+                       !within_module_init((unsigned long)scan_end, mod))
+                       scan_end = (kprobe_opcode_t *)mod->module_init;
+               else if (mod && within_module_core((unsigned long)addr, mod) &&
+                       !within_module_core((unsigned long)scan_end, mod))
+                       scan_end = (kprobe_opcode_t *)mod->module_core;
+               preempt_enable();
+       }
+#endif
+       decoded = arm_probe_decode_insn(insn, asi);
+
+       if (decoded == INSN_REJECTED ||
+                       is_probed_address_atomic(scan_start, scan_end))
+               return INSN_REJECTED;
+
+       return decoded;
+}
diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h

new file mode 100644 (file)

index 0000000..d438289
--- /dev/null
+++ b/arch/arm64/kernel/probes/decode-insn.h
@@ -0,0 +1,35 @@
+/*
+ * arch/arm64/kernel/probes/decode-insn.h
+ *
+ * Copyright (C) 2013 Linaro Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _ARM_KERNEL_KPROBES_ARM64_H
+#define _ARM_KERNEL_KPROBES_ARM64_H
+
+/*
+ * ARM strongly recommends a limit of 128 bytes between LoadExcl and
+ * StoreExcl instructions in a single thread of execution. So keep the
+ * max atomic context size as 32.
+ */
+#define MAX_ATOMIC_CONTEXT_SIZE        (128 / sizeof(kprobe_opcode_t))
+
+enum kprobe_insn {
+       INSN_REJECTED,
+       INSN_GOOD_NO_SLOT,
+       INSN_GOOD,
+};
+
+enum kprobe_insn __kprobes
+arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi);
+
+#endif /* _ARM_KERNEL_KPROBES_ARM64_H */
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c

new file mode 100644 (file)

index 0000000..1ee93c7
--- /dev/null
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -0,0 +1,657 @@
+/*
+ * arch/arm64/kernel/probes/kprobes.c
+ *
+ * Kprobes support for ARM64
+ *
+ * Copyright (C) 2013 Linaro Limited.
+ * Author: Sandeepa Prabhu <sandeepa.prabhu@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/stop_machine.h>
+#include <linux/stringify.h>
+#include <asm/traps.h>
+#include <asm/ptrace.h>
+#include <asm/cacheflush.h>
+#include <asm/debug-monitors.h>
+#include <asm/system_misc.h>
+#include <asm/insn.h>
+#include <asm/uaccess.h>
+#include <asm/irq.h>
+#include <asm-generic/sections.h>
+
+#include "decode-insn.h"
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+static void __kprobes
+post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *);
+
+static void __kprobes arch_prepare_ss_slot(struct kprobe *p)
+{
+       /* prepare insn slot */
+       p->ainsn.insn[0] = cpu_to_le32(p->opcode);
+
+       flush_icache_range((uintptr_t) (p->ainsn.insn),
+                          (uintptr_t) (p->ainsn.insn) +
+                          MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+
+       /*
+        * Needs restoring of return address after stepping xol.
+        */
+       p->ainsn.restore = (unsigned long) p->addr +
+         sizeof(kprobe_opcode_t);
+}
+
+static void __kprobes arch_prepare_simulate(struct kprobe *p)
+{
+       /* This instructions is not executed xol. No need to adjust the PC */
+       p->ainsn.restore = 0;
+}
+
+static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs)
+{
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       if (p->ainsn.handler)
+               p->ainsn.handler((u32)p->opcode, (long)p->addr, regs);
+
+       /* single step simulated, now go for post processing */
+       post_kprobe_handler(kcb, regs);
+}
+
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
+{
+       unsigned long probe_addr = (unsigned long)p->addr;
+       extern char __start_rodata[];
+       extern char __end_rodata[];
+
+       if (probe_addr & 0x3)
+               return -EINVAL;
+
+       /* copy instruction */
+       p->opcode = le32_to_cpu(*p->addr);
+
+       if (in_exception_text(probe_addr))
+               return -EINVAL;
+       if (probe_addr >= (unsigned long) __start_rodata &&
+           probe_addr <= (unsigned long) __end_rodata)
+               return -EINVAL;
+
+       /* decode instruction */
+       switch (arm_kprobe_decode_insn(p->addr, &p->ainsn)) {
+       case INSN_REJECTED:     /* insn not supported */
+               return -EINVAL;
+
+       case INSN_GOOD_NO_SLOT: /* insn need simulation */
+               p->ainsn.insn = NULL;
+               break;
+
+       case INSN_GOOD: /* instruction uses slot */
+               p->ainsn.insn = get_insn_slot();
+               if (!p->ainsn.insn)
+                       return -ENOMEM;
+               break;
+       };
+
+       /* prepare the instruction */
+       if (p->ainsn.insn)
+               arch_prepare_ss_slot(p);
+       else
+               arch_prepare_simulate(p);
+
+       return 0;
+}
+
+static int __kprobes patch_text(kprobe_opcode_t *addr, u32 opcode)
+{
+       void *addrs[1];
+       u32 insns[1];
+
+       addrs[0] = (void *)addr;
+       insns[0] = (u32)opcode;
+
+       return aarch64_insn_patch_text(addrs, insns, 1);
+}
+
+/* arm kprobe: install breakpoint in text */
+void __kprobes arch_arm_kprobe(struct kprobe *p)
+{
+       patch_text(p->addr, BRK64_OPCODE_KPROBES);
+}
+
+/* disarm kprobe: remove breakpoint from text */
+void __kprobes arch_disarm_kprobe(struct kprobe *p)
+{
+       patch_text(p->addr, p->opcode);
+}
+
+void __kprobes arch_remove_kprobe(struct kprobe *p)
+{
+       if (p->ainsn.insn) {
+               free_insn_slot(p->ainsn.insn, 0);
+               p->ainsn.insn = NULL;
+       }
+}
+
+static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+       kcb->prev_kprobe.kp = kprobe_running();
+       kcb->prev_kprobe.status = kcb->kprobe_status;
+}
+
+static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+       __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
+       kcb->kprobe_status = kcb->prev_kprobe.status;
+}
+
+static void __kprobes set_current_kprobe(struct kprobe *p)
+{
+       __this_cpu_write(current_kprobe, p);
+}
+
+/*
+ * The D-flag (Debug mask) is set (masked) upon debug exception entry.
+ * Kprobes needs to clear (unmask) D-flag -ONLY- in case of recursive
+ * probe i.e. when probe hit from kprobe handler context upon
+ * executing the pre/post handlers. In this case we return with
+ * D-flag clear so that single-stepping can be carried-out.
+ *
+ * Leave D-flag set in all other cases.
+ */
+static void __kprobes
+spsr_set_debug_flag(struct pt_regs *regs, int mask)
+{
+       unsigned long spsr = regs->pstate;
+
+       if (mask)
+               spsr |= PSR_D_BIT;
+       else
+               spsr &= ~PSR_D_BIT;
+
+       regs->pstate = spsr;
+}
+
+/*
+ * Interrupts need to be disabled before single-step mode is set, and not
+ * reenabled until after single-step mode ends.
+ * Without disabling interrupt on local CPU, there is a chance of
+ * interrupt occurrence in the period of exception return and  start of
+ * out-of-line single-step, that result in wrongly single stepping
+ * into the interrupt handler.
+ */
+static void __kprobes kprobes_save_local_irqflag(struct kprobe_ctlblk *kcb,
+                                               struct pt_regs *regs)
+{
+       kcb->saved_irqflag = regs->pstate;
+       regs->pstate |= PSR_I_BIT;
+}
+
+static void __kprobes kprobes_restore_local_irqflag(struct kprobe_ctlblk *kcb,
+                                               struct pt_regs *regs)
+{
+       if (kcb->saved_irqflag & PSR_I_BIT)
+               regs->pstate |= PSR_I_BIT;
+       else
+               regs->pstate &= ~PSR_I_BIT;
+}
+
+static void __kprobes
+set_ss_context(struct kprobe_ctlblk *kcb, unsigned long addr)
+{
+       kcb->ss_ctx.ss_pending = true;
+       kcb->ss_ctx.match_addr = addr + sizeof(kprobe_opcode_t);
+}
+
+static void __kprobes clear_ss_context(struct kprobe_ctlblk *kcb)
+{
+       kcb->ss_ctx.ss_pending = false;
+       kcb->ss_ctx.match_addr = 0;
+}
+
+static void __kprobes setup_singlestep(struct kprobe *p,
+                                      struct pt_regs *regs,
+                                      struct kprobe_ctlblk *kcb, int reenter)
+{
+       unsigned long slot;
+
+       if (reenter) {
+               save_previous_kprobe(kcb);
+               set_current_kprobe(p);
+               kcb->kprobe_status = KPROBE_REENTER;
+       } else {
+               kcb->kprobe_status = KPROBE_HIT_SS;
+       }
+
+
+       if (p->ainsn.insn) {
+               /* prepare for single stepping */
+               slot = (unsigned long)p->ainsn.insn;
+
+               set_ss_context(kcb, slot);      /* mark pending ss */
+
+               if (kcb->kprobe_status == KPROBE_REENTER)
+                       spsr_set_debug_flag(regs, 0);
+               else
+                       WARN_ON(regs->pstate & PSR_D_BIT);
+
+               /* IRQs and single stepping do not mix well. */
+               kprobes_save_local_irqflag(kcb, regs);
+               kernel_enable_single_step(regs);
+               instruction_pointer_set(regs, slot);
+       } else {
+               /* insn simulation */
+               arch_simulate_insn(p, regs);
+       }
+}
+
+static int __kprobes reenter_kprobe(struct kprobe *p,
+                                   struct pt_regs *regs,
+                                   struct kprobe_ctlblk *kcb)
+{
+       switch (kcb->kprobe_status) {
+       case KPROBE_HIT_SSDONE:
+       case KPROBE_HIT_ACTIVE:
+               kprobes_inc_nmissed_count(p);
+               setup_singlestep(p, regs, kcb, 1);
+               break;
+       case KPROBE_HIT_SS:
+       case KPROBE_REENTER:
+               pr_warn("Unrecoverable kprobe detected at %p.\n", p->addr);
+               dump_kprobe(p);
+               BUG();
+               break;
+       default:
+               WARN_ON(1);
+               return 0;
+       }
+
+       return 1;
+}
+
+static void __kprobes
+post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs)
+{
+       struct kprobe *cur = kprobe_running();
+
+       if (!cur)
+               return;
+
+       /* return addr restore if non-branching insn */
+       if (cur->ainsn.restore != 0)
+               instruction_pointer_set(regs, cur->ainsn.restore);
+
+       /* restore back original saved kprobe variables and continue */
+       if (kcb->kprobe_status == KPROBE_REENTER) {
+               restore_previous_kprobe(kcb);
+               return;
+       }
+       /* call post handler */
+       kcb->kprobe_status = KPROBE_HIT_SSDONE;
+       if (cur->post_handler)  {
+               /* post_handler can hit breakpoint and single step
+                * again, so we enable D-flag for recursive exception.
+                */
+               cur->post_handler(cur, regs, 0);
+       }
+
+       reset_current_kprobe();
+}
+
+int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr)
+{
+       struct kprobe *cur = kprobe_running();
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       switch (kcb->kprobe_status) {
+       case KPROBE_HIT_SS:
+       case KPROBE_REENTER:
+               /*
+                * We are here because the instruction being single
+                * stepped caused a page fault. We reset the current
+                * kprobe and the ip points back to the probe address
+                * and allow the page fault handler to continue as a
+                * normal page fault.
+                */
+               instruction_pointer_set(regs, (unsigned long) cur->addr);
+               if (!instruction_pointer(regs))
+                       BUG();
+
+               kernel_disable_single_step();
+               if (kcb->kprobe_status == KPROBE_REENTER)
+                       spsr_set_debug_flag(regs, 1);
+
+               if (kcb->kprobe_status == KPROBE_REENTER)
+                       restore_previous_kprobe(kcb);
+               else
+                       reset_current_kprobe();
+
+               break;
+       case KPROBE_HIT_ACTIVE:
+       case KPROBE_HIT_SSDONE:
+               /*
+                * We increment the nmissed count for accounting,
+                * we can also use npre/npostfault count for accounting
+                * these specific fault cases.
+                */
+               kprobes_inc_nmissed_count(cur);
+
+               /*
+                * We come here because instructions in the pre/post
+                * handler caused the page_fault, this could happen
+                * if handler tries to access user space by
+                * copy_from_user(), get_user() etc. Let the
+                * user-specified handler try to fix it first.
+                */
+               if (cur->fault_handler && cur->fault_handler(cur, regs, fsr))
+                       return 1;
+
+               /*
+                * In case the user-specified fault handler returned
+                * zero, try to fix up.
+                */
+               if (fixup_exception(regs))
+                       return 1;
+       }
+       return 0;
+}
+
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+                                      unsigned long val, void *data)
+{
+       return NOTIFY_DONE;
+}
+
+static void __kprobes kprobe_handler(struct pt_regs *regs)
+{
+       struct kprobe *p, *cur_kprobe;
+       struct kprobe_ctlblk *kcb;
+       unsigned long addr = instruction_pointer(regs);
+
+       kcb = get_kprobe_ctlblk();
+       cur_kprobe = kprobe_running();
+
+       p = get_kprobe((kprobe_opcode_t *) addr);
+
+       if (p) {
+               if (cur_kprobe) {
+                       if (reenter_kprobe(p, regs, kcb))
+                               return;
+               } else {
+                       /* Probe hit */
+                       set_current_kprobe(p);
+                       kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+                       /*
+                        * If we have no pre-handler or it returned 0, we
+                        * continue with normal processing.  If we have a
+                        * pre-handler and it returned non-zero, it prepped
+                        * for calling the break_handler below on re-entry,
+                        * so get out doing nothing more here.
+                        *
+                        * pre_handler can hit a breakpoint and can step thru
+                        * before return, keep PSTATE D-flag enabled until
+                        * pre_handler return back.
+                        */
+                       if (!p->pre_handler || !p->pre_handler(p, regs)) {
+                               setup_singlestep(p, regs, kcb, 0);
+                               return;
+                       }
+               }
+       } else if ((le32_to_cpu(*(kprobe_opcode_t *) addr) ==
+           BRK64_OPCODE_KPROBES) && cur_kprobe) {
+               /* We probably hit a jprobe.  Call its break handler. */
+               if (cur_kprobe->break_handler  &&
+                    cur_kprobe->break_handler(cur_kprobe, regs)) {
+                       setup_singlestep(cur_kprobe, regs, kcb, 0);
+                       return;
+               }
+       }
+       /*
+        * The breakpoint instruction was removed right
+        * after we hit it.  Another cpu has removed
+        * either a probepoint or a debugger breakpoint
+        * at this address.  In either case, no further
+        * handling of this interrupt is appropriate.
+        * Return back to original instruction, and continue.
+        */
+}
+
+static int __kprobes
+kprobe_ss_hit(struct kprobe_ctlblk *kcb, unsigned long addr)
+{
+       if ((kcb->ss_ctx.ss_pending)
+           && (kcb->ss_ctx.match_addr == addr)) {
+               clear_ss_context(kcb);  /* clear pending ss */
+               return DBG_HOOK_HANDLED;
+       }
+       /* not ours, kprobes should ignore it */
+       return DBG_HOOK_ERROR;
+}
+
+int __kprobes
+kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr)
+{
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+       int retval;
+
+       /* return error if this is not our step */
+       retval = kprobe_ss_hit(kcb, instruction_pointer(regs));
+
+       if (retval == DBG_HOOK_HANDLED) {
+               kprobes_restore_local_irqflag(kcb, regs);
+               kernel_disable_single_step();
+
+               if (kcb->kprobe_status == KPROBE_REENTER)
+                       spsr_set_debug_flag(regs, 1);
+
+               post_kprobe_handler(kcb, regs);
+       }
+
+       return retval;
+}
+
+int __kprobes
+kprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr)
+{
+       kprobe_handler(regs);
+       return DBG_HOOK_HANDLED;
+}
+
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+       struct jprobe *jp = container_of(p, struct jprobe, kp);
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       kcb->jprobe_saved_regs = *regs;
+       /*
+        * Since we can't be sure where in the stack frame "stacked"
+        * pass-by-value arguments are stored we just don't try to
+        * duplicate any of the stack. Do not use jprobes on functions that
+        * use more than 64 bytes (after padding each to an 8 byte boundary)
+        * of arguments, or pass individual arguments larger than 16 bytes.
+        */
+
+       instruction_pointer_set(regs, (unsigned long) jp->entry);
+       preempt_disable();
+       pause_graph_tracing();
+       return 1;
+}
+
+void __kprobes jprobe_return(void)
+{
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       /*
+        * Jprobe handler return by entering break exception,
+        * encoded same as kprobe, but with following conditions
+        * -a special PC to identify it from the other kprobes.
+        * -restore stack addr to original saved pt_regs
+        */
+       asm volatile("                          mov sp, %0      \n"
+                    "jprobe_return_break:      brk %1          \n"
+                    :
+                    : "r" (kcb->jprobe_saved_regs.sp),
+                      "I" (BRK64_ESR_KPROBES)
+                    : "memory");
+
+       unreachable();
+}
+
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+       long stack_addr = kcb->jprobe_saved_regs.sp;
+       long orig_sp = kernel_stack_pointer(regs);
+       struct jprobe *jp = container_of(p, struct jprobe, kp);
+       extern const char jprobe_return_break[];
+
+       if (instruction_pointer(regs) != (u64) jprobe_return_break)
+               return 0;
+
+       if (orig_sp != stack_addr) {
+               struct pt_regs *saved_regs =
+                   (struct pt_regs *)kcb->jprobe_saved_regs.sp;
+               pr_err("current sp %lx does not match saved sp %lx\n",
+                      orig_sp, stack_addr);
+               pr_err("Saved registers for jprobe %p\n", jp);
+               show_regs(saved_regs);
+               pr_err("Current registers\n");
+               show_regs(regs);
+               BUG();
+       }
+       unpause_graph_tracing();
+       *regs = kcb->jprobe_saved_regs;
+       preempt_enable_no_resched();
+       return 1;
+}
+
+bool arch_within_kprobe_blacklist(unsigned long addr)
+{
+       extern char __idmap_text_start[], __idmap_text_end[];
+
+       if ((addr >= (unsigned long)__kprobes_text_start &&
+           addr < (unsigned long)__kprobes_text_end) ||
+           (addr >= (unsigned long)__entry_text_start &&
+           addr < (unsigned long)__entry_text_end) ||
+           (addr >= (unsigned long)__idmap_text_start &&
+           addr < (unsigned long)__idmap_text_end) ||
+           !!search_exception_tables(addr))
+               return true;
+
+
+       return false;
+}
+
+void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs)
+{
+       struct kretprobe_instance *ri = NULL;
+       struct hlist_head *head, empty_rp;
+       struct hlist_node *tmp;
+       unsigned long flags, orig_ret_address = 0;
+       unsigned long trampoline_address =
+               (unsigned long)&kretprobe_trampoline;
+       kprobe_opcode_t *correct_ret_addr = NULL;
+
+       INIT_HLIST_HEAD(&empty_rp);
+       kretprobe_hash_lock(current, &head, &flags);
+
+       /*
+        * It is possible to have multiple instances associated with a given
+        * task either because multiple functions in the call path have
+        * return probes installed on them, and/or more than one
+        * return probe was registered for a target function.
+        *
+        * We can handle this because:
+        *     - instances are always pushed into the head of the list
+        *     - when multiple return probes are registered for the same
+        *       function, the (chronologically) first instance's ret_addr
+        *       will be the real return address, and all the rest will
+        *       point to kretprobe_trampoline.
+        */
+       hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+               if (ri->task != current)
+                       /* another task is sharing our hash bucket */
+                       continue;
+
+               orig_ret_address = (unsigned long)ri->ret_addr;
+
+               if (orig_ret_address != trampoline_address)
+                       /*
+                        * This is the real return address. Any other
+                        * instances associated with this task are for
+                        * other calls deeper on the call stack
+                        */
+                       break;
+       }
+
+       kretprobe_assert(ri, orig_ret_address, trampoline_address);
+
+       correct_ret_addr = ri->ret_addr;
+       hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+               if (ri->task != current)
+                       /* another task is sharing our hash bucket */
+                       continue;
+
+               orig_ret_address = (unsigned long)ri->ret_addr;
+               if (ri->rp && ri->rp->handler) {
+                       __this_cpu_write(current_kprobe, &ri->rp->kp);
+                       get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+                       ri->ret_addr = correct_ret_addr;
+                       ri->rp->handler(ri, regs);
+                       __this_cpu_write(current_kprobe, NULL);
+               }
+
+               recycle_rp_inst(ri, &empty_rp);
+
+               if (orig_ret_address != trampoline_address)
+                       /*
+                        * This is the real return address. Any other
+                        * instances associated with this task are for
+                        * other calls deeper on the call stack
+                        */
+                       break;
+       }
+
+       kretprobe_hash_unlock(current, &flags);
+
+       hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
+               hlist_del(&ri->hlist);
+               kfree(ri);
+       }
+       return (void *)orig_ret_address;
+}
+
+void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
+                                     struct pt_regs *regs)
+{
+       ri->ret_addr = (kprobe_opcode_t *)regs->regs[30];
+
+       /* replace return addr (x30) with trampoline */
+       regs->regs[30] = (long)&kretprobe_trampoline;
+}
+
+int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+{
+       return 0;
+}
+
+int __init arch_init_kprobes(void)
+{
+       return 0;
+}
diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S

new file mode 100644 (file)

index 0000000..5d6e7f1
--- /dev/null
+++ b/arch/arm64/kernel/probes/kprobes_trampoline.S
@@ -0,0 +1,81 @@
+/*
+ * trampoline entry and return code for kretprobes.
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/assembler.h>
+
+       .text
+
+       .macro  save_all_base_regs
+       stp x0, x1, [sp, #S_X0]
+       stp x2, x3, [sp, #S_X2]
+       stp x4, x5, [sp, #S_X4]
+       stp x6, x7, [sp, #S_X6]
+       stp x8, x9, [sp, #S_X8]
+       stp x10, x11, [sp, #S_X10]
+       stp x12, x13, [sp, #S_X12]
+       stp x14, x15, [sp, #S_X14]
+       stp x16, x17, [sp, #S_X16]
+       stp x18, x19, [sp, #S_X18]
+       stp x20, x21, [sp, #S_X20]
+       stp x22, x23, [sp, #S_X22]
+       stp x24, x25, [sp, #S_X24]
+       stp x26, x27, [sp, #S_X26]
+       stp x28, x29, [sp, #S_X28]
+       add x0, sp, #S_FRAME_SIZE
+       stp lr, x0, [sp, #S_LR]
+       /*
+        * Construct a useful saved PSTATE
+        */
+       mrs x0, nzcv
+       mrs x1, daif
+       orr x0, x0, x1
+       mrs x1, CurrentEL
+       orr x0, x0, x1
+       mrs x1, SPSel
+       orr x0, x0, x1
+       stp xzr, x0, [sp, #S_PC]
+       .endm
+
+       .macro  restore_all_base_regs
+       ldr x0, [sp, #S_PSTATE]
+       and x0, x0, #(PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT)
+       msr nzcv, x0
+       ldp x0, x1, [sp, #S_X0]
+       ldp x2, x3, [sp, #S_X2]
+       ldp x4, x5, [sp, #S_X4]
+       ldp x6, x7, [sp, #S_X6]
+       ldp x8, x9, [sp, #S_X8]
+       ldp x10, x11, [sp, #S_X10]
+       ldp x12, x13, [sp, #S_X12]
+       ldp x14, x15, [sp, #S_X14]
+       ldp x16, x17, [sp, #S_X16]
+       ldp x18, x19, [sp, #S_X18]
+       ldp x20, x21, [sp, #S_X20]
+       ldp x22, x23, [sp, #S_X22]
+       ldp x24, x25, [sp, #S_X24]
+       ldp x26, x27, [sp, #S_X26]
+       ldp x28, x29, [sp, #S_X28]
+       .endm
+
+ENTRY(kretprobe_trampoline)
+       sub sp, sp, #S_FRAME_SIZE
+
+       save_all_base_regs
+
+       mov x0, sp
+       bl trampoline_probe_handler
+       /*
+        * Replace trampoline address in lr with actual orig_ret_addr return
+        * address.
+        */
+       mov lr, x0
+
+       restore_all_base_regs
+
+       add sp, sp, #S_FRAME_SIZE
+       ret
+
+ENDPROC(kretprobe_trampoline)
diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c

new file mode 100644 (file)

index 0000000..8977ce9
--- /dev/null
+++ b/arch/arm64/kernel/probes/simulate-insn.c
@@ -0,0 +1,217 @@
+/*
+ * arch/arm64/kernel/probes/simulate-insn.c
+ *
+ * Copyright (C) 2013 Linaro Limited.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+
+#include "simulate-insn.h"
+
+#define sign_extend(x, signbit)                \
+       ((x) | (0 - ((x) & (1 << (signbit)))))
+
+#define bbl_displacement(insn)         \
+       sign_extend(((insn) & 0x3ffffff) << 2, 27)
+
+#define bcond_displacement(insn)       \
+       sign_extend(((insn >> 5) & 0x7ffff) << 2, 20)
+
+#define cbz_displacement(insn) \
+       sign_extend(((insn >> 5) & 0x7ffff) << 2, 20)
+
+#define tbz_displacement(insn) \
+       sign_extend(((insn >> 5) & 0x3fff) << 2, 15)
+
+#define ldr_displacement(insn) \
+       sign_extend(((insn >> 5) & 0x7ffff) << 2, 20)
+
+static inline void set_x_reg(struct pt_regs *regs, int reg, u64 val)
+{
+       if (reg < 31)
+               regs->regs[reg] = val;
+}
+
+static inline void set_w_reg(struct pt_regs *regs, int reg, u64 val)
+{
+       if (reg < 31)
+               regs->regs[reg] = lower_32_bits(val);
+}
+
+static inline u64 get_x_reg(struct pt_regs *regs, int reg)
+{
+       if (reg < 31)
+               return regs->regs[reg];
+       else
+               return 0;
+}
+
+static inline u32 get_w_reg(struct pt_regs *regs, int reg)
+{
+       if (reg < 31)
+               return lower_32_bits(regs->regs[reg]);
+       else
+               return 0;
+}
+
+static bool __kprobes check_cbz(u32 opcode, struct pt_regs *regs)
+{
+       int xn = opcode & 0x1f;
+
+       return (opcode & (1 << 31)) ?
+           (get_x_reg(regs, xn) == 0) : (get_w_reg(regs, xn) == 0);
+}
+
+static bool __kprobes check_cbnz(u32 opcode, struct pt_regs *regs)
+{
+       int xn = opcode & 0x1f;
+
+       return (opcode & (1 << 31)) ?
+           (get_x_reg(regs, xn) != 0) : (get_w_reg(regs, xn) != 0);
+}
+
+static bool __kprobes check_tbz(u32 opcode, struct pt_regs *regs)
+{
+       int xn = opcode & 0x1f;
+       int bit_pos = ((opcode & (1 << 31)) >> 26) | ((opcode >> 19) & 0x1f);
+
+       return ((get_x_reg(regs, xn) >> bit_pos) & 0x1) == 0;
+}
+
+static bool __kprobes check_tbnz(u32 opcode, struct pt_regs *regs)
+{
+       int xn = opcode & 0x1f;
+       int bit_pos = ((opcode & (1 << 31)) >> 26) | ((opcode >> 19) & 0x1f);
+
+       return ((get_x_reg(regs, xn) >> bit_pos) & 0x1) != 0;
+}
+
+/*
+ * instruction simulation functions
+ */
+void __kprobes
+simulate_adr_adrp(u32 opcode, long addr, struct pt_regs *regs)
+{
+       long imm, xn, val;
+
+       xn = opcode & 0x1f;
+       imm = ((opcode >> 3) & 0x1ffffc) | ((opcode >> 29) & 0x3);
+       imm = sign_extend(imm, 20);
+       if (opcode & 0x80000000)
+               val = (imm<<12) + (addr & 0xfffffffffffff000);
+       else
+               val = imm + addr;
+
+       set_x_reg(regs, xn, val);
+
+       instruction_pointer_set(regs, instruction_pointer(regs) + 4);
+}
+
+void __kprobes
+simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs)
+{
+       int disp = bbl_displacement(opcode);
+
+       /* Link register is x30 */
+       if (opcode & (1 << 31))
+               set_x_reg(regs, 30, addr + 4);
+
+       instruction_pointer_set(regs, addr + disp);
+}
+
+void __kprobes
+simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs)
+{
+       int disp = 4;
+
+       if (aarch32_opcode_cond_checks[opcode & 0xf](regs->pstate & 0xffffffff))
+               disp = bcond_displacement(opcode);
+
+       instruction_pointer_set(regs, addr + disp);
+}
+
+void __kprobes
+simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs)
+{
+       int xn = (opcode >> 5) & 0x1f;
+
+       /* update pc first in case we're doing a "blr lr" */
+       instruction_pointer_set(regs, get_x_reg(regs, xn));
+
+       /* Link register is x30 */
+       if (((opcode >> 21) & 0x3) == 1)
+               set_x_reg(regs, 30, addr + 4);
+}
+
+void __kprobes
+simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs)
+{
+       int disp = 4;
+
+       if (opcode & (1 << 24)) {
+               if (check_cbnz(opcode, regs))
+                       disp = cbz_displacement(opcode);
+       } else {
+               if (check_cbz(opcode, regs))
+                       disp = cbz_displacement(opcode);
+       }
+       instruction_pointer_set(regs, addr + disp);
+}
+
+void __kprobes
+simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs)
+{
+       int disp = 4;
+
+       if (opcode & (1 << 24)) {
+               if (check_tbnz(opcode, regs))
+                       disp = tbz_displacement(opcode);
+       } else {
+               if (check_tbz(opcode, regs))
+                       disp = tbz_displacement(opcode);
+       }
+       instruction_pointer_set(regs, addr + disp);
+}
+
+void __kprobes
+simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs)
+{
+       u64 *load_addr;
+       int xn = opcode & 0x1f;
+       int disp;
+
+       disp = ldr_displacement(opcode);
+       load_addr = (u64 *) (addr + disp);
+
+       if (opcode & (1 << 30)) /* x0-x30 */
+               set_x_reg(regs, xn, *load_addr);
+       else                    /* w0-w30 */
+               set_w_reg(regs, xn, *load_addr);
+
+       instruction_pointer_set(regs, instruction_pointer(regs) + 4);
+}
+
+void __kprobes
+simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs)
+{
+       s32 *load_addr;
+       int xn = opcode & 0x1f;
+       int disp;
+
+       disp = ldr_displacement(opcode);
+       load_addr = (s32 *) (addr + disp);
+
+       set_x_reg(regs, xn, *load_addr);
+
+       instruction_pointer_set(regs, instruction_pointer(regs) + 4);
+}
diff --git a/arch/arm64/kernel/probes/simulate-insn.h b/arch/arm64/kernel/probes/simulate-insn.h

new file mode 100644 (file)

index 0000000..050bde6
--- /dev/null
+++ b/arch/arm64/kernel/probes/simulate-insn.h
@@ -0,0 +1,28 @@
+/*
+ * arch/arm64/kernel/probes/simulate-insn.h
+ *
+ * Copyright (C) 2013 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _ARM_KERNEL_KPROBES_SIMULATE_INSN_H
+#define _ARM_KERNEL_KPROBES_SIMULATE_INSN_H
+
+void simulate_adr_adrp(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_b_bl(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_b_cond(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_br_blr_ret(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs);
+
+#endif /* _ARM_KERNEL_KPROBES_SIMULATE_INSN_H */
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c

index f75b540bc3b4b0daae4a8773cd0eddf1a86a90aa..80624829db613961b7a088ce18d8591361b448c7 100644 (file)
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -46,6 +46,7 @@
  #include <linux/notifier.h>
  #include <trace/events/power.h>
  
+#include <asm/alternative.h>
  #include <asm/compat.h>
  #include <asm/cacheflush.h>
  #include <asm/fpsimd.h>
@@ -280,6 +281,9 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
         } else {
                 memset(childregs, 0, sizeof(struct pt_regs));
                 childregs->pstate = PSR_MODE_EL1h;
+               if (IS_ENABLED(CONFIG_ARM64_UAO) &&
+                   cpus_have_cap(ARM64_HAS_UAO))
+                       childregs->pstate |= PSR_UAO_BIT;
                 p->thread.cpu_context.x19 = stack_start;
                 p->thread.cpu_context.x20 = stk_sz;
         }
@@ -308,6 +312,17 @@ static void tls_thread_switch(struct task_struct *next)
         : : "r" (tpidr), "r" (tpidrro));
  }
  
+/* Restore the UAO state depending on next's addr_limit */
+static void uao_thread_switch(struct task_struct *next)
+{
+       if (IS_ENABLED(CONFIG_ARM64_UAO)) {
+               if (task_thread_info(next)->addr_limit == KERNEL_DS)
+                       asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO));
+               else
+                       asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO));
+       }
+}
+
  /*
   * Thread switching.
   */
@@ -320,6 +335,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
         tls_thread_switch(next);
         hw_breakpoint_thread_switch(next);
         contextidr_thread_switch(next);
+       uao_thread_switch(next);
  
         /*
          * Complete any pending TLB or cache maintenance on this CPU in case
@@ -344,11 +360,14 @@ unsigned long get_wchan(struct task_struct *p)
         frame.fp = thread_saved_fp(p);
         frame.sp = thread_saved_sp(p);
         frame.pc = thread_saved_pc(p);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       frame.graph = p->curr_ret_stack;
+#endif
         stack_page = (unsigned long)task_stack_page(p);
         do {
                 if (frame.sp < stack_page ||
                     frame.sp >= stack_page + THREAD_SIZE ||
-                   unwind_frame(&frame))
+                   unwind_frame(p, &frame))
                         return 0;
                 if (!in_sched_functions(frame.pc))
                         return frame.pc;
diff --git a/arch/arm64/kernel/psci-call.S b/arch/arm64/kernel/psci-call.S

deleted file mode 100644 (file)

index cf83e61..0000000
--- a/arch/arm64/kernel/psci-call.S
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * Copyright (C) 2015 ARM Limited
- *
- * Author: Will Deacon <will.deacon@arm.com>
- */
-
-#include <linux/linkage.h>
-
-/* int __invoke_psci_fn_hvc(u64 function_id, u64 arg0, u64 arg1, u64 arg2) */
-ENTRY(__invoke_psci_fn_hvc)
-       hvc     #0
-       ret
-ENDPROC(__invoke_psci_fn_hvc)
-
-/* int __invoke_psci_fn_smc(u64 function_id, u64 arg0, u64 arg1, u64 arg2) */
-ENTRY(__invoke_psci_fn_smc)
-       smc     #0
-       ret
-ENDPROC(__invoke_psci_fn_smc)
diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c

index f67f35b6edb12e4d34e1db17750b07a0bec72e39..42816bebb1e0f732d788780fde431028f202c31a 100644 (file)
--- a/arch/arm64/kernel/psci.c
+++ b/arch/arm64/kernel/psci.c
@@ -20,7 +20,6 @@
  #include <linux/smp.h>
  #include <linux/delay.h>
  #include <linux/psci.h>
-#include <linux/slab.h>
  
  #include <uapi/linux/psci.h>
  
@@ -28,73 +27,6 @@
  #include <asm/cpu_ops.h>
  #include <asm/errno.h>
  #include <asm/smp_plat.h>
-#include <asm/suspend.h>
-
-static DEFINE_PER_CPU_READ_MOSTLY(u32 *, psci_power_state);
-
-static int __maybe_unused cpu_psci_cpu_init_idle(unsigned int cpu)
-{
-       int i, ret, count = 0;
-       u32 *psci_states;
-       struct device_node *state_node, *cpu_node;
-
-       cpu_node = of_get_cpu_node(cpu, NULL);
-       if (!cpu_node)
-               return -ENODEV;
-
-       /*
-        * If the PSCI cpu_suspend function hook has not been initialized
-        * idle states must not be enabled, so bail out
-        */
-       if (!psci_ops.cpu_suspend)
-               return -EOPNOTSUPP;
-
-       /* Count idle states */
-       while ((state_node = of_parse_phandle(cpu_node, "cpu-idle-states",
-                                             count))) {
-               count++;
-               of_node_put(state_node);
-       }
-
-       if (!count)
-               return -ENODEV;
-
-       psci_states = kcalloc(count, sizeof(*psci_states), GFP_KERNEL);
-       if (!psci_states)
-               return -ENOMEM;
-
-       for (i = 0; i < count; i++) {
-               u32 state;
-
-               state_node = of_parse_phandle(cpu_node, "cpu-idle-states", i);
-
-               ret = of_property_read_u32(state_node,
-                                          "arm,psci-suspend-param",
-                                          &state);
-               if (ret) {
-                       pr_warn(" * %s missing arm,psci-suspend-param property\n",
-                               state_node->full_name);
-                       of_node_put(state_node);
-                       goto free_mem;
-               }
-
-               of_node_put(state_node);
-               pr_debug("psci-power-state %#x index %d\n", state, i);
-               if (!psci_power_state_is_valid(state)) {
-                       pr_warn("Invalid PSCI power state %#x\n", state);
-                       ret = -EINVAL;
-                       goto free_mem;
-               }
-               psci_states[i] = state;
-       }
-       /* Idle states parsed correctly, initialize per-cpu pointer */
-       per_cpu(psci_power_state, cpu) = psci_states;
-       return 0;
-
-free_mem:
-       kfree(psci_states);
-       return ret;
-}
  
  static int __init cpu_psci_cpu_init(unsigned int cpu)
  {
@@ -178,38 +110,11 @@ static int cpu_psci_cpu_kill(unsigned int cpu)
  }
  #endif
  
-static int psci_suspend_finisher(unsigned long index)
-{
-       u32 *state = __this_cpu_read(psci_power_state);
-
-       return psci_ops.cpu_suspend(state[index - 1],
-                                   virt_to_phys(cpu_resume));
-}
-
-static int __maybe_unused cpu_psci_cpu_suspend(unsigned long index)
-{
-       int ret;
-       u32 *state = __this_cpu_read(psci_power_state);
-       /*
-        * idle state index 0 corresponds to wfi, should never be called
-        * from the cpu_suspend operations
-        */
-       if (WARN_ON_ONCE(!index))
-               return -EINVAL;
-
-       if (!psci_power_state_loses_context(state[index - 1]))
-               ret = psci_ops.cpu_suspend(state[index - 1], 0);
-       else
-               ret = cpu_suspend(index, psci_suspend_finisher);
-
-       return ret;
-}
-
  const struct cpu_operations cpu_psci_ops = {
         .name           = "psci",
  #ifdef CONFIG_CPU_IDLE
-       .cpu_init_idle  = cpu_psci_cpu_init_idle,
-       .cpu_suspend    = cpu_psci_cpu_suspend,
+       .cpu_init_idle  = psci_cpu_init_idle,
+       .cpu_suspend    = psci_cpu_suspend_enter,
  #endif
         .cpu_init       = cpu_psci_cpu_init,
         .cpu_prepare    = cpu_psci_cpu_prepare,
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c

index 55909b2208cc08ce43e415f13dbc5e60836b90b8..c5ef059598135eee7ee8894330e269d1d2b1bec2 100644 (file)
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -49,6 +49,106 @@
  #define CREATE_TRACE_POINTS
  #include <trace/events/syscalls.h>
  
+struct pt_regs_offset {
+       const char *name;
+       int offset;
+};
+
+#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
+#define REG_OFFSET_END {.name = NULL, .offset = 0}
+#define GPR_OFFSET_NAME(r) \
+       {.name = "x" #r, .offset = offsetof(struct pt_regs, regs[r])}
+
+static const struct pt_regs_offset regoffset_table[] = {
+       GPR_OFFSET_NAME(0),
+       GPR_OFFSET_NAME(1),
+       GPR_OFFSET_NAME(2),
+       GPR_OFFSET_NAME(3),
+       GPR_OFFSET_NAME(4),
+       GPR_OFFSET_NAME(5),
+       GPR_OFFSET_NAME(6),
+       GPR_OFFSET_NAME(7),
+       GPR_OFFSET_NAME(8),
+       GPR_OFFSET_NAME(9),
+       GPR_OFFSET_NAME(10),
+       GPR_OFFSET_NAME(11),
+       GPR_OFFSET_NAME(12),
+       GPR_OFFSET_NAME(13),
+       GPR_OFFSET_NAME(14),
+       GPR_OFFSET_NAME(15),
+       GPR_OFFSET_NAME(16),
+       GPR_OFFSET_NAME(17),
+       GPR_OFFSET_NAME(18),
+       GPR_OFFSET_NAME(19),
+       GPR_OFFSET_NAME(20),
+       GPR_OFFSET_NAME(21),
+       GPR_OFFSET_NAME(22),
+       GPR_OFFSET_NAME(23),
+       GPR_OFFSET_NAME(24),
+       GPR_OFFSET_NAME(25),
+       GPR_OFFSET_NAME(26),
+       GPR_OFFSET_NAME(27),
+       GPR_OFFSET_NAME(28),
+       GPR_OFFSET_NAME(29),
+       GPR_OFFSET_NAME(30),
+       {.name = "lr", .offset = offsetof(struct pt_regs, regs[30])},
+       REG_OFFSET_NAME(sp),
+       REG_OFFSET_NAME(pc),
+       REG_OFFSET_NAME(pstate),
+       REG_OFFSET_END,
+};
+
+/**
+ * regs_query_register_offset() - query register offset from its name
+ * @name:      the name of a register
+ *
+ * regs_query_register_offset() returns the offset of a register in struct
+ * pt_regs from its name. If the name is invalid, this returns -EINVAL;
+ */
+int regs_query_register_offset(const char *name)
+{
+       const struct pt_regs_offset *roff;
+
+       for (roff = regoffset_table; roff->name != NULL; roff++)
+               if (!strcmp(roff->name, name))
+                       return roff->offset;
+       return -EINVAL;
+}
+
+/**
+ * regs_within_kernel_stack() - check the address in the stack
+ * @regs:      pt_regs which contains kernel stack pointer.
+ * @addr:      address which is checked.
+ *
+ * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
+ * If @addr is within the kernel stack, it returns true. If not, returns false.
+ */
+static bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr)
+{
+       return ((addr & ~(THREAD_SIZE - 1))  ==
+               (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
+}
+
+/**
+ * regs_get_kernel_stack_nth() - get Nth entry of the stack
+ * @regs:      pt_regs which contains kernel stack pointer.
+ * @n:         stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
+ * is specified by @regs. If the @n th entry is NOT in the kernel stack,
+ * this returns 0.
+ */
+unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n)
+{
+       unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
+
+       addr += n;
+       if (regs_within_kernel_stack(regs, (unsigned long)addr))
+               return *addr;
+       else
+               return 0;
+}
+
  /*
   * TODO: does not yet catch signals sent when the child dies.
   * in exit.c or in signal.c.
diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c

index 6c4fd2810ecb35b3e648db18923d21f19a50f422..1718706fde83604f78d81d850bf8827705338f1a 100644 (file)
--- a/arch/arm64/kernel/return_address.c
+++ b/arch/arm64/kernel/return_address.c
@@ -43,8 +43,11 @@ void *return_address(unsigned int level)
         frame.fp = (unsigned long)__builtin_frame_address(0);
         frame.sp = current_stack_pointer;
         frame.pc = (unsigned long)return_address; /* dummy */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       frame.graph = current->curr_ret_stack;
+#endif
  
-       walk_stackframe(&frame, save_return_addr, &data);
+       walk_stackframe(current, &frame, save_return_addr, &data);
  
         if (!data.level)
                 return data.addr;
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c

index 8119479147db147c33800f76aa0d07c6072e8559..1e33d967c0ae5ce731a1980e1a41e6d7939ce568 100644 (file)
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -62,6 +62,7 @@
  #include <asm/memblock.h>
  #include <asm/efi.h>
  #include <asm/xen/hypervisor.h>
+#include <asm/mmu_context.h>
  
  phys_addr_t __fdt_pointer __initdata;
  
@@ -174,7 +175,6 @@ static void __init smp_build_mpidr_hash(void)
          */
         if (mpidr_hash_size() > 4 * num_possible_cpus())
                 pr_warn("Large number of MPIDR hash buckets detected\n");
-       __flush_dcache_area(&mpidr_hash, sizeof(struct mpidr_hash));
  }
  
  static void __init setup_machine_fdt(phys_addr_t dt_phys)
@@ -313,6 +313,12 @@ void __init setup_arch(char **cmdline_p)
          */
         local_async_enable();
  
+       /*
+        * TTBR0 is only used for the identity mapping at this stage. Make it
+        * point to zero page to avoid speculatively fetching new entries.
+        */
+       cpu_uninstall_idmap();
+
         efi_init();
         arm64_memblock_init();
  
@@ -381,3 +387,32 @@ static int __init topology_init(void)
         return 0;
  }
  subsys_initcall(topology_init);
+
+/*
+ * Dump out kernel offset information on panic.
+ */
+static int dump_kernel_offset(struct notifier_block *self, unsigned long v,
+                             void *p)
+{
+       u64 const kaslr_offset = kimage_vaddr - KIMAGE_VADDR;
+
+       if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset > 0) {
+               pr_emerg("Kernel Offset: 0x%llx from 0x%lx\n",
+                        kaslr_offset, KIMAGE_VADDR);
+       } else {
+               pr_emerg("Kernel Offset: disabled\n");
+       }
+       return 0;
+}
+
+static struct notifier_block kernel_offset_notifier = {
+       .notifier_call = dump_kernel_offset
+};
+
+static int __init register_kernel_offset_dumper(void)
+{
+       atomic_notifier_chain_register(&panic_notifier_list,
+                                      &kernel_offset_notifier);
+       return 0;
+}
+__initcall(register_kernel_offset_dumper);
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S

index f586f7c875e29295b6094efd4967e26cb6d01a99..c2bf5a58039f1b7f749912bc18f58bf5007e80fb 100644 (file)
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -49,39 +49,32 @@
         orr     \dst, \dst, \mask               // dst|=(aff3>>rs3)
         .endm
  /*
- * Save CPU state for a suspend and execute the suspend finisher.
- * On success it will return 0 through cpu_resume - ie through a CPU
- * soft/hard reboot from the reset vector.
- * On failure it returns the suspend finisher return value or force
- * -EOPNOTSUPP if the finisher erroneously returns 0 (the suspend finisher
- * is not allowed to return, if it does this must be considered failure).
- * It saves callee registers, and allocates space on the kernel stack
- * to save the CPU specific registers + some other data for resume.
+ * Save CPU state in the provided sleep_stack_data area, and publish its
+ * location for cpu_resume()'s use in sleep_save_stash.
   *
- *  x0 = suspend finisher argument
- *  x1 = suspend finisher function pointer
+ * cpu_resume() will restore this saved state, and return. Because the
+ * link-register is saved and restored, it will appear to return from this
+ * function. So that the caller can tell the suspend/resume paths apart,
+ * __cpu_suspend_enter() will always return a non-zero value, whereas the
+ * path through cpu_resume() will return 0.
+ *
+ *  x0 = struct sleep_stack_data area
   */
  ENTRY(__cpu_suspend_enter)
-       stp     x29, lr, [sp, #-96]!
-       stp     x19, x20, [sp,#16]
-       stp     x21, x22, [sp,#32]
-       stp     x23, x24, [sp,#48]
-       stp     x25, x26, [sp,#64]
-       stp     x27, x28, [sp,#80]
-       /*
-        * Stash suspend finisher and its argument in x20 and x19
-        */
-       mov     x19, x0
-       mov     x20, x1
+       stp     x29, lr, [x0, #SLEEP_STACK_DATA_CALLEE_REGS]
+       stp     x19, x20, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+16]
+       stp     x21, x22, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+32]
+       stp     x23, x24, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+48]
+       stp     x25, x26, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+64]
+       stp     x27, x28, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+80]
+
+       /* save the sp in cpu_suspend_ctx */
         mov     x2, sp
-       sub     sp, sp, #CPU_SUSPEND_SZ // allocate cpu_suspend_ctx
-       mov     x0, sp
-       /*
-        * x0 now points to struct cpu_suspend_ctx allocated on the stack
-        */
-       str     x2, [x0, #CPU_CTX_SP]
-       ldr     x1, =sleep_save_sp
-       ldr     x1, [x1, #SLEEP_SAVE_SP_VIRT]
+       str     x2, [x0, #SLEEP_STACK_DATA_SYSTEM_REGS + CPU_CTX_SP]
+
+       /* find the mpidr_hash */
+       ldr     x1, =sleep_save_stash
+       ldr     x1, [x1]
         mrs     x7, mpidr_el1
         ldr     x9, =mpidr_hash
         ldr     x10, [x9, #MPIDR_HASH_MASK]
@@ -93,70 +86,28 @@ ENTRY(__cpu_suspend_enter)
         ldp     w5, w6, [x9, #(MPIDR_HASH_SHIFTS + 8)]
         compute_mpidr_hash x8, x3, x4, x5, x6, x7, x10
         add     x1, x1, x8, lsl #3
-       bl      __cpu_suspend_save
-       /*
-        * Grab suspend finisher in x20 and its argument in x19
-        */
-       mov     x0, x19
-       mov     x1, x20
-       /*
-        * We are ready for power down, fire off the suspend finisher
-        * in x1, with argument in x0
-        */
-       blr     x1
-        /*
-        * Never gets here, unless suspend finisher fails.
-        * Successful cpu_suspend should return from cpu_resume, returning
-        * through this code path is considered an error
-        * If the return value is set to 0 force x0 = -EOPNOTSUPP
-        * to make sure a proper error condition is propagated
-        */
-       cmp     x0, #0
-       mov     x3, #-EOPNOTSUPP
-       csel    x0, x3, x0, eq
-       add     sp, sp, #CPU_SUSPEND_SZ // rewind stack pointer
-       ldp     x19, x20, [sp, #16]
-       ldp     x21, x22, [sp, #32]
-       ldp     x23, x24, [sp, #48]
-       ldp     x25, x26, [sp, #64]
-       ldp     x27, x28, [sp, #80]
-       ldp     x29, lr, [sp], #96
+
+       str     x0, [x1]
+       add     x0, x0, #SLEEP_STACK_DATA_SYSTEM_REGS
+       stp     x29, lr, [sp, #-16]!
+       bl      cpu_do_suspend
+       ldp     x29, lr, [sp], #16
+       mov     x0, #1
         ret
  ENDPROC(__cpu_suspend_enter)
         .ltorg
  
-/*
- * x0 must contain the sctlr value retrieved from restored context
- */
-       .pushsection    ".idmap.text", "ax"
-ENTRY(cpu_resume_mmu)
-       ldr     x3, =cpu_resume_after_mmu
-       msr     sctlr_el1, x0           // restore sctlr_el1
-       isb
-       /*
-        * Invalidate the local I-cache so that any instructions fetched
-        * speculatively from the PoC are discarded, since they may have
-        * been dynamically patched at the PoU.
-        */
-       ic      iallu
-       dsb     nsh
-       isb
-       br      x3                      // global jump to virtual address
-ENDPROC(cpu_resume_mmu)
-       .popsection
-cpu_resume_after_mmu:
-       mov     x0, #0                  // return zero on success
-       ldp     x19, x20, [sp, #16]
-       ldp     x21, x22, [sp, #32]
-       ldp     x23, x24, [sp, #48]
-       ldp     x25, x26, [sp, #64]
-       ldp     x27, x28, [sp, #80]
-       ldp     x29, lr, [sp], #96
-       ret
-ENDPROC(cpu_resume_after_mmu)
-
  ENTRY(cpu_resume)
         bl      el2_setup               // if in EL2 drop to EL1 cleanly
+       /* enable the MMU early - so we can access sleep_save_stash by va */
+       adr_l   lr, __enable_mmu        /* __cpu_setup will return here */
+       ldr     x27, =_cpu_resume       /* __enable_mmu will branch here */
+       adrp    x25, idmap_pg_dir
+       adrp    x26, swapper_pg_dir
+       b       __cpu_setup
+ENDPROC(cpu_resume)
+
+ENTRY(_cpu_resume)
         mrs     x1, mpidr_el1
         adrp    x8, mpidr_hash
         add x8, x8, #:lo12:mpidr_hash // x8 = struct mpidr_hash phys address
@@ -166,17 +117,27 @@ ENTRY(cpu_resume)
         ldp     w5, w6, [x8, #(MPIDR_HASH_SHIFTS + 8)]
         compute_mpidr_hash x7, x3, x4, x5, x6, x1, x2
          /* x7 contains hash index, let's use it to grab context pointer */
-       ldr_l   x0, sleep_save_sp + SLEEP_SAVE_SP_PHYS
+       ldr_l   x0, sleep_save_stash
         ldr     x0, [x0, x7, lsl #3]
+       add     x29, x0, #SLEEP_STACK_DATA_CALLEE_REGS
+       add     x0, x0, #SLEEP_STACK_DATA_SYSTEM_REGS
         /* load sp from context */
         ldr     x2, [x0, #CPU_CTX_SP]
-       /* load physical address of identity map page table in x1 */
-       adrp    x1, idmap_pg_dir
         mov     sp, x2
+       /* save thread_info */
+       and     x2, x2, #~(THREAD_SIZE - 1)
+       msr     sp_el0, x2
         /*
-        * cpu_do_resume expects x0 to contain context physical address
-        * pointer and x1 to contain physical address of 1:1 page tables
+        * cpu_do_resume expects x0 to contain context address pointer
          */
-       bl      cpu_do_resume           // PC relative jump, MMU off
-       b       cpu_resume_mmu          // Resume MMU, never returns
-ENDPROC(cpu_resume)
+       bl      cpu_do_resume
+
+       ldp     x19, x20, [x29, #16]
+       ldp     x21, x22, [x29, #32]
+       ldp     x23, x24, [x29, #48]
+       ldp     x25, x26, [x29, #64]
+       ldp     x27, x28, [x29, #80]
+       ldp     x29, lr, [x29]
+       mov     x0, #0
+       ret
+ENDPROC(_cpu_resume)
diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S

new file mode 100644 (file)

index 0000000..ae0496f
--- /dev/null
+++ b/arch/arm64/kernel/smccc-call.S
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License Version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+
+       .macro SMCCC instr
+       .cfi_startproc
+       \instr  #0
+       ldr     x4, [sp]
+       stp     x0, x1, [x4, #ARM_SMCCC_RES_X0_OFFS]
+       stp     x2, x3, [x4, #ARM_SMCCC_RES_X2_OFFS]
+       ret
+       .cfi_endproc
+       .endm
+
+/*
+ * void arm_smccc_smc(unsigned long a0, unsigned long a1, unsigned long a2,
+ *               unsigned long a3, unsigned long a4, unsigned long a5,
+ *               unsigned long a6, unsigned long a7, struct arm_smccc_res *res)
+ */
+ENTRY(arm_smccc_smc)
+       SMCCC   smc
+ENDPROC(arm_smccc_smc)
+
+/*
+ * void arm_smccc_hvc(unsigned long a0, unsigned long a1, unsigned long a2,
+ *               unsigned long a3, unsigned long a4, unsigned long a5,
+ *               unsigned long a6, unsigned long a7, struct arm_smccc_res *res)
+ */
+ENTRY(arm_smccc_hvc)
+       SMCCC   hvc
+ENDPROC(arm_smccc_hvc)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c

index f3c3d8fee5bab2dbeec427266c23ae90badc9e2d..a84623d91410d17daf444995c6c8ec1baceae939 100644 (file)
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -70,6 +70,7 @@ enum ipi_msg_type {
         IPI_CPU_STOP,
         IPI_TIMER,
         IPI_IRQ_WORK,
+       IPI_WAKEUP
  };
  
  /*
@@ -149,9 +150,7 @@ asmlinkage void secondary_start_kernel(void)
          * TTBR0 is only used for the identity mapping at this stage. Make it
          * point to zero page to avoid speculatively fetching new entries.
          */
-       cpu_set_reserved_ttbr0();
-       local_flush_tlb_all();
-       cpu_set_default_tcr_t0sz();
+       cpu_uninstall_idmap();
  
         preempt_disable();
         trace_hardirqs_off();
@@ -444,6 +443,17 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor)
         /* map the logical cpu id to cpu MPIDR */
         cpu_logical_map(cpu_count) = hwid;
  
+       /*
+        * Set-up the ACPI parking protocol cpu entries
+        * while initializing the cpu_logical_map to
+        * avoid parsing MADT entries multiple times for
+        * nothing (ie a valid cpu_logical_map entry should
+        * contain a valid parking protocol data set to
+        * initialize the cpu if the parking protocol is
+        * the only available enable method).
+        */
+       acpi_set_mailbox_entry(cpu_count, processor);
+
         cpu_count++;
  }
  
@@ -626,6 +636,7 @@ static const char *ipi_types[NR_IPI] __tracepoint_string = {
         S(IPI_CPU_STOP, "CPU stop interrupts"),
         S(IPI_TIMER, "Timer broadcast interrupts"),
         S(IPI_IRQ_WORK, "IRQ work interrupts"),
+       S(IPI_WAKEUP, "CPU wake-up interrupts"),
  };
  
  static void smp_cross_call(const struct cpumask *target, unsigned int ipinr)
@@ -669,6 +680,13 @@ void arch_send_call_function_single_ipi(int cpu)
         smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC);
  }
  
+#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
+void arch_send_wakeup_ipi_mask(const struct cpumask *mask)
+{
+       smp_cross_call(mask, IPI_WAKEUP);
+}
+#endif
+
  #ifdef CONFIG_IRQ_WORK
  void arch_irq_work_raise(void)
  {
@@ -746,6 +764,14 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
                 break;
  #endif
  
+#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
+       case IPI_WAKEUP:
+               WARN_ONCE(!acpi_parking_protocol_valid(cpu),
+                         "CPU%u: Wake-up IPI outside the ACPI parking protocol\n",
+                         cpu);
+               break;
+#endif
+
         default:
                 pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr);
                 break;
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c

index ccb6078ed9f20fb55132deb48504df3a3134a784..cfd46c227c8cbd7c57c9b88c1d6189404f0bdcfa 100644 (file)
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -17,9 +17,11 @@
   */
  #include <linux/kernel.h>
  #include <linux/export.h>
+#include <linux/ftrace.h>
  #include <linux/sched.h>
  #include <linux/stacktrace.h>
  
+#include <asm/irq.h>
  #include <asm/stacktrace.h>
  
  /*
@@ -35,25 +37,82 @@
   *     ldp     x29, x30, [sp]
   *     add     sp, sp, #0x10
   */
-int notrace unwind_frame(struct stackframe *frame)
+int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
  {
         unsigned long high, low;
         unsigned long fp = frame->fp;
+       unsigned long irq_stack_ptr;
+
+       /*
+        * Switching between stacks is valid when tracing current and in
+        * non-preemptible context.
+        */
+       if (tsk == current && !preemptible())
+               irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id());
+       else
+               irq_stack_ptr = 0;
  
         low  = frame->sp;
-       high = ALIGN(low, THREAD_SIZE);
+       /* irq stacks are not THREAD_SIZE aligned */
+       if (on_irq_stack(frame->sp, raw_smp_processor_id()))
+               high = irq_stack_ptr;
+       else
+               high = ALIGN(low, THREAD_SIZE) - 0x20;
  
-       if (fp < low || fp > high - 0x18 || fp & 0xf)
+       if (fp < low || fp > high || fp & 0xf)
                 return -EINVAL;
  
         frame->sp = fp + 0x10;
         frame->fp = *(unsigned long *)(fp);
         frame->pc = *(unsigned long *)(fp + 8);
  
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       if (tsk && tsk->ret_stack &&
+                       (frame->pc == (unsigned long)return_to_handler)) {
+               /*
+                * This is a case where function graph tracer has
+                * modified a return address (LR) in a stack frame
+                * to hook a function return.
+                * So replace it to an original value.
+                */
+               frame->pc = tsk->ret_stack[frame->graph--].ret;
+       }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+       /*
+        * Check whether we are going to walk through from interrupt stack
+        * to task stack.
+        * If we reach the end of the stack - and its an interrupt stack,
+        * unpack the dummy frame to find the original elr.
+        *
+        * Check the frame->fp we read from the bottom of the irq_stack,
+        * and the original task stack pointer are both in current->stack.
+        */
+       if (frame->sp == irq_stack_ptr) {
+               struct pt_regs *irq_args;
+               unsigned long orig_sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr);
+
+               if (object_is_on_stack((void *)orig_sp) &&
+                  object_is_on_stack((void *)frame->fp)) {
+                       frame->sp = orig_sp;
+
+                       /* orig_sp is the saved pt_regs, find the elr */
+                       irq_args = (struct pt_regs *)orig_sp;
+                       frame->pc = irq_args->pc;
+               } else {
+                       /*
+                        * This frame has a non-standard format, and we
+                        * didn't fix it, because the data looked wrong.
+                        * Refuse to output this frame.
+                        */
+                       return -EINVAL;
+               }
+       }
+
         return 0;
  }
  
-void notrace walk_stackframe(struct stackframe *frame,
+void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
                      int (*fn)(struct stackframe *, void *), void *data)
  {
         while (1) {
@@ -61,7 +120,7 @@ void notrace walk_stackframe(struct stackframe *frame,
  
                 if (fn(frame, data))
                         break;
-               ret = unwind_frame(frame);
+               ret = unwind_frame(tsk, frame);
                 if (ret < 0)
                         break;
         }
@@ -112,8 +171,11 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
                 frame.sp = current_stack_pointer;
                 frame.pc = (unsigned long)save_stack_trace_tsk;
         }
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       frame.graph = tsk->curr_ret_stack;
+#endif
  
-       walk_stackframe(&frame, save_trace, &data);
+       walk_stackframe(tsk, &frame, save_trace, &data);
         if (trace->nr_entries < trace->max_entries)
                 trace->entries[trace->nr_entries++] = ULONG_MAX;
  }
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c

index 00c1372bf57ba19e74d6bb87cc0fdc8e6ead8837..5a0b1088c17c0fa0d9998ee03a67791d111b2e75 100644 (file)
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -12,30 +12,11 @@
  #include <asm/suspend.h>
  #include <asm/tlbflush.h>
  
-extern int __cpu_suspend_enter(unsigned long arg, int (*fn)(unsigned long));
  /*
- * This is called by __cpu_suspend_enter() to save the state, and do whatever
- * flushing is required to ensure that when the CPU goes to sleep we have
- * the necessary data available when the caches are not searched.
- *
- * ptr: CPU context virtual address
- * save_ptr: address of the location where the context physical address
- *           must be saved
+ * This is allocated by cpu_suspend_init(), and used to store a pointer to
+ * the 'struct sleep_stack_data' the contains a particular CPUs state.
   */
-void notrace __cpu_suspend_save(struct cpu_suspend_ctx *ptr,
-                               phys_addr_t *save_ptr)
-{
-       *save_ptr = virt_to_phys(ptr);
-
-       cpu_do_suspend(ptr);
-       /*
-        * Only flush the context that must be retrieved with the MMU
-        * off. VA primitives ensure the flush is applied to all
-        * cache levels so context is pushed to DRAM.
-        */
-       __flush_dcache_area(ptr, sizeof(*ptr));
-       __flush_dcache_area(save_ptr, sizeof(*save_ptr));
-}
+unsigned long *sleep_save_stash;
  
  /*
   * This hook is provided so that cpu_suspend code can restore HW
@@ -53,6 +34,30 @@ void __init cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *))
         hw_breakpoint_restore = hw_bp_restore;
  }
  
+void notrace __cpu_suspend_exit(void)
+{
+       /*
+        * We are resuming from reset with the idmap active in TTBR0_EL1.
+        * We must uninstall the idmap and restore the expected MMU
+        * state before we can possibly return to userspace.
+        */
+       cpu_uninstall_idmap();
+
+       /*
+        * Restore per-cpu offset before any kernel
+        * subsystem relying on it has a chance to run.
+        */
+       set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
+
+       /*
+        * Restore HW breakpoint registers to sane values
+        * before debug exceptions are possibly reenabled
+        * through local_dbg_restore.
+        */
+       if (hw_breakpoint_restore)
+               hw_breakpoint_restore(NULL);
+}
+
  /*
   * cpu_suspend
   *
@@ -62,9 +67,9 @@ void __init cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *))
   */
  int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
  {
-       struct mm_struct *mm = current->active_mm;
-       int ret;
+       int ret = 0;
         unsigned long flags;
+       struct sleep_stack_data state;
  
         /*
          * From this point debug exceptions are disabled to prevent
@@ -80,37 +85,9 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
          */
         pause_graph_tracing();
  
-       /*
-        * mm context saved on the stack, it will be restored when
-        * the cpu comes out of reset through the identity mapped
-        * page tables, so that the thread address space is properly
-        * set-up on function return.
-        */
-       ret = __cpu_suspend_enter(arg, fn);
-       if (ret == 0) {
-               /*
-                * We are resuming from reset with TTBR0_EL1 set to the
-                * idmap to enable the MMU; set the TTBR0 to the reserved
-                * page tables to prevent speculative TLB allocations, flush
-                * the local tlb and set the default tcr_el1.t0sz so that
-                * the TTBR0 address space set-up is properly restored.
-                * If the current active_mm != &init_mm we entered cpu_suspend
-                * with mappings in TTBR0 that must be restored, so we switch
-                * them back to complete the address space configuration
-                * restoration before returning.
-                */
-               cpu_set_reserved_ttbr0();
-               local_flush_tlb_all();
-               cpu_set_default_tcr_t0sz();
-
-               if (mm != &init_mm)
-                       cpu_switch_mm(mm->pgd, mm);
-
-               /*
-                * Restore per-cpu offset before any kernel
-                * subsystem relying on it has a chance to run.
-                */
-               set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
+       if (__cpu_suspend_enter(&state)) {
+               /* Call the suspend finisher */
+               ret = fn(arg);
  
                 /*
                  * PSTATE was not saved over suspend/resume, re-enable any
@@ -124,8 +101,10 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
                  * before debug exceptions are possibly reenabled
                  * through local_dbg_restore.
                  */
-               if (hw_breakpoint_restore)
-                       hw_breakpoint_restore(NULL);
+               if (!ret)
+                       ret = -EOPNOTSUPP;
+       } else {
+               __cpu_suspend_exit();
         }
  
         unpause_graph_tracing();
@@ -140,22 +119,15 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
         return ret;
  }
  
-struct sleep_save_sp sleep_save_sp;
-
  static int __init cpu_suspend_init(void)
  {
-       void *ctx_ptr;
-
         /* ctx_ptr is an array of physical addresses */
-       ctx_ptr = kcalloc(mpidr_hash_size(), sizeof(phys_addr_t), GFP_KERNEL);
+       sleep_save_stash = kcalloc(mpidr_hash_size(), sizeof(*sleep_save_stash),
+                                  GFP_KERNEL);
  
-       if (WARN_ON(!ctx_ptr))
+       if (WARN_ON(!sleep_save_stash))
                 return -ENOMEM;
  
-       sleep_save_sp.save_ptr_stash = ctx_ptr;
-       sleep_save_sp.save_ptr_stash_phys = virt_to_phys(ctx_ptr);
-       __flush_dcache_area(&sleep_save_sp, sizeof(struct sleep_save_sp));
-
         return 0;
  }
  early_initcall(cpu_suspend_init);
diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c

index 13339b6ffc1a07839103fca328f1eccd6d185c12..59779699a1a40ef3a1940aa0d878c16164ea5398 100644 (file)
--- a/arch/arm64/kernel/time.c
+++ b/arch/arm64/kernel/time.c
@@ -52,8 +52,11 @@ unsigned long profile_pc(struct pt_regs *regs)
         frame.fp = regs->regs[29];
         frame.sp = regs->sp;
         frame.pc = regs->pc;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       frame.graph = -1; /* no task info */
+#endif
         do {
-               int ret = unwind_frame(&frame);
+               int ret = unwind_frame(NULL, &frame);
                 if (ret < 0)
                         return 0;
         } while (in_lock_functions(frame.pc));
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c

index ca7f0ac5f70807ca2770ba646770e7cb82f235af..a036ff290d69d4687477bfd5f4c42f385474bac2 100644 (file)
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -146,17 +146,24 @@ static void dump_instr(const char *lvl, struct pt_regs *regs)
  static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
  {
         struct stackframe frame;
+       unsigned long irq_stack_ptr;
+       int skip;
+
+       /*
+        * Switching between stacks is valid when tracing current and in
+        * non-preemptible context.
+        */
+       if (tsk == current && !preemptible())
+               irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id());
+       else
+               irq_stack_ptr = 0;
  
         pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk);
  
         if (!tsk)
                 tsk = current;
  
-       if (regs) {
-               frame.fp = regs->regs[29];
-               frame.sp = regs->sp;
-               frame.pc = regs->pc;
-       } else if (tsk == current) {
+       if (tsk == current) {
                 frame.fp = (unsigned long)__builtin_frame_address(0);
                 frame.sp = current_stack_pointer;
                 frame.pc = (unsigned long)dump_backtrace;
@@ -168,21 +175,49 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
                 frame.sp = thread_saved_sp(tsk);
                 frame.pc = thread_saved_pc(tsk);
         }
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       frame.graph = tsk->curr_ret_stack;
+#endif
  
-       pr_emerg("Call trace:\n");
+       skip = !!regs;
+       printk("Call trace:\n");
         while (1) {
                 unsigned long where = frame.pc;
                 unsigned long stack;
                 int ret;
  
-               dump_backtrace_entry(where);
-               ret = unwind_frame(&frame);
+               /* skip until specified stack frame */
+               if (!skip) {
+                       dump_backtrace_entry(where);
+               } else if (frame.fp == regs->regs[29]) {
+                       skip = 0;
+                       /*
+                        * Mostly, this is the case where this function is
+                        * called in panic/abort. As exception handler's
+                        * stack frame does not contain the corresponding pc
+                        * at which an exception has taken place, use regs->pc
+                        * instead.
+                        */
+                       dump_backtrace_entry(regs->pc);
+               }
+               ret = unwind_frame(tsk, &frame);
                 if (ret < 0)
                         break;
                 stack = frame.sp;
-               if (in_exception_text(where))
+               if (in_exception_text(where)) {
+                       /*
+                        * If we switched to the irq_stack before calling this
+                        * exception handler, then the pt_regs will be on the
+                        * task stack. The easiest way to tell is if the large
+                        * pt_regs would overlap with the end of the irq_stack.
+                        */
+                       if (stack < irq_stack_ptr &&
+                           (stack + sizeof(struct pt_regs)) > irq_stack_ptr)
+                               stack = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr);
+
                         dump_mem("", "Exception stack", stack,
                                  stack + sizeof(struct pt_regs), false);
+               }
         }
  }
  
@@ -476,22 +511,22 @@ asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr)
  
  void __pte_error(const char *file, int line, unsigned long val)
  {
-       pr_crit("%s:%d: bad pte %016lx.\n", file, line, val);
+       pr_err("%s:%d: bad pte %016lx.\n", file, line, val);
  }
  
  void __pmd_error(const char *file, int line, unsigned long val)
  {
-       pr_crit("%s:%d: bad pmd %016lx.\n", file, line, val);
+       pr_err("%s:%d: bad pmd %016lx.\n", file, line, val);
  }
  
  void __pud_error(const char *file, int line, unsigned long val)
  {
-       pr_crit("%s:%d: bad pud %016lx.\n", file, line, val);
+       pr_err("%s:%d: bad pud %016lx.\n", file, line, val);
  }
  
  void __pgd_error(const char *file, int line, unsigned long val)
  {
-       pr_crit("%s:%d: bad pgd %016lx.\n", file, line, val);
+       pr_err("%s:%d: bad pgd %016lx.\n", file, line, val);
  }
  
  /* GENERIC_BUG traps */
diff --git a/arch/arm64/kernel/vdso/vdso.S b/arch/arm64/kernel/vdso/vdso.S

index 60c1db54b41a251b4d006c17a696aa9d6837067b..82379a70ef03ff4296cb8d7fc04226a3d68e1392 100644 (file)
--- a/arch/arm64/kernel/vdso/vdso.S
+++ b/arch/arm64/kernel/vdso/vdso.S
@@ -21,9 +21,8 @@
  #include <linux/const.h>
  #include <asm/page.h>
  
-       __PAGE_ALIGNED_DATA
-
         .globl vdso_start, vdso_end
+       .section .rodata
         .balign PAGE_SIZE
  vdso_start:
         .incbin "arch/arm64/kernel/vdso/vdso.so"
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S

index 71426a78db123d13e98acf8659d65155ff342a06..623532f44323026b4c2dfa54015f2e5714466ae0 100644 (file)
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -46,6 +46,16 @@ jiffies = jiffies_64;
         *(.idmap.text)                                  \
         VMLINUX_SYMBOL(__idmap_text_end) = .;
  
+#ifdef CONFIG_HIBERNATION
+#define HIBERNATE_TEXT                                 \
+       . = ALIGN(SZ_4K);                               \
+       VMLINUX_SYMBOL(__hibernate_exit_text_start) = .;\
+       *(.hibernate_exit.text)                         \
+       VMLINUX_SYMBOL(__hibernate_exit_text_end) = .;
+#else
+#define HIBERNATE_TEXT
+#endif
+
  /*
   * The size of the PE/COFF section that covers the kernel image, which
   * runs from stext to _edata, must be a round multiple of the PE/COFF
@@ -63,14 +73,19 @@ PECOFF_FILE_ALIGNMENT = 0x200;
  #endif
  
  #if defined(CONFIG_DEBUG_ALIGN_RODATA)
-#define ALIGN_DEBUG_RO                 . = ALIGN(1<<SECTION_SHIFT);
-#define ALIGN_DEBUG_RO_MIN(min)                ALIGN_DEBUG_RO
-#elif defined(CONFIG_DEBUG_RODATA)
-#define ALIGN_DEBUG_RO                 . = ALIGN(1<<PAGE_SHIFT);
-#define ALIGN_DEBUG_RO_MIN(min)                ALIGN_DEBUG_RO
+/*
+ *  4 KB granule:   1 level 2 entry
+ * 16 KB granule: 128 level 3 entries, with contiguous bit
+ * 64 KB granule:  32 level 3 entries, with contiguous bit
+ */
+#define SEGMENT_ALIGN                  SZ_2M
  #else
-#define ALIGN_DEBUG_RO
-#define ALIGN_DEBUG_RO_MIN(min)                . = ALIGN(min);
+/*
+ *  4 KB granule:  16 level 3 entries, with contiguous bit
+ * 16 KB granule:   4 level 3 entries, without contiguous bit
+ * 64 KB granule:   1 level 3 entry
+ */
+#define SEGMENT_ALIGN                  SZ_64K
  #endif
  
  SECTIONS
@@ -87,40 +102,43 @@ SECTIONS
                 EXIT_CALL
                 *(.discard)
                 *(.discard.*)
+               *(.interp .dynamic)
+               *(.dynsym .dynstr .hash)
         }
  
-       . = PAGE_OFFSET + TEXT_OFFSET;
+       . = KIMAGE_VADDR + TEXT_OFFSET;
  
         .head.text : {
                 _text = .;
                 HEAD_TEXT
         }
-       ALIGN_DEBUG_RO
         .text : {                       /* Real text segment            */
                 _stext = .;             /* Text and read-only data      */
                         __exception_text_start = .;
                         *(.exception.text)
                         __exception_text_end = .;
                         IRQENTRY_TEXT
+                       ENTRY_TEXT
                         TEXT_TEXT
                         SCHED_TEXT
                         LOCK_TEXT
+                       KPROBES_TEXT
                         HYPERVISOR_TEXT
                         IDMAP_TEXT
+                       HIBERNATE_TEXT
                         *(.fixup)
                         *(.gnu.warning)
                 . = ALIGN(16);
                 *(.got)                 /* Global offset table          */
         }
  
-       ALIGN_DEBUG_RO
-       RO_DATA(PAGE_SIZE)
-       EXCEPTION_TABLE(8)
+       . = ALIGN(SEGMENT_ALIGN);
+       RO_DATA(PAGE_SIZE)              /* everything from this point to */
+       EXCEPTION_TABLE(8)              /* _etext will be marked RO NX   */
         NOTES
-       ALIGN_DEBUG_RO
-       _etext = .;                     /* End of text and rodata section */
  
-       ALIGN_DEBUG_RO_MIN(PAGE_SIZE)
+       . = ALIGN(SEGMENT_ALIGN);
+       _etext = .;                     /* End of text and rodata section */
         __init_begin = .;
  
         INIT_TEXT_SECTION(8)
@@ -128,7 +146,6 @@ SECTIONS
                 ARM_EXIT_KEEP(EXIT_TEXT)
         }
  
-       ALIGN_DEBUG_RO_MIN(16)
         .init.data : {
                 INIT_DATA
                 INIT_SETUP(16)
@@ -143,9 +160,6 @@ SECTIONS
  
         PERCPU_SECTION(L1_CACHE_BYTES)
  
-       . = ALIGN(PAGE_SIZE);
-       __init_end = .;
-
         . = ALIGN(4);
         .altinstructions : {
                 __alt_instructions = .;
@@ -155,8 +169,16 @@ SECTIONS
         .altinstr_replacement : {
                 *(.altinstr_replacement)
         }
+       .rela : ALIGN(8) {
+               *(.rela .rela*)
+       }
+
+       __rela_offset   = ABSOLUTE(ADDR(.rela) - KIMAGE_VADDR);
+       __rela_size     = SIZEOF(.rela);
+
+       . = ALIGN(SEGMENT_ALIGN);
+       __init_end = .;
  
-       . = ALIGN(PAGE_SIZE);
         _data = .;
         _sdata = .;
         RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
@@ -186,8 +208,12 @@ ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
         "HYP init code too big or misaligned")
  ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
         "ID map text too big or misaligned")
+#ifdef CONFIG_HIBERNATION
+ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1))
+       <= SZ_4K, "Hibernate exit text too big or misaligned")
+#endif
  
  /*
   * If padding is applied before .head.text, virt<->phys conversions will fail.
   */
-ASSERT(_text == (PAGE_OFFSET + TEXT_OFFSET), "HEAD is misaligned")
+ASSERT(_text == (KIMAGE_VADDR + TEXT_OFFSET), "HEAD is misaligned")
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile

index 1949fe5f54246a3f14983753570f4e4d0f978e76..caee9ee8e12af1eef4c13ebd1f750d7a3b5ee7f2 100644 (file)
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -10,6 +10,7 @@ KVM=../../../virt/kvm
  ARM=../../../arch/arm/kvm
  
  obj-$(CONFIG_KVM_ARM_HOST) += kvm.o
+obj-$(CONFIG_KVM_ARM_HOST) += hyp/
  
  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o
  kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/arm.o $(ARM)/mmu.o $(ARM)/mmio.o
@@ -22,8 +23,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generi
  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic-v2-switch.o
  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o
  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic-v3-switch.o
  kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c

index 3039f080e2d5820ed4446aac0871dee42ffa5ad2..e5ee8880d5d9b2014ea6859ea455daa33beb2ebd 100644 (file)
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -28,7 +28,6 @@
  #include <asm/cputype.h>
  #include <asm/uaccess.h>
  #include <asm/kvm.h>
-#include <asm/kvm_asm.h>
  #include <asm/kvm_emulate.h>
  #include <asm/kvm_coproc.h>
  
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c

index 15f0477b0d2adc53d86573b1733d2fa7f368bbd9..25006a7a5316924d472f0d676b933e8007ca8dfe 100644 (file)
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -23,6 +23,7 @@
  #include <linux/kvm_host.h>
  
  #include <asm/esr.h>
+#include <asm/kvm_asm.h>
  #include <asm/kvm_coproc.h>
  #include <asm/kvm_emulate.h>
  #include <asm/kvm_mmu.h>
@@ -182,6 +183,13 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
                 exit_handler = kvm_get_exit_handler(vcpu);
  
                 return exit_handler(vcpu, run);
+       case ARM_EXCEPTION_HYP_GONE:
+               /*
+                * EL2 has been reset to the hyp-stub. This happens when a guest
+                * is pre-empted by kvm_reboot()'s shutdown call.
+                */
+               run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+               return 0;
         default:
                 kvm_pr_unimpl("Unsupported exception type: %d",
                               exception_index);
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S

index 84c338f017b2121b0000484bbca981cbabc49abb..d87635e678b7edf2ea18a6a038d2db1dba802d8c 100644 (file)
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -21,6 +21,7 @@
  #include <asm/kvm_arm.h>
  #include <asm/kvm_mmu.h>
  #include <asm/pgtable-hwdef.h>
+#include <asm/sysreg.h>
  
         .text
         .pushsection    .hyp.idmap.text, "ax"
@@ -96,6 +97,14 @@ __do_hyp_init:
  
         ldr     x4, =VTCR_EL2_FLAGS
         bfi     x4, x5, #16, #3
+       /*
+        * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS bit in
+        * VTCR_EL2.
+        */
+       mrs     x5, ID_AA64MMFR1_EL1
+       ubfx    x5, x5, #5, #1
+       lsl     x5, x5, #VTCR_EL2_VS
+       orr     x4, x4, x5
  
         msr     vtcr_el2, x4
  
@@ -108,8 +117,8 @@ __do_hyp_init:
         dsb     sy
  
         mrs     x4, sctlr_el2
-       and     x4, x4, #SCTLR_EL2_EE   // preserve endianness of EL2
-       ldr     x5, =SCTLR_EL2_FLAGS
+       and     x4, x4, #SCTLR_ELx_EE   // preserve endianness of EL2
+       ldr     x5, =SCTLR_ELx_FLAGS
         orr     x4, x4, x5
         msr     sctlr_el2, x4
         isb
@@ -143,6 +152,44 @@ merged:
         eret
  ENDPROC(__kvm_hyp_init)
  
+       /*
+        * x0: HYP boot pgd
+        * x1: HYP phys_idmap_start
+        */
+ENTRY(__kvm_hyp_reset)
+       /* We're in trampoline code in VA, switch back to boot page tables */
+       msr     ttbr0_el2, x0
+       isb
+
+       /* Ensure the PA branch doesn't find a stale tlb entry or stale code. */
+       ic      iallu
+       tlbi    alle2
+       dsb     sy
+       isb
+
+       /* Branch into PA space */
+       adr     x0, 1f
+       bfi     x1, x0, #0, #PAGE_SHIFT
+       br      x1
+
+       /* We're now in idmap, disable MMU */
+1:     mrs     x0, sctlr_el2
+       ldr     x1, =SCTLR_ELx_FLAGS
+       bic     x0, x0, x1              // Clear SCTL_M and etc
+       msr     sctlr_el2, x0
+       isb
+
+       /* Invalidate the old TLBs */
+       tlbi    alle2
+       dsb     sy
+
+       /* Install stub vectors */
+       adr_l   x0, __hyp_stub_vectors
+       msr     vbar_el2, x0
+
+       eret
+ENDPROC(__kvm_hyp_reset)
+
         .ltorg
  
         .popsection
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S

index 86c289832272d71ba48786414bb6e4ecb9b9cb14..7ce9315651518cf88e47d4a08670b9088bfbcfa1 100644 (file)
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -18,912 +18,11 @@
  #include <linux/linkage.h>
  
  #include <asm/alternative.h>
-#include <asm/asm-offsets.h>
  #include <asm/assembler.h>
  #include <asm/cpufeature.h>
-#include <asm/debug-monitors.h>
-#include <asm/esr.h>
-#include <asm/fpsimdmacros.h>
-#include <asm/kvm.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
-#include <asm/memory.h>
-
-#define CPU_GP_REG_OFFSET(x)   (CPU_GP_REGS + x)
-#define CPU_XREG_OFFSET(x)     CPU_GP_REG_OFFSET(CPU_USER_PT_REGS + 8*x)
-#define CPU_SPSR_OFFSET(x)     CPU_GP_REG_OFFSET(CPU_SPSR + 8*x)
-#define CPU_SYSREG_OFFSET(x)   (CPU_SYSREGS + 8*x)
-
-       .text
-       .pushsection    .hyp.text, "ax"
-       .align  PAGE_SHIFT
-
-.macro save_common_regs
-       // x2: base address for cpu context
-       // x3: tmp register
-
-       add     x3, x2, #CPU_XREG_OFFSET(19)
-       stp     x19, x20, [x3]
-       stp     x21, x22, [x3, #16]
-       stp     x23, x24, [x3, #32]
-       stp     x25, x26, [x3, #48]
-       stp     x27, x28, [x3, #64]
-       stp     x29, lr, [x3, #80]
-
-       mrs     x19, sp_el0
-       mrs     x20, elr_el2            // pc before entering el2
-       mrs     x21, spsr_el2           // pstate before entering el2
-
-       stp     x19, x20, [x3, #96]
-       str     x21, [x3, #112]
-
-       mrs     x22, sp_el1
-       mrs     x23, elr_el1
-       mrs     x24, spsr_el1
-
-       str     x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)]
-       str     x23, [x2, #CPU_GP_REG_OFFSET(CPU_ELR_EL1)]
-       str     x24, [x2, #CPU_SPSR_OFFSET(KVM_SPSR_EL1)]
-.endm
-
-.macro restore_common_regs
-       // x2: base address for cpu context
-       // x3: tmp register
-
-       ldr     x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)]
-       ldr     x23, [x2, #CPU_GP_REG_OFFSET(CPU_ELR_EL1)]
-       ldr     x24, [x2, #CPU_SPSR_OFFSET(KVM_SPSR_EL1)]
-
-       msr     sp_el1, x22
-       msr     elr_el1, x23
-       msr     spsr_el1, x24
-
-       add     x3, x2, #CPU_XREG_OFFSET(31)    // SP_EL0
-       ldp     x19, x20, [x3]
-       ldr     x21, [x3, #16]
-
-       msr     sp_el0, x19
-       msr     elr_el2, x20            // pc on return from el2
-       msr     spsr_el2, x21           // pstate on return from el2
-
-       add     x3, x2, #CPU_XREG_OFFSET(19)
-       ldp     x19, x20, [x3]
-       ldp     x21, x22, [x3, #16]
-       ldp     x23, x24, [x3, #32]
-       ldp     x25, x26, [x3, #48]
-       ldp     x27, x28, [x3, #64]
-       ldp     x29, lr, [x3, #80]
-.endm
-
-.macro save_host_regs
-       save_common_regs
-.endm
-
-.macro restore_host_regs
-       restore_common_regs
-.endm
-
-.macro save_fpsimd
-       // x2: cpu context address
-       // x3, x4: tmp regs
-       add     x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
-       fpsimd_save x3, 4
-.endm
-
-.macro restore_fpsimd
-       // x2: cpu context address
-       // x3, x4: tmp regs
-       add     x3, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
-       fpsimd_restore x3, 4
-.endm
-
-.macro save_guest_regs
-       // x0 is the vcpu address
-       // x1 is the return code, do not corrupt!
-       // x2 is the cpu context
-       // x3 is a tmp register
-       // Guest's x0-x3 are on the stack
-
-       // Compute base to save registers
-       add     x3, x2, #CPU_XREG_OFFSET(4)
-       stp     x4, x5, [x3]
-       stp     x6, x7, [x3, #16]
-       stp     x8, x9, [x3, #32]
-       stp     x10, x11, [x3, #48]
-       stp     x12, x13, [x3, #64]
-       stp     x14, x15, [x3, #80]
-       stp     x16, x17, [x3, #96]
-       str     x18, [x3, #112]
-
-       pop     x6, x7                  // x2, x3
-       pop     x4, x5                  // x0, x1
-
-       add     x3, x2, #CPU_XREG_OFFSET(0)
-       stp     x4, x5, [x3]
-       stp     x6, x7, [x3, #16]
-
-       save_common_regs
-.endm
-
-.macro restore_guest_regs
-       // x0 is the vcpu address.
-       // x2 is the cpu context
-       // x3 is a tmp register
-
-       // Prepare x0-x3 for later restore
-       add     x3, x2, #CPU_XREG_OFFSET(0)
-       ldp     x4, x5, [x3]
-       ldp     x6, x7, [x3, #16]
-       push    x4, x5          // Push x0-x3 on the stack
-       push    x6, x7
-
-       // x4-x18
-       ldp     x4, x5, [x3, #32]
-       ldp     x6, x7, [x3, #48]
-       ldp     x8, x9, [x3, #64]
-       ldp     x10, x11, [x3, #80]
-       ldp     x12, x13, [x3, #96]
-       ldp     x14, x15, [x3, #112]
-       ldp     x16, x17, [x3, #128]
-       ldr     x18, [x3, #144]
-
-       // x19-x29, lr, sp*, elr*, spsr*
-       restore_common_regs
-
-       // Last bits of the 64bit state
-       pop     x2, x3
-       pop     x0, x1
-
-       // Do not touch any register after this!
-.endm
-
-/*
- * Macros to perform system register save/restore.
- *
- * Ordering here is absolutely critical, and must be kept consistent
- * in {save,restore}_sysregs, {save,restore}_guest_32bit_state,
- * and in kvm_asm.h.
- *
- * In other words, don't touch any of these unless you know what
- * you are doing.
- */
-.macro save_sysregs
-       // x2: base address for cpu context
-       // x3: tmp register
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(MPIDR_EL1)
-
-       mrs     x4,     vmpidr_el2
-       mrs     x5,     csselr_el1
-       mrs     x6,     sctlr_el1
-       mrs     x7,     actlr_el1
-       mrs     x8,     cpacr_el1
-       mrs     x9,     ttbr0_el1
-       mrs     x10,    ttbr1_el1
-       mrs     x11,    tcr_el1
-       mrs     x12,    esr_el1
-       mrs     x13,    afsr0_el1
-       mrs     x14,    afsr1_el1
-       mrs     x15,    far_el1
-       mrs     x16,    mair_el1
-       mrs     x17,    vbar_el1
-       mrs     x18,    contextidr_el1
-       mrs     x19,    tpidr_el0
-       mrs     x20,    tpidrro_el0
-       mrs     x21,    tpidr_el1
-       mrs     x22,    amair_el1
-       mrs     x23,    cntkctl_el1
-       mrs     x24,    par_el1
-       mrs     x25,    mdscr_el1
-
-       stp     x4, x5, [x3]
-       stp     x6, x7, [x3, #16]
-       stp     x8, x9, [x3, #32]
-       stp     x10, x11, [x3, #48]
-       stp     x12, x13, [x3, #64]
-       stp     x14, x15, [x3, #80]
-       stp     x16, x17, [x3, #96]
-       stp     x18, x19, [x3, #112]
-       stp     x20, x21, [x3, #128]
-       stp     x22, x23, [x3, #144]
-       stp     x24, x25, [x3, #160]
-.endm
-
-.macro save_debug type
-       // x4: pointer to register set
-       // x5: number of registers to skip
-       // x6..x22 trashed
-
-       adr     x22, 1f
-       add     x22, x22, x5, lsl #2
-       br      x22
-1:
-       mrs     x21, \type\()15_el1
-       mrs     x20, \type\()14_el1
-       mrs     x19, \type\()13_el1
-       mrs     x18, \type\()12_el1
-       mrs     x17, \type\()11_el1
-       mrs     x16, \type\()10_el1
-       mrs     x15, \type\()9_el1
-       mrs     x14, \type\()8_el1
-       mrs     x13, \type\()7_el1
-       mrs     x12, \type\()6_el1
-       mrs     x11, \type\()5_el1
-       mrs     x10, \type\()4_el1
-       mrs     x9, \type\()3_el1
-       mrs     x8, \type\()2_el1
-       mrs     x7, \type\()1_el1
-       mrs     x6, \type\()0_el1
-
-       adr     x22, 1f
-       add     x22, x22, x5, lsl #2
-       br      x22
-1:
-       str     x21, [x4, #(15 * 8)]
-       str     x20, [x4, #(14 * 8)]
-       str     x19, [x4, #(13 * 8)]
-       str     x18, [x4, #(12 * 8)]
-       str     x17, [x4, #(11 * 8)]
-       str     x16, [x4, #(10 * 8)]
-       str     x15, [x4, #(9 * 8)]
-       str     x14, [x4, #(8 * 8)]
-       str     x13, [x4, #(7 * 8)]
-       str     x12, [x4, #(6 * 8)]
-       str     x11, [x4, #(5 * 8)]
-       str     x10, [x4, #(4 * 8)]
-       str     x9, [x4, #(3 * 8)]
-       str     x8, [x4, #(2 * 8)]
-       str     x7, [x4, #(1 * 8)]
-       str     x6, [x4, #(0 * 8)]
-.endm
-
-.macro restore_sysregs
-       // x2: base address for cpu context
-       // x3: tmp register
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(MPIDR_EL1)
-
-       ldp     x4, x5, [x3]
-       ldp     x6, x7, [x3, #16]
-       ldp     x8, x9, [x3, #32]
-       ldp     x10, x11, [x3, #48]
-       ldp     x12, x13, [x3, #64]
-       ldp     x14, x15, [x3, #80]
-       ldp     x16, x17, [x3, #96]
-       ldp     x18, x19, [x3, #112]
-       ldp     x20, x21, [x3, #128]
-       ldp     x22, x23, [x3, #144]
-       ldp     x24, x25, [x3, #160]
-
-       msr     vmpidr_el2,     x4
-       msr     csselr_el1,     x5
-       msr     sctlr_el1,      x6
-       msr     actlr_el1,      x7
-       msr     cpacr_el1,      x8
-       msr     ttbr0_el1,      x9
-       msr     ttbr1_el1,      x10
-       msr     tcr_el1,        x11
-       msr     esr_el1,        x12
-       msr     afsr0_el1,      x13
-       msr     afsr1_el1,      x14
-       msr     far_el1,        x15
-       msr     mair_el1,       x16
-       msr     vbar_el1,       x17
-       msr     contextidr_el1, x18
-       msr     tpidr_el0,      x19
-       msr     tpidrro_el0,    x20
-       msr     tpidr_el1,      x21
-       msr     amair_el1,      x22
-       msr     cntkctl_el1,    x23
-       msr     par_el1,        x24
-       msr     mdscr_el1,      x25
-.endm
-
-.macro restore_debug type
-       // x4: pointer to register set
-       // x5: number of registers to skip
-       // x6..x22 trashed
-
-       adr     x22, 1f
-       add     x22, x22, x5, lsl #2
-       br      x22
-1:
-       ldr     x21, [x4, #(15 * 8)]
-       ldr     x20, [x4, #(14 * 8)]
-       ldr     x19, [x4, #(13 * 8)]
-       ldr     x18, [x4, #(12 * 8)]
-       ldr     x17, [x4, #(11 * 8)]
-       ldr     x16, [x4, #(10 * 8)]
-       ldr     x15, [x4, #(9 * 8)]
-       ldr     x14, [x4, #(8 * 8)]
-       ldr     x13, [x4, #(7 * 8)]
-       ldr     x12, [x4, #(6 * 8)]
-       ldr     x11, [x4, #(5 * 8)]
-       ldr     x10, [x4, #(4 * 8)]
-       ldr     x9, [x4, #(3 * 8)]
-       ldr     x8, [x4, #(2 * 8)]
-       ldr     x7, [x4, #(1 * 8)]
-       ldr     x6, [x4, #(0 * 8)]
-
-       adr     x22, 1f
-       add     x22, x22, x5, lsl #2
-       br      x22
-1:
-       msr     \type\()15_el1, x21
-       msr     \type\()14_el1, x20
-       msr     \type\()13_el1, x19
-       msr     \type\()12_el1, x18
-       msr     \type\()11_el1, x17
-       msr     \type\()10_el1, x16
-       msr     \type\()9_el1, x15
-       msr     \type\()8_el1, x14
-       msr     \type\()7_el1, x13
-       msr     \type\()6_el1, x12
-       msr     \type\()5_el1, x11
-       msr     \type\()4_el1, x10
-       msr     \type\()3_el1, x9
-       msr     \type\()2_el1, x8
-       msr     \type\()1_el1, x7
-       msr     \type\()0_el1, x6
-.endm
-
-.macro skip_32bit_state tmp, target
-       // Skip 32bit state if not needed
-       mrs     \tmp, hcr_el2
-       tbnz    \tmp, #HCR_RW_SHIFT, \target
-.endm
-
-.macro skip_tee_state tmp, target
-       // Skip ThumbEE state if not needed
-       mrs     \tmp, id_pfr0_el1
-       tbz     \tmp, #12, \target
-.endm
-
-.macro skip_debug_state tmp, target
-       ldr     \tmp, [x0, #VCPU_DEBUG_FLAGS]
-       tbz     \tmp, #KVM_ARM64_DEBUG_DIRTY_SHIFT, \target
-.endm
-
-/*
- * Branch to target if CPTR_EL2.TFP bit is set (VFP/SIMD trapping enabled)
- */
-.macro skip_fpsimd_state tmp, target
-       mrs     \tmp, cptr_el2
-       tbnz    \tmp, #CPTR_EL2_TFP_SHIFT, \target
-.endm
-
-.macro compute_debug_state target
-       // Compute debug state: If any of KDE, MDE or KVM_ARM64_DEBUG_DIRTY
-       // is set, we do a full save/restore cycle and disable trapping.
-       add     x25, x0, #VCPU_CONTEXT
-
-       // Check the state of MDSCR_EL1
-       ldr     x25, [x25, #CPU_SYSREG_OFFSET(MDSCR_EL1)]
-       and     x26, x25, #DBG_MDSCR_KDE
-       and     x25, x25, #DBG_MDSCR_MDE
-       adds    xzr, x25, x26
-       b.eq    9998f           // Nothing to see there
-
-       // If any interesting bits was set, we must set the flag
-       mov     x26, #KVM_ARM64_DEBUG_DIRTY
-       str     x26, [x0, #VCPU_DEBUG_FLAGS]
-       b       9999f           // Don't skip restore
-
-9998:
-       // Otherwise load the flags from memory in case we recently
-       // trapped
-       skip_debug_state x25, \target
-9999:
-.endm
-
-.macro save_guest_32bit_state
-       skip_32bit_state x3, 1f
-
-       add     x3, x2, #CPU_SPSR_OFFSET(KVM_SPSR_ABT)
-       mrs     x4, spsr_abt
-       mrs     x5, spsr_und
-       mrs     x6, spsr_irq
-       mrs     x7, spsr_fiq
-       stp     x4, x5, [x3]
-       stp     x6, x7, [x3, #16]
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DACR32_EL2)
-       mrs     x4, dacr32_el2
-       mrs     x5, ifsr32_el2
-       stp     x4, x5, [x3]
-
-       skip_fpsimd_state x8, 2f
-       mrs     x6, fpexc32_el2
-       str     x6, [x3, #16]
-2:
-       skip_debug_state x8, 1f
-       mrs     x7, dbgvcr32_el2
-       str     x7, [x3, #24]
-1:
-.endm
-
-.macro restore_guest_32bit_state
-       skip_32bit_state x3, 1f
-
-       add     x3, x2, #CPU_SPSR_OFFSET(KVM_SPSR_ABT)
-       ldp     x4, x5, [x3]
-       ldp     x6, x7, [x3, #16]
-       msr     spsr_abt, x4
-       msr     spsr_und, x5
-       msr     spsr_irq, x6
-       msr     spsr_fiq, x7
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DACR32_EL2)
-       ldp     x4, x5, [x3]
-       msr     dacr32_el2, x4
-       msr     ifsr32_el2, x5
-
-       skip_debug_state x8, 1f
-       ldr     x7, [x3, #24]
-       msr     dbgvcr32_el2, x7
-1:
-.endm
-
-.macro activate_traps
-       ldr     x2, [x0, #VCPU_HCR_EL2]
-
-       /*
-        * We are about to set CPTR_EL2.TFP to trap all floating point
-        * register accesses to EL2, however, the ARM ARM clearly states that
-        * traps are only taken to EL2 if the operation would not otherwise
-        * trap to EL1.  Therefore, always make sure that for 32-bit guests,
-        * we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit.
-        */
-       tbnz    x2, #HCR_RW_SHIFT, 99f // open code skip_32bit_state
-       mov     x3, #(1 << 30)
-       msr     fpexc32_el2, x3
-       isb
-99:
-       msr     hcr_el2, x2
-       mov     x2, #CPTR_EL2_TTA
-       orr     x2, x2, #CPTR_EL2_TFP
-       msr     cptr_el2, x2
-
-       mov     x2, #(1 << 15)  // Trap CP15 Cr=15
-       msr     hstr_el2, x2
-
-       // Monitor Debug Config - see kvm_arm_setup_debug()
-       ldr     x2, [x0, #VCPU_MDCR_EL2]
-       msr     mdcr_el2, x2
-.endm
-
-.macro deactivate_traps
-       mov     x2, #HCR_RW
-       msr     hcr_el2, x2
-       msr     hstr_el2, xzr
-
-       mrs     x2, mdcr_el2
-       and     x2, x2, #MDCR_EL2_HPMN_MASK
-       msr     mdcr_el2, x2
-.endm
-
-.macro activate_vm
-       ldr     x1, [x0, #VCPU_KVM]
-       kern_hyp_va     x1
-       ldr     x2, [x1, #KVM_VTTBR]
-       msr     vttbr_el2, x2
-.endm
-
-.macro deactivate_vm
-       msr     vttbr_el2, xzr
-.endm
-
-/*
- * Call into the vgic backend for state saving
- */
-.macro save_vgic_state
-alternative_if_not ARM64_HAS_SYSREG_GIC_CPUIF
-       bl      __save_vgic_v2_state
-alternative_else
-       bl      __save_vgic_v3_state
-alternative_endif
-       mrs     x24, hcr_el2
-       mov     x25, #HCR_INT_OVERRIDE
-       neg     x25, x25
-       and     x24, x24, x25
-       msr     hcr_el2, x24
-.endm
-
-/*
- * Call into the vgic backend for state restoring
- */
-.macro restore_vgic_state
-       mrs     x24, hcr_el2
-       ldr     x25, [x0, #VCPU_IRQ_LINES]
-       orr     x24, x24, #HCR_INT_OVERRIDE
-       orr     x24, x24, x25
-       msr     hcr_el2, x24
-alternative_if_not ARM64_HAS_SYSREG_GIC_CPUIF
-       bl      __restore_vgic_v2_state
-alternative_else
-       bl      __restore_vgic_v3_state
-alternative_endif
-.endm
-
-.macro save_timer_state
-       // x0: vcpu pointer
-       ldr     x2, [x0, #VCPU_KVM]
-       kern_hyp_va x2
-       ldr     w3, [x2, #KVM_TIMER_ENABLED]
-       cbz     w3, 1f
-
-       mrs     x3, cntv_ctl_el0
-       and     x3, x3, #3
-       str     w3, [x0, #VCPU_TIMER_CNTV_CTL]
-
-       isb
-
-       mrs     x3, cntv_cval_el0
-       str     x3, [x0, #VCPU_TIMER_CNTV_CVAL]
-
-1:
-       // Disable the virtual timer
-       msr     cntv_ctl_el0, xzr
-
-       // Allow physical timer/counter access for the host
-       mrs     x2, cnthctl_el2
-       orr     x2, x2, #3
-       msr     cnthctl_el2, x2
-
-       // Clear cntvoff for the host
-       msr     cntvoff_el2, xzr
-.endm
-
-.macro restore_timer_state
-       // x0: vcpu pointer
-       // Disallow physical timer access for the guest
-       // Physical counter access is allowed
-       mrs     x2, cnthctl_el2
-       orr     x2, x2, #1
-       bic     x2, x2, #2
-       msr     cnthctl_el2, x2
-
-       ldr     x2, [x0, #VCPU_KVM]
-       kern_hyp_va x2
-       ldr     w3, [x2, #KVM_TIMER_ENABLED]
-       cbz     w3, 1f
-
-       ldr     x3, [x2, #KVM_TIMER_CNTVOFF]
-       msr     cntvoff_el2, x3
-       ldr     x2, [x0, #VCPU_TIMER_CNTV_CVAL]
-       msr     cntv_cval_el0, x2
-       isb
-
-       ldr     w2, [x0, #VCPU_TIMER_CNTV_CTL]
-       and     x2, x2, #3
-       msr     cntv_ctl_el0, x2
-1:
-.endm
-
-__save_sysregs:
-       save_sysregs
-       ret
-
-__restore_sysregs:
-       restore_sysregs
-       ret
-
-/* Save debug state */
-__save_debug:
-       // x2: ptr to CPU context
-       // x3: ptr to debug reg struct
-       // x4/x5/x6-22/x24-26: trashed
-
-       mrs     x26, id_aa64dfr0_el1
-       ubfx    x24, x26, #12, #4       // Extract BRPs
-       ubfx    x25, x26, #20, #4       // Extract WRPs
-       mov     w26, #15
-       sub     w24, w26, w24           // How many BPs to skip
-       sub     w25, w26, w25           // How many WPs to skip
-
-       mov     x5, x24
-       add     x4, x3, #DEBUG_BCR
-       save_debug dbgbcr
-       add     x4, x3, #DEBUG_BVR
-       save_debug dbgbvr
-
-       mov     x5, x25
-       add     x4, x3, #DEBUG_WCR
-       save_debug dbgwcr
-       add     x4, x3, #DEBUG_WVR
-       save_debug dbgwvr
-
-       mrs     x21, mdccint_el1
-       str     x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
-       ret
-
-/* Restore debug state */
-__restore_debug:
-       // x2: ptr to CPU context
-       // x3: ptr to debug reg struct
-       // x4/x5/x6-22/x24-26: trashed
-
-       mrs     x26, id_aa64dfr0_el1
-       ubfx    x24, x26, #12, #4       // Extract BRPs
-       ubfx    x25, x26, #20, #4       // Extract WRPs
-       mov     w26, #15
-       sub     w24, w26, w24           // How many BPs to skip
-       sub     w25, w26, w25           // How many WPs to skip
-
-       mov     x5, x24
-       add     x4, x3, #DEBUG_BCR
-       restore_debug dbgbcr
-       add     x4, x3, #DEBUG_BVR
-       restore_debug dbgbvr
-
-       mov     x5, x25
-       add     x4, x3, #DEBUG_WCR
-       restore_debug dbgwcr
-       add     x4, x3, #DEBUG_WVR
-       restore_debug dbgwvr
-
-       ldr     x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
-       msr     mdccint_el1, x21
-
-       ret
-
-__save_fpsimd:
-       skip_fpsimd_state x3, 1f
-       save_fpsimd
-1:     ret
-
-__restore_fpsimd:
-       skip_fpsimd_state x3, 1f
-       restore_fpsimd
-1:     ret
-
-switch_to_guest_fpsimd:
-       push    x4, lr
-
-       mrs     x2, cptr_el2
-       bic     x2, x2, #CPTR_EL2_TFP
-       msr     cptr_el2, x2
-       isb
-
-       mrs     x0, tpidr_el2
-
-       ldr     x2, [x0, #VCPU_HOST_CONTEXT]
-       kern_hyp_va x2
-       bl __save_fpsimd
-
-       add     x2, x0, #VCPU_CONTEXT
-       bl __restore_fpsimd
-
-       skip_32bit_state x3, 1f
-       ldr     x4, [x2, #CPU_SYSREG_OFFSET(FPEXC32_EL2)]
-       msr     fpexc32_el2, x4
-1:
-       pop     x4, lr
-       pop     x2, x3
-       pop     x0, x1
-
-       eret
-
-/*
- * u64 __kvm_vcpu_run(struct kvm_vcpu *vcpu);
- *
- * This is the world switch. The first half of the function
- * deals with entering the guest, and anything from __kvm_vcpu_return
- * to the end of the function deals with reentering the host.
- * On the enter path, only x0 (vcpu pointer) must be preserved until
- * the last moment. On the exit path, x0 (vcpu pointer) and x1 (exception
- * code) must both be preserved until the epilogue.
- * In both cases, x2 points to the CPU context we're saving/restoring from/to.
- */
-ENTRY(__kvm_vcpu_run)
-       kern_hyp_va     x0
-       msr     tpidr_el2, x0   // Save the vcpu register
-
-       // Host context
-       ldr     x2, [x0, #VCPU_HOST_CONTEXT]
-       kern_hyp_va x2
-
-       save_host_regs
-       bl __save_sysregs
-
-       compute_debug_state 1f
-       add     x3, x0, #VCPU_HOST_DEBUG_STATE
-       bl      __save_debug
-1:
-       activate_traps
-       activate_vm
-
-       restore_vgic_state
-       restore_timer_state
-
-       // Guest context
-       add     x2, x0, #VCPU_CONTEXT
-
-       // We must restore the 32-bit state before the sysregs, thanks
-       // to Cortex-A57 erratum #852523.
-       restore_guest_32bit_state
-       bl __restore_sysregs
-
-       skip_debug_state x3, 1f
-       ldr     x3, [x0, #VCPU_DEBUG_PTR]
-       kern_hyp_va x3
-       bl      __restore_debug
-1:
-       restore_guest_regs
-
-       // That's it, no more messing around.
-       eret
-
-__kvm_vcpu_return:
-       // Assume x0 is the vcpu pointer, x1 the return code
-       // Guest's x0-x3 are on the stack
-
-       // Guest context
-       add     x2, x0, #VCPU_CONTEXT
-
-       save_guest_regs
-       bl __save_fpsimd
-       bl __save_sysregs
-
-       skip_debug_state x3, 1f
-       ldr     x3, [x0, #VCPU_DEBUG_PTR]
-       kern_hyp_va x3
-       bl      __save_debug
-1:
-       save_guest_32bit_state
-
-       save_timer_state
-       save_vgic_state
-
-       deactivate_traps
-       deactivate_vm
-
-       // Host context
-       ldr     x2, [x0, #VCPU_HOST_CONTEXT]
-       kern_hyp_va x2
-
-       bl __restore_sysregs
-       bl __restore_fpsimd
-       /* Clear FPSIMD and Trace trapping */
-       msr     cptr_el2, xzr
-
-       skip_debug_state x3, 1f
-       // Clear the dirty flag for the next run, as all the state has
-       // already been saved. Note that we nuke the whole 64bit word.
-       // If we ever add more flags, we'll have to be more careful...
-       str     xzr, [x0, #VCPU_DEBUG_FLAGS]
-       add     x3, x0, #VCPU_HOST_DEBUG_STATE
-       bl      __restore_debug
-1:
-       restore_host_regs
-
-       mov     x0, x1
-       ret
-END(__kvm_vcpu_run)
-
-// void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
-ENTRY(__kvm_tlb_flush_vmid_ipa)
-       dsb     ishst
-
-       kern_hyp_va     x0
-       ldr     x2, [x0, #KVM_VTTBR]
-       msr     vttbr_el2, x2
-       isb
-
-       /*
-        * We could do so much better if we had the VA as well.
-        * Instead, we invalidate Stage-2 for this IPA, and the
-        * whole of Stage-1. Weep...
-        */
-       lsr     x1, x1, #12
-       tlbi    ipas2e1is, x1
-       /*
-        * We have to ensure completion of the invalidation at Stage-2,
-        * since a table walk on another CPU could refill a TLB with a
-        * complete (S1 + S2) walk based on the old Stage-2 mapping if
-        * the Stage-1 invalidation happened first.
-        */
-       dsb     ish
-       tlbi    vmalle1is
-       dsb     ish
-       isb
-
-       msr     vttbr_el2, xzr
-       ret
-ENDPROC(__kvm_tlb_flush_vmid_ipa)
-
-/**
- * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs
- * @struct kvm *kvm - pointer to kvm structure
- *
- * Invalidates all Stage 1 and 2 TLB entries for current VMID.
- */
-ENTRY(__kvm_tlb_flush_vmid)
-       dsb     ishst
-
-       kern_hyp_va     x0
-       ldr     x2, [x0, #KVM_VTTBR]
-       msr     vttbr_el2, x2
-       isb
-
-       tlbi    vmalls12e1is
-       dsb     ish
-       isb
-
-       msr     vttbr_el2, xzr
-       ret
-ENDPROC(__kvm_tlb_flush_vmid)
-
-ENTRY(__kvm_flush_vm_context)
-       dsb     ishst
-       tlbi    alle1is
-       ic      ialluis
-       dsb     ish
-       ret
-ENDPROC(__kvm_flush_vm_context)
-
-__kvm_hyp_panic:
-       // Stash PAR_EL1 before corrupting it in __restore_sysregs
-       mrs     x0, par_el1
-       push    x0, xzr
-
-       // Guess the context by looking at VTTBR:
-       // If zero, then we're already a host.
-       // Otherwise restore a minimal host context before panicing.
-       mrs     x0, vttbr_el2
-       cbz     x0, 1f
-
-       mrs     x0, tpidr_el2
-
-       deactivate_traps
-       deactivate_vm
-
-       ldr     x2, [x0, #VCPU_HOST_CONTEXT]
-       kern_hyp_va x2
-
-       bl __restore_sysregs
-
-       /*
-        * Make sure we have a valid host stack, and don't leave junk in the
-        * frame pointer that will give us a misleading host stack unwinding.
-        */
-       ldr     x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)]
-       msr     sp_el1, x22
-       mov     x29, xzr
-
-1:     adr     x0, __hyp_panic_str
-       adr     x1, 2f
-       ldp     x2, x3, [x1]
-       sub     x0, x0, x2
-       add     x0, x0, x3
-       mrs     x1, spsr_el2
-       mrs     x2, elr_el2
-       mrs     x3, esr_el2
-       mrs     x4, far_el2
-       mrs     x5, hpfar_el2
-       pop     x6, xzr         // active context PAR_EL1
-       mrs     x7, tpidr_el2
-
-       mov     lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
-                     PSR_MODE_EL1h)
-       msr     spsr_el2, lr
-       ldr     lr, =panic
-       msr     elr_el2, lr
-       eret
-
-       .align  3
-2:     .quad   HYP_PAGE_OFFSET
-       .quad   PAGE_OFFSET
-ENDPROC(__kvm_hyp_panic)
-
-__hyp_panic_str:
-       .ascii  "HYP panic:\nPS:%08x PC:%016x ESR:%08x\nFAR:%016x HPFAR:%016x PAR:%016x\nVCPU:%p\n\0"
-
-       .align  2
  
  /*
- * u64 kvm_call_hyp(void *hypfn, ...);
+ * u64 __kvm_call_hyp(void *hypfn, ...);
   *
   * This is not really a variadic function in the classic C-way and care must
   * be taken when calling this to ensure parameters are passed in registers
@@ -934,189 +33,23 @@ __hyp_panic_str:
   * passed as x0, x1, and x2 (a maximum of 3 arguments in addition to the
   * function pointer can be passed).  The function being called must be mapped
   * in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c).  Return values are
- * passed in r0 and r1.
+ * passed in x0.
   *
- * A function pointer with a value of 0 has a special meaning, and is
- * used to implement __hyp_get_vectors in the same way as in
+ * A function pointer with a value less than 0xfff has a special meaning,
+ * and is used to implement __hyp_get_vectors in the same way as in
   * arch/arm64/kernel/hyp_stub.S.
+ * HVC behaves as a 'bl' call and will clobber lr.
   */
-ENTRY(kvm_call_hyp)
+ENTRY(__kvm_call_hyp)
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
+       str     lr, [sp, #-16]!
         hvc     #0
+       ldr     lr, [sp], #16
         ret
-ENDPROC(kvm_call_hyp)
-
-.macro invalid_vector  label, target
-       .align  2
-\label:
-       b \target
-ENDPROC(\label)
-.endm
-
-       /* None of these should ever happen */
-       invalid_vector  el2t_sync_invalid, __kvm_hyp_panic
-       invalid_vector  el2t_irq_invalid, __kvm_hyp_panic
-       invalid_vector  el2t_fiq_invalid, __kvm_hyp_panic
-       invalid_vector  el2t_error_invalid, __kvm_hyp_panic
-       invalid_vector  el2h_sync_invalid, __kvm_hyp_panic
-       invalid_vector  el2h_irq_invalid, __kvm_hyp_panic
-       invalid_vector  el2h_fiq_invalid, __kvm_hyp_panic
-       invalid_vector  el2h_error_invalid, __kvm_hyp_panic
-       invalid_vector  el1_sync_invalid, __kvm_hyp_panic
-       invalid_vector  el1_irq_invalid, __kvm_hyp_panic
-       invalid_vector  el1_fiq_invalid, __kvm_hyp_panic
-       invalid_vector  el1_error_invalid, __kvm_hyp_panic
-
-el1_sync:                                      // Guest trapped into EL2
-       push    x0, x1
-       push    x2, x3
-
-       mrs     x1, esr_el2
-       lsr     x2, x1, #ESR_ELx_EC_SHIFT
-
-       cmp     x2, #ESR_ELx_EC_HVC64
-       b.ne    el1_trap
-
-       mrs     x3, vttbr_el2                   // If vttbr is valid, the 64bit guest
-       cbnz    x3, el1_trap                    // called HVC
-
-       /* Here, we're pretty sure the host called HVC. */
-       pop     x2, x3
-       pop     x0, x1
-
-       /* Check for __hyp_get_vectors */
-       cbnz    x0, 1f
-       mrs     x0, vbar_el2
-       b       2f
-
-1:     push    lr, xzr
-
-       /*
-        * Compute the function address in EL2, and shuffle the parameters.
-        */
-       kern_hyp_va     x0
-       mov     lr, x0
-       mov     x0, x1
-       mov     x1, x2
-       mov     x2, x3
-       blr     lr
-
-       pop     lr, xzr
-2:     eret
-
-el1_trap:
-       /*
-        * x1: ESR
-        * x2: ESR_EC
-        */
-
-       /* Guest accessed VFP/SIMD registers, save host, restore Guest */
-       cmp     x2, #ESR_ELx_EC_FP_ASIMD
-       b.eq    switch_to_guest_fpsimd
-
-       cmp     x2, #ESR_ELx_EC_DABT_LOW
-       mov     x0, #ESR_ELx_EC_IABT_LOW
-       ccmp    x2, x0, #4, ne
-       b.ne    1f              // Not an abort we care about
-
-       /* This is an abort. Check for permission fault */
-alternative_if_not ARM64_WORKAROUND_834220
-       and     x2, x1, #ESR_ELx_FSC_TYPE
-       cmp     x2, #FSC_PERM
-       b.ne    1f              // Not a permission fault
  alternative_else
-       nop                     // Use the permission fault path to
-       nop                     // check for a valid S1 translation,
-       nop                     // regardless of the ESR value.
+       b       __vhe_hyp_call
+       nop
+       nop
+       nop
  alternative_endif
-
-       /*
-        * Check for Stage-1 page table walk, which is guaranteed
-        * to give a valid HPFAR_EL2.
-        */
-       tbnz    x1, #7, 1f      // S1PTW is set
-
-       /* Preserve PAR_EL1 */
-       mrs     x3, par_el1
-       push    x3, xzr
-
-       /*
-        * Permission fault, HPFAR_EL2 is invalid.
-        * Resolve the IPA the hard way using the guest VA.
-        * Stage-1 translation already validated the memory access rights.
-        * As such, we can use the EL1 translation regime, and don't have
-        * to distinguish between EL0 and EL1 access.
-        */
-       mrs     x2, far_el2
-       at      s1e1r, x2
-       isb
-
-       /* Read result */
-       mrs     x3, par_el1
-       pop     x0, xzr                 // Restore PAR_EL1 from the stack
-       msr     par_el1, x0
-       tbnz    x3, #0, 3f              // Bail out if we failed the translation
-       ubfx    x3, x3, #12, #36        // Extract IPA
-       lsl     x3, x3, #4              // and present it like HPFAR
-       b       2f
-
-1:     mrs     x3, hpfar_el2
-       mrs     x2, far_el2
-
-2:     mrs     x0, tpidr_el2
-       str     w1, [x0, #VCPU_ESR_EL2]
-       str     x2, [x0, #VCPU_FAR_EL2]
-       str     x3, [x0, #VCPU_HPFAR_EL2]
-
-       mov     x1, #ARM_EXCEPTION_TRAP
-       b       __kvm_vcpu_return
-
-       /*
-        * Translation failed. Just return to the guest and
-        * let it fault again. Another CPU is probably playing
-        * behind our back.
-        */
-3:     pop     x2, x3
-       pop     x0, x1
-
-       eret
-
-el1_irq:
-       push    x0, x1
-       push    x2, x3
-       mrs     x0, tpidr_el2
-       mov     x1, #ARM_EXCEPTION_IRQ
-       b       __kvm_vcpu_return
-
-       .ltorg
-
-       .align 11
-
-ENTRY(__kvm_hyp_vector)
-       ventry  el2t_sync_invalid               // Synchronous EL2t
-       ventry  el2t_irq_invalid                // IRQ EL2t
-       ventry  el2t_fiq_invalid                // FIQ EL2t
-       ventry  el2t_error_invalid              // Error EL2t
-
-       ventry  el2h_sync_invalid               // Synchronous EL2h
-       ventry  el2h_irq_invalid                // IRQ EL2h
-       ventry  el2h_fiq_invalid                // FIQ EL2h
-       ventry  el2h_error_invalid              // Error EL2h
-
-       ventry  el1_sync                        // Synchronous 64-bit EL1
-       ventry  el1_irq                         // IRQ 64-bit EL1
-       ventry  el1_fiq_invalid                 // FIQ 64-bit EL1
-       ventry  el1_error_invalid               // Error 64-bit EL1
-
-       ventry  el1_sync                        // Synchronous 32-bit EL1
-       ventry  el1_irq                         // IRQ 32-bit EL1
-       ventry  el1_fiq_invalid                 // FIQ 32-bit EL1
-       ventry  el1_error_invalid               // Error 32-bit EL1
-ENDPROC(__kvm_hyp_vector)
-
-
-ENTRY(__kvm_get_mdcr_el2)
-       mrs     x0, mdcr_el2
-       ret
-ENDPROC(__kvm_get_mdcr_el2)
-
-       .popsection
+ENDPROC(__kvm_call_hyp)
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile

new file mode 100644 (file)

index 0000000..826032b
--- /dev/null
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for Kernel-based Virtual Machine module, HYP part
+#
+
+obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += timer-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += entry.o
+obj-$(CONFIG_KVM_ARM_HOST) += switch.o
+obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o
+obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
+obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o
diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c

new file mode 100644 (file)

index 0000000..2f8bca8
--- /dev/null
+++ b/arch/arm64/kvm/hyp/debug-sr.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/compiler.h>
+#include <linux/kvm_host.h>
+
+#include <asm/debug-monitors.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_mmu.h>
+
+#include "hyp.h"
+
+#define read_debug(r,n)                read_sysreg(r##n##_el1)
+#define write_debug(v,r,n)     write_sysreg(v, r##n##_el1)
+
+#define save_debug(ptr,reg,nr)                                         \
+       switch (nr) {                                                   \
+       case 15:        ptr[15] = read_debug(reg, 15);                  \
+       case 14:        ptr[14] = read_debug(reg, 14);                  \
+       case 13:        ptr[13] = read_debug(reg, 13);                  \
+       case 12:        ptr[12] = read_debug(reg, 12);                  \
+       case 11:        ptr[11] = read_debug(reg, 11);                  \
+       case 10:        ptr[10] = read_debug(reg, 10);                  \
+       case 9:         ptr[9] = read_debug(reg, 9);                    \
+       case 8:         ptr[8] = read_debug(reg, 8);                    \
+       case 7:         ptr[7] = read_debug(reg, 7);                    \
+       case 6:         ptr[6] = read_debug(reg, 6);                    \
+       case 5:         ptr[5] = read_debug(reg, 5);                    \
+       case 4:         ptr[4] = read_debug(reg, 4);                    \
+       case 3:         ptr[3] = read_debug(reg, 3);                    \
+       case 2:         ptr[2] = read_debug(reg, 2);                    \
+       case 1:         ptr[1] = read_debug(reg, 1);                    \
+       default:        ptr[0] = read_debug(reg, 0);                    \
+       }
+
+#define restore_debug(ptr,reg,nr)                                      \
+       switch (nr) {                                                   \
+       case 15:        write_debug(ptr[15], reg, 15);                  \
+       case 14:        write_debug(ptr[14], reg, 14);                  \
+       case 13:        write_debug(ptr[13], reg, 13);                  \
+       case 12:        write_debug(ptr[12], reg, 12);                  \
+       case 11:        write_debug(ptr[11], reg, 11);                  \
+       case 10:        write_debug(ptr[10], reg, 10);                  \
+       case 9:         write_debug(ptr[9], reg, 9);                    \
+       case 8:         write_debug(ptr[8], reg, 8);                    \
+       case 7:         write_debug(ptr[7], reg, 7);                    \
+       case 6:         write_debug(ptr[6], reg, 6);                    \
+       case 5:         write_debug(ptr[5], reg, 5);                    \
+       case 4:         write_debug(ptr[4], reg, 4);                    \
+       case 3:         write_debug(ptr[3], reg, 3);                    \
+       case 2:         write_debug(ptr[2], reg, 2);                    \
+       case 1:         write_debug(ptr[1], reg, 1);                    \
+       default:        write_debug(ptr[0], reg, 0);                    \
+       }
+
+void __hyp_text __debug_save_state(struct kvm_vcpu *vcpu,
+                                  struct kvm_guest_debug_arch *dbg,
+                                  struct kvm_cpu_context *ctxt)
+{
+       u64 aa64dfr0;
+       int brps, wrps;
+
+       if (!(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY))
+               return;
+
+       aa64dfr0 = read_sysreg(id_aa64dfr0_el1);
+       brps = (aa64dfr0 >> 12) & 0xf;
+       wrps = (aa64dfr0 >> 20) & 0xf;
+
+       save_debug(dbg->dbg_bcr, dbgbcr, brps);
+       save_debug(dbg->dbg_bvr, dbgbvr, brps);
+       save_debug(dbg->dbg_wcr, dbgwcr, wrps);
+       save_debug(dbg->dbg_wvr, dbgwvr, wrps);
+
+       ctxt->sys_regs[MDCCINT_EL1] = read_sysreg(mdccint_el1);
+}
+
+void __hyp_text __debug_restore_state(struct kvm_vcpu *vcpu,
+                                     struct kvm_guest_debug_arch *dbg,
+                                     struct kvm_cpu_context *ctxt)
+{
+       u64 aa64dfr0;
+       int brps, wrps;
+
+       if (!(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY))
+               return;
+
+       aa64dfr0 = read_sysreg(id_aa64dfr0_el1);
+
+       brps = (aa64dfr0 >> 12) & 0xf;
+       wrps = (aa64dfr0 >> 20) & 0xf;
+
+       restore_debug(dbg->dbg_bcr, dbgbcr, brps);
+       restore_debug(dbg->dbg_bvr, dbgbvr, brps);
+       restore_debug(dbg->dbg_wcr, dbgwcr, wrps);
+       restore_debug(dbg->dbg_wvr, dbgwvr, wrps);
+
+       write_sysreg(ctxt->sys_regs[MDCCINT_EL1], mdccint_el1);
+}
+
+void __hyp_text __debug_cond_save_host_state(struct kvm_vcpu *vcpu)
+{
+       /* If any of KDE, MDE or KVM_ARM64_DEBUG_DIRTY is set, perform
+        * a full save/restore cycle. */
+       if ((vcpu->arch.ctxt.sys_regs[MDSCR_EL1] & DBG_MDSCR_KDE) ||
+           (vcpu->arch.ctxt.sys_regs[MDSCR_EL1] & DBG_MDSCR_MDE))
+               vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
+
+       __debug_save_state(vcpu, &vcpu->arch.host_debug_state,
+                          kern_hyp_va(vcpu->arch.host_cpu_context));
+}
+
+void __hyp_text __debug_cond_restore_host_state(struct kvm_vcpu *vcpu)
+{
+       __debug_restore_state(vcpu, &vcpu->arch.host_debug_state,
+                             kern_hyp_va(vcpu->arch.host_cpu_context));
+
+       if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
+               vcpu->arch.debug_flags &= ~KVM_ARM64_DEBUG_DIRTY;
+}
+
+static u32 __hyp_text __debug_read_mdcr_el2(void)
+{
+       return read_sysreg(mdcr_el2);
+}
+
+__alias(__debug_read_mdcr_el2) u32 __kvm_get_mdcr_el2(void);
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S

new file mode 100644 (file)

index 0000000..fd0fbe9
--- /dev/null
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -0,0 +1,160 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/assembler.h>
+#include <asm/fpsimdmacros.h>
+#include <asm/kvm.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_mmu.h>
+
+#define CPU_GP_REG_OFFSET(x)   (CPU_GP_REGS + x)
+#define CPU_XREG_OFFSET(x)     CPU_GP_REG_OFFSET(CPU_USER_PT_REGS + 8*x)
+
+       .text
+       .pushsection    .hyp.text, "ax"
+
+.macro save_callee_saved_regs ctxt
+       stp     x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
+       stp     x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
+       stp     x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
+       stp     x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
+       stp     x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
+       stp     x29, lr,  [\ctxt, #CPU_XREG_OFFSET(29)]
+.endm
+
+.macro restore_callee_saved_regs ctxt
+       ldp     x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
+       ldp     x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
+       ldp     x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
+       ldp     x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
+       ldp     x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
+       ldp     x29, lr,  [\ctxt, #CPU_XREG_OFFSET(29)]
+.endm
+
+/*
+ * u64 __guest_enter(struct kvm_vcpu *vcpu,
+ *                  struct kvm_cpu_context *host_ctxt);
+ */
+ENTRY(__guest_enter)
+       // x0: vcpu
+       // x1: host/guest context
+       // x2-x18: clobbered by macros
+
+       // Store the host regs
+       save_callee_saved_regs x1
+
+       // Preserve vcpu & host_ctxt for use at exit time
+       stp     x0, x1, [sp, #-16]!
+
+       add     x1, x0, #VCPU_CONTEXT
+
+       // Prepare x0-x1 for later restore by pushing them onto the stack
+       ldp     x2, x3, [x1, #CPU_XREG_OFFSET(0)]
+       stp     x2, x3, [sp, #-16]!
+
+       // x2-x18
+       ldp     x2, x3,   [x1, #CPU_XREG_OFFSET(2)]
+       ldp     x4, x5,   [x1, #CPU_XREG_OFFSET(4)]
+       ldp     x6, x7,   [x1, #CPU_XREG_OFFSET(6)]
+       ldp     x8, x9,   [x1, #CPU_XREG_OFFSET(8)]
+       ldp     x10, x11, [x1, #CPU_XREG_OFFSET(10)]
+       ldp     x12, x13, [x1, #CPU_XREG_OFFSET(12)]
+       ldp     x14, x15, [x1, #CPU_XREG_OFFSET(14)]
+       ldp     x16, x17, [x1, #CPU_XREG_OFFSET(16)]
+       ldr     x18,      [x1, #CPU_XREG_OFFSET(18)]
+
+       // x19-x29, lr
+       restore_callee_saved_regs x1
+
+       // Last bits of the 64bit state
+       ldp     x0, x1, [sp], #16
+
+       // Do not touch any register after this!
+       eret
+ENDPROC(__guest_enter)
+
+ENTRY(__guest_exit)
+       // x0: vcpu
+       // x1: return code
+       // x2-x3: free
+       // x4-x29,lr: vcpu regs
+       // vcpu x0-x3 on the stack
+
+       add     x2, x0, #VCPU_CONTEXT
+
+       stp     x4, x5,   [x2, #CPU_XREG_OFFSET(4)]
+       stp     x6, x7,   [x2, #CPU_XREG_OFFSET(6)]
+       stp     x8, x9,   [x2, #CPU_XREG_OFFSET(8)]
+       stp     x10, x11, [x2, #CPU_XREG_OFFSET(10)]
+       stp     x12, x13, [x2, #CPU_XREG_OFFSET(12)]
+       stp     x14, x15, [x2, #CPU_XREG_OFFSET(14)]
+       stp     x16, x17, [x2, #CPU_XREG_OFFSET(16)]
+       str     x18,      [x2, #CPU_XREG_OFFSET(18)]
+
+       ldp     x6, x7, [sp], #16       // x2, x3
+       ldp     x4, x5, [sp], #16       // x0, x1
+
+       stp     x4, x5, [x2, #CPU_XREG_OFFSET(0)]
+       stp     x6, x7, [x2, #CPU_XREG_OFFSET(2)]
+
+       save_callee_saved_regs x2
+
+       // Restore vcpu & host_ctxt from the stack
+       // (preserving return code in x1)
+       ldp     x0, x2, [sp], #16
+       // Now restore the host regs
+       restore_callee_saved_regs x2
+
+       mov     x0, x1
+       ret
+ENDPROC(__guest_exit)
+
+ENTRY(__fpsimd_guest_restore)
+       stp     x4, lr, [sp, #-16]!
+
+       mrs     x2, cptr_el2
+       bic     x2, x2, #CPTR_EL2_TFP
+       msr     cptr_el2, x2
+       isb
+
+       mrs     x3, tpidr_el2
+
+       ldr     x0, [x3, #VCPU_HOST_CONTEXT]
+       kern_hyp_va x0
+       add     x0, x0, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
+       bl      __fpsimd_save_state
+
+       add     x2, x3, #VCPU_CONTEXT
+       add     x0, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
+       bl      __fpsimd_restore_state
+
+       // Skip restoring fpexc32 for AArch64 guests
+       mrs     x1, hcr_el2
+       tbnz    x1, #HCR_RW_SHIFT, 1f
+       ldr     x4, [x3, #VCPU_FPEXC32_EL2]
+       msr     fpexc32_el2, x4
+1:
+       ldp     x4, lr, [sp], #16
+       ldp     x2, x3, [sp], #16
+       ldp     x0, x1, [sp], #16
+
+       eret
+ENDPROC(__fpsimd_guest_restore)
diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S

new file mode 100644 (file)

index 0000000..da3f22c
--- /dev/null
+++ b/arch/arm64/kvm/hyp/fpsimd.S
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/fpsimdmacros.h>
+
+       .text
+       .pushsection    .hyp.text, "ax"
+
+ENTRY(__fpsimd_save_state)
+       fpsimd_save     x0, 1
+       ret
+ENDPROC(__fpsimd_save_state)
+
+ENTRY(__fpsimd_restore_state)
+       fpsimd_restore  x0, 1
+       ret
+ENDPROC(__fpsimd_restore_state)
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S

new file mode 100644 (file)

index 0000000..44c79fd
--- /dev/null
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -0,0 +1,232 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/alternative.h>
+#include <asm/assembler.h>
+#include <asm/asm-offsets.h>
+#include <asm/cpufeature.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_mmu.h>
+
+       .text
+       .pushsection    .hyp.text, "ax"
+
+.macro save_x0_to_x3
+       stp     x0, x1, [sp, #-16]!
+       stp     x2, x3, [sp, #-16]!
+.endm
+
+.macro restore_x0_to_x3
+       ldp     x2, x3, [sp], #16
+       ldp     x0, x1, [sp], #16
+.endm
+
+.macro do_el2_call
+       /*
+        * Shuffle the parameters before calling the function
+        * pointed to in x0. Assumes parameters in x[1,2,3].
+        */
+       mov     lr, x0
+       mov     x0, x1
+       mov     x1, x2
+       mov     x2, x3
+       blr     lr
+.endm
+
+ENTRY(__vhe_hyp_call)
+       str     lr, [sp, #-16]!
+       do_el2_call
+       ldr     lr, [sp], #16
+       /*
+        * We used to rely on having an exception return to get
+        * an implicit isb. In the E2H case, we don't have it anymore.
+        * rather than changing all the leaf functions, just do it here
+        * before returning to the rest of the kernel.
+        */
+       isb
+       ret
+ENDPROC(__vhe_hyp_call)
+       
+el1_sync:                              // Guest trapped into EL2
+       save_x0_to_x3
+
+       mrs     x1, esr_el2
+       lsr     x2, x1, #ESR_ELx_EC_SHIFT
+
+       cmp     x2, #ESR_ELx_EC_HVC64
+       b.ne    el1_trap
+
+       mrs     x3, vttbr_el2           // If vttbr is valid, the 64bit guest
+       cbnz    x3, el1_trap            // called HVC
+
+       /* Here, we're pretty sure the host called HVC. */
+       restore_x0_to_x3
+
+       cmp     x0, #HVC_GET_VECTORS
+       b.ne    1f
+       mrs     x0, vbar_el2
+       b       2f
+
+1:
+       /*
+        * Perform the EL2 call
+        */
+       kern_hyp_va     x0
+       do_el2_call
+
+2:     eret
+
+el1_trap:
+       /*
+        * x1: ESR
+        * x2: ESR_EC
+        */
+
+       /* Guest accessed VFP/SIMD registers, save host, restore Guest */
+       cmp     x2, #ESR_ELx_EC_FP_ASIMD
+       b.eq    __fpsimd_guest_restore
+
+       cmp     x2, #ESR_ELx_EC_DABT_LOW
+       mov     x0, #ESR_ELx_EC_IABT_LOW
+       ccmp    x2, x0, #4, ne
+       b.ne    1f              // Not an abort we care about
+
+       /* This is an abort. Check for permission fault */
+alternative_if_not ARM64_WORKAROUND_834220
+       and     x2, x1, #ESR_ELx_FSC_TYPE
+       cmp     x2, #FSC_PERM
+       b.ne    1f              // Not a permission fault
+alternative_else
+       nop                     // Use the permission fault path to
+       nop                     // check for a valid S1 translation,
+       nop                     // regardless of the ESR value.
+alternative_endif
+
+       /*
+        * Check for Stage-1 page table walk, which is guaranteed
+        * to give a valid HPFAR_EL2.
+        */
+       tbnz    x1, #7, 1f      // S1PTW is set
+
+       /* Preserve PAR_EL1 */
+       mrs     x3, par_el1
+       stp     x3, xzr, [sp, #-16]!
+
+       /*
+        * Permission fault, HPFAR_EL2 is invalid.
+        * Resolve the IPA the hard way using the guest VA.
+        * Stage-1 translation already validated the memory access rights.
+        * As such, we can use the EL1 translation regime, and don't have
+        * to distinguish between EL0 and EL1 access.
+        */
+       mrs     x2, far_el2
+       at      s1e1r, x2
+       isb
+
+       /* Read result */
+       mrs     x3, par_el1
+       ldp     x0, xzr, [sp], #16      // Restore PAR_EL1 from the stack
+       msr     par_el1, x0
+       tbnz    x3, #0, 3f              // Bail out if we failed the translation
+       ubfx    x3, x3, #12, #36        // Extract IPA
+       lsl     x3, x3, #4              // and present it like HPFAR
+       b       2f
+
+1:     mrs     x3, hpfar_el2
+       mrs     x2, far_el2
+
+2:     mrs     x0, tpidr_el2
+       str     w1, [x0, #VCPU_ESR_EL2]
+       str     x2, [x0, #VCPU_FAR_EL2]
+       str     x3, [x0, #VCPU_HPFAR_EL2]
+
+       mov     x1, #ARM_EXCEPTION_TRAP
+       b       __guest_exit
+
+       /*
+        * Translation failed. Just return to the guest and
+        * let it fault again. Another CPU is probably playing
+        * behind our back.
+        */
+3:     restore_x0_to_x3
+
+       eret
+
+el1_irq:
+       save_x0_to_x3
+       mrs     x0, tpidr_el2
+       mov     x1, #ARM_EXCEPTION_IRQ
+       b       __guest_exit
+
+ENTRY(__hyp_do_panic)
+       mov     lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
+                     PSR_MODE_EL1h)
+       msr     spsr_el2, lr
+       ldr     lr, =panic
+       msr     elr_el2, lr
+       eret
+ENDPROC(__hyp_do_panic)
+
+.macro invalid_vector  label, target = __hyp_panic
+       .align  2
+\label:
+       b \target
+ENDPROC(\label)
+.endm
+
+       /* None of these should ever happen */
+       invalid_vector  el2t_sync_invalid
+       invalid_vector  el2t_irq_invalid
+       invalid_vector  el2t_fiq_invalid
+       invalid_vector  el2t_error_invalid
+       invalid_vector  el2h_sync_invalid
+       invalid_vector  el2h_irq_invalid
+       invalid_vector  el2h_fiq_invalid
+       invalid_vector  el2h_error_invalid
+       invalid_vector  el1_sync_invalid
+       invalid_vector  el1_irq_invalid
+       invalid_vector  el1_fiq_invalid
+       invalid_vector  el1_error_invalid
+
+       .ltorg
+
+       .align 11
+
+ENTRY(__kvm_hyp_vector)
+       ventry  el2t_sync_invalid               // Synchronous EL2t
+       ventry  el2t_irq_invalid                // IRQ EL2t
+       ventry  el2t_fiq_invalid                // FIQ EL2t
+       ventry  el2t_error_invalid              // Error EL2t
+
+       ventry  el2h_sync_invalid               // Synchronous EL2h
+       ventry  el2h_irq_invalid                // IRQ EL2h
+       ventry  el2h_fiq_invalid                // FIQ EL2h
+       ventry  el2h_error_invalid              // Error EL2h
+
+       ventry  el1_sync                        // Synchronous 64-bit EL1
+       ventry  el1_irq                         // IRQ 64-bit EL1
+       ventry  el1_fiq_invalid                 // FIQ 64-bit EL1
+       ventry  el1_error_invalid               // Error 64-bit EL1
+
+       ventry  el1_sync                        // Synchronous 32-bit EL1
+       ventry  el1_irq                         // IRQ 32-bit EL1
+       ventry  el1_fiq_invalid                 // FIQ 32-bit EL1
+       ventry  el1_error_invalid               // Error 32-bit EL1
+ENDPROC(__kvm_hyp_vector)
diff --git a/arch/arm64/kvm/hyp/hyp.h b/arch/arm64/kvm/hyp/hyp.h

new file mode 100644 (file)

index 0000000..fb27517
--- /dev/null
+++ b/arch/arm64/kvm/hyp/hyp.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ARM64_KVM_HYP_H__
+#define __ARM64_KVM_HYP_H__
+
+#include <linux/compiler.h>
+#include <linux/kvm_host.h>
+#include <asm/kvm_mmu.h>
+#include <asm/sysreg.h>
+
+#define __hyp_text __section(.hyp.text) notrace
+
+#define kern_hyp_va(v) (typeof(v))((unsigned long)(v) & HYP_PAGE_OFFSET_MASK)
+#define hyp_kern_va(v) (typeof(v))((unsigned long)(v) - HYP_PAGE_OFFSET \
+                                                     + PAGE_OFFSET)
+
+/**
+ * hyp_alternate_select - Generates patchable code sequences that are
+ * used to switch between two implementations of a function, depending
+ * on the availability of a feature.
+ *
+ * @fname: a symbol name that will be defined as a function returning a
+ * function pointer whose type will match @orig and @alt
+ * @orig: A pointer to the default function, as returned by @fname when
+ * @cond doesn't hold
+ * @alt: A pointer to the alternate function, as returned by @fname
+ * when @cond holds
+ * @cond: a CPU feature (as described in asm/cpufeature.h)
+ */
+#define hyp_alternate_select(fname, orig, alt, cond)                   \
+typeof(orig) * __hyp_text fname(void)                                  \
+{                                                                      \
+       typeof(alt) *val = orig;                                        \
+       asm volatile(ALTERNATIVE("nop           \n",                    \
+                                "mov   %0, %1  \n",                    \
+                                cond)                                  \
+                    : "+r" (val) : "r" (alt));                         \
+       return val;                                                     \
+}
+
+void __vgic_v2_save_state(struct kvm_vcpu *vcpu);
+void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
+
+void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
+void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
+
+void __timer_save_state(struct kvm_vcpu *vcpu);
+void __timer_restore_state(struct kvm_vcpu *vcpu);
+
+void __sysreg_save_state(struct kvm_cpu_context *ctxt);
+void __sysreg_restore_state(struct kvm_cpu_context *ctxt);
+void __sysreg32_save_state(struct kvm_vcpu *vcpu);
+void __sysreg32_restore_state(struct kvm_vcpu *vcpu);
+
+void __debug_save_state(struct kvm_vcpu *vcpu,
+                       struct kvm_guest_debug_arch *dbg,
+                       struct kvm_cpu_context *ctxt);
+void __debug_restore_state(struct kvm_vcpu *vcpu,
+                          struct kvm_guest_debug_arch *dbg,
+                          struct kvm_cpu_context *ctxt);
+void __debug_cond_save_host_state(struct kvm_vcpu *vcpu);
+void __debug_cond_restore_host_state(struct kvm_vcpu *vcpu);
+
+void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
+void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
+static inline bool __fpsimd_enabled(void)
+{
+       return !(read_sysreg(cptr_el2) & CPTR_EL2_TFP);
+}
+
+u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
+void __noreturn __hyp_do_panic(unsigned long, ...);
+
+#endif /* __ARM64_KVM_HYP_H__ */
+
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c

new file mode 100644 (file)

index 0000000..ca8f5a5
--- /dev/null
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hyp.h"
+
+static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
+{
+       u64 val;
+
+       /*
+        * We are about to set CPTR_EL2.TFP to trap all floating point
+        * register accesses to EL2, however, the ARM ARM clearly states that
+        * traps are only taken to EL2 if the operation would not otherwise
+        * trap to EL1.  Therefore, always make sure that for 32-bit guests,
+        * we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit.
+        */
+       val = vcpu->arch.hcr_el2;
+       if (!(val & HCR_RW)) {
+               write_sysreg(1 << 30, fpexc32_el2);
+               isb();
+       }
+       write_sysreg(val, hcr_el2);
+       /* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */
+       write_sysreg(1 << 15, hstr_el2);
+       write_sysreg(CPTR_EL2_TTA | CPTR_EL2_TFP, cptr_el2);
+       write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
+}
+
+static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu)
+{
+       write_sysreg(HCR_RW, hcr_el2);
+       write_sysreg(0, hstr_el2);
+       write_sysreg(read_sysreg(mdcr_el2) & MDCR_EL2_HPMN_MASK, mdcr_el2);
+       write_sysreg(0, cptr_el2);
+}
+
+static void __hyp_text __activate_vm(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+       write_sysreg(kvm->arch.vttbr, vttbr_el2);
+}
+
+static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
+{
+       write_sysreg(0, vttbr_el2);
+}
+
+static hyp_alternate_select(__vgic_call_save_state,
+                           __vgic_v2_save_state, __vgic_v3_save_state,
+                           ARM64_HAS_SYSREG_GIC_CPUIF);
+
+static hyp_alternate_select(__vgic_call_restore_state,
+                           __vgic_v2_restore_state, __vgic_v3_restore_state,
+                           ARM64_HAS_SYSREG_GIC_CPUIF);
+
+static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu)
+{
+       __vgic_call_save_state()(vcpu);
+       write_sysreg(read_sysreg(hcr_el2) & ~HCR_INT_OVERRIDE, hcr_el2);
+}
+
+static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu)
+{
+       u64 val;
+
+       val = read_sysreg(hcr_el2);
+       val |=  HCR_INT_OVERRIDE;
+       val |= vcpu->arch.irq_lines;
+       write_sysreg(val, hcr_el2);
+
+       __vgic_call_restore_state()(vcpu);
+}
+
+static int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpu_context *host_ctxt;
+       struct kvm_cpu_context *guest_ctxt;
+       bool fp_enabled;
+       u64 exit_code;
+
+       vcpu = kern_hyp_va(vcpu);
+       write_sysreg(vcpu, tpidr_el2);
+
+       host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
+       guest_ctxt = &vcpu->arch.ctxt;
+
+       __sysreg_save_state(host_ctxt);
+       __debug_cond_save_host_state(vcpu);
+
+       __activate_traps(vcpu);
+       __activate_vm(vcpu);
+
+       __vgic_restore_state(vcpu);
+       __timer_restore_state(vcpu);
+
+       /*
+        * We must restore the 32-bit state before the sysregs, thanks
+        * to Cortex-A57 erratum #852523.
+        */
+       __sysreg32_restore_state(vcpu);
+       __sysreg_restore_state(guest_ctxt);
+       __debug_restore_state(vcpu, kern_hyp_va(vcpu->arch.debug_ptr), guest_ctxt);
+
+       /* Jump in the fire! */
+       exit_code = __guest_enter(vcpu, host_ctxt);
+       /* And we're baaack! */
+
+       fp_enabled = __fpsimd_enabled();
+
+       __sysreg_save_state(guest_ctxt);
+       __sysreg32_save_state(vcpu);
+       __timer_save_state(vcpu);
+       __vgic_save_state(vcpu);
+
+       __deactivate_traps(vcpu);
+       __deactivate_vm(vcpu);
+
+       __sysreg_restore_state(host_ctxt);
+
+       if (fp_enabled) {
+               __fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
+               __fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs);
+       }
+
+       __debug_save_state(vcpu, kern_hyp_va(vcpu->arch.debug_ptr), guest_ctxt);
+       __debug_cond_restore_host_state(vcpu);
+
+       return exit_code;
+}
+
+__alias(__guest_run) int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
+
+static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
+
+void __hyp_text __noreturn __hyp_panic(void)
+{
+       unsigned long str_va = (unsigned long)__hyp_panic_string;
+       u64 spsr = read_sysreg(spsr_el2);
+       u64 elr = read_sysreg(elr_el2);
+       u64 par = read_sysreg(par_el1);
+
+       if (read_sysreg(vttbr_el2)) {
+               struct kvm_vcpu *vcpu;
+               struct kvm_cpu_context *host_ctxt;
+
+               vcpu = (struct kvm_vcpu *)read_sysreg(tpidr_el2);
+               host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
+               __deactivate_traps(vcpu);
+               __deactivate_vm(vcpu);
+               __sysreg_restore_state(host_ctxt);
+       }
+
+       /* Call panic for real */
+       __hyp_do_panic(hyp_kern_va(str_va),
+                      spsr,  elr,
+                      read_sysreg(esr_el2),   read_sysreg(far_el2),
+                      read_sysreg(hpfar_el2), par,
+                      (void *)read_sysreg(tpidr_el2));
+
+       unreachable();
+}
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c

new file mode 100644 (file)

index 0000000..4256309
--- /dev/null
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/compiler.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_asm.h>
+#include <asm/kvm_mmu.h>
+
+#include "hyp.h"
+
+/* ctxt is already in the HYP VA space */
+void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt)
+{
+       ctxt->sys_regs[MPIDR_EL1]       = read_sysreg(vmpidr_el2);
+       ctxt->sys_regs[CSSELR_EL1]      = read_sysreg(csselr_el1);
+       ctxt->sys_regs[SCTLR_EL1]       = read_sysreg(sctlr_el1);
+       ctxt->sys_regs[ACTLR_EL1]       = read_sysreg(actlr_el1);
+       ctxt->sys_regs[CPACR_EL1]       = read_sysreg(cpacr_el1);
+       ctxt->sys_regs[TTBR0_EL1]       = read_sysreg(ttbr0_el1);
+       ctxt->sys_regs[TTBR1_EL1]       = read_sysreg(ttbr1_el1);
+       ctxt->sys_regs[TCR_EL1]         = read_sysreg(tcr_el1);
+       ctxt->sys_regs[ESR_EL1]         = read_sysreg(esr_el1);
+       ctxt->sys_regs[AFSR0_EL1]       = read_sysreg(afsr0_el1);
+       ctxt->sys_regs[AFSR1_EL1]       = read_sysreg(afsr1_el1);
+       ctxt->sys_regs[FAR_EL1]         = read_sysreg(far_el1);
+       ctxt->sys_regs[MAIR_EL1]        = read_sysreg(mair_el1);
+       ctxt->sys_regs[VBAR_EL1]        = read_sysreg(vbar_el1);
+       ctxt->sys_regs[CONTEXTIDR_EL1]  = read_sysreg(contextidr_el1);
+       ctxt->sys_regs[TPIDR_EL0]       = read_sysreg(tpidr_el0);
+       ctxt->sys_regs[TPIDRRO_EL0]     = read_sysreg(tpidrro_el0);
+       ctxt->sys_regs[TPIDR_EL1]       = read_sysreg(tpidr_el1);
+       ctxt->sys_regs[AMAIR_EL1]       = read_sysreg(amair_el1);
+       ctxt->sys_regs[CNTKCTL_EL1]     = read_sysreg(cntkctl_el1);
+       ctxt->sys_regs[PAR_EL1]         = read_sysreg(par_el1);
+       ctxt->sys_regs[MDSCR_EL1]       = read_sysreg(mdscr_el1);
+
+       ctxt->gp_regs.regs.sp           = read_sysreg(sp_el0);
+       ctxt->gp_regs.regs.pc           = read_sysreg(elr_el2);
+       ctxt->gp_regs.regs.pstate       = read_sysreg(spsr_el2);
+       ctxt->gp_regs.sp_el1            = read_sysreg(sp_el1);
+       ctxt->gp_regs.elr_el1           = read_sysreg(elr_el1);
+       ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg(spsr_el1);
+}
+
+void __hyp_text __sysreg_restore_state(struct kvm_cpu_context *ctxt)
+{
+       write_sysreg(ctxt->sys_regs[MPIDR_EL1],   vmpidr_el2);
+       write_sysreg(ctxt->sys_regs[CSSELR_EL1],  csselr_el1);
+       write_sysreg(ctxt->sys_regs[SCTLR_EL1],   sctlr_el1);
+       write_sysreg(ctxt->sys_regs[ACTLR_EL1],   actlr_el1);
+       write_sysreg(ctxt->sys_regs[CPACR_EL1],   cpacr_el1);
+       write_sysreg(ctxt->sys_regs[TTBR0_EL1],   ttbr0_el1);
+       write_sysreg(ctxt->sys_regs[TTBR1_EL1],   ttbr1_el1);
+       write_sysreg(ctxt->sys_regs[TCR_EL1],     tcr_el1);
+       write_sysreg(ctxt->sys_regs[ESR_EL1],     esr_el1);
+       write_sysreg(ctxt->sys_regs[AFSR0_EL1],   afsr0_el1);
+       write_sysreg(ctxt->sys_regs[AFSR1_EL1],   afsr1_el1);
+       write_sysreg(ctxt->sys_regs[FAR_EL1],     far_el1);
+       write_sysreg(ctxt->sys_regs[MAIR_EL1],    mair_el1);
+       write_sysreg(ctxt->sys_regs[VBAR_EL1],    vbar_el1);
+       write_sysreg(ctxt->sys_regs[CONTEXTIDR_EL1], contextidr_el1);
+       write_sysreg(ctxt->sys_regs[TPIDR_EL0],   tpidr_el0);
+       write_sysreg(ctxt->sys_regs[TPIDRRO_EL0], tpidrro_el0);
+       write_sysreg(ctxt->sys_regs[TPIDR_EL1],   tpidr_el1);
+       write_sysreg(ctxt->sys_regs[AMAIR_EL1],   amair_el1);
+       write_sysreg(ctxt->sys_regs[CNTKCTL_EL1], cntkctl_el1);
+       write_sysreg(ctxt->sys_regs[PAR_EL1],     par_el1);
+       write_sysreg(ctxt->sys_regs[MDSCR_EL1],   mdscr_el1);
+
+       write_sysreg(ctxt->gp_regs.regs.sp,     sp_el0);
+       write_sysreg(ctxt->gp_regs.regs.pc,     elr_el2);
+       write_sysreg(ctxt->gp_regs.regs.pstate, spsr_el2);
+       write_sysreg(ctxt->gp_regs.sp_el1,      sp_el1);
+       write_sysreg(ctxt->gp_regs.elr_el1,     elr_el1);
+       write_sysreg(ctxt->gp_regs.spsr[KVM_SPSR_EL1], spsr_el1);
+}
+
+void __hyp_text __sysreg32_save_state(struct kvm_vcpu *vcpu)
+{
+       u64 *spsr, *sysreg;
+
+       if (read_sysreg(hcr_el2) & HCR_RW)
+               return;
+
+       spsr = vcpu->arch.ctxt.gp_regs.spsr;
+       sysreg = vcpu->arch.ctxt.sys_regs;
+
+       spsr[KVM_SPSR_ABT] = read_sysreg(spsr_abt);
+       spsr[KVM_SPSR_UND] = read_sysreg(spsr_und);
+       spsr[KVM_SPSR_IRQ] = read_sysreg(spsr_irq);
+       spsr[KVM_SPSR_FIQ] = read_sysreg(spsr_fiq);
+
+       sysreg[DACR32_EL2] = read_sysreg(dacr32_el2);
+       sysreg[IFSR32_EL2] = read_sysreg(ifsr32_el2);
+
+       if (__fpsimd_enabled())
+               sysreg[FPEXC32_EL2] = read_sysreg(fpexc32_el2);
+
+       if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
+               sysreg[DBGVCR32_EL2] = read_sysreg(dbgvcr32_el2);
+}
+
+void __hyp_text __sysreg32_restore_state(struct kvm_vcpu *vcpu)
+{
+       u64 *spsr, *sysreg;
+
+       if (read_sysreg(hcr_el2) & HCR_RW)
+               return;
+
+       spsr = vcpu->arch.ctxt.gp_regs.spsr;
+       sysreg = vcpu->arch.ctxt.sys_regs;
+
+       write_sysreg(spsr[KVM_SPSR_ABT], spsr_abt);
+       write_sysreg(spsr[KVM_SPSR_UND], spsr_und);
+       write_sysreg(spsr[KVM_SPSR_IRQ], spsr_irq);
+       write_sysreg(spsr[KVM_SPSR_FIQ], spsr_fiq);
+
+       write_sysreg(sysreg[DACR32_EL2], dacr32_el2);
+       write_sysreg(sysreg[IFSR32_EL2], ifsr32_el2);
+
+       if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
+               write_sysreg(sysreg[DBGVCR32_EL2], dbgvcr32_el2);
+}
diff --git a/arch/arm64/kvm/hyp/timer-sr.c b/arch/arm64/kvm/hyp/timer-sr.c

new file mode 100644 (file)

index 0000000..1051e5d
--- /dev/null
+++ b/arch/arm64/kvm/hyp/timer-sr.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <clocksource/arm_arch_timer.h>
+#include <linux/compiler.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_mmu.h>
+
+#include "hyp.h"
+
+/* vcpu is already in the HYP VA space */
+void __hyp_text __timer_save_state(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+       struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+       u64 val;
+
+       if (kvm->arch.timer.enabled) {
+               timer->cntv_ctl = read_sysreg(cntv_ctl_el0);
+               timer->cntv_cval = read_sysreg(cntv_cval_el0);
+       }
+
+       /* Disable the virtual timer */
+       write_sysreg(0, cntv_ctl_el0);
+
+       /* Allow physical timer/counter access for the host */
+       val = read_sysreg(cnthctl_el2);
+       val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
+       write_sysreg(val, cnthctl_el2);
+
+       /* Clear cntvoff for the host */
+       write_sysreg(0, cntvoff_el2);
+}
+
+void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+       struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+       u64 val;
+
+       /*
+        * Disallow physical timer access for the guest
+        * Physical counter access is allowed
+        */
+       val = read_sysreg(cnthctl_el2);
+       val &= ~CNTHCTL_EL1PCEN;
+       val |= CNTHCTL_EL1PCTEN;
+       write_sysreg(val, cnthctl_el2);
+
+       if (kvm->arch.timer.enabled) {
+               write_sysreg(kvm->arch.timer.cntvoff, cntvoff_el2);
+               write_sysreg(timer->cntv_cval, cntv_cval_el0);
+               isb();
+               write_sysreg(timer->cntv_ctl, cntv_ctl_el0);
+       }
+}
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c

new file mode 100644 (file)

index 0000000..2a7e0d8
--- /dev/null
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hyp.h"
+
+static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
+{
+       dsb(ishst);
+
+       /* Switch to requested VMID */
+       kvm = kern_hyp_va(kvm);
+       write_sysreg(kvm->arch.vttbr, vttbr_el2);
+       isb();
+
+       /*
+        * We could do so much better if we had the VA as well.
+        * Instead, we invalidate Stage-2 for this IPA, and the
+        * whole of Stage-1. Weep...
+        */
+       ipa >>= 12;
+       asm volatile("tlbi ipas2e1is, %0" : : "r" (ipa));
+
+       /*
+        * We have to ensure completion of the invalidation at Stage-2,
+        * since a table walk on another CPU could refill a TLB with a
+        * complete (S1 + S2) walk based on the old Stage-2 mapping if
+        * the Stage-1 invalidation happened first.
+        */
+       dsb(ish);
+       asm volatile("tlbi vmalle1is" : : );
+       dsb(ish);
+       isb();
+
+       write_sysreg(0, vttbr_el2);
+}
+
+__alias(__tlb_flush_vmid_ipa) void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm,
+                                                           phys_addr_t ipa);
+
+static void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
+{
+       dsb(ishst);
+
+       /* Switch to requested VMID */
+       kvm = kern_hyp_va(kvm);
+       write_sysreg(kvm->arch.vttbr, vttbr_el2);
+       isb();
+
+       asm volatile("tlbi vmalls12e1is" : : );
+       dsb(ish);
+       isb();
+
+       write_sysreg(0, vttbr_el2);
+}
+
+__alias(__tlb_flush_vmid) void __kvm_tlb_flush_vmid(struct kvm *kvm);
+
+static void __hyp_text __tlb_flush_vm_context(void)
+{
+       dsb(ishst);
+       asm volatile("tlbi alle1is      \n"
+                    "ic ialluis          ": : );
+       dsb(ish);
+}
+
+__alias(__tlb_flush_vm_context) void __kvm_flush_vm_context(void);
diff --git a/arch/arm64/kvm/hyp/vgic-v2-sr.c b/arch/arm64/kvm/hyp/vgic-v2-sr.c

new file mode 100644 (file)

index 0000000..e717612
--- /dev/null
+++ b/arch/arm64/kvm/hyp/vgic-v2-sr.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/compiler.h>
+#include <linux/irqchip/arm-gic.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_mmu.h>
+
+#include "hyp.h"
+
+/* vcpu is already in the HYP VA space */
+void __hyp_text __vgic_v2_save_state(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+       struct vgic_dist *vgic = &kvm->arch.vgic;
+       void __iomem *base = kern_hyp_va(vgic->vctrl_base);
+       u32 eisr0, eisr1, elrsr0, elrsr1;
+       int i, nr_lr;
+
+       if (!base)
+               return;
+
+       nr_lr = vcpu->arch.vgic_cpu.nr_lr;
+       cpu_if->vgic_vmcr = readl_relaxed(base + GICH_VMCR);
+       cpu_if->vgic_misr = readl_relaxed(base + GICH_MISR);
+       eisr0  = readl_relaxed(base + GICH_EISR0);
+       elrsr0 = readl_relaxed(base + GICH_ELRSR0);
+       if (unlikely(nr_lr > 32)) {
+               eisr1  = readl_relaxed(base + GICH_EISR1);
+               elrsr1 = readl_relaxed(base + GICH_ELRSR1);
+       } else {
+               eisr1 = elrsr1 = 0;
+       }
+#ifdef CONFIG_CPU_BIG_ENDIAN
+       cpu_if->vgic_eisr  = ((u64)eisr0 << 32) | eisr1;
+       cpu_if->vgic_elrsr = ((u64)elrsr0 << 32) | elrsr1;
+#else
+       cpu_if->vgic_eisr  = ((u64)eisr1 << 32) | eisr0;
+       cpu_if->vgic_elrsr = ((u64)elrsr1 << 32) | elrsr0;
+#endif
+       cpu_if->vgic_apr    = readl_relaxed(base + GICH_APR);
+
+       writel_relaxed(0, base + GICH_HCR);
+
+       for (i = 0; i < nr_lr; i++)
+               cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4));
+}
+
+/* vcpu is already in the HYP VA space */
+void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+       struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+       struct vgic_dist *vgic = &kvm->arch.vgic;
+       void __iomem *base = kern_hyp_va(vgic->vctrl_base);
+       int i, nr_lr;
+
+       if (!base)
+               return;
+
+       writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
+       writel_relaxed(cpu_if->vgic_vmcr, base + GICH_VMCR);
+       writel_relaxed(cpu_if->vgic_apr, base + GICH_APR);
+
+       nr_lr = vcpu->arch.vgic_cpu.nr_lr;
+       for (i = 0; i < nr_lr; i++)
+               writel_relaxed(cpu_if->vgic_lr[i], base + GICH_LR0 + (i * 4));
+}
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c

new file mode 100644 (file)

index 0000000..9142e08
--- /dev/null
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/compiler.h>
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_mmu.h>
+
+#include "hyp.h"
+
+#define vtr_to_max_lr_idx(v)           ((v) & 0xf)
+#define vtr_to_nr_pri_bits(v)          (((u32)(v) >> 29) + 1)
+
+#define read_gicreg(r)                                                 \
+       ({                                                              \
+               u64 reg;                                                \
+               asm volatile("mrs_s %0, " __stringify(r) : "=r" (reg)); \
+               reg;                                                    \
+       })
+
+#define write_gicreg(v,r)                                              \
+       do {                                                            \
+               u64 __val = (v);                                        \
+               asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\
+       } while (0)
+
+/* vcpu is already in the HYP VA space */
+void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+       u64 val;
+       u32 max_lr_idx, nr_pri_bits;
+
+       /*
+        * Make sure stores to the GIC via the memory mapped interface
+        * are now visible to the system register interface.
+        */
+       dsb(st);
+
+       cpu_if->vgic_vmcr  = read_gicreg(ICH_VMCR_EL2);
+       cpu_if->vgic_misr  = read_gicreg(ICH_MISR_EL2);
+       cpu_if->vgic_eisr  = read_gicreg(ICH_EISR_EL2);
+       cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2);
+
+       write_gicreg(0, ICH_HCR_EL2);
+       val = read_gicreg(ICH_VTR_EL2);
+       max_lr_idx = vtr_to_max_lr_idx(val);
+       nr_pri_bits = vtr_to_nr_pri_bits(val);
+
+       switch (max_lr_idx) {
+       case 15:
+               cpu_if->vgic_lr[VGIC_V3_LR_INDEX(15)] = read_gicreg(ICH_LR15_EL2);
+       case 14:
+               cpu_if->vgic_lr[VGIC_V3_LR_INDEX(14)] = read_gicreg(ICH_LR14_EL2);
+       case 13:
+               cpu_if->vgic_lr[VGIC_V3_LR_INDEX(13)] = read_gicreg(ICH_LR13_EL2);
+       case 12:
+               cpu_if->vgic_lr[VGIC_V3_LR_INDEX(12)] = read_gicreg(ICH_LR12_EL2);
+       case 11:
+               cpu_if->vgic_lr[VGIC_V3_LR_INDEX(11)] = read_gicreg(ICH_LR11_EL2);
+       case 10:
+               cpu_if->vgic_lr[VGIC_V3_LR_INDEX(10)] = read_gicreg(ICH_LR10_EL2);
+       case 9:
+               cpu_if->vgic_lr[VGIC_V3_LR_INDEX(9)] = read_gicreg(ICH_LR9_EL2);
+       case 8:
+               cpu_if->vgic_lr[VGIC_V3_LR_INDEX(8)] = read_gicreg(ICH_LR8_EL2);
+       case 7:
+               cpu_if->vgic_lr[VGIC_V3_LR_INDEX(7)] = read_gicreg(ICH_LR7_EL2);
+       case 6:
+               cpu_if->vgic_lr[VGIC_V3_LR_INDEX(6)] = read_gicreg(ICH_LR6_EL2);
+       case 5:
+               cpu_if->vgic_lr[VGIC_V3_LR_INDEX(5)] = read_gicreg(ICH_LR5_EL2);
+       case 4:
+               cpu_if->vgic_lr[VGIC_V3_LR_INDEX(4)] = read_gicreg(ICH_LR4_EL2);
+       case 3:
+               cpu_if->vgic_lr[VGIC_V3_LR_INDEX(3)] = read_gicreg(ICH_LR3_EL2);
+       case 2:
+               cpu_if->vgic_lr[VGIC_V3_LR_INDEX(2)] = read_gicreg(ICH_LR2_EL2);
+       case 1:
+               cpu_if->vgic_lr[VGIC_V3_LR_INDEX(1)] = read_gicreg(ICH_LR1_EL2);
+       case 0:
+               cpu_if->vgic_lr[VGIC_V3_LR_INDEX(0)] = read_gicreg(ICH_LR0_EL2);
+       }
+
+       switch (nr_pri_bits) {
+       case 7:
+               cpu_if->vgic_ap0r[3] = read_gicreg(ICH_AP0R3_EL2);
+               cpu_if->vgic_ap0r[2] = read_gicreg(ICH_AP0R2_EL2);
+       case 6:
+               cpu_if->vgic_ap0r[1] = read_gicreg(ICH_AP0R1_EL2);
+       default:
+               cpu_if->vgic_ap0r[0] = read_gicreg(ICH_AP0R0_EL2);
+       }
+
+       switch (nr_pri_bits) {
+       case 7:
+               cpu_if->vgic_ap1r[3] = read_gicreg(ICH_AP1R3_EL2);
+               cpu_if->vgic_ap1r[2] = read_gicreg(ICH_AP1R2_EL2);
+       case 6:
+               cpu_if->vgic_ap1r[1] = read_gicreg(ICH_AP1R1_EL2);
+       default:
+               cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2);
+       }
+
+       val = read_gicreg(ICC_SRE_EL2);
+       write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
+       isb(); /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
+       write_gicreg(1, ICC_SRE_EL1);
+}
+
+void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
+{
+       struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+       u64 val;
+       u32 max_lr_idx, nr_pri_bits;
+
+       /*
+        * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
+        * Group0 interrupt (as generated in GICv2 mode) to be
+        * delivered as a FIQ to the guest, with potentially fatal
+        * consequences. So we must make sure that ICC_SRE_EL1 has
+        * been actually programmed with the value we want before
+        * starting to mess with the rest of the GIC.
+        */
+       write_gicreg(cpu_if->vgic_sre, ICC_SRE_EL1);
+       isb();
+
+       write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+       write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
+
+       val = read_gicreg(ICH_VTR_EL2);
+       max_lr_idx = vtr_to_max_lr_idx(val);
+       nr_pri_bits = vtr_to_nr_pri_bits(val);
+
+       switch (nr_pri_bits) {
+       case 7:
+                write_gicreg(cpu_if->vgic_ap1r[3], ICH_AP1R3_EL2);
+                write_gicreg(cpu_if->vgic_ap1r[2], ICH_AP1R2_EL2);
+       case 6:
+                write_gicreg(cpu_if->vgic_ap1r[1], ICH_AP1R1_EL2);
+       default:
+                write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2);
+       }                                          
+                                                  
+       switch (nr_pri_bits) {
+       case 7:
+                write_gicreg(cpu_if->vgic_ap0r[3], ICH_AP0R3_EL2);
+                write_gicreg(cpu_if->vgic_ap0r[2], ICH_AP0R2_EL2);
+       case 6:
+                write_gicreg(cpu_if->vgic_ap0r[1], ICH_AP0R1_EL2);
+       default:
+                write_gicreg(cpu_if->vgic_ap0r[0], ICH_AP0R0_EL2);
+       }
+
+       switch (max_lr_idx) {
+       case 15:
+               write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(15)], ICH_LR15_EL2);
+       case 14:
+               write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(14)], ICH_LR14_EL2);
+       case 13:
+               write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(13)], ICH_LR13_EL2);
+       case 12:
+               write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(12)], ICH_LR12_EL2);
+       case 11:
+               write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(11)], ICH_LR11_EL2);
+       case 10:
+               write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(10)], ICH_LR10_EL2);
+       case 9:
+               write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(9)], ICH_LR9_EL2);
+       case 8:
+               write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(8)], ICH_LR8_EL2);
+       case 7:
+               write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(7)], ICH_LR7_EL2);
+       case 6:
+               write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(6)], ICH_LR6_EL2);
+       case 5:
+               write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(5)], ICH_LR5_EL2);
+       case 4:
+               write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(4)], ICH_LR4_EL2);
+       case 3:
+               write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(3)], ICH_LR3_EL2);
+       case 2:
+               write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(2)], ICH_LR2_EL2);
+       case 1:
+               write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(1)], ICH_LR1_EL2);
+       case 0:
+               write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(0)], ICH_LR0_EL2);
+       }
+
+       /*
+        * Ensures that the above will have reached the
+        * (re)distributors. This ensure the guest will read the
+        * correct values from the memory-mapped interface.
+        */
+       isb();
+       dsb(sy);
+
+       /*
+        * Prevent the guest from touching the GIC system registers if
+        * SRE isn't enabled for GICv3 emulation.
+        */
+       if (!cpu_if->vgic_sre) {
+               write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
+                            ICC_SRE_EL2);
+       }
+}
+
+static u64 __hyp_text __vgic_v3_read_ich_vtr_el2(void)
+{
+       return read_gicreg(ICH_VTR_EL2);
+}
+
+__alias(__vgic_v3_read_ich_vtr_el2) u64 __vgic_v3_get_ich_vtr_el2(void);
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c

index f34745cb3d236fe0a4731f8d02031f8ff764d69c..d6e155a212dc42fa9696879c710a395fc8be2779 100644 (file)
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -29,7 +29,9 @@
  #include <asm/cputype.h>
  #include <asm/ptrace.h>
  #include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
  #include <asm/kvm_coproc.h>
+#include <asm/kvm_mmu.h>
  
  /*
   * ARMv8 Reset Values
@@ -123,3 +125,15 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
         /* Reset timer */
         return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
  }
+
+extern char __hyp_idmap_text_start[];
+
+phys_addr_t kvm_hyp_reset_entry(void)
+{
+       unsigned long offset;
+
+       offset = (unsigned long)__kvm_hyp_reset
+                - ((unsigned long)__hyp_idmap_text_start & PAGE_MASK);
+
+       return TRAMPOLINE_VA + offset;
+}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c

index d2650e84faf2f53f2afbdbd15e1f54d217e3fdb8..eec3598b4184077b83b5a1f24321891cb110f5bb 100644 (file)
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -29,6 +29,7 @@
  #include <asm/debug-monitors.h>
  #include <asm/esr.h>
  #include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
  #include <asm/kvm_coproc.h>
  #include <asm/kvm_emulate.h>
  #include <asm/kvm_host.h>
@@ -219,9 +220,9 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu,
   * All writes will set the KVM_ARM64_DEBUG_DIRTY flag to ensure the
   * hyp.S code switches between host and guest values in future.
   */
-static inline void reg_to_dbg(struct kvm_vcpu *vcpu,
-                             struct sys_reg_params *p,
-                             u64 *dbg_reg)
+static void reg_to_dbg(struct kvm_vcpu *vcpu,
+                      struct sys_reg_params *p,
+                      u64 *dbg_reg)
  {
         u64 val = p->regval;
  
@@ -234,18 +235,18 @@ static inline void reg_to_dbg(struct kvm_vcpu *vcpu,
         vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
  }
  
-static inline void dbg_to_reg(struct kvm_vcpu *vcpu,
-                             struct sys_reg_params *p,
-                             u64 *dbg_reg)
+static void dbg_to_reg(struct kvm_vcpu *vcpu,
+                      struct sys_reg_params *p,
+                      u64 *dbg_reg)
  {
         p->regval = *dbg_reg;
         if (p->is_32bit)
                 p->regval &= 0xffffffffUL;
  }
  
-static inline bool trap_bvr(struct kvm_vcpu *vcpu,
-                           struct sys_reg_params *p,
-                           const struct sys_reg_desc *rd)
+static bool trap_bvr(struct kvm_vcpu *vcpu,
+                    struct sys_reg_params *p,
+                    const struct sys_reg_desc *rd)
  {
         u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
  
@@ -279,15 +280,15 @@ static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
         return 0;
  }
  
-static inline void reset_bvr(struct kvm_vcpu *vcpu,
-                            const struct sys_reg_desc *rd)
+static void reset_bvr(struct kvm_vcpu *vcpu,
+                     const struct sys_reg_desc *rd)
  {
         vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg] = rd->val;
  }
  
-static inline bool trap_bcr(struct kvm_vcpu *vcpu,
-                           struct sys_reg_params *p,
-                           const struct sys_reg_desc *rd)
+static bool trap_bcr(struct kvm_vcpu *vcpu,
+                    struct sys_reg_params *p,
+                    const struct sys_reg_desc *rd)
  {
         u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
  
@@ -322,15 +323,15 @@ static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
         return 0;
  }
  
-static inline void reset_bcr(struct kvm_vcpu *vcpu,
-                            const struct sys_reg_desc *rd)
+static void reset_bcr(struct kvm_vcpu *vcpu,
+                     const struct sys_reg_desc *rd)
  {
         vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg] = rd->val;
  }
  
-static inline bool trap_wvr(struct kvm_vcpu *vcpu,
-                           struct sys_reg_params *p,
-                           const struct sys_reg_desc *rd)
+static bool trap_wvr(struct kvm_vcpu *vcpu,
+                    struct sys_reg_params *p,
+                    const struct sys_reg_desc *rd)
  {
         u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
  
@@ -365,15 +366,15 @@ static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
         return 0;
  }
  
-static inline void reset_wvr(struct kvm_vcpu *vcpu,
-                            const struct sys_reg_desc *rd)
+static void reset_wvr(struct kvm_vcpu *vcpu,
+                     const struct sys_reg_desc *rd)
  {
         vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg] = rd->val;
  }
  
-static inline bool trap_wcr(struct kvm_vcpu *vcpu,
-                           struct sys_reg_params *p,
-                           const struct sys_reg_desc *rd)
+static bool trap_wcr(struct kvm_vcpu *vcpu,
+                    struct sys_reg_params *p,
+                    const struct sys_reg_desc *rd)
  {
         u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
  
@@ -407,8 +408,8 @@ static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
         return 0;
  }
  
-static inline void reset_wcr(struct kvm_vcpu *vcpu,
-                            const struct sys_reg_desc *rd)
+static void reset_wcr(struct kvm_vcpu *vcpu,
+                     const struct sys_reg_desc *rd)
  {
         vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg] = rd->val;
  }
@@ -722,9 +723,9 @@ static bool trap_debug32(struct kvm_vcpu *vcpu,
   * system is in.
   */
  
-static inline bool trap_xvr(struct kvm_vcpu *vcpu,
-                           struct sys_reg_params *p,
-                           const struct sys_reg_desc *rd)
+static bool trap_xvr(struct kvm_vcpu *vcpu,
+                    struct sys_reg_params *p,
+                    const struct sys_reg_desc *rd)
  {
         u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
  
diff --git a/arch/arm64/kvm/vgic-v2-switch.S b/arch/arm64/kvm/vgic-v2-switch.S

deleted file mode 100644 (file)

index 3f00071..0000000
--- a/arch/arm64/kvm/vgic-v2-switch.S
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/linkage.h>
-#include <linux/irqchip/arm-gic.h>
-
-#include <asm/assembler.h>
-#include <asm/memory.h>
-#include <asm/asm-offsets.h>
-#include <asm/kvm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-       .text
-       .pushsection    .hyp.text, "ax"
-
-/*
- * Save the VGIC CPU state into memory
- * x0: Register pointing to VCPU struct
- * Do not corrupt x1!!!
- */
-ENTRY(__save_vgic_v2_state)
-__save_vgic_v2_state:
-       /* Get VGIC VCTRL base into x2 */
-       ldr     x2, [x0, #VCPU_KVM]
-       kern_hyp_va     x2
-       ldr     x2, [x2, #KVM_VGIC_VCTRL]
-       kern_hyp_va     x2
-       cbz     x2, 2f          // disabled
-
-       /* Compute the address of struct vgic_cpu */
-       add     x3, x0, #VCPU_VGIC_CPU
-
-       /* Save all interesting registers */
-       ldr     w5, [x2, #GICH_VMCR]
-       ldr     w6, [x2, #GICH_MISR]
-       ldr     w7, [x2, #GICH_EISR0]
-       ldr     w8, [x2, #GICH_EISR1]
-       ldr     w9, [x2, #GICH_ELRSR0]
-       ldr     w10, [x2, #GICH_ELRSR1]
-       ldr     w11, [x2, #GICH_APR]
-CPU_BE(        rev     w5,  w5  )
-CPU_BE(        rev     w6,  w6  )
-CPU_BE(        rev     w7,  w7  )
-CPU_BE(        rev     w8,  w8  )
-CPU_BE(        rev     w9,  w9  )
-CPU_BE(        rev     w10, w10 )
-CPU_BE(        rev     w11, w11 )
-
-       str     w5, [x3, #VGIC_V2_CPU_VMCR]
-       str     w6, [x3, #VGIC_V2_CPU_MISR]
-CPU_LE(        str     w7, [x3, #VGIC_V2_CPU_EISR] )
-CPU_LE(        str     w8, [x3, #(VGIC_V2_CPU_EISR + 4)] )
-CPU_LE(        str     w9, [x3, #VGIC_V2_CPU_ELRSR] )
-CPU_LE(        str     w10, [x3, #(VGIC_V2_CPU_ELRSR + 4)] )
-CPU_BE(        str     w7, [x3, #(VGIC_V2_CPU_EISR + 4)] )
-CPU_BE(        str     w8, [x3, #VGIC_V2_CPU_EISR] )
-CPU_BE(        str     w9, [x3, #(VGIC_V2_CPU_ELRSR + 4)] )
-CPU_BE(        str     w10, [x3, #VGIC_V2_CPU_ELRSR] )
-       str     w11, [x3, #VGIC_V2_CPU_APR]
-
-       /* Clear GICH_HCR */
-       str     wzr, [x2, #GICH_HCR]
-
-       /* Save list registers */
-       add     x2, x2, #GICH_LR0
-       ldr     w4, [x3, #VGIC_CPU_NR_LR]
-       add     x3, x3, #VGIC_V2_CPU_LR
-1:     ldr     w5, [x2], #4
-CPU_BE(        rev     w5, w5 )
-       str     w5, [x3], #4
-       sub     w4, w4, #1
-       cbnz    w4, 1b
-2:
-       ret
-ENDPROC(__save_vgic_v2_state)
-
-/*
- * Restore the VGIC CPU state from memory
- * x0: Register pointing to VCPU struct
- */
-ENTRY(__restore_vgic_v2_state)
-__restore_vgic_v2_state:
-       /* Get VGIC VCTRL base into x2 */
-       ldr     x2, [x0, #VCPU_KVM]
-       kern_hyp_va     x2
-       ldr     x2, [x2, #KVM_VGIC_VCTRL]
-       kern_hyp_va     x2
-       cbz     x2, 2f          // disabled
-
-       /* Compute the address of struct vgic_cpu */
-       add     x3, x0, #VCPU_VGIC_CPU
-
-       /* We only restore a minimal set of registers */
-       ldr     w4, [x3, #VGIC_V2_CPU_HCR]
-       ldr     w5, [x3, #VGIC_V2_CPU_VMCR]
-       ldr     w6, [x3, #VGIC_V2_CPU_APR]
-CPU_BE(        rev     w4, w4 )
-CPU_BE(        rev     w5, w5 )
-CPU_BE(        rev     w6, w6 )
-
-       str     w4, [x2, #GICH_HCR]
-       str     w5, [x2, #GICH_VMCR]
-       str     w6, [x2, #GICH_APR]
-
-       /* Restore list registers */
-       add     x2, x2, #GICH_LR0
-       ldr     w4, [x3, #VGIC_CPU_NR_LR]
-       add     x3, x3, #VGIC_V2_CPU_LR
-1:     ldr     w5, [x3], #4
-CPU_BE(        rev     w5, w5 )
-       str     w5, [x2], #4
-       sub     w4, w4, #1
-       cbnz    w4, 1b
-2:
-       ret
-ENDPROC(__restore_vgic_v2_state)
-
-       .popsection
diff --git a/arch/arm64/kvm/vgic-v3-switch.S b/arch/arm64/kvm/vgic-v3-switch.S

deleted file mode 100644 (file)

index 3c20730..0000000
--- a/arch/arm64/kvm/vgic-v3-switch.S
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/linkage.h>
-#include <linux/irqchip/arm-gic-v3.h>
-
-#include <asm/assembler.h>
-#include <asm/memory.h>
-#include <asm/asm-offsets.h>
-#include <asm/kvm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_arm.h>
-
-       .text
-       .pushsection    .hyp.text, "ax"
-
-/*
- * We store LRs in reverse order to let the CPU deal with streaming
- * access. Use this macro to make it look saner...
- */
-#define LR_OFFSET(n)   (VGIC_V3_CPU_LR + (15 - n) * 8)
-
-/*
- * Save the VGIC CPU state into memory
- * x0: Register pointing to VCPU struct
- * Do not corrupt x1!!!
- */
-.macro save_vgic_v3_state
-       // Compute the address of struct vgic_cpu
-       add     x3, x0, #VCPU_VGIC_CPU
-
-       // Make sure stores to the GIC via the memory mapped interface
-       // are now visible to the system register interface
-       dsb     st
-
-       // Save all interesting registers
-       mrs_s   x5, ICH_VMCR_EL2
-       mrs_s   x6, ICH_MISR_EL2
-       mrs_s   x7, ICH_EISR_EL2
-       mrs_s   x8, ICH_ELSR_EL2
-
-       str     w5, [x3, #VGIC_V3_CPU_VMCR]
-       str     w6, [x3, #VGIC_V3_CPU_MISR]
-       str     w7, [x3, #VGIC_V3_CPU_EISR]
-       str     w8, [x3, #VGIC_V3_CPU_ELRSR]
-
-       msr_s   ICH_HCR_EL2, xzr
-
-       mrs_s   x21, ICH_VTR_EL2
-       mvn     w22, w21
-       ubfiz   w23, w22, 2, 4  // w23 = (15 - ListRegs) * 4
-
-       adr     x24, 1f
-       add     x24, x24, x23
-       br      x24
-
-1:
-       mrs_s   x20, ICH_LR15_EL2
-       mrs_s   x19, ICH_LR14_EL2
-       mrs_s   x18, ICH_LR13_EL2
-       mrs_s   x17, ICH_LR12_EL2
-       mrs_s   x16, ICH_LR11_EL2
-       mrs_s   x15, ICH_LR10_EL2
-       mrs_s   x14, ICH_LR9_EL2
-       mrs_s   x13, ICH_LR8_EL2
-       mrs_s   x12, ICH_LR7_EL2
-       mrs_s   x11, ICH_LR6_EL2
-       mrs_s   x10, ICH_LR5_EL2
-       mrs_s   x9, ICH_LR4_EL2
-       mrs_s   x8, ICH_LR3_EL2
-       mrs_s   x7, ICH_LR2_EL2
-       mrs_s   x6, ICH_LR1_EL2
-       mrs_s   x5, ICH_LR0_EL2
-
-       adr     x24, 1f
-       add     x24, x24, x23
-       br      x24
-
-1:
-       str     x20, [x3, #LR_OFFSET(15)]
-       str     x19, [x3, #LR_OFFSET(14)]
-       str     x18, [x3, #LR_OFFSET(13)]
-       str     x17, [x3, #LR_OFFSET(12)]
-       str     x16, [x3, #LR_OFFSET(11)]
-       str     x15, [x3, #LR_OFFSET(10)]
-       str     x14, [x3, #LR_OFFSET(9)]
-       str     x13, [x3, #LR_OFFSET(8)]
-       str     x12, [x3, #LR_OFFSET(7)]
-       str     x11, [x3, #LR_OFFSET(6)]
-       str     x10, [x3, #LR_OFFSET(5)]
-       str     x9, [x3, #LR_OFFSET(4)]
-       str     x8, [x3, #LR_OFFSET(3)]
-       str     x7, [x3, #LR_OFFSET(2)]
-       str     x6, [x3, #LR_OFFSET(1)]
-       str     x5, [x3, #LR_OFFSET(0)]
-
-       tbnz    w21, #29, 6f    // 6 bits
-       tbz     w21, #30, 5f    // 5 bits
-                               // 7 bits
-       mrs_s   x20, ICH_AP0R3_EL2
-       str     w20, [x3, #(VGIC_V3_CPU_AP0R + 3*4)]
-       mrs_s   x19, ICH_AP0R2_EL2
-       str     w19, [x3, #(VGIC_V3_CPU_AP0R + 2*4)]
-6:     mrs_s   x18, ICH_AP0R1_EL2
-       str     w18, [x3, #(VGIC_V3_CPU_AP0R + 1*4)]
-5:     mrs_s   x17, ICH_AP0R0_EL2
-       str     w17, [x3, #VGIC_V3_CPU_AP0R]
-
-       tbnz    w21, #29, 6f    // 6 bits
-       tbz     w21, #30, 5f    // 5 bits
-                               // 7 bits
-       mrs_s   x20, ICH_AP1R3_EL2
-       str     w20, [x3, #(VGIC_V3_CPU_AP1R + 3*4)]
-       mrs_s   x19, ICH_AP1R2_EL2
-       str     w19, [x3, #(VGIC_V3_CPU_AP1R + 2*4)]
-6:     mrs_s   x18, ICH_AP1R1_EL2
-       str     w18, [x3, #(VGIC_V3_CPU_AP1R + 1*4)]
-5:     mrs_s   x17, ICH_AP1R0_EL2
-       str     w17, [x3, #VGIC_V3_CPU_AP1R]
-
-       // Restore SRE_EL1 access and re-enable SRE at EL1.
-       mrs_s   x5, ICC_SRE_EL2
-       orr     x5, x5, #ICC_SRE_EL2_ENABLE
-       msr_s   ICC_SRE_EL2, x5
-       isb
-       mov     x5, #1
-       msr_s   ICC_SRE_EL1, x5
-.endm
-
-/*
- * Restore the VGIC CPU state from memory
- * x0: Register pointing to VCPU struct
- */
-.macro restore_vgic_v3_state
-       // Compute the address of struct vgic_cpu
-       add     x3, x0, #VCPU_VGIC_CPU
-
-       // Restore all interesting registers
-       ldr     w4, [x3, #VGIC_V3_CPU_HCR]
-       ldr     w5, [x3, #VGIC_V3_CPU_VMCR]
-       ldr     w25, [x3, #VGIC_V3_CPU_SRE]
-
-       msr_s   ICC_SRE_EL1, x25
-
-       // make sure SRE is valid before writing the other registers
-       isb
-
-       msr_s   ICH_HCR_EL2, x4
-       msr_s   ICH_VMCR_EL2, x5
-
-       mrs_s   x21, ICH_VTR_EL2
-
-       tbnz    w21, #29, 6f    // 6 bits
-       tbz     w21, #30, 5f    // 5 bits
-                               // 7 bits
-       ldr     w20, [x3, #(VGIC_V3_CPU_AP1R + 3*4)]
-       msr_s   ICH_AP1R3_EL2, x20
-       ldr     w19, [x3, #(VGIC_V3_CPU_AP1R + 2*4)]
-       msr_s   ICH_AP1R2_EL2, x19
-6:     ldr     w18, [x3, #(VGIC_V3_CPU_AP1R + 1*4)]
-       msr_s   ICH_AP1R1_EL2, x18
-5:     ldr     w17, [x3, #VGIC_V3_CPU_AP1R]
-       msr_s   ICH_AP1R0_EL2, x17
-
-       tbnz    w21, #29, 6f    // 6 bits
-       tbz     w21, #30, 5f    // 5 bits
-                               // 7 bits
-       ldr     w20, [x3, #(VGIC_V3_CPU_AP0R + 3*4)]
-       msr_s   ICH_AP0R3_EL2, x20
-       ldr     w19, [x3, #(VGIC_V3_CPU_AP0R + 2*4)]
-       msr_s   ICH_AP0R2_EL2, x19
-6:     ldr     w18, [x3, #(VGIC_V3_CPU_AP0R + 1*4)]
-       msr_s   ICH_AP0R1_EL2, x18
-5:     ldr     w17, [x3, #VGIC_V3_CPU_AP0R]
-       msr_s   ICH_AP0R0_EL2, x17
-
-       and     w22, w21, #0xf
-       mvn     w22, w21
-       ubfiz   w23, w22, 2, 4  // w23 = (15 - ListRegs) * 4
-
-       adr     x24, 1f
-       add     x24, x24, x23
-       br      x24
-
-1:
-       ldr     x20, [x3, #LR_OFFSET(15)]
-       ldr     x19, [x3, #LR_OFFSET(14)]
-       ldr     x18, [x3, #LR_OFFSET(13)]
-       ldr     x17, [x3, #LR_OFFSET(12)]
-       ldr     x16, [x3, #LR_OFFSET(11)]
-       ldr     x15, [x3, #LR_OFFSET(10)]
-       ldr     x14, [x3, #LR_OFFSET(9)]
-       ldr     x13, [x3, #LR_OFFSET(8)]
-       ldr     x12, [x3, #LR_OFFSET(7)]
-       ldr     x11, [x3, #LR_OFFSET(6)]
-       ldr     x10, [x3, #LR_OFFSET(5)]
-       ldr     x9, [x3, #LR_OFFSET(4)]
-       ldr     x8, [x3, #LR_OFFSET(3)]
-       ldr     x7, [x3, #LR_OFFSET(2)]
-       ldr     x6, [x3, #LR_OFFSET(1)]
-       ldr     x5, [x3, #LR_OFFSET(0)]
-
-       adr     x24, 1f
-       add     x24, x24, x23
-       br      x24
-
-1:
-       msr_s   ICH_LR15_EL2, x20
-       msr_s   ICH_LR14_EL2, x19
-       msr_s   ICH_LR13_EL2, x18
-       msr_s   ICH_LR12_EL2, x17
-       msr_s   ICH_LR11_EL2, x16
-       msr_s   ICH_LR10_EL2, x15
-       msr_s   ICH_LR9_EL2,  x14
-       msr_s   ICH_LR8_EL2,  x13
-       msr_s   ICH_LR7_EL2,  x12
-       msr_s   ICH_LR6_EL2,  x11
-       msr_s   ICH_LR5_EL2,  x10
-       msr_s   ICH_LR4_EL2,   x9
-       msr_s   ICH_LR3_EL2,   x8
-       msr_s   ICH_LR2_EL2,   x7
-       msr_s   ICH_LR1_EL2,   x6
-       msr_s   ICH_LR0_EL2,   x5
-
-       // Ensure that the above will have reached the
-       // (re)distributors. This ensure the guest will read
-       // the correct values from the memory-mapped interface.
-       isb
-       dsb     sy
-
-       // Prevent the guest from touching the GIC system registers
-       // if SRE isn't enabled for GICv3 emulation
-       cbnz    x25, 1f
-       mrs_s   x5, ICC_SRE_EL2
-       and     x5, x5, #~ICC_SRE_EL2_ENABLE
-       msr_s   ICC_SRE_EL2, x5
-1:
-.endm
-
-ENTRY(__save_vgic_v3_state)
-       save_vgic_v3_state
-       ret
-ENDPROC(__save_vgic_v3_state)
-
-ENTRY(__restore_vgic_v3_state)
-       restore_vgic_v3_state
-       ret
-ENDPROC(__restore_vgic_v3_state)
-
-ENTRY(__vgic_v3_get_ich_vtr_el2)
-       mrs_s   x0, ICH_VTR_EL2
-       ret
-ENDPROC(__vgic_v3_get_ich_vtr_el2)
-
-       .popsection
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile

index 1a811ecf71da8a8032a1e8cda8cf686f8fc71189..c86b7909ef312009028c46ba83b375b544d9ae84 100644 (file)
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -4,15 +4,16 @@ lib-y         := bitops.o clear_user.o delay.o copy_from_user.o       \
                    memcmp.o strcmp.o strncmp.o strlen.o strnlen.o       \
                    strchr.o strrchr.o
  
-# Tell the compiler to treat all general purpose registers as
-# callee-saved, which allows for efficient runtime patching of the bl
-# instruction in the caller with an atomic instruction when supported by
-# the CPU. Result and argument registers are handled correctly, based on
-# the function prototype.
+# Tell the compiler to treat all general purpose registers (with the
+# exception of the IP registers, which are already handled by the caller
+# in case of a PLT) as callee-saved, which allows for efficient runtime
+# patching of the bl instruction in the caller with an atomic instruction
+# when supported by the CPU. Result and argument registers are handled
+# correctly, based on the function prototype.
  lib-$(CONFIG_ARM64_LSE_ATOMICS) += atomic_ll_sc.o
  CFLAGS_atomic_ll_sc.o  := -fcall-used-x0 -ffixed-x1 -ffixed-x2         \
                    -ffixed-x3 -ffixed-x4 -ffixed-x5 -ffixed-x6          \
                    -ffixed-x7 -fcall-saved-x8 -fcall-saved-x9           \
                    -fcall-saved-x10 -fcall-saved-x11 -fcall-saved-x12   \
                    -fcall-saved-x13 -fcall-saved-x14 -fcall-saved-x15   \
-                  -fcall-saved-x16 -fcall-saved-x17 -fcall-saved-x18
+                  -fcall-saved-x18
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S

index a9723c71c52b20adf4e2efe69b2e43163cb2c878..5d1cad3ce6d601aa474ae9c9b8ef4c76a785912e 100644 (file)
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -33,28 +33,28 @@
   * Alignment fixed up by hardware.
   */
  ENTRY(__clear_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
             CONFIG_ARM64_PAN)
         mov     x2, x1                  // save the size for fixup return
         subs    x1, x1, #8
         b.mi    2f
  1:
-USER(9f, str   xzr, [x0], #8   )
+uao_user_alternative 9f, str, sttr, xzr, x0, 8
         subs    x1, x1, #8
         b.pl    1b
  2:     adds    x1, x1, #4
         b.mi    3f
-USER(9f, str   wzr, [x0], #4   )
+uao_user_alternative 9f, str, sttr, wzr, x0, 4
         sub     x1, x1, #4
  3:     adds    x1, x1, #2
         b.mi    4f
-USER(9f, strh  wzr, [x0], #2   )
+uao_user_alternative 9f, strh, sttrh, wzr, x0, 2
         sub     x1, x1, #2
  4:     adds    x1, x1, #1
         b.mi    5f
-USER(9f, strb  wzr, [x0]       )
+uao_user_alternative 9f, strb, sttrb, wzr, x0, 0
  5:     mov     x0, #0
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
             CONFIG_ARM64_PAN)
         ret
  ENDPROC(__clear_user)
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S

index 4699cd74f87e4af7bf69da8ce48a88a7f4f69b74..0b90497d4424c59d0a9ce2dcf5642012f452d3c8 100644 (file)
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -34,7 +34,7 @@
   */
  
         .macro ldrb1 ptr, regB, val
-       USER(9998f, ldrb  \ptr, [\regB], \val)
+       uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val
         .endm
  
         .macro strb1 ptr, regB, val
@@ -42,7 +42,7 @@
         .endm
  
         .macro ldrh1 ptr, regB, val
-       USER(9998f, ldrh  \ptr, [\regB], \val)
+       uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val
         .endm
  
         .macro strh1 ptr, regB, val
@@ -50,7 +50,7 @@
         .endm
  
         .macro ldr1 ptr, regB, val
-       USER(9998f, ldr \ptr, [\regB], \val)
+       uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val
         .endm
  
         .macro str1 ptr, regB, val
@@ -58,7 +58,7 @@
         .endm
  
         .macro ldp1 ptr, regB, regC, val
-       USER(9998f, ldp \ptr, \regB, [\regC], \val)
+       uao_ldp 9998f, \ptr, \regB, \regC, \val
         .endm
  
         .macro stp1 ptr, regB, regC, val
@@ -66,16 +66,16 @@
         .endm
  
  end    .req    x5
-ENTRY(__copy_from_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
+ENTRY(__arch_copy_from_user)
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
             CONFIG_ARM64_PAN)
         add     end, x0, x2
  #include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
             CONFIG_ARM64_PAN)
         mov     x0, #0                          // Nothing to copy
         ret
-ENDPROC(__copy_from_user)
+ENDPROC(__arch_copy_from_user)
  
         .section .fixup,"ax"
         .align  2
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S

index 81c8fc93c100b7be7da17ebf96b1edeeb806671f..f7292dd08c840f27d39874fe7cc08aa89bdfb66d 100644 (file)
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -35,44 +35,44 @@
   *     x0 - bytes not copied
   */
         .macro ldrb1 ptr, regB, val
-       USER(9998f, ldrb  \ptr, [\regB], \val)
+       uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val
         .endm
  
         .macro strb1 ptr, regB, val
-       USER(9998f, strb \ptr, [\regB], \val)
+       uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val
         .endm
  
         .macro ldrh1 ptr, regB, val
-       USER(9998f, ldrh  \ptr, [\regB], \val)
+       uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val
         .endm
  
         .macro strh1 ptr, regB, val
-       USER(9998f, strh \ptr, [\regB], \val)
+       uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val
         .endm
  
         .macro ldr1 ptr, regB, val
-       USER(9998f, ldr \ptr, [\regB], \val)
+       uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val
         .endm
  
         .macro str1 ptr, regB, val
-       USER(9998f, str \ptr, [\regB], \val)
+       uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val
         .endm
  
         .macro ldp1 ptr, regB, regC, val
-       USER(9998f, ldp \ptr, \regB, [\regC], \val)
+       uao_ldp 9998f, \ptr, \regB, \regC, \val
         .endm
  
         .macro stp1 ptr, regB, regC, val
-       USER(9998f, stp \ptr, \regB, [\regC], \val)
+       uao_stp 9998f, \ptr, \regB, \regC, \val
         .endm
  
  end    .req    x5
  ENTRY(__copy_in_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
             CONFIG_ARM64_PAN)
         add     end, x0, x2
  #include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
             CONFIG_ARM64_PAN)
         mov     x0, #0
         ret
diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S

index 512b9a7b980e98bbed9a699107e936e4b1913dca..4c1e700840b6ced5a0b2f868bfb4f37dddc8abc0 100644 (file)
--- a/arch/arm64/lib/copy_page.S
+++ b/arch/arm64/lib/copy_page.S
@@ -18,6 +18,8 @@
  #include <linux/const.h>
  #include <asm/assembler.h>
  #include <asm/page.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative.h>
  
  /*
   * Copy a page from src to dest (both are page aligned)
@@ -27,20 +29,65 @@
   *     x1 - src
   */
  ENTRY(copy_page)
-       /* Assume cache line size is 64 bytes. */
-       prfm    pldl1strm, [x1, #64]
-1:     ldp     x2, x3, [x1]
+alternative_if_not ARM64_HAS_NO_HW_PREFETCH
+       nop
+       nop
+alternative_else
+       # Prefetch two cache lines ahead.
+       prfm    pldl1strm, [x1, #128]
+       prfm    pldl1strm, [x1, #256]
+alternative_endif
+
+       ldp     x2, x3, [x1]
         ldp     x4, x5, [x1, #16]
         ldp     x6, x7, [x1, #32]
         ldp     x8, x9, [x1, #48]
-       add     x1, x1, #64
-       prfm    pldl1strm, [x1, #64]
+       ldp     x10, x11, [x1, #64]
+       ldp     x12, x13, [x1, #80]
+       ldp     x14, x15, [x1, #96]
+       ldp     x16, x17, [x1, #112]
+
+       mov     x18, #(PAGE_SIZE - 128)
+       add     x1, x1, #128
+1:
+       subs    x18, x18, #128
+
+alternative_if_not ARM64_HAS_NO_HW_PREFETCH
+       nop
+alternative_else
+       prfm    pldl1strm, [x1, #384]
+alternative_endif
+
         stnp    x2, x3, [x0]
+       ldp     x2, x3, [x1]
         stnp    x4, x5, [x0, #16]
+       ldp     x4, x5, [x1, #16]
         stnp    x6, x7, [x0, #32]
+       ldp     x6, x7, [x1, #32]
         stnp    x8, x9, [x0, #48]
-       add     x0, x0, #64
-       tst     x1, #(PAGE_SIZE - 1)
-       b.ne    1b
+       ldp     x8, x9, [x1, #48]
+       stnp    x10, x11, [x0, #64]
+       ldp     x10, x11, [x1, #64]
+       stnp    x12, x13, [x0, #80]
+       ldp     x12, x13, [x1, #80]
+       stnp    x14, x15, [x0, #96]
+       ldp     x14, x15, [x1, #96]
+       stnp    x16, x17, [x0, #112]
+       ldp     x16, x17, [x1, #112]
+
+       add     x0, x0, #128
+       add     x1, x1, #128
+
+       b.gt    1b
+
+       stnp    x2, x3, [x0]
+       stnp    x4, x5, [x0, #16]
+       stnp    x6, x7, [x0, #32]
+       stnp    x8, x9, [x0, #48]
+       stnp    x10, x11, [x0, #64]
+       stnp    x12, x13, [x0, #80]
+       stnp    x14, x15, [x0, #96]
+       stnp    x16, x17, [x0, #112]
+
         ret
  ENDPROC(copy_page)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S

index 7512bbbc07ac39dbe8c963745281f25c2d60efa4..7a7efe25503452bdfe8e4108b5f4aa0ad9495da5 100644 (file)
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -37,7 +37,7 @@
         .endm
  
         .macro strb1 ptr, regB, val
-       USER(9998f, strb \ptr, [\regB], \val)
+       uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val
         .endm
  
         .macro ldrh1 ptr, regB, val
@@ -45,7 +45,7 @@
         .endm
  
         .macro strh1 ptr, regB, val
-       USER(9998f, strh \ptr, [\regB], \val)
+       uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val
         .endm
  
         .macro ldr1 ptr, regB, val
@@ -53,7 +53,7 @@
         .endm
  
         .macro str1 ptr, regB, val
-       USER(9998f, str \ptr, [\regB], \val)
+       uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val
         .endm
  
         .macro ldp1 ptr, regB, regC, val
@@ -61,20 +61,20 @@
         .endm
  
         .macro stp1 ptr, regB, regC, val
-       USER(9998f, stp \ptr, \regB, [\regC], \val)
+       uao_stp 9998f, \ptr, \regB, \regC, \val
         .endm
  
  end    .req    x5
-ENTRY(__copy_to_user)
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
+ENTRY(__arch_copy_to_user)
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
             CONFIG_ARM64_PAN)
         add     end, x0, x2
  #include "copy_template.S"
-ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
+ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
             CONFIG_ARM64_PAN)
         mov     x0, #0
         ret
-ENDPROC(__copy_to_user)
+ENDPROC(__arch_copy_to_user)
  
         .section .fixup,"ax"
         .align  2
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S

index cfa44a6adc0ad5ec29f78228196b7e834b65df40..50ff9ba3a2367283e0340bb082c48f24678a5daa 100644 (file)
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -24,8 +24,6 @@
  #include <asm/cpufeature.h>
  #include <asm/alternative.h>
  
-#include "proc-macros.S"
-
  /*
   *     flush_icache_range(start,end)
   *
@@ -81,25 +79,31 @@ ENDPROC(__flush_cache_user_range)
  /*
   *     __flush_dcache_area(kaddr, size)
   *
- *     Ensure that the data held in the page kaddr is written back to the
- *     page in question.
+ *     Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
+ *     are cleaned and invalidated to the PoC.
   *
   *     - kaddr   - kernel address
   *     - size    - size in question
   */
  ENTRY(__flush_dcache_area)
-       dcache_line_size x2, x3
-       add     x1, x0, x1
-       sub     x3, x2, #1
-       bic     x0, x0, x3
-1:     dc      civac, x0                       // clean & invalidate D line / unified line
-       add     x0, x0, x2
-       cmp     x0, x1
-       b.lo    1b
-       dsb     sy
+       dcache_by_line_op civac, sy, x0, x1, x2, x3
         ret
  ENDPIPROC(__flush_dcache_area)
  
+/*
+ *     __clean_dcache_area_pou(kaddr, size)
+ *
+ *     Ensure that any D-cache lines for the interval [kaddr, kaddr+size)
+ *     are cleaned to the PoU.
+ *
+ *     - kaddr   - kernel address
+ *     - size    - size in question
+ */
+ENTRY(__clean_dcache_area_pou)
+       dcache_by_line_op cvau, ish, x0, x1, x2, x3
+       ret
+ENDPROC(__clean_dcache_area_pou)
+
  /*
   *     __inval_cache_range(start, end)
   *     - start   - start address of region
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c

index e87f53ff5f583aeb47b3ec3187d229df380b2c50..7275628ba59f663489f6f9403d46ca8a5050c6f7 100644 (file)
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -187,7 +187,7 @@ switch_mm_fastpath:
  
  static int asids_init(void)
  {
-       int fld = cpuid_feature_extract_field(read_cpuid(ID_AA64MMFR0_EL1), 4);
+       int fld = cpuid_feature_extract_field(read_cpuid(SYS_ID_AA64MMFR0_EL1), 4);
  
         switch (fld) {
         default:
diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c

index 13bbc3be6f5ab31a24d6d0a03b8f368ea6923ed8..22e4cb4d6f538baa43f7071ad1729dec01216d23 100644 (file)
--- a/arch/arm64/mm/copypage.c
+++ b/arch/arm64/mm/copypage.c
@@ -24,8 +24,9 @@
  
  void __cpu_copy_user_page(void *kto, const void *kfrom, unsigned long vaddr)
  {
+       struct page *page = virt_to_page(kto);
         copy_page(kto, kfrom);
-       __flush_dcache_area(kto, PAGE_SIZE);
+       flush_dcache_page(page);
  }
  EXPORT_SYMBOL_GPL(__cpu_copy_user_page);
  
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c

index 354144e3321843ec1e34993c5a0b37068053e1c9..a6e757cbab7785ed411e919b95c6d13caaf21726 100644 (file)
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -40,7 +40,7 @@ static pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot,
  static struct gen_pool *atomic_pool;
  
  #define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
-static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE;
+static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
  
  static int __init early_coherent_pool(char *p)
  {
@@ -896,7 +896,7 @@ static int __iommu_attach_notifier(struct notifier_block *nb,
         return 0;
  }
  
-static int register_iommu_dma_ops_notifier(struct bus_type *bus)
+static int __init register_iommu_dma_ops_notifier(struct bus_type *bus)
  {
         struct notifier_block *nb = kzalloc(sizeof(*nb), GFP_KERNEL);
         int ret;
diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c

index 5a22a119a74c87b4b5b54e114701b3c6eed233e6..6be918478f855021fee88a50fefe1b6642252c81 100644 (file)
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/dump.c
@@ -35,7 +35,9 @@ struct addr_marker {
  };
  
  enum address_markers_idx {
-       VMALLOC_START_NR = 0,
+       MODULES_START_NR = 0,
+       MODULES_END_NR,
+       VMALLOC_START_NR,
         VMALLOC_END_NR,
  #ifdef CONFIG_SPARSEMEM_VMEMMAP
         VMEMMAP_START_NR,
@@ -45,12 +47,12 @@ enum address_markers_idx {
         FIXADDR_END_NR,
         PCI_START_NR,
         PCI_END_NR,
-       MODULES_START_NR,
-       MODUELS_END_NR,
         KERNEL_SPACE_NR,
  };
  
  static struct addr_marker address_markers[] = {
+       { MODULES_VADDR,        "Modules start" },
+       { MODULES_END,          "Modules end" },
         { VMALLOC_START,        "vmalloc() Area" },
         { VMALLOC_END,          "vmalloc() End" },
  #ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -61,9 +63,7 @@ static struct addr_marker address_markers[] = {
         { FIXADDR_TOP,          "Fixmap end" },
         { PCI_IO_START,         "PCI I/O start" },
         { PCI_IO_END,           "PCI I/O end" },
-       { MODULES_VADDR,        "Modules start" },
-       { MODULES_END,          "Modules end" },
-       { PAGE_OFFSET,          "Kernel Mapping" },
+       { PAGE_OFFSET,          "Linear Mapping" },
         { -1,                   NULL },
  };
  
@@ -90,6 +90,11 @@ struct prot_bits {
  
  static const struct prot_bits pte_bits[] = {
         {
+               .mask   = PTE_VALID,
+               .val    = PTE_VALID,
+               .set    = " ",
+               .clear  = "F",
+       }, {
                 .mask   = PTE_USER,
                 .val    = PTE_USER,
                 .set    = "USR",
diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c

index 79444279ba8c674316e34cfe0861c42021fa92f3..81acd4706878f85d8821f0ff924bff05adc31c97 100644 (file)
--- a/arch/arm64/mm/extable.c
+++ b/arch/arm64/mm/extable.c
@@ -11,7 +11,7 @@ int fixup_exception(struct pt_regs *regs)
  
         fixup = search_exception_tables(instruction_pointer(regs));
         if (fixup)
-               regs->pc = fixup->fixup;
+               regs->pc = (unsigned long)&fixup->fixup + fixup->fixup;
  
         return fixup != NULL;
  }
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c

index 247bae758e1e7ec37b34f26d92e63d5e96745e62..18e5a2c3d5546ff0bf7c1e809d2817ad53ac348a 100644 (file)
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -43,6 +43,28 @@
  
  static const char *fault_name(unsigned int esr);
  
+#ifdef CONFIG_KPROBES
+static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
+{
+       int ret = 0;
+
+       /* kprobe_running() needs smp_processor_id() */
+       if (!user_mode(regs)) {
+               preempt_disable();
+               if (kprobe_running() && kprobe_fault_handler(regs, esr))
+                       ret = 1;
+               preempt_enable();
+       }
+
+       return ret;
+}
+#else
+static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
+{
+       return 0;
+}
+#endif
+
  /*
   * Dump out the page tables associated with 'addr' in mm 'mm'.
   */
@@ -244,6 +266,14 @@ out:
         return fault;
  }
  
+static inline int permission_fault(unsigned int esr)
+{
+       unsigned int ec       = (esr & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT;
+       unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
+
+       return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM);
+}
+
  static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
                                    struct pt_regs *regs)
  {
@@ -253,6 +283,9 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
         unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
         unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
  
+       if (notify_page_fault(regs, esr))
+               return 0;
+
         tsk = current;
         mm  = tsk->mm;
  
@@ -277,12 +310,13 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
                 mm_flags |= FAULT_FLAG_WRITE;
         }
  
-       /*
-        * PAN bit set implies the fault happened in kernel space, but not
-        * in the arch's user access functions.
-        */
-       if (IS_ENABLED(CONFIG_ARM64_PAN) && (regs->pstate & PSR_PAN_BIT))
-               goto no_context;
+       if (permission_fault(esr) && (addr < USER_DS)) {
+               if (get_fs() == KERNEL_DS)
+                       die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
+
+               if (!search_exception_tables(regs->pc))
+                       die("Accessing user space memory outside uaccess.h routines", regs, esr);
+       }
  
         /*
          * As per x86, we may deadlock here. However, since the kernel only
@@ -606,6 +640,7 @@ asmlinkage int __exception do_debug_exception(unsigned long addr,
  
         return 0;
  }
+NOKPROBE_SYMBOL(do_debug_exception);
  
  #ifdef CONFIG_ARM64_PAN
  int cpu_enable_pan(void *__unused)
@@ -621,3 +656,16 @@ int cpu_enable_pan(void *__unused)
         return 0;
  }
  #endif /* CONFIG_ARM64_PAN */
+
+#ifdef CONFIG_ARM64_UAO
+/*
+ * Kernel threads have fs=KERNEL_DS by default, and don't need to call
+ * set_fs(), devtmpfs in particular relies on this behaviour.
+ * We need to enable the feature at runtime (instead of adding it to
+ * PSR_MODE_EL1h) as the feature may not be implemented by the cpu.
+ */
+int cpu_enable_uao(void *__unused)
+{
+       asm(SET_PSTATE_UAO(1));
+}
+#endif /* CONFIG_ARM64_UAO */
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c

index c26b804015e80c46e1380d0a1af7f8f439c55405..46649d6e6c5a5608caa84015d3ce4f09d3d47eee 100644 (file)
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -34,19 +34,24 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
                 __flush_icache_all();
  }
  
+static void sync_icache_aliases(void *kaddr, unsigned long len)
+{
+       unsigned long addr = (unsigned long)kaddr;
+
+       if (icache_is_aliasing()) {
+               __clean_dcache_area_pou(kaddr, len);
+               __flush_icache_all();
+       } else {
+               flush_icache_range(addr, addr + len);
+       }
+}
+
  static void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
                                 unsigned long uaddr, void *kaddr,
                                 unsigned long len)
  {
-       if (vma->vm_flags & VM_EXEC) {
-               unsigned long addr = (unsigned long)kaddr;
-               if (icache_is_aliasing()) {
-                       __flush_dcache_area(kaddr, len);
-                       __flush_icache_all();
-               } else {
-                       flush_icache_range(addr, addr + len);
-               }
-       }
+       if (vma->vm_flags & VM_EXEC)
+               sync_icache_aliases(kaddr, len);
  }
  
  /*
@@ -74,13 +79,11 @@ void __sync_icache_dcache(pte_t pte, unsigned long addr)
         if (!page_mapping(page))
                 return;
  
-       if (!test_and_set_bit(PG_dcache_clean, &page->flags)) {
-               __flush_dcache_area(page_address(page),
-                               PAGE_SIZE << compound_order(page));
+       if (!test_and_set_bit(PG_dcache_clean, &page->flags))
+               sync_icache_aliases(page_address(page),
+                                   PAGE_SIZE << compound_order(page));
+       else if (icache_is_aivivt())
                 __flush_icache_all();
-       } else if (icache_is_aivivt()) {
-               __flush_icache_all();
-       }
  }
  
  /*
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c

index 383b03ff38f850a0b0a000ee4e6450fd6649c7db..da30529bb1f65c9e3d5408b2e28ab31bc2283211 100644 (file)
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -41,15 +41,273 @@ int pud_huge(pud_t pud)
  #endif
  }
  
+static int find_num_contig(struct mm_struct *mm, unsigned long addr,
+                          pte_t *ptep, pte_t pte, size_t *pgsize)
+{
+       pgd_t *pgd = pgd_offset(mm, addr);
+       pud_t *pud;
+       pmd_t *pmd;
+
+       *pgsize = PAGE_SIZE;
+       if (!pte_cont(pte))
+               return 1;
+       if (!pgd_present(*pgd)) {
+               VM_BUG_ON(!pgd_present(*pgd));
+               return 1;
+       }
+       pud = pud_offset(pgd, addr);
+       if (!pud_present(*pud)) {
+               VM_BUG_ON(!pud_present(*pud));
+               return 1;
+       }
+       pmd = pmd_offset(pud, addr);
+       if (!pmd_present(*pmd)) {
+               VM_BUG_ON(!pmd_present(*pmd));
+               return 1;
+       }
+       if ((pte_t *)pmd == ptep) {
+               *pgsize = PMD_SIZE;
+               return CONT_PMDS;
+       }
+       return CONT_PTES;
+}
+
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+                           pte_t *ptep, pte_t pte)
+{
+       size_t pgsize;
+       int i;
+       int ncontig = find_num_contig(mm, addr, ptep, pte, &pgsize);
+       unsigned long pfn;
+       pgprot_t hugeprot;
+
+       if (ncontig == 1) {
+               set_pte_at(mm, addr, ptep, pte);
+               return;
+       }
+
+       pfn = pte_pfn(pte);
+       hugeprot = __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
+       for (i = 0; i < ncontig; i++) {
+               pr_debug("%s: set pte %p to 0x%llx\n", __func__, ptep,
+                        pte_val(pfn_pte(pfn, hugeprot)));
+               set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
+               ptep++;
+               pfn += pgsize >> PAGE_SHIFT;
+               addr += pgsize;
+       }
+}
+
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+                     unsigned long addr, unsigned long sz)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pte_t *pte = NULL;
+
+       pr_debug("%s: addr:0x%lx sz:0x%lx\n", __func__, addr, sz);
+       pgd = pgd_offset(mm, addr);
+       pud = pud_alloc(mm, pgd, addr);
+       if (!pud)
+               return NULL;
+
+       if (sz == PUD_SIZE) {
+               pte = (pte_t *)pud;
+       } else if (sz == (PAGE_SIZE * CONT_PTES)) {
+               pmd_t *pmd = pmd_alloc(mm, pud, addr);
+
+               WARN_ON(addr & (sz - 1));
+               /*
+                * Note that if this code were ever ported to the
+                * 32-bit arm platform then it will cause trouble in
+                * the case where CONFIG_HIGHPTE is set, since there
+                * will be no pte_unmap() to correspond with this
+                * pte_alloc_map().
+                */
+               pte = pte_alloc_map(mm, NULL, pmd, addr);
+       } else if (sz == PMD_SIZE) {
+               if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
+                   pud_none(*pud))
+                       pte = huge_pmd_share(mm, addr, pud);
+               else
+                       pte = (pte_t *)pmd_alloc(mm, pud, addr);
+       } else if (sz == (PMD_SIZE * CONT_PMDS)) {
+               pmd_t *pmd;
+
+               pmd = pmd_alloc(mm, pud, addr);
+               WARN_ON(addr & (sz - 1));
+               return (pte_t *)pmd;
+       }
+
+       pr_debug("%s: addr:0x%lx sz:0x%lx ret pte=%p/0x%llx\n", __func__, addr,
+              sz, pte, pte_val(*pte));
+       return pte;
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd = NULL;
+       pte_t *pte = NULL;
+
+       pgd = pgd_offset(mm, addr);
+       pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd);
+       if (!pgd_present(*pgd))
+               return NULL;
+       pud = pud_offset(pgd, addr);
+       if (!pud_present(*pud))
+               return NULL;
+
+       if (pud_huge(*pud))
+               return (pte_t *)pud;
+       pmd = pmd_offset(pud, addr);
+       if (!pmd_present(*pmd))
+               return NULL;
+
+       if (pte_cont(pmd_pte(*pmd))) {
+               pmd = pmd_offset(
+                       pud, (addr & CONT_PMD_MASK));
+               return (pte_t *)pmd;
+       }
+       if (pmd_huge(*pmd))
+               return (pte_t *)pmd;
+       pte = pte_offset_kernel(pmd, addr);
+       if (pte_present(*pte) && pte_cont(*pte)) {
+               pte = pte_offset_kernel(
+                       pmd, (addr & CONT_PTE_MASK));
+               return pte;
+       }
+       return NULL;
+}
+
+pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+                        struct page *page, int writable)
+{
+       size_t pagesize = huge_page_size(hstate_vma(vma));
+
+       if (pagesize == CONT_PTE_SIZE) {
+               entry = pte_mkcont(entry);
+       } else if (pagesize == CONT_PMD_SIZE) {
+               entry = pmd_pte(pmd_mkcont(pte_pmd(entry)));
+       } else if (pagesize != PUD_SIZE && pagesize != PMD_SIZE) {
+               pr_warn("%s: unrecognized huge page size 0x%lx\n",
+                       __func__, pagesize);
+       }
+       return entry;
+}
+
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+                             unsigned long addr, pte_t *ptep)
+{
+       pte_t pte;
+
+       if (pte_cont(*ptep)) {
+               int ncontig, i;
+               size_t pgsize;
+               pte_t *cpte;
+               bool is_dirty = false;
+
+               cpte = huge_pte_offset(mm, addr);
+               ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize);
+               /* save the 1st pte to return */
+               pte = ptep_get_and_clear(mm, addr, cpte);
+               for (i = 1; i < ncontig; ++i) {
+                       /*
+                        * If HW_AFDBM is enabled, then the HW could
+                        * turn on the dirty bit for any of the page
+                        * in the set, so check them all.
+                        */
+                       ++cpte;
+                       if (pte_dirty(ptep_get_and_clear(mm, addr, cpte)))
+                               is_dirty = true;
+               }
+               if (is_dirty)
+                       return pte_mkdirty(pte);
+               else
+                       return pte;
+       } else {
+               return ptep_get_and_clear(mm, addr, ptep);
+       }
+}
+
+int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+                              unsigned long addr, pte_t *ptep,
+                              pte_t pte, int dirty)
+{
+       pte_t *cpte;
+
+       if (pte_cont(pte)) {
+               int ncontig, i, changed = 0;
+               size_t pgsize = 0;
+               unsigned long pfn = pte_pfn(pte);
+               /* Select all bits except the pfn */
+               pgprot_t hugeprot =
+                       __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^
+                                pte_val(pte));
+
+               cpte = huge_pte_offset(vma->vm_mm, addr);
+               pfn = pte_pfn(*cpte);
+               ncontig = find_num_contig(vma->vm_mm, addr, cpte,
+                                         *cpte, &pgsize);
+               for (i = 0; i < ncontig; ++i, ++cpte) {
+                       changed = ptep_set_access_flags(vma, addr, cpte,
+                                                       pfn_pte(pfn,
+                                                               hugeprot),
+                                                       dirty);
+                       pfn += pgsize >> PAGE_SHIFT;
+               }
+               return changed;
+       } else {
+               return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+       }
+}
+
+void huge_ptep_set_wrprotect(struct mm_struct *mm,
+                            unsigned long addr, pte_t *ptep)
+{
+       if (pte_cont(*ptep)) {
+               int ncontig, i;
+               pte_t *cpte;
+               size_t pgsize = 0;
+
+               cpte = huge_pte_offset(mm, addr);
+               ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize);
+               for (i = 0; i < ncontig; ++i, ++cpte)
+                       ptep_set_wrprotect(mm, addr, cpte);
+       } else {
+               ptep_set_wrprotect(mm, addr, ptep);
+       }
+}
+
+void huge_ptep_clear_flush(struct vm_area_struct *vma,
+                          unsigned long addr, pte_t *ptep)
+{
+       if (pte_cont(*ptep)) {
+               int ncontig, i;
+               pte_t *cpte;
+               size_t pgsize = 0;
+
+               cpte = huge_pte_offset(vma->vm_mm, addr);
+               ncontig = find_num_contig(vma->vm_mm, addr, cpte,
+                                         *cpte, &pgsize);
+               for (i = 0; i < ncontig; ++i, ++cpte)
+                       ptep_clear_flush(vma, addr, cpte);
+       } else {
+               ptep_clear_flush(vma, addr, ptep);
+       }
+}
+
  static __init int setup_hugepagesz(char *opt)
  {
         unsigned long ps = memparse(opt, &opt);
+
         if (ps == PMD_SIZE) {
                 hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
         } else if (ps == PUD_SIZE) {
                 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
         } else {
-               pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20);
+               pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
                 return 0;
         }
         return 1;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c

index 4cb98aa8c27b2e89fd32783d45f283e8a152d80e..75728047b60b33f29dea212e58b309db8424e119 100644 (file)
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -35,7 +35,10 @@
  #include <linux/efi.h>
  #include <linux/swiotlb.h>
  
+#include <asm/boot.h>
  #include <asm/fixmap.h>
+#include <asm/kasan.h>
+#include <asm/kernel-pgtable.h>
  #include <asm/memory.h>
  #include <asm/sections.h>
  #include <asm/setup.h>
@@ -45,7 +48,13 @@
  
  #include "mm.h"
  
-phys_addr_t memstart_addr __read_mostly = 0;
+/*
+ * We need to be able to catch inadvertent references to memstart_addr
+ * that occur (potentially in generic code) before arm64_memblock_init()
+ * executes, which assigns it its actual value. So use a default value
+ * that cannot be mistaken for a real physical address.
+ */
+s64 memstart_addr __read_mostly = -1;
  phys_addr_t arm64_dma_phys_limit __read_mostly;
  
  #ifdef CONFIG_BLK_DEV_INITRD
@@ -58,8 +67,8 @@ static int __init early_initrd(char *p)
         if (*endp == ',') {
                 size = memparse(endp + 1, NULL);
  
-               initrd_start = (unsigned long)__va(start);
-               initrd_end = (unsigned long)__va(start + size);
+               initrd_start = start;
+               initrd_end = start + size;
         }
         return 0;
  }
@@ -71,7 +80,7 @@ early_param("initrd", early_initrd);
   * currently assumes that for memory starting above 4G, 32-bit devices will
   * use a DMA offset.
   */
-static phys_addr_t max_zone_dma_phys(void)
+static phys_addr_t __init max_zone_dma_phys(void)
  {
         phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, 32);
         return min(offset + (1ULL << 32), memblock_end_of_DRAM());
@@ -126,11 +135,11 @@ EXPORT_SYMBOL(pfn_valid);
  #endif
  
  #ifndef CONFIG_SPARSEMEM
-static void arm64_memory_present(void)
+static void __init arm64_memory_present(void)
  {
  }
  #else
-static void arm64_memory_present(void)
+static void __init arm64_memory_present(void)
  {
         struct memblock_region *reg;
  
@@ -159,7 +168,57 @@ early_param("mem", early_mem);
  
  void __init arm64_memblock_init(void)
  {
-       memblock_enforce_memory_limit(memory_limit);
+       const s64 linear_region_size = -(s64)PAGE_OFFSET;
+
+       /*
+        * Ensure that the linear region takes up exactly half of the kernel
+        * virtual address space. This way, we can distinguish a linear address
+        * from a kernel/module/vmalloc address by testing a single bit.
+        */
+       BUILD_BUG_ON(linear_region_size != BIT(VA_BITS - 1));
+
+       /*
+        * Select a suitable value for the base of physical memory.
+        */
+       memstart_addr = round_down(memblock_start_of_DRAM(),
+                                  ARM64_MEMSTART_ALIGN);
+
+       /*
+        * Remove the memory that we will not be able to cover with the
+        * linear mapping. Take care not to clip the kernel which may be
+        * high in memory.
+        */
+       memblock_remove(max_t(u64, memstart_addr + linear_region_size, __pa(_end)),
+                       ULLONG_MAX);
+       if (memblock_end_of_DRAM() > linear_region_size)
+               memblock_remove(0, memblock_end_of_DRAM() - linear_region_size);
+
+       /*
+        * Apply the memory limit if it was set. Since the kernel may be loaded
+        * high up in memory, add back the kernel region that must be accessible
+        * via the linear mapping.
+        */
+       if (memory_limit != (phys_addr_t)ULLONG_MAX) {
+               memblock_enforce_memory_limit(memory_limit);
+               memblock_add(__pa(_text), (u64)(_end - _text));
+       }
+
+       if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
+               extern u16 memstart_offset_seed;
+               u64 range = linear_region_size -
+                           (memblock_end_of_DRAM() - memblock_start_of_DRAM());
+
+               /*
+                * If the size of the linear region exceeds, by a sufficient
+                * margin, the size of the region that the available physical
+                * memory spans, randomize the linear region as well.
+                */
+               if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) {
+                       range = range / ARM64_MEMSTART_ALIGN + 1;
+                       memstart_addr -= ARM64_MEMSTART_ALIGN *
+                                        ((range * memstart_offset_seed) >> 16);
+               }
+       }
  
         /*
          * Register the kernel text, kernel data, initrd, and initial
@@ -167,8 +226,13 @@ void __init arm64_memblock_init(void)
          */
         memblock_reserve(__pa(_text), _end - _text);
  #ifdef CONFIG_BLK_DEV_INITRD
-       if (initrd_start)
-               memblock_reserve(__virt_to_phys(initrd_start), initrd_end - initrd_start);
+       if (initrd_start) {
+               memblock_reserve(initrd_start, initrd_end - initrd_start);
+
+               /* the generic initrd code expects virtual addresses */
+               initrd_start = __phys_to_virt(initrd_start);
+               initrd_end = __phys_to_virt(initrd_end);
+       }
  #endif
  
         early_init_fdt_scan_reserved_mem();
@@ -302,35 +366,38 @@ void __init mem_init(void)
  #ifdef CONFIG_KASAN
                   "    kasan   : 0x%16lx - 0x%16lx   (%6ld GB)\n"
  #endif
+                 "    modules : 0x%16lx - 0x%16lx   (%6ld MB)\n"
                   "    vmalloc : 0x%16lx - 0x%16lx   (%6ld GB)\n"
+                 "      .init : 0x%p" " - 0x%p" "   (%6ld KB)\n"
+                 "      .text : 0x%p" " - 0x%p" "   (%6ld KB)\n"
+                 "    .rodata : 0x%p" " - 0x%p" "   (%6ld KB)\n"
+                 "      .data : 0x%p" " - 0x%p" "   (%6ld KB)\n"
  #ifdef CONFIG_SPARSEMEM_VMEMMAP
                   "    vmemmap : 0x%16lx - 0x%16lx   (%6ld GB maximum)\n"
                   "              0x%16lx - 0x%16lx   (%6ld MB actual)\n"
  #endif
                   "    fixed   : 0x%16lx - 0x%16lx   (%6ld KB)\n"
                   "    PCI I/O : 0x%16lx - 0x%16lx   (%6ld MB)\n"
-                 "    modules : 0x%16lx - 0x%16lx   (%6ld MB)\n"
-                 "    memory  : 0x%16lx - 0x%16lx   (%6ld MB)\n"
-                 "      .init : 0x%p" " - 0x%p" "   (%6ld KB)\n"
-                 "      .text : 0x%p" " - 0x%p" "   (%6ld KB)\n"
-                 "      .data : 0x%p" " - 0x%p" "   (%6ld KB)\n",
+                 "    memory  : 0x%16lx - 0x%16lx   (%6ld MB)\n",
  #ifdef CONFIG_KASAN
                   MLG(KASAN_SHADOW_START, KASAN_SHADOW_END),
  #endif
+                 MLM(MODULES_VADDR, MODULES_END),
                   MLG(VMALLOC_START, VMALLOC_END),
+                 MLK_ROUNDUP(__init_begin, __init_end),
+                 MLK_ROUNDUP(_text, __start_rodata),
+                 MLK_ROUNDUP(__start_rodata, _etext),
+                 MLK_ROUNDUP(_sdata, _edata),
  #ifdef CONFIG_SPARSEMEM_VMEMMAP
                   MLG(VMEMMAP_START,
                       VMEMMAP_START + VMEMMAP_SIZE),
-                 MLM((unsigned long)virt_to_page(PAGE_OFFSET),
+                 MLM((unsigned long)phys_to_page(memblock_start_of_DRAM()),
                       (unsigned long)virt_to_page(high_memory)),
  #endif
                   MLK(FIXADDR_START, FIXADDR_TOP),
                   MLM(PCI_IO_START, PCI_IO_END),
-                 MLM(MODULES_VADDR, MODULES_END),
-                 MLM(PAGE_OFFSET, (unsigned long)high_memory),
-                 MLK_ROUNDUP(__init_begin, __init_end),
-                 MLK_ROUNDUP(_text, _etext),
-                 MLK_ROUNDUP(_sdata, _edata));
+                 MLM(__phys_to_virt(memblock_start_of_DRAM()),
+                     (unsigned long)high_memory));
  
  #undef MLK
  #undef MLM
@@ -358,9 +425,8 @@ void __init mem_init(void)
  
  void free_initmem(void)
  {
-       fixup_init();
         free_initmem_default(0);
-       free_alternatives_memory();
+       fixup_init();
  }
  
  #ifdef CONFIG_BLK_DEV_INITRD
@@ -381,3 +447,28 @@ static int __init keepinitrd_setup(char *__unused)
  
  __setup("keepinitrd", keepinitrd_setup);
  #endif
+
+/*
+ * Dump out memory limit information on panic.
+ */
+static int dump_mem_limit(struct notifier_block *self, unsigned long v, void *p)
+{
+       if (memory_limit != (phys_addr_t)ULLONG_MAX) {
+               pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20);
+       } else {
+               pr_emerg("Memory Limit: none\n");
+       }
+       return 0;
+}
+
+static struct notifier_block mem_limit_notifier = {
+       .notifier_call = dump_mem_limit,
+};
+
+static int __init register_mem_limit_dumper(void)
+{
+       atomic_notifier_chain_register(&panic_notifier_list,
+                                      &mem_limit_notifier);
+       return 0;
+}
+__initcall(register_mem_limit_dumper);
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c

index cf038c7d9fa994c7d86e05920ffa8961aecc4ad8..757009daa9ede454abccbcab6b3a0421a355b49c 100644 (file)
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -16,9 +16,12 @@
  #include <linux/memblock.h>
  #include <linux/start_kernel.h>
  
+#include <asm/mmu_context.h>
+#include <asm/kernel-pgtable.h>
  #include <asm/page.h>
  #include <asm/pgalloc.h>
  #include <asm/pgtable.h>
+#include <asm/sections.h>
  #include <asm/tlbflush.h>
  
  static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE);
@@ -32,7 +35,7 @@ static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr,
         if (pmd_none(*pmd))
                 pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
  
-       pte = pte_offset_kernel(pmd, addr);
+       pte = pte_offset_kimg(pmd, addr);
         do {
                 next = addr + PAGE_SIZE;
                 set_pte(pte, pfn_pte(virt_to_pfn(kasan_zero_page),
@@ -50,7 +53,7 @@ static void __init kasan_early_pmd_populate(pud_t *pud,
         if (pud_none(*pud))
                 pud_populate(&init_mm, pud, kasan_zero_pmd);
  
-       pmd = pmd_offset(pud, addr);
+       pmd = pmd_offset_kimg(pud, addr);
         do {
                 next = pmd_addr_end(addr, end);
                 kasan_early_pte_populate(pmd, addr, next);
@@ -67,7 +70,7 @@ static void __init kasan_early_pud_populate(pgd_t *pgd,
         if (pgd_none(*pgd))
                 pgd_populate(&init_mm, pgd, kasan_zero_pud);
  
-       pud = pud_offset(pgd, addr);
+       pud = pud_offset_kimg(pgd, addr);
         do {
                 next = pud_addr_end(addr, end);
                 kasan_early_pmd_populate(pud, addr, next);
@@ -96,6 +99,21 @@ asmlinkage void __init kasan_early_init(void)
         kasan_map_early_shadow();
  }
  
+/*
+ * Copy the current shadow region into a new pgdir.
+ */
+void __init kasan_copy_shadow(pgd_t *pgdir)
+{
+       pgd_t *pgd, *pgd_new, *pgd_end;
+
+       pgd = pgd_offset_k(KASAN_SHADOW_START);
+       pgd_end = pgd_offset_k(KASAN_SHADOW_END);
+       pgd_new = pgd_offset_raw(pgdir, KASAN_SHADOW_START);
+       do {
+               set_pgd(pgd_new, *pgd);
+       } while (pgd++, pgd_new++, pgd != pgd_end);
+}
+
  static void __init clear_pgds(unsigned long start,
                         unsigned long end)
  {
@@ -108,18 +126,18 @@ static void __init clear_pgds(unsigned long start,
                 set_pgd(pgd_offset_k(start), __pgd(0));
  }
  
-static void __init cpu_set_ttbr1(unsigned long ttbr1)
-{
-       asm(
-       "       msr     ttbr1_el1, %0\n"
-       "       isb"
-       :
-       : "r" (ttbr1));
-}
-
  void __init kasan_init(void)
  {
+       u64 kimg_shadow_start, kimg_shadow_end;
+       u64 mod_shadow_start, mod_shadow_end;
         struct memblock_region *reg;
+       int i;
+
+       kimg_shadow_start = (u64)kasan_mem_to_shadow(_text);
+       kimg_shadow_end = (u64)kasan_mem_to_shadow(_end);
+
+       mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR);
+       mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END);
  
         /*
          * We are going to perform proper setup of shadow memory.
@@ -129,13 +147,33 @@ void __init kasan_init(void)
          * setup will be finished.
          */
         memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(tmp_pg_dir));
-       cpu_set_ttbr1(__pa(tmp_pg_dir));
-       flush_tlb_all();
+       dsb(ishst);
+       cpu_replace_ttbr1(tmp_pg_dir);
  
         clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
  
+       vmemmap_populate(kimg_shadow_start, kimg_shadow_end,
+                        pfn_to_nid(virt_to_pfn(_text)));
+
+       /*
+        * vmemmap_populate() has populated the shadow region that covers the
+        * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round
+        * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent
+        * kasan_populate_zero_shadow() from replacing the page table entries
+        * (PMD or PTE) at the edges of the shadow region for the kernel
+        * image.
+        */
+       kimg_shadow_start = round_down(kimg_shadow_start, SWAPPER_BLOCK_SIZE);
+       kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE);
+
         kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
-                       kasan_mem_to_shadow((void *)MODULES_VADDR));
+                                  (void *)mod_shadow_start);
+       kasan_populate_zero_shadow((void *)kimg_shadow_end,
+                                  kasan_mem_to_shadow((void *)PAGE_OFFSET));
+
+       if (kimg_shadow_start > mod_shadow_end)
+               kasan_populate_zero_shadow((void *)mod_shadow_end,
+                                          (void *)kimg_shadow_start);
  
         for_each_memblock(memory, reg) {
                 void *start = (void *)__phys_to_virt(reg->base);
@@ -155,9 +193,16 @@ void __init kasan_init(void)
                                 pfn_to_nid(virt_to_pfn(start)));
         }
  
+       /*
+        * KAsan may reuse the contents of kasan_zero_pte directly, so we
+        * should make sure that it maps the zero page read-only.
+        */
+       for (i = 0; i < PTRS_PER_PTE; i++)
+               set_pte(&kasan_zero_pte[i],
+                       pfn_pte(virt_to_pfn(kasan_zero_page), PAGE_KERNEL_RO));
+
         memset(kasan_zero_page, 0, PAGE_SIZE);
-       cpu_set_ttbr1(__pa(swapper_pg_dir));
-       flush_tlb_all();
+       cpu_replace_ttbr1(swapper_pg_dir);
  
         /* At this point kasan is fully initialized. Enable error messages */
         init_task.kasan_depth = 0;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c

index 653735a8c58a86248e37593648de4a62f301893c..8fc302d84e1f524aa0496fd213524a9d30602ebc 100644 (file)
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -30,8 +30,10 @@
  #include <linux/slab.h>
  #include <linux/stop_machine.h>
  
+#include <asm/barrier.h>
  #include <asm/cputype.h>
  #include <asm/fixmap.h>
+#include <asm/kasan.h>
  #include <asm/kernel-pgtable.h>
  #include <asm/sections.h>
  #include <asm/setup.h>
@@ -44,13 +46,20 @@
  
  u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
  
+u64 kimage_voffset __read_mostly;
+EXPORT_SYMBOL(kimage_voffset);
+
  /*
   * Empty_zero_page is a special page that is used for zero-initialized data
   * and COW.
   */
-struct page *empty_zero_page;
+unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
  EXPORT_SYMBOL(empty_zero_page);
  
+static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
+static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
+static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
+
  pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                               unsigned long size, pgprot_t vma_prot)
  {
@@ -62,16 +71,30 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
  }
  EXPORT_SYMBOL(phys_mem_access_prot);
  
-static void __init *early_alloc(unsigned long sz)
+static phys_addr_t __init early_pgtable_alloc(void)
  {
         phys_addr_t phys;
         void *ptr;
  
-       phys = memblock_alloc(sz, sz);
+       phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
         BUG_ON(!phys);
-       ptr = __va(phys);
-       memset(ptr, 0, sz);
-       return ptr;
+
+       /*
+        * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
+        * slot will be free, so we can (ab)use the FIX_PTE slot to initialise
+        * any level of table.
+        */
+       ptr = pte_set_fixmap(phys);
+
+       memset(ptr, 0, PAGE_SIZE);
+
+       /*
+        * Implicit barriers also ensure the zeroed page is visible to the page
+        * table walker
+        */
+       pte_clear_fixmap();
+
+       return phys;
  }
  
  /*
@@ -95,24 +118,30 @@ static void split_pmd(pmd_t *pmd, pte_t *pte)
  static void alloc_init_pte(pmd_t *pmd, unsigned long addr,
                                   unsigned long end, unsigned long pfn,
                                   pgprot_t prot,
-                                 void *(*alloc)(unsigned long size))
+                                 phys_addr_t (*pgtable_alloc)(void))
  {
         pte_t *pte;
  
         if (pmd_none(*pmd) || pmd_sect(*pmd)) {
-               pte = alloc(PTRS_PER_PTE * sizeof(pte_t));
+               phys_addr_t pte_phys;
+               BUG_ON(!pgtable_alloc);
+               pte_phys = pgtable_alloc();
+               pte = pte_set_fixmap(pte_phys);
                 if (pmd_sect(*pmd))
                         split_pmd(pmd, pte);
-               __pmd_populate(pmd, __pa(pte), PMD_TYPE_TABLE);
+               __pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE);
                 flush_tlb_all();
+               pte_clear_fixmap();
         }
         BUG_ON(pmd_bad(*pmd));
  
-       pte = pte_offset_kernel(pmd, addr);
+       pte = pte_set_fixmap_offset(pmd, addr);
         do {
                 set_pte(pte, pfn_pte(pfn, prot));
                 pfn++;
         } while (pte++, addr += PAGE_SIZE, addr != end);
+
+       pte_clear_fixmap();
  }
  
  static void split_pud(pud_t *old_pud, pmd_t *pmd)
@@ -127,10 +156,29 @@ static void split_pud(pud_t *old_pud, pmd_t *pmd)
         } while (pmd++, i++, i < PTRS_PER_PMD);
  }
  
-static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
-                                 unsigned long addr, unsigned long end,
+#ifdef CONFIG_DEBUG_PAGEALLOC
+static bool block_mappings_allowed(phys_addr_t (*pgtable_alloc)(void))
+{
+
+       /*
+        * If debug_page_alloc is enabled we must map the linear map
+        * using pages. However, other mappings created by
+        * create_mapping_noalloc must use sections in some cases. Allow
+        * sections to be used in those cases, where no pgtable_alloc
+        * function is provided.
+        */
+       return !pgtable_alloc || !debug_pagealloc_enabled();
+}
+#else
+static bool block_mappings_allowed(phys_addr_t (*pgtable_alloc)(void))
+{
+       return true;
+}
+#endif
+
+static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end,
                                   phys_addr_t phys, pgprot_t prot,
-                                 void *(*alloc)(unsigned long size))
+                                 phys_addr_t (*pgtable_alloc)(void))
  {
         pmd_t *pmd;
         unsigned long next;
@@ -139,7 +187,10 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
          * Check for initial section mappings in the pgd/pud and remove them.
          */
         if (pud_none(*pud) || pud_sect(*pud)) {
-               pmd = alloc(PTRS_PER_PMD * sizeof(pmd_t));
+               phys_addr_t pmd_phys;
+               BUG_ON(!pgtable_alloc);
+               pmd_phys = pgtable_alloc();
+               pmd = pmd_set_fixmap(pmd_phys);
                 if (pud_sect(*pud)) {
                         /*
                          * need to have the 1G of mappings continue to be
@@ -147,19 +198,20 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
                          */
                         split_pud(pud, pmd);
                 }
-               pud_populate(mm, pud, pmd);
+               __pud_populate(pud, pmd_phys, PUD_TYPE_TABLE);
                 flush_tlb_all();
+               pmd_clear_fixmap();
         }
         BUG_ON(pud_bad(*pud));
  
-       pmd = pmd_offset(pud, addr);
+       pmd = pmd_set_fixmap_offset(pud, addr);
         do {
                 next = pmd_addr_end(addr, end);
                 /* try section mapping first */
-               if (((addr | next | phys) & ~SECTION_MASK) == 0) {
+               if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
+                     block_mappings_allowed(pgtable_alloc)) {
                         pmd_t old_pmd =*pmd;
-                       set_pmd(pmd, __pmd(phys |
-                                          pgprot_val(mk_sect_prot(prot))));
+                       pmd_set_huge(pmd, phys, prot);
                         /*
                          * Check for previous table entries created during
                          * boot (__create_page_tables) and flush them.
@@ -167,17 +219,19 @@ static void alloc_init_pmd(struct mm_struct *mm, pud_t *pud,
                         if (!pmd_none(old_pmd)) {
                                 flush_tlb_all();
                                 if (pmd_table(old_pmd)) {
-                                       phys_addr_t table = __pa(pte_offset_map(&old_pmd, 0));
+                                       phys_addr_t table = pmd_page_paddr(old_pmd);
                                         if (!WARN_ON_ONCE(slab_is_available()))
                                                 memblock_free(table, PAGE_SIZE);
                                 }
                         }
                 } else {
                         alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys),
-                                      prot, alloc);
+                                      prot, pgtable_alloc);
                 }
                 phys += next - addr;
         } while (pmd++, addr = next, addr != end);
+
+       pmd_clear_fixmap();
  }
  
  static inline bool use_1G_block(unsigned long addr, unsigned long next,
@@ -192,31 +246,32 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next,
         return true;
  }
  
-static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd,
-                                 unsigned long addr, unsigned long end,
+static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end,
                                   phys_addr_t phys, pgprot_t prot,
-                                 void *(*alloc)(unsigned long size))
+                                 phys_addr_t (*pgtable_alloc)(void))
  {
         pud_t *pud;
         unsigned long next;
  
         if (pgd_none(*pgd)) {
-               pud = alloc(PTRS_PER_PUD * sizeof(pud_t));
-               pgd_populate(mm, pgd, pud);
+               phys_addr_t pud_phys;
+               BUG_ON(!pgtable_alloc);
+               pud_phys = pgtable_alloc();
+               __pgd_populate(pgd, pud_phys, PUD_TYPE_TABLE);
         }
         BUG_ON(pgd_bad(*pgd));
  
-       pud = pud_offset(pgd, addr);
+       pud = pud_set_fixmap_offset(pgd, addr);
         do {
                 next = pud_addr_end(addr, end);
  
                 /*
                  * For 4K granule only, attempt to put down a 1GB block
                  */
-               if (use_1G_block(addr, next, phys)) {
+               if (use_1G_block(addr, next, phys) &&
+                   block_mappings_allowed(pgtable_alloc)) {
                         pud_t old_pud = *pud;
-                       set_pud(pud, __pud(phys |
-                                          pgprot_val(mk_sect_prot(prot))));
+                       pud_set_huge(pud, phys, prot);
  
                         /*
                          * If we have an old value for a pud, it will
@@ -228,51 +283,74 @@ static void alloc_init_pud(struct mm_struct *mm, pgd_t *pgd,
                         if (!pud_none(old_pud)) {
                                 flush_tlb_all();
                                 if (pud_table(old_pud)) {
-                                       phys_addr_t table = __pa(pmd_offset(&old_pud, 0));
+                                       phys_addr_t table = pud_page_paddr(old_pud);
                                         if (!WARN_ON_ONCE(slab_is_available()))
                                                 memblock_free(table, PAGE_SIZE);
                                 }
                         }
                 } else {
-                       alloc_init_pmd(mm, pud, addr, next, phys, prot, alloc);
+                       alloc_init_pmd(pud, addr, next, phys, prot,
+                                      pgtable_alloc);
                 }
                 phys += next - addr;
         } while (pud++, addr = next, addr != end);
+
+       pud_clear_fixmap();
  }
  
  /*
   * Create the page directory entries and any necessary page tables for the
   * mapping specified by 'md'.
   */
-static void  __create_mapping(struct mm_struct *mm, pgd_t *pgd,
-                                   phys_addr_t phys, unsigned long virt,
+static void init_pgd(pgd_t *pgd, phys_addr_t phys, unsigned long virt,
                                     phys_addr_t size, pgprot_t prot,
-                                   void *(*alloc)(unsigned long size))
+                                   phys_addr_t (*pgtable_alloc)(void))
  {
         unsigned long addr, length, end, next;
  
+       /*
+        * If the virtual and physical address don't have the same offset
+        * within a page, we cannot map the region as the caller expects.
+        */
+       if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
+               return;
+
+       phys &= PAGE_MASK;
         addr = virt & PAGE_MASK;
         length = PAGE_ALIGN(size + (virt & ~PAGE_MASK));
  
         end = addr + length;
         do {
                 next = pgd_addr_end(addr, end);
-               alloc_init_pud(mm, pgd, addr, next, phys, prot, alloc);
+               alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc);
                 phys += next - addr;
         } while (pgd++, addr = next, addr != end);
  }
  
-static void *late_alloc(unsigned long size)
+static phys_addr_t late_pgtable_alloc(void)
  {
-       void *ptr;
-
-       BUG_ON(size > PAGE_SIZE);
-       ptr = (void *)__get_free_page(PGALLOC_GFP);
+       void *ptr = (void *)__get_free_page(PGALLOC_GFP);
         BUG_ON(!ptr);
-       return ptr;
+
+       /* Ensure the zeroed page is visible to the page table walker */
+       dsb(ishst);
+       return __pa(ptr);
  }
  
-static void __init create_mapping(phys_addr_t phys, unsigned long virt,
+static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
+                                unsigned long virt, phys_addr_t size,
+                                pgprot_t prot,
+                                phys_addr_t (*alloc)(void))
+{
+       init_pgd(pgd_offset_raw(pgdir, virt), phys, virt, size, prot, alloc);
+}
+
+/*
+ * This function can only be used to modify existing table entries,
+ * without allocating new levels of table. Note that this permits the
+ * creation of new section or page entries.
+ */
+static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
                                   phys_addr_t size, pgprot_t prot)
  {
         if (virt < VMALLOC_START) {
@@ -280,16 +358,16 @@ static void __init create_mapping(phys_addr_t phys, unsigned long virt,
                         &phys, virt);
                 return;
         }
-       __create_mapping(&init_mm, pgd_offset_k(virt & PAGE_MASK), phys, virt,
-                        size, prot, early_alloc);
+       __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot,
+                            NULL);
  }
  
  void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
                                unsigned long virt, phys_addr_t size,
                                pgprot_t prot)
  {
-       __create_mapping(mm, pgd_offset(mm, virt), phys, virt, size, prot,
-                               late_alloc);
+       __create_pgd_mapping(mm->pgd, phys, virt, size, prot,
+                            late_pgtable_alloc);
  }
  
  static void create_mapping_late(phys_addr_t phys, unsigned long virt,
@@ -301,69 +379,57 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt,
                 return;
         }
  
-       return __create_mapping(&init_mm, pgd_offset_k(virt & PAGE_MASK),
-                               phys, virt, size, prot, late_alloc);
+       __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot,
+                            late_pgtable_alloc);
  }
  
-#ifdef CONFIG_DEBUG_RODATA
-static void __init __map_memblock(phys_addr_t start, phys_addr_t end)
+static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end)
  {
+       unsigned long kernel_start = __pa(_text);
+       unsigned long kernel_end = __pa(_etext);
+
         /*
-        * Set up the executable regions using the existing section mappings
-        * for now. This will get more fine grained later once all memory
-        * is mapped
+        * Take care not to create a writable alias for the
+        * read-only text and rodata sections of the kernel image.
          */
-       unsigned long kernel_x_start = round_down(__pa(_stext), SWAPPER_BLOCK_SIZE);
-       unsigned long kernel_x_end = round_up(__pa(__init_end), SWAPPER_BLOCK_SIZE);
-
-       if (end < kernel_x_start) {
-               create_mapping(start, __phys_to_virt(start),
-                       end - start, PAGE_KERNEL);
-       } else if (start >= kernel_x_end) {
-               create_mapping(start, __phys_to_virt(start),
-                       end - start, PAGE_KERNEL);
-       } else {
-               if (start < kernel_x_start)
-                       create_mapping(start, __phys_to_virt(start),
-                               kernel_x_start - start,
-                               PAGE_KERNEL);
-               create_mapping(kernel_x_start,
-                               __phys_to_virt(kernel_x_start),
-                               kernel_x_end - kernel_x_start,
-                               PAGE_KERNEL_EXEC);
-               if (kernel_x_end < end)
-                       create_mapping(kernel_x_end,
-                               __phys_to_virt(kernel_x_end),
-                               end - kernel_x_end,
-                               PAGE_KERNEL);
+
+       /* No overlap with the kernel text */
+       if (end < kernel_start || start >= kernel_end) {
+               __create_pgd_mapping(pgd, start, __phys_to_virt(start),
+                                    end - start, PAGE_KERNEL,
+                                    early_pgtable_alloc);
+               return;
         }
  
+       /*
+        * This block overlaps the kernel text mapping.
+        * Map the portion(s) which don't overlap.
+        */
+       if (start < kernel_start)
+               __create_pgd_mapping(pgd, start,
+                                    __phys_to_virt(start),
+                                    kernel_start - start, PAGE_KERNEL,
+                                    early_pgtable_alloc);
+       if (kernel_end < end)
+               __create_pgd_mapping(pgd, kernel_end,
+                                    __phys_to_virt(kernel_end),
+                                    end - kernel_end, PAGE_KERNEL,
+                                    early_pgtable_alloc);
+
+       /*
+        * Map the linear alias of the [_text, _etext) interval as
+        * read-only/non-executable. This makes the contents of the
+        * region accessible to subsystems such as hibernate, but
+        * protects it from inadvertent modification or execution.
+        */
+       __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start),
+                            kernel_end - kernel_start, PAGE_KERNEL_RO,
+                            early_pgtable_alloc);
  }
-#else
-static void __init __map_memblock(phys_addr_t start, phys_addr_t end)
-{
-       create_mapping(start, __phys_to_virt(start), end - start,
-                       PAGE_KERNEL_EXEC);
-}
-#endif
  
-static void __init map_mem(void)
+static void __init map_mem(pgd_t *pgd)
  {
         struct memblock_region *reg;
-       phys_addr_t limit;
-
-       /*
-        * Temporarily limit the memblock range. We need to do this as
-        * create_mapping requires puds, pmds and ptes to be allocated from
-        * memory addressable from the initial direct kernel mapping.
-        *
-        * The initial direct kernel mapping, located at swapper_pg_dir, gives
-        * us PUD_SIZE (with SECTION maps) or PMD_SIZE (without SECTION maps,
-        * memory starting from PHYS_OFFSET (which must be aligned to 2MB as
-        * per Documentation/arm64/booting.txt).
-        */
-       limit = PHYS_OFFSET + SWAPPER_INIT_MAP_SIZE;
-       memblock_set_current_limit(limit);
  
         /* map all the memory banks */
         for_each_memblock(memory, reg) {
@@ -373,69 +439,94 @@ static void __init map_mem(void)
                 if (start >= end)
                         break;
  
-               if (ARM64_SWAPPER_USES_SECTION_MAPS) {
-                       /*
-                        * For the first memory bank align the start address and
-                        * current memblock limit to prevent create_mapping() from
-                        * allocating pte page tables from unmapped memory. With
-                        * the section maps, if the first block doesn't end on section
-                        * size boundary, create_mapping() will try to allocate a pte
-                        * page, which may be returned from an unmapped area.
-                        * When section maps are not used, the pte page table for the
-                        * current limit is already present in swapper_pg_dir.
-                        */
-                       if (start < limit)
-                               start = ALIGN(start, SECTION_SIZE);
-                       if (end < limit) {
-                               limit = end & SECTION_MASK;
-                               memblock_set_current_limit(limit);
-                       }
-               }
-               __map_memblock(start, end);
+               __map_memblock(pgd, start, end);
         }
-
-       /* Limit no longer required. */
-       memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
  }
  
-static void __init fixup_executable(void)
+void mark_rodata_ro(void)
  {
-#ifdef CONFIG_DEBUG_RODATA
-       /* now that we are actually fully mapped, make the start/end more fine grained */
-       if (!IS_ALIGNED((unsigned long)_stext, SWAPPER_BLOCK_SIZE)) {
-               unsigned long aligned_start = round_down(__pa(_stext),
-                                                        SWAPPER_BLOCK_SIZE);
+       unsigned long section_size;
  
-               create_mapping(aligned_start, __phys_to_virt(aligned_start),
-                               __pa(_stext) - aligned_start,
-                               PAGE_KERNEL);
-       }
+       section_size = (unsigned long)__start_rodata - (unsigned long)_text;
+       create_mapping_late(__pa(_text), (unsigned long)_text,
+                           section_size, PAGE_KERNEL_ROX);
+       /*
+        * mark .rodata as read only. Use _etext rather than __end_rodata to
+        * cover NOTES and EXCEPTION_TABLE.
+        */
+       section_size = (unsigned long)_etext - (unsigned long)__start_rodata;
+       create_mapping_late(__pa(__start_rodata), (unsigned long)__start_rodata,
+                           section_size, PAGE_KERNEL_RO);
+}
  
-       if (!IS_ALIGNED((unsigned long)__init_end, SWAPPER_BLOCK_SIZE)) {
-               unsigned long aligned_end = round_up(__pa(__init_end),
-                                                         SWAPPER_BLOCK_SIZE);
-               create_mapping(__pa(__init_end), (unsigned long)__init_end,
-                               aligned_end - __pa(__init_end),
-                               PAGE_KERNEL);
-       }
-#endif
+void fixup_init(void)
+{
+       /*
+        * Unmap the __init region but leave the VM area in place. This
+        * prevents the region from being reused for kernel modules, which
+        * is not supported by kallsyms.
+        */
+       unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin));
  }
  
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void)
+static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end,
+                                     pgprot_t prot, struct vm_struct *vma)
  {
-       create_mapping_late(__pa(_stext), (unsigned long)_stext,
-                               (unsigned long)_etext - (unsigned long)_stext,
-                               PAGE_KERNEL_ROX);
+       phys_addr_t pa_start = __pa(va_start);
+       unsigned long size = va_end - va_start;
+
+       BUG_ON(!PAGE_ALIGNED(pa_start));
+       BUG_ON(!PAGE_ALIGNED(size));
+
+       __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot,
+                            early_pgtable_alloc);
  
+       vma->addr       = va_start;
+       vma->phys_addr  = pa_start;
+       vma->size       = size;
+       vma->flags      = VM_MAP;
+       vma->caller     = __builtin_return_address(0);
+
+       vm_area_add_early(vma);
  }
-#endif
  
-void fixup_init(void)
+/*
+ * Create fine-grained mappings for the kernel.
+ */
+static void __init map_kernel(pgd_t *pgd)
  {
-       create_mapping_late(__pa(__init_begin), (unsigned long)__init_begin,
-                       (unsigned long)__init_end - (unsigned long)__init_begin,
-                       PAGE_KERNEL);
+       static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data;
+
+       map_kernel_segment(pgd, _text, __start_rodata, PAGE_KERNEL_EXEC, &vmlinux_text);
+       map_kernel_segment(pgd, __start_rodata, _etext, PAGE_KERNEL, &vmlinux_rodata);
+       map_kernel_segment(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC,
+                          &vmlinux_init);
+       map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data);
+
+       if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) {
+               /*
+                * The fixmap falls in a separate pgd to the kernel, and doesn't
+                * live in the carveout for the swapper_pg_dir. We can simply
+                * re-use the existing dir for the fixmap.
+                */
+               set_pgd(pgd_offset_raw(pgd, FIXADDR_START),
+                       *pgd_offset_k(FIXADDR_START));
+       } else if (CONFIG_PGTABLE_LEVELS > 3) {
+               /*
+                * The fixmap shares its top level pgd entry with the kernel
+                * mapping. This can really only occur when we are running
+                * with 16k/4 levels, so we can simply reuse the pud level
+                * entry instead.
+                */
+               BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
+               set_pud(pud_set_fixmap_offset(pgd, FIXADDR_START),
+                       __pud(__pa(bm_pmd) | PUD_TYPE_TABLE));
+               pud_clear_fixmap();
+       } else {
+               BUG();
+       }
+
+       kasan_copy_shadow(pgd);
  }
  
  /*
@@ -444,28 +535,35 @@ void fixup_init(void)
   */
  void __init paging_init(void)
  {
-       void *zero_page;
-
-       map_mem();
-       fixup_executable();
+       phys_addr_t pgd_phys = early_pgtable_alloc();
+       pgd_t *pgd = pgd_set_fixmap(pgd_phys);
  
-       /* allocate the zero page. */
-       zero_page = early_alloc(PAGE_SIZE);
+       map_kernel(pgd);
+       map_mem(pgd);
  
-       bootmem_init();
-
-       empty_zero_page = virt_to_page(zero_page);
+       /*
+        * We want to reuse the original swapper_pg_dir so we don't have to
+        * communicate the new address to non-coherent secondaries in
+        * secondary_entry, and so cpu_switch_mm can generate the address with
+        * adrp+add rather than a load from some global variable.
+        *
+        * To do this we need to go via a temporary pgd.
+        */
+       cpu_replace_ttbr1(__va(pgd_phys));
+       memcpy(swapper_pg_dir, pgd, PAGE_SIZE);
+       cpu_replace_ttbr1(swapper_pg_dir);
  
-       /* Ensure the zero page is visible to the page table walker */
-       dsb(ishst);
+       pgd_clear_fixmap();
+       memblock_free(pgd_phys, PAGE_SIZE);
  
         /*
-        * TTBR0 is only used for the identity mapping at this stage. Make it
-        * point to zero page to avoid speculatively fetching new entries.
+        * We only reuse the PGD from the swapper_pg_dir, not the pud + pmd
+        * allocated with it.
          */
-       cpu_set_reserved_ttbr0();
-       local_flush_tlb_all();
-       cpu_set_default_tcr_t0sz();
+       memblock_free(__pa(swapper_pg_dir) + PAGE_SIZE,
+                     SWAPPER_DIR_SIZE - PAGE_SIZE);
+
+       bootmem_init();
  }
  
  /*
@@ -552,21 +650,13 @@ void vmemmap_free(unsigned long start, unsigned long end)
  }
  #endif /* CONFIG_SPARSEMEM_VMEMMAP */
  
-static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
-#if CONFIG_PGTABLE_LEVELS > 2
-static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss;
-#endif
-#if CONFIG_PGTABLE_LEVELS > 3
-static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss;
-#endif
-
  static inline pud_t * fixmap_pud(unsigned long addr)
  {
         pgd_t *pgd = pgd_offset_k(addr);
  
         BUG_ON(pgd_none(*pgd) || pgd_bad(*pgd));
  
-       return pud_offset(pgd, addr);
+       return pud_offset_kimg(pgd, addr);
  }
  
  static inline pmd_t * fixmap_pmd(unsigned long addr)
@@ -575,16 +665,12 @@ static inline pmd_t * fixmap_pmd(unsigned long addr)
  
         BUG_ON(pud_none(*pud) || pud_bad(*pud));
  
-       return pmd_offset(pud, addr);
+       return pmd_offset_kimg(pud, addr);
  }
  
  static inline pte_t * fixmap_pte(unsigned long addr)
  {
-       pmd_t *pmd = fixmap_pmd(addr);
-
-       BUG_ON(pmd_none(*pmd) || pmd_bad(*pmd));
-
-       return pte_offset_kernel(pmd, addr);
+       return &bm_pte[pte_index(addr)];
  }
  
  void __init early_fixmap_init(void)
@@ -595,15 +681,26 @@ void __init early_fixmap_init(void)
         unsigned long addr = FIXADDR_START;
  
         pgd = pgd_offset_k(addr);
-       pgd_populate(&init_mm, pgd, bm_pud);
-       pud = pud_offset(pgd, addr);
+       if (CONFIG_PGTABLE_LEVELS > 3 &&
+           !(pgd_none(*pgd) || pgd_page_paddr(*pgd) == __pa(bm_pud))) {
+               /*
+                * We only end up here if the kernel mapping and the fixmap
+                * share the top level pgd entry, which should only happen on
+                * 16k/4 levels configurations.
+                */
+               BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
+               pud = pud_offset_kimg(pgd, addr);
+       } else {
+               pgd_populate(&init_mm, pgd, bm_pud);
+               pud = fixmap_pud(addr);
+       }
         pud_populate(&init_mm, pud, bm_pmd);
-       pmd = pmd_offset(pud, addr);
+       pmd = fixmap_pmd(addr);
         pmd_populate_kernel(&init_mm, pmd, bm_pte);
  
         /*
          * The boot-ioremap range spans multiple pmds, for which
-        * we are not preparted:
+        * we are not prepared:
          */
         BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
                      != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
@@ -642,11 +739,10 @@ void __set_fixmap(enum fixed_addresses idx,
         }
  }
  
-void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
+void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
  {
         const u64 dt_virt_base = __fix_to_virt(FIX_FDT);
-       pgprot_t prot = PAGE_KERNEL_RO;
-       int size, offset;
+       int offset;
         void *dt_virt;
  
         /*
@@ -663,7 +759,7 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
         /*
          * Make sure that the FDT region can be mapped without the need to
          * allocate additional translation table pages, so that it is safe
-        * to call create_mapping() this early.
+        * to call create_mapping_noalloc() this early.
          *
          * On 64k pages, the FDT will be mapped using PTEs, so we need to
          * be in the same PMD as the rest of the fixmap.
@@ -679,21 +775,73 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
         dt_virt = (void *)dt_virt_base + offset;
  
         /* map the first chunk so we can read the size from the header */
-       create_mapping(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
-                      SWAPPER_BLOCK_SIZE, prot);
+       create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE),
+                       dt_virt_base, SWAPPER_BLOCK_SIZE, prot);
  
         if (fdt_magic(dt_virt) != FDT_MAGIC)
                 return NULL;
  
-       size = fdt_totalsize(dt_virt);
-       if (size > MAX_FDT_SIZE)
+       *size = fdt_totalsize(dt_virt);
+       if (*size > MAX_FDT_SIZE)
                 return NULL;
  
-       if (offset + size > SWAPPER_BLOCK_SIZE)
-               create_mapping(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
-                              round_up(offset + size, SWAPPER_BLOCK_SIZE), prot);
+       if (offset + *size > SWAPPER_BLOCK_SIZE)
+               create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
+                              round_up(offset + *size, SWAPPER_BLOCK_SIZE), prot);
  
-       memblock_reserve(dt_phys, size);
+       return dt_virt;
+}
+
+void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
+{
+       void *dt_virt;
+       int size;
+
+       dt_virt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO);
+       if (!dt_virt)
+               return NULL;
  
+       memblock_reserve(dt_phys, size);
         return dt_virt;
  }
+
+int __init arch_ioremap_pud_supported(void)
+{
+       /* only 4k granule supports level 1 block mappings */
+       return IS_ENABLED(CONFIG_ARM64_4K_PAGES);
+}
+
+int __init arch_ioremap_pmd_supported(void)
+{
+       return 1;
+}
+
+int pud_set_huge(pud_t *pud, phys_addr_t phys, pgprot_t prot)
+{
+       BUG_ON(phys & ~PUD_MASK);
+       set_pud(pud, __pud(phys | PUD_TYPE_SECT | pgprot_val(mk_sect_prot(prot))));
+       return 1;
+}
+
+int pmd_set_huge(pmd_t *pmd, phys_addr_t phys, pgprot_t prot)
+{
+       BUG_ON(phys & ~PMD_MASK);
+       set_pmd(pmd, __pmd(phys | PMD_TYPE_SECT | pgprot_val(mk_sect_prot(prot))));
+       return 1;
+}
+
+int pud_clear_huge(pud_t *pud)
+{
+       if (!pud_sect(*pud))
+               return 0;
+       pud_clear(pud);
+       return 1;
+}
+
+int pmd_clear_huge(pmd_t *pmd)
+{
+       if (!pmd_sect(*pmd))
+               return 0;
+       pmd_clear(pmd);
+       return 1;
+}
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c

index cf6240741134ecbeece606dade39dc90478a08b6..ca6d268e3313229b0941ce7d33439c7a4c861120 100644 (file)
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -14,6 +14,7 @@
  #include <linux/mm.h>
  #include <linux/module.h>
  #include <linux/sched.h>
+#include <linux/vmalloc.h>
  
  #include <asm/pgtable.h>
  #include <asm/tlbflush.h>
@@ -36,14 +37,32 @@ static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr,
         return 0;
  }
  
+/*
+ * This function assumes that the range is mapped with PAGE_SIZE pages.
+ */
+static int __change_memory_common(unsigned long start, unsigned long size,
+                               pgprot_t set_mask, pgprot_t clear_mask)
+{
+       struct page_change_data data;
+       int ret;
+
+       data.set_mask = set_mask;
+       data.clear_mask = clear_mask;
+
+       ret = apply_to_page_range(&init_mm, start, size, change_page_range,
+                                       &data);
+
+       flush_tlb_kernel_range(start, start + size);
+       return ret;
+}
+
  static int change_memory_common(unsigned long addr, int numpages,
                                 pgprot_t set_mask, pgprot_t clear_mask)
  {
         unsigned long start = addr;
         unsigned long size = PAGE_SIZE*numpages;
         unsigned long end = start + size;
-       int ret;
-       struct page_change_data data;
+       struct vm_struct *area;
  
         if (!PAGE_ALIGNED(addr)) {
                 start &= PAGE_MASK;
@@ -51,23 +70,29 @@ static int change_memory_common(unsigned long addr, int numpages,
                 WARN_ON_ONCE(1);
         }
  
-       if (start < MODULES_VADDR || start >= MODULES_END)
-               return -EINVAL;
-
-       if (end < MODULES_VADDR || end >= MODULES_END)
+       /*
+        * Kernel VA mappings are always live, and splitting live section
+        * mappings into page mappings may cause TLB conflicts. This means
+        * we have to ensure that changing the permission bits of the range
+        * we are operating on does not result in such splitting.
+        *
+        * Let's restrict ourselves to mappings created by vmalloc (or vmap).
+        * Those are guaranteed to consist entirely of page mappings, and
+        * splitting is never needed.
+        *
+        * So check whether the [addr, addr + size) interval is entirely
+        * covered by precisely one VM area that has the VM_ALLOC flag set.
+        */
+       area = find_vm_area((void *)addr);
+       if (!area ||
+           end > (unsigned long)area->addr + area->size ||
+           !(area->flags & VM_ALLOC))
                 return -EINVAL;
  
         if (!numpages)
                 return 0;
  
-       data.set_mask = set_mask;
-       data.clear_mask = clear_mask;
-
-       ret = apply_to_page_range(&init_mm, start, size, change_page_range,
-                                       &data);
-
-       flush_tlb_kernel_range(start, end);
-       return ret;
+       return __change_memory_common(start, size, set_mask, clear_mask);
  }
  
  int set_memory_ro(unsigned long addr, int numpages)
@@ -99,3 +124,19 @@ int set_memory_x(unsigned long addr, int numpages)
                                         __pgprot(PTE_PXN));
  }
  EXPORT_SYMBOL_GPL(set_memory_x);
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+       unsigned long addr = (unsigned long) page_address(page);
+
+       if (enable)
+               __change_memory_common(addr, PAGE_SIZE * numpages,
+                                       __pgprot(PTE_VALID),
+                                       __pgprot(0));
+       else
+               __change_memory_common(addr, PAGE_SIZE * numpages,
+                                       __pgprot(0),
+                                       __pgprot(PTE_VALID));
+}
+#endif
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c

index cb3ba1b812e74dcd1acbc167756d60da331d105f..ae11d4e03d0e68d7f0fe621f1c9d313fcab09127 100644 (file)
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@@ -46,14 +46,14 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
                 kmem_cache_free(pgd_cache, pgd);
  }
  
-static int __init pgd_cache_init(void)
+void __init pgd_cache_init(void)
  {
+       if (PGD_SIZE == PAGE_SIZE)
+               return;
+
         /*
          * Naturally aligned pgds required by the architecture.
          */
-       if (PGD_SIZE != PAGE_SIZE)
-               pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_SIZE,
-                                             SLAB_PANIC, NULL);
-       return 0;
+       pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_SIZE,
+                                     SLAB_PANIC, NULL);
  }
-core_initcall(pgd_cache_init);
diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S

index d69dffffaa8993bc7260c1df035b92c3b3867b48..984edcda1850f1be420f92be7666b36c687cc476 100644 (file)
--- a/arch/arm64/mm/proc-macros.S
+++ b/arch/arm64/mm/proc-macros.S
@@ -74,3 +74,25 @@
         msr     pmuserenr_el0, xzr              // Disable PMU access from EL0
  9000:
         .endm
+
+/*
+ * Macro to perform a data cache maintenance for the interval
+ * [kaddr, kaddr + size)
+ *
+ *     op:             operation passed to dc instruction
+ *     domain:         domain used in dsb instruciton
+ *     kaddr:          starting virtual address of the region
+ *     size:           size of the region
+ *     Corrupts:       kaddr, size, tmp1, tmp2
+ */
+       .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
+       dcache_line_size \tmp1, \tmp2
+       add     \size, \kaddr, \size
+       sub     \tmp2, \tmp1, #1
+       bic     \kaddr, \kaddr, \tmp2
+9998:  dc      \op, \kaddr
+       add     \kaddr, \kaddr, \tmp1
+       cmp     \kaddr, \size
+       b.lo    9998b
+       dsb     \domain
+       .endm
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S

index 18201e9e8cc71c22d017a9573cf722bdab63de74..5bb61de2320172c806ee58959e3f721b2b243a99 100644 (file)
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -23,13 +23,11 @@
  #include <asm/assembler.h>
  #include <asm/asm-offsets.h>
  #include <asm/hwcap.h>
-#include <asm/pgtable-hwdef.h>
  #include <asm/pgtable.h>
+#include <asm/pgtable-hwdef.h>
  #include <asm/cpufeature.h>
  #include <asm/alternative.h>
  
-#include "proc-macros.S"
-
  #ifdef CONFIG_ARM64_64K_PAGES
  #define TCR_TG_FLAGS   TCR_TG0_64K | TCR_TG1_64K
  #elif defined(CONFIG_ARM64_16K_PAGES)
@@ -66,62 +64,50 @@ ENTRY(cpu_do_suspend)
         mrs     x2, tpidr_el0
         mrs     x3, tpidrro_el0
         mrs     x4, contextidr_el1
-       mrs     x5, mair_el1
-       mrs     x6, cpacr_el1
-       mrs     x7, ttbr1_el1
-       mrs     x8, tcr_el1
-       mrs     x9, vbar_el1
-       mrs     x10, mdscr_el1
-       mrs     x11, oslsr_el1
-       mrs     x12, sctlr_el1
+       mrs     x5, cpacr_el1
+       mrs     x6, tcr_el1
+       mrs     x7, vbar_el1
+       mrs     x8, mdscr_el1
+       mrs     x9, oslsr_el1
+       mrs     x10, sctlr_el1
         stp     x2, x3, [x0]
-       stp     x4, x5, [x0, #16]
-       stp     x6, x7, [x0, #32]
-       stp     x8, x9, [x0, #48]
-       stp     x10, x11, [x0, #64]
-       str     x12, [x0, #80]
+       stp     x4, xzr, [x0, #16]
+       stp     x5, x6, [x0, #32]
+       stp     x7, x8, [x0, #48]
+       stp     x9, x10, [x0, #64]
         ret
  ENDPROC(cpu_do_suspend)
  
  /**
   * cpu_do_resume - restore CPU register context
   *
- * x0: Physical address of context pointer
- * x1: ttbr0_el1 to be restored
- *
- * Returns:
- *     sctlr_el1 value in x0
+ * x0: Address of context pointer
   */
  ENTRY(cpu_do_resume)
-       /*
-        * Invalidate local tlb entries before turning on MMU
-        */
-       tlbi    vmalle1
         ldp     x2, x3, [x0]
         ldp     x4, x5, [x0, #16]
-       ldp     x6, x7, [x0, #32]
-       ldp     x8, x9, [x0, #48]
-       ldp     x10, x11, [x0, #64]
-       ldr     x12, [x0, #80]
+       ldp     x6, x8, [x0, #32]
+       ldp     x9, x10, [x0, #48]
+       ldp     x11, x12, [x0, #64]
         msr     tpidr_el0, x2
         msr     tpidrro_el0, x3
         msr     contextidr_el1, x4
-       msr     mair_el1, x5
         msr     cpacr_el1, x6
-       msr     ttbr0_el1, x1
-       msr     ttbr1_el1, x7
-       tcr_set_idmap_t0sz x8, x7
+
+       /* Don't change t0sz here, mask those bits when restoring */
+       mrs     x5, tcr_el1
+       bfi     x8, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
+
         msr     tcr_el1, x8
         msr     vbar_el1, x9
         msr     mdscr_el1, x10
+       msr     sctlr_el1, x12
         /*
          * Restore oslsr_el1 by writing oslar_el1
          */
         ubfx    x11, x11, #1, #1
         msr     oslar_el1, x11
         reset_pmuserenr_el0 x0                  // Disable PMU access from EL0
-       mov     x0, x12
-       dsb     nsh             // Make sure local tlb invalidation completed
         isb
         ret
  ENDPROC(cpu_do_resume)
@@ -152,7 +138,33 @@ alternative_else
  alternative_endif
  ENDPROC(cpu_do_switch_mm)
  
-       .section ".text.init", #alloc, #execinstr
+       .pushsection ".idmap.text", "ax"
+/*
+ * void idmap_cpu_replace_ttbr1(phys_addr_t new_pgd)
+ *
+ * This is the low-level counterpart to cpu_replace_ttbr1, and should not be
+ * called by anything else. It can only be executed from a TTBR0 mapping.
+ */
+ENTRY(idmap_cpu_replace_ttbr1)
+       mrs     x2, daif
+       msr     daifset, #0xf
+
+       adrp    x1, empty_zero_page
+       msr     ttbr1_el1, x1
+       isb
+
+       tlbi    vmalle1
+       dsb     nsh
+       isb
+
+       msr     ttbr1_el1, x0
+       isb
+
+       msr     daif, x2
+
+       ret
+ENDPROC(idmap_cpu_replace_ttbr1)
+       .popsection
  
  /*
   *     __cpu_setup
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig

index eb0249e3798112615fd5774d6f30229aa6241e53..2c86a4ef67424c0495c5438854d1794198a74b3f 100644 (file)
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -53,6 +53,7 @@ config IA64
         select MODULES_USE_ELF_RELA
         select ARCH_USE_CMPXCHG_LOCKREF
         select HAVE_ARCH_AUDITSYSCALL
+       select HAVE_ARCH_HARDENED_USERCOPY
         default y
         help
           The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h

index 40c2027a2bf443a2534ac932df11a0a18de292b7..71e8c6f45a1d2e233e0978d677d5b4f2106882e1 100644 (file)
--- a/arch/ia64/include/asm/uaccess.h
+++ b/arch/ia64/include/asm/uaccess.h
@@ -241,12 +241,16 @@ extern unsigned long __must_check __copy_user (void __user *to, const void __use
  static inline unsigned long
  __copy_to_user (void __user *to, const void *from, unsigned long count)
  {
+       check_object_size(from, count, true);
+
         return __copy_user(to, (__force void __user *) from, count);
  }
  
  static inline unsigned long
  __copy_from_user (void *to, const void __user *from, unsigned long count)
  {
+       check_object_size(to, count, false);
+
         return __copy_user((__force void __user *) to, from, count);
  }
  
@@ -258,8 +262,10 @@ __copy_from_user (void *to, const void __user *from, unsigned long count)
         const void *__cu_from = (from);                                                 \
         long __cu_len = (n);                                                            \
                                                                                         \
-       if (__access_ok(__cu_to, __cu_len, get_fs()))                                   \
-               __cu_len = __copy_user(__cu_to, (__force void __user *) __cu_from, __cu_len);   \
+       if (__access_ok(__cu_to, __cu_len, get_fs())) {                                 \
+               check_object_size(__cu_from, __cu_len, true);                   \
+               __cu_len = __copy_user(__cu_to, (__force void __user *)  __cu_from, __cu_len);  \
+       }                                                                               \
         __cu_len;                                                                       \
  })
  
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig

index 729f89163bc32113dba77e309c8ce767ed3d15e8..d2256fa97ea0c7875df5d21bcfff23bb6f25a0d9 100644 (file)
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -11,6 +11,7 @@ config PARISC
         select RTC_DRV_GENERIC
         select INIT_ALL_POSSIBLE
         select BUG
+       select BUILDTIME_EXTABLE_SORT
         select HAVE_PERF_EVENTS
         select GENERIC_ATOMIC64 if !64BIT
         select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
diff --git a/arch/parisc/include/asm/assembly.h b/arch/parisc/include/asm/assembly.h

index b3069fd83468c5972f98d19f8f17dfa9bfb5e0cf..60e6f07b7e326bc4eb306105dcae477d5aa01f0e 100644 (file)
--- a/arch/parisc/include/asm/assembly.h
+++ b/arch/parisc/include/asm/assembly.h
@@ -523,7 +523,7 @@
          */
  #define ASM_EXCEPTIONTABLE_ENTRY(fault_addr, except_addr)      \
         .section __ex_table,"aw"                        !       \
-       ASM_ULONG_INSN  fault_addr, except_addr         !       \
+       .word (fault_addr - .), (except_addr - .)       !       \
         .previous
  
  
diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h

index 3d0e17bcc8e905ece06053ae15b3ff5443e6b033..df0f52bd18b457f9efa1a0b9fc470b0803f50632 100644 (file)
--- a/arch/parisc/include/asm/cache.h
+++ b/arch/parisc/include/asm/cache.h
@@ -22,6 +22,9 @@
  
  #define __read_mostly __attribute__((__section__(".data..read_mostly")))
  
+/* Read-only memory is marked before mark_rodata_ro() is called. */
+#define __ro_after_init        __read_mostly
+
  void parisc_cache_init(void);  /* initializes cache-flushing */
  void disable_sr_hashing_asm(int); /* low level support for above */
  void disable_sr_hashing(void);   /* turns off space register hashing */
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h

index 845272ce9cc587222e835131bb5ed96be0be03d4..7bd69bd43a018577d099f373346383900b1f0121 100644 (file)
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -121,10 +121,6 @@ flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vma
         }
  }
  
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-#endif
-
  #include <asm/kmap_types.h>
  
  #define ARCH_HAS_KMAP
diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h

index 4ad51465890bdcc35b75c055a21a9dd616f96a1f..30eed2d6d8a8d7d2225b28b2e33f75c2dd0fbf52 100644 (file)
--- a/arch/parisc/include/asm/uaccess.h
+++ b/arch/parisc/include/asm/uaccess.h
@@ -61,14 +61,15 @@ static inline long access_ok(int type, const void __user * addr,
   * use a 32bit (unsigned int) address here.
   */
  
+#define ARCH_HAS_RELATIVE_EXTABLE
  struct exception_table_entry {
-       unsigned long insn;     /* address of insn that is allowed to fault. */
-       unsigned long fixup;    /* fixup routine */
+       int insn;       /* relative address of insn that is allowed to fault. */
+       int fixup;      /* relative address of fixup routine */
  };
  
  #define ASM_EXCEPTIONTABLE_ENTRY( fault_addr, except_addr )\
         ".section __ex_table,\"aw\"\n"                     \
-       ASM_WORD_INSN #fault_addr ", " #except_addr "\n\t" \
+       ".word (" #fault_addr " - .), (" #except_addr " - .)\n\t" \
         ".previous\n"
  
  /*
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c

index f9064449908aad5ae01fbeb1c92e97399b0b828b..16dbe81c97c9005df3cbb91045ccb6ed20878930 100644 (file)
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -140,12 +140,6 @@ int fixup_exception(struct pt_regs *regs)
  {
         const struct exception_table_entry *fix;
  
-       /* If we only stored 32bit addresses in the exception table we can drop
-        * out if we faulted on a 64bit address. */
-       if ((sizeof(regs->iaoq[0]) > sizeof(fix->insn))
-               && (regs->iaoq[0] >> 32))
-                       return 0;
-
         fix = search_exception_tables(regs->iaoq[0]);
         if (fix) {
                 struct exception_data *d;
@@ -155,7 +149,8 @@ int fixup_exception(struct pt_regs *regs)
                 d->fault_space = regs->isr;
                 d->fault_addr = regs->ior;
  
-               regs->iaoq[0] = ((fix->fixup) & ~3);
+               regs->iaoq[0] = (unsigned long)&fix->fixup + fix->fixup;
+               regs->iaoq[0] &= ~3;
                 /*
                  * NOTE: In some cases the faulting instruction
                  * may be in the delay slot of a branch. We
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig

index db49e0d796b1bf50642365aa92d63c5a83a07320..ec7b8f1e4822c66a4ced8bad6cd52f869ad413b2 100644 (file)
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -160,6 +160,7 @@ config PPC
         select EDAC_ATOMIC_SCRUB
         select ARCH_HAS_DMA_SET_COHERENT_MASK
         select HAVE_ARCH_SECCOMP_FILTER
+       select HAVE_ARCH_HARDENED_USERCOPY
  
  config GENERIC_CSUM
         def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h

index a5ffe0207c16f96f06f6eb630b5ba049495dd2d1..db71448b9bb92455cb6b93343b2604f5eb78ebdb 100644 (file)
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -323,8 +323,10 @@ extern unsigned long __copy_tofrom_user(void __user *to,
  static inline unsigned long copy_from_user(void *to,
                 const void __user *from, unsigned long n)
  {
-       if (likely(access_ok(VERIFY_READ, from, n)))
+       if (likely(access_ok(VERIFY_READ, from, n))) {
+               check_object_size(to, n, false);
                 return __copy_tofrom_user((__force void __user *)to, from, n);
+       }
         memset(to, 0, n);
         return n;
  }
@@ -332,8 +334,10 @@ static inline unsigned long copy_from_user(void *to,
  static inline unsigned long copy_to_user(void __user *to,
                 const void *from, unsigned long n)
  {
-       if (access_ok(VERIFY_WRITE, to, n))
+       if (access_ok(VERIFY_WRITE, to, n)) {
+               check_object_size(from, n, true);
                 return __copy_tofrom_user(to, (__force void __user *)from, n);
+       }
         return n;
  }
  
@@ -374,6 +378,9 @@ static inline unsigned long __copy_from_user_inatomic(void *to,
                 if (ret == 0)
                         return 0;
         }
+
+       check_object_size(to, n, false);
+
         return __copy_tofrom_user((__force void __user *)to, from, n);
  }
  
@@ -400,6 +407,9 @@ static inline unsigned long __copy_to_user_inatomic(void __user *to,
                 if (ret == 0)
                         return 0;
         }
+
+       check_object_size(from, n, true);
+
         return __copy_tofrom_user(to, (__force const void __user *)from, n);
  }
  
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig

index 3a55f493c7da72b9ee7ee7ad7c8c76d9e01bed8d..60530fd93d6d11368731fedfe145afb6826c7e59 100644 (file)
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -117,6 +117,7 @@ config S390
         select HAVE_ALIGNED_STRUCT_PAGE if SLUB
         select HAVE_ARCH_AUDITSYSCALL
         select HAVE_ARCH_EARLY_PFN_TO_NID
+       select HAVE_ARCH_HARDENED_USERCOPY
         select HAVE_ARCH_JUMP_LABEL
         select HAVE_ARCH_SECCOMP_FILTER
         select HAVE_ARCH_SOFT_DIRTY
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c

index ae4de559e3a04288c6be111de684b3355f35109b..6986c20166f028bd859fa38af033d076a93eb3f3 100644 (file)
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -104,6 +104,7 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr,
  
  unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n)
  {
+       check_object_size(to, n, false);
         if (static_branch_likely(&have_mvcos))
                 return copy_from_user_mvcos(to, from, n);
         return copy_from_user_mvcp(to, from, n);
@@ -177,6 +178,7 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x,
  
  unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n)
  {
+       check_object_size(from, n, true);
         if (static_branch_likely(&have_mvcos))
                 return copy_to_user_mvcos(to, from, n);
         return copy_to_user_mvcs(to, from, n);
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig

index 56442d2d7bbca4181e8897bfa5d9719dbded43d1..3736be630113b4e1ae8fd26ff45cff32d03d47a9 100644 (file)
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -43,6 +43,7 @@ config SPARC
         select ODD_RT_SIGACTION
         select OLD_SIGSUSPEND
         select ARCH_HAS_SG_CHAIN
+       select HAVE_ARCH_HARDENED_USERCOPY
  
  config SPARC32
         def_bool !64BIT
diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h

index dfb542c7cc714ea5b5cf19d6d3f853cb1a9c08ed..e7d6bb4cd619b362aa8fd84c71ecc3d7a18b9eb3 100644 (file)
--- a/arch/sparc/include/asm/uaccess_32.h
+++ b/arch/sparc/include/asm/uaccess_32.h
@@ -313,20 +313,23 @@ unsigned long __copy_user(void __user *to, const void __user *from, unsigned lon
  
  static inline unsigned long copy_to_user(void __user *to, const void *from, unsigned long n)
  {
-       if (n && __access_ok((unsigned long) to, n))
+       if (n && __access_ok((unsigned long) to, n)) {
+               check_object_size(from, n, true);
                 return __copy_user(to, (__force void __user *) from, n);
-       else
+       } else
                 return n;
  }
  
  static inline unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n)
  {
+       check_object_size(from, n, true);
         return __copy_user(to, (__force void __user *) from, n);
  }
  
  static inline unsigned long copy_from_user(void *to, const void __user *from, unsigned long n)
  {
-       if (n && __access_ok((unsigned long) from, n))
+       if (n && __access_ok((unsigned long) from, n)) {
+               check_object_size(to, n, false);
                 return __copy_user((__force void __user *) to, from, n);
         else {
                 memset(to, 0, n);
diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h

index f428512481f9e52a52a6889180500a4ef6e33c87..a6847fc05a6dc7fbb33d6ebcddb7bd7e116b37cc 100644 (file)
--- a/arch/sparc/include/asm/uaccess_64.h
+++ b/arch/sparc/include/asm/uaccess_64.h
@@ -207,6 +207,8 @@ unsigned long __must_check ___copy_from_user(void *to,
  static inline unsigned long __must_check
  copy_from_user(void *to, const void __user *from, unsigned long size)
  {
+       check_object_size(to, size, false);
+
         return ___copy_from_user(to, from, size);
  }
  #define __copy_from_user copy_from_user
@@ -217,6 +219,8 @@ unsigned long __must_check ___copy_to_user(void __user *to,
  static inline unsigned long __must_check
  copy_to_user(void __user *to, const void *from, unsigned long size)
  {
+       check_object_size(from, size, true);
+
         return ___copy_to_user(to, from, size);
  }
  #define __copy_to_user copy_to_user
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index 436639a316248532dee15c0ab527346c02448788..924bbffc56f090081e451ffd37ae23552e597076 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -77,6 +77,7 @@ config X86
         select HAVE_ALIGNED_STRUCT_PAGE         if SLUB
         select HAVE_AOUT                        if X86_32
         select HAVE_ARCH_AUDITSYSCALL
+       select HAVE_ARCH_HARDENED_USERCOPY
         select HAVE_ARCH_HUGE_VMAP              if X86_64 || X86_PAE
         select HAVE_ARCH_JUMP_LABEL
         select HAVE_ARCH_KASAN                  if X86_64 && SPARSEMEM_VMEMMAP
@@ -86,7 +87,7 @@ config X86
         select HAVE_ARCH_SOFT_DIRTY             if X86_64
         select HAVE_ARCH_TRACEHOOK
         select HAVE_ARCH_TRANSPARENT_HUGEPAGE
-       select HAVE_BPF_JIT                     if X86_64
+       select HAVE_ARCH_WITHIN_STACK_FRAMES
         select HAVE_CC_STACKPROTECTOR
         select HAVE_CMPXCHG_DOUBLE
         select HAVE_CMPXCHG_LOCAL
@@ -289,6 +290,9 @@ config ARCH_SUPPORTS_UPROBES
  config FIX_EARLYCON_MEM
         def_bool y
  
+config DEBUG_RODATA
+       def_bool y
+
  config PGTABLE_LEVELS
         int
         default 4 if X86_64
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug

index 137dfa96aa14e1c8c3a9c1d7b50b21d910cabae4..1f6c306a9a009bc442ed1c1677ac5b900edbcf66 100644 (file)
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -91,28 +91,16 @@ config EFI_PGT_DUMP
           issues with the mapping of the EFI runtime regions into that
           table.
  
-config DEBUG_RODATA
-       bool "Write protect kernel read-only data structures"
-       default y
-       depends on DEBUG_KERNEL
-       ---help---
-         Mark the kernel read-only data as write-protected in the pagetables,
-         in order to catch accidental (and incorrect) writes to such const
-         data. This is recommended so that we can catch kernel bugs sooner.
-         If in doubt, say "Y".
-
  config DEBUG_RODATA_TEST
-       bool "Testcase for the DEBUG_RODATA feature"
-       depends on DEBUG_RODATA
+       bool "Testcase for the marking rodata read-only"
         default y
         ---help---
-         This option enables a testcase for the DEBUG_RODATA
-         feature as well as for the change_page_attr() infrastructure.
+         This option enables a testcase for the setting rodata read-only
+         as well as for the change_page_attr() infrastructure.
           If in doubt, say "N"
  
  config DEBUG_WX
         bool "Warn on W+X mappings at boot"
-       depends on DEBUG_RODATA
         select X86_PTDUMP_CORE
         ---help---
           Generate a warning if any W+X mappings are found at boot.
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h

index 0224987556ce80bd606063b56ef124b0857a3f44..3f69326ed545719aedd9e11b1be0fdea104136bc 100644 (file)
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -140,7 +140,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
         fprintf(outfile, "#include <asm/vdso.h>\n");
         fprintf(outfile, "\n");
         fprintf(outfile,
-               "static unsigned char raw_data[%lu] __page_aligned_data = {",
+               "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
                 mapping_size);
         for (j = 0; j < stripped_len; j++) {
                 if (j % 10 == 0)
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h

index e63aa38e85fb23375ecefa40efe8cd9d76da6c43..61518cf79437679788bd41a1ee1c942aea29a8cb 100644 (file)
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -91,16 +91,10 @@ void clflush_cache_range(void *addr, unsigned int size);
  
  #define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
  
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
  extern const int rodata_test_data;
  extern int kernel_set_to_readonly;
  void set_kernel_text_rw(void);
  void set_kernel_text_ro(void);
-#else
-static inline void set_kernel_text_rw(void) { }
-static inline void set_kernel_text_ro(void) { }
-#endif
  
  #ifdef CONFIG_DEBUG_RODATA_TEST
  int rodata_test(void);
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h

index 0010c78c4998cf0702299ea2f8a9229e09bb6438..08b1f2f6ea50c186933d0d9e79c82da0b9da4f3f 100644 (file)
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -25,6 +25,8 @@
  #define EFI32_LOADER_SIGNATURE "EL32"
  #define EFI64_LOADER_SIGNATURE "EL64"
  
+#define MAX_CMDLINE_ADDRESS    UINT_MAX
+
  #ifdef CONFIG_X86_32
  
  
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h

index c1adf33fdd0d6f70f055b9a056bc7787bda7635e..bc62e7cbf1b1f883fc9acb0a145305384a3bf062 100644 (file)
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -17,15 +17,8 @@ static inline bool kvm_check_and_clear_guest_paused(void)
  }
  #endif /* CONFIG_KVM_GUEST */
  
-#ifdef CONFIG_DEBUG_RODATA
  #define KVM_HYPERCALL \
          ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL)
-#else
-/* On AMD processors, vmcall will generate a trap that we will
- * then rewrite to the appropriate instruction.
- */
-#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
-#endif
  
  /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
   * instruction.  The hypervisor may replace it with something else but only the
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h

index 0a5242428659045cfb439d3045593cc1c63aad96..13b6cdd0af57049468e47ef884e6eccba49dca0e 100644 (file)
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -7,7 +7,7 @@
  extern char __brk_base[], __brk_limit[];
  extern struct exception_table_entry __stop___ex_table[];
  
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
  extern char __end_rodata_hpage_align[];
  #endif
  
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h

index c7b551028740f18a5360070a6ccb5dc9ad714c5e..0c977fc124a77b7a766cce85de4c9e130d699689 100644 (file)
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -177,6 +177,50 @@ static inline unsigned long current_stack_pointer(void)
         return sp;
  }
  
+/*
+ * Walks up the stack frames to make sure that the specified object is
+ * entirely contained by a single stack frame.
+ *
+ * Returns:
+ *              1 if within a frame
+ *             -1 if placed across a frame boundary (or outside stack)
+ *              0 unable to determine (no frame pointers, etc)
+ */
+static inline int arch_within_stack_frames(const void * const stack,
+                                          const void * const stackend,
+                                          const void *obj, unsigned long len)
+{
+#if defined(CONFIG_FRAME_POINTER)
+       const void *frame = NULL;
+       const void *oldframe;
+
+       oldframe = __builtin_frame_address(1);
+       if (oldframe)
+               frame = __builtin_frame_address(2);
+       /*
+        * low ----------------------------------------------> high
+        * [saved bp][saved ip][args][local vars][saved bp][saved ip]
+        *                     ^----------------^
+        *               allow copies only within here
+        */
+       while (stack <= frame && frame < stackend) {
+               /*
+                * If obj + len extends past the last frame, this
+                * check won't pass and the next frame will be 0,
+                * causing us to bail out and correctly report
+                * the copy as invalid.
+                */
+               if (obj + len <= frame)
+                       return obj >= oldframe + 2 * sizeof(void *) ? 1 : -1;
+               oldframe = frame;
+               frame = *(const void * const *)frame;
+       }
+       return -1;
+#else
+       return 0;
+#endif
+}
+
  #else /* !__ASSEMBLY__ */
  
  #ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h

index b8c75f3aade8330ed9ac63bedc8c056adf21043d..f0bb7c1f7d199413abc9280a19150912096d6aa9 100644 (file)
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -134,6 +134,9 @@ extern int __get_user_4(void);
  extern int __get_user_8(void);
  extern int __get_user_bad(void);
  
+#define __uaccess_begin() stac()
+#define __uaccess_end()   clac()
+
  /*
   * This is a type: either unsigned long, if the argument fits into
   * that type, or otherwise unsigned long long.
@@ -193,10 +196,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
  
  #ifdef CONFIG_X86_32
  #define __put_user_asm_u64(x, addr, err, errret)                       \
-       asm volatile(ASM_STAC "\n"                                      \
+       asm volatile("\n"                                               \
                      "1:        movl %%eax,0(%2)\n"                     \
                      "2:        movl %%edx,4(%2)\n"                     \
-                    "3: " ASM_CLAC "\n"                                \
+                    "3:"                                               \
                      ".section .fixup,\"ax\"\n"                         \
                      "4:        movl %3,%0\n"                           \
                      "  jmp 3b\n"                                       \
@@ -207,10 +210,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
                      : "A" (x), "r" (addr), "i" (errret), "0" (err))
  
  #define __put_user_asm_ex_u64(x, addr)                                 \
-       asm volatile(ASM_STAC "\n"                                      \
+       asm volatile("\n"                                               \
                      "1:        movl %%eax,0(%1)\n"                     \
                      "2:        movl %%edx,4(%1)\n"                     \
-                    "3: " ASM_CLAC "\n"                                \
+                    "3:"                                               \
                      _ASM_EXTABLE_EX(1b, 2b)                            \
                      _ASM_EXTABLE_EX(2b, 3b)                            \
                      : : "A" (x), "r" (addr))
@@ -304,6 +307,10 @@ do {                                                                       \
         }                                                               \
  } while (0)
  
+/*
+ * This doesn't do __uaccess_begin/end - the exception handling
+ * around it must do that.
+ */
  #define __put_user_size_ex(x, ptr, size)                               \
  do {                                                                   \
         __chk_user_ptr(ptr);                                            \
@@ -358,9 +365,9 @@ do {                                                                        \
  } while (0)
  
  #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret)      \
-       asm volatile(ASM_STAC "\n"                                      \
+       asm volatile("\n"                                               \
                      "1:        mov"itype" %2,%"rtype"1\n"              \
-                    "2: " ASM_CLAC "\n"                                \
+                    "2:\n"                                             \
                      ".section .fixup,\"ax\"\n"                         \
                      "3:        mov %3,%0\n"                            \
                      "  xor"itype" %"rtype"1,%"rtype"1\n"               \
@@ -370,6 +377,10 @@ do {                                                                       \
                      : "=r" (err), ltype(x)                             \
                      : "m" (__m(addr)), "i" (errret), "0" (err))
  
+/*
+ * This doesn't do __uaccess_begin/end - the exception handling
+ * around it must do that.
+ */
  #define __get_user_size_ex(x, ptr, size)                               \
  do {                                                                   \
         __chk_user_ptr(ptr);                                            \
@@ -400,7 +411,9 @@ do {                                                                        \
  #define __put_user_nocheck(x, ptr, size)                       \
  ({                                                             \
         int __pu_err;                                           \
+       __uaccess_begin();                                      \
         __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
+       __uaccess_end();                                        \
         __builtin_expect(__pu_err, 0);                          \
  })
  
@@ -408,7 +421,9 @@ do {                                                                        \
  ({                                                                     \
         int __gu_err;                                                   \
         unsigned long __gu_val;                                         \
+       __uaccess_begin();                                              \
         __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT);    \
+       __uaccess_end();                                                \
         (x) = (__force __typeof__(*(ptr)))__gu_val;                     \
         __builtin_expect(__gu_err, 0);                                  \
  })
@@ -423,9 +438,9 @@ struct __large_struct { unsigned long buf[100]; };
   * aliasing issues.
   */
  #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret)      \
-       asm volatile(ASM_STAC "\n"                                      \
+       asm volatile("\n"                                               \
                      "1:        mov"itype" %"rtype"1,%2\n"              \
-                    "2: " ASM_CLAC "\n"                                \
+                    "2:\n"                                             \
                      ".section .fixup,\"ax\"\n"                         \
                      "3:        mov %3,%0\n"                            \
                      "  jmp 2b\n"                                       \
@@ -445,11 +460,11 @@ struct __large_struct { unsigned long buf[100]; };
   */
  #define uaccess_try    do {                                            \
         current_thread_info()->uaccess_err = 0;                         \
-       stac();                                                         \
+       __uaccess_begin();                                              \
         barrier();
  
  #define uaccess_catch(err)                                             \
-       clac();                                                         \
+       __uaccess_end();                                                \
         (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0);    \
  } while (0)
  
@@ -547,12 +562,13 @@ extern void __cmpxchg_wrong_size(void)
         __typeof__(ptr) __uval = (uval);                                \
         __typeof__(*(ptr)) __old = (old);                               \
         __typeof__(*(ptr)) __new = (new);                               \
+       __uaccess_begin();                                              \
         switch (size) {                                                 \
         case 1:                                                         \
         {                                                               \
-               asm volatile("\t" ASM_STAC "\n"                         \
+               asm volatile("\n"                                       \
                         "1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n"          \
-                       "2:\t" ASM_CLAC "\n"                            \
+                       "2:\n"                                          \
                         "\t.section .fixup, \"ax\"\n"                   \
                         "3:\tmov     %3, %0\n"                          \
                         "\tjmp     2b\n"                                \
@@ -566,9 +582,9 @@ extern void __cmpxchg_wrong_size(void)
         }                                                               \
         case 2:                                                         \
         {                                                               \
-               asm volatile("\t" ASM_STAC "\n"                         \
+               asm volatile("\n"                                       \
                         "1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n"          \
-                       "2:\t" ASM_CLAC "\n"                            \
+                       "2:\n"                                          \
                         "\t.section .fixup, \"ax\"\n"                   \
                         "3:\tmov     %3, %0\n"                          \
                         "\tjmp     2b\n"                                \
@@ -582,9 +598,9 @@ extern void __cmpxchg_wrong_size(void)
         }                                                               \
         case 4:                                                         \
         {                                                               \
-               asm volatile("\t" ASM_STAC "\n"                         \
+               asm volatile("\n"                                       \
                         "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"          \
-                       "2:\t" ASM_CLAC "\n"                            \
+                       "2:\n"                                          \
                         "\t.section .fixup, \"ax\"\n"                   \
                         "3:\tmov     %3, %0\n"                          \
                         "\tjmp     2b\n"                                \
@@ -601,9 +617,9 @@ extern void __cmpxchg_wrong_size(void)
                 if (!IS_ENABLED(CONFIG_X86_64))                         \
                         __cmpxchg_wrong_size();                         \
                                                                         \
-               asm volatile("\t" ASM_STAC "\n"                         \
+               asm volatile("\n"                                       \
                         "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n"          \
-                       "2:\t" ASM_CLAC "\n"                            \
+                       "2:\n"                                          \
                         "\t.section .fixup, \"ax\"\n"                   \
                         "3:\tmov     %3, %0\n"                          \
                         "\tjmp     2b\n"                                \
@@ -618,6 +634,7 @@ extern void __cmpxchg_wrong_size(void)
         default:                                                        \
                 __cmpxchg_wrong_size();                                 \
         }                                                               \
+       __uaccess_end();                                                \
         *__uval = __old;                                                \
         __ret;                                                          \
  })
@@ -714,9 +731,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
          * case, and do only runtime checking for non-constant sizes.
          */
  
-       if (likely(sz < 0 || sz >= n))
+       if (likely(sz < 0 || sz >= n)) {
+               check_object_size(to, n, false);
                 n = _copy_from_user(to, from, n);
-       else if(__builtin_constant_p(n))
+       } else if (__builtin_constant_p(n))
                 copy_from_user_overflow();
         else
                 __copy_from_user_overflow(sz, n);
@@ -732,9 +750,10 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
         might_fault();
  
         /* See the comment in copy_from_user() above. */
-       if (likely(sz < 0 || sz >= n))
+       if (likely(sz < 0 || sz >= n)) {
+               check_object_size(from, n, true);
                 n = _copy_to_user(to, from, n);
-       else if(__builtin_constant_p(n))
+       } else if (__builtin_constant_p(n))
                 copy_to_user_overflow();
         else
                 __copy_to_user_overflow(sz, n);
@@ -745,5 +764,30 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
  #undef __copy_from_user_overflow
  #undef __copy_to_user_overflow
  
+/*
+ * The "unsafe" user accesses aren't really "unsafe", but the naming
+ * is a big fat warning: you have to not only do the access_ok()
+ * checking before using them, but you have to surround them with the
+ * user_access_begin/end() pair.
+ */
+#define user_access_begin()    __uaccess_begin()
+#define user_access_end()      __uaccess_end()
+
+#define unsafe_put_user(x, ptr, err_label)                                     \
+do {                                                                           \
+       int __pu_err;                                                           \
+       __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT);         \
+       if (unlikely(__pu_err)) goto err_label;                                 \
+} while (0)
+
+#define unsafe_get_user(x, ptr, err_label)                                     \
+do {                                                                           \
+       int __gu_err;                                                           \
+       unsigned long __gu_val;                                                 \
+       __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT);    \
+       (x) = (__force __typeof__(*(ptr)))__gu_val;                             \
+       if (unlikely(__gu_err)) goto err_label;                                 \
+} while (0)
+
  #endif /* _ASM_X86_UACCESS_H */
  
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h

index f5dcb5204dcd5b27e8b8e9a1b87612a28cda10c6..7d3bdd1ed6977b5e1f69dc8ba3e3d6cfa8f861a3 100644 (file)
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -33,38 +33,11 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero
   * the specified block with access_ok() before calling this function.
   * The caller should also make sure he pins the user space address
   * so that we don't result in page fault and sleep.
- *
- * Here we special-case 1, 2 and 4-byte copy_*_user invocations.  On a fault
- * we return the initial request size (1, 2 or 4), as copy_*_user should do.
- * If a store crosses a page boundary and gets a fault, the x86 will not write
- * anything, so this is accurate.
   */
-
  static __always_inline unsigned long __must_check
  __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
  {
-       if (__builtin_constant_p(n)) {
-               unsigned long ret;
-
-               switch (n) {
-               case 1:
-                       __put_user_size(*(u8 *)from, (u8 __user *)to,
-                                       1, ret, 1);
-                       return ret;
-               case 2:
-                       __put_user_size(*(u16 *)from, (u16 __user *)to,
-                                       2, ret, 2);
-                       return ret;
-               case 4:
-                       __put_user_size(*(u32 *)from, (u32 __user *)to,
-                                       4, ret, 4);
-                       return ret;
-               case 8:
-                       __put_user_size(*(u64 *)from, (u64 __user *)to,
-                                       8, ret, 8);
-                       return ret;
-               }
-       }
+       check_object_size(from, n, true);
         return __copy_to_user_ll(to, from, n);
  }
  
@@ -93,26 +66,6 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
  static __always_inline unsigned long
  __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
  {
-       /* Avoid zeroing the tail if the copy fails..
-        * If 'n' is constant and 1, 2, or 4, we do still zero on a failure,
-        * but as the zeroing behaviour is only significant when n is not
-        * constant, that shouldn't be a problem.
-        */
-       if (__builtin_constant_p(n)) {
-               unsigned long ret;
-
-               switch (n) {
-               case 1:
-                       __get_user_size(*(u8 *)to, from, 1, ret, 1);
-                       return ret;
-               case 2:
-                       __get_user_size(*(u16 *)to, from, 2, ret, 2);
-                       return ret;
-               case 4:
-                       __get_user_size(*(u32 *)to, from, 4, ret, 4);
-                       return ret;
-               }
-       }
         return __copy_from_user_ll_nozero(to, from, n);
  }
  
@@ -143,18 +96,25 @@ static __always_inline unsigned long
  __copy_from_user(void *to, const void __user *from, unsigned long n)
  {
         might_fault();
+       check_object_size(to, n, false);
         if (__builtin_constant_p(n)) {
                 unsigned long ret;
  
                 switch (n) {
                 case 1:
+                       __uaccess_begin();
                         __get_user_size(*(u8 *)to, from, 1, ret, 1);
+                       __uaccess_end();
                         return ret;
                 case 2:
+                       __uaccess_begin();
                         __get_user_size(*(u16 *)to, from, 2, ret, 2);
+                       __uaccess_end();
                         return ret;
                 case 4:
+                       __uaccess_begin();
                         __get_user_size(*(u32 *)to, from, 4, ret, 4);
+                       __uaccess_end();
                         return ret;
                 }
         }
@@ -170,13 +130,19 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to,
  
                 switch (n) {
                 case 1:
+                       __uaccess_begin();
                         __get_user_size(*(u8 *)to, from, 1, ret, 1);
+                       __uaccess_end();
                         return ret;
                 case 2:
+                       __uaccess_begin();
                         __get_user_size(*(u16 *)to, from, 2, ret, 2);
+                       __uaccess_end();
                         return ret;
                 case 4:
+                       __uaccess_begin();
                         __get_user_size(*(u32 *)to, from, 4, ret, 4);
+                       __uaccess_end();
                         return ret;
                 }
         }
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h

index f2f9b39b274ab0c2f81ab6b388f78ba5c881ec5e..2957c8237c28d6e46283bca9dd58a6ec528f1913 100644 (file)
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -53,38 +53,53 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
  {
         int ret = 0;
  
+       check_object_size(dst, size, false);
         if (!__builtin_constant_p(size))
                 return copy_user_generic(dst, (__force void *)src, size);
         switch (size) {
-       case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src,
+       case 1:
+               __uaccess_begin();
+               __get_user_asm(*(u8 *)dst, (u8 __user *)src,
                               ret, "b", "b", "=q", 1);
+               __uaccess_end();
                 return ret;
-       case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src,
+       case 2:
+               __uaccess_begin();
+               __get_user_asm(*(u16 *)dst, (u16 __user *)src,
                               ret, "w", "w", "=r", 2);
+               __uaccess_end();
                 return ret;
-       case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src,
+       case 4:
+               __uaccess_begin();
+               __get_user_asm(*(u32 *)dst, (u32 __user *)src,
                               ret, "l", "k", "=r", 4);
+               __uaccess_end();
                 return ret;
-       case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src,
+       case 8:
+               __uaccess_begin();
+               __get_user_asm(*(u64 *)dst, (u64 __user *)src,
                               ret, "q", "", "=r", 8);
+               __uaccess_end();
                 return ret;
         case 10:
+               __uaccess_begin();
                 __get_user_asm(*(u64 *)dst, (u64 __user *)src,
                                ret, "q", "", "=r", 10);
-               if (unlikely(ret))
-                       return ret;
-               __get_user_asm(*(u16 *)(8 + (char *)dst),
-                              (u16 __user *)(8 + (char __user *)src),
-                              ret, "w", "w", "=r", 2);
+               if (likely(!ret))
+                       __get_user_asm(*(u16 *)(8 + (char *)dst),
+                                      (u16 __user *)(8 + (char __user *)src),
+                                      ret, "w", "w", "=r", 2);
+               __uaccess_end();
                 return ret;
         case 16:
+               __uaccess_begin();
                 __get_user_asm(*(u64 *)dst, (u64 __user *)src,
                                ret, "q", "", "=r", 16);
-               if (unlikely(ret))
-                       return ret;
-               __get_user_asm(*(u64 *)(8 + (char *)dst),
-                              (u64 __user *)(8 + (char __user *)src),
-                              ret, "q", "", "=r", 8);
+               if (likely(!ret))
+                       __get_user_asm(*(u64 *)(8 + (char *)dst),
+                                      (u64 __user *)(8 + (char __user *)src),
+                                      ret, "q", "", "=r", 8);
+               __uaccess_end();
                 return ret;
         default:
                 return copy_user_generic(dst, (__force void *)src, size);
@@ -103,38 +118,55 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size)
  {
         int ret = 0;
  
+       check_object_size(src, size, true);
         if (!__builtin_constant_p(size))
                 return copy_user_generic((__force void *)dst, src, size);
         switch (size) {
-       case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst,
+       case 1:
+               __uaccess_begin();
+               __put_user_asm(*(u8 *)src, (u8 __user *)dst,
                               ret, "b", "b", "iq", 1);
+               __uaccess_end();
                 return ret;
-       case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst,
+       case 2:
+               __uaccess_begin();
+               __put_user_asm(*(u16 *)src, (u16 __user *)dst,
                               ret, "w", "w", "ir", 2);
+               __uaccess_end();
                 return ret;
-       case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst,
+       case 4:
+               __uaccess_begin();
+               __put_user_asm(*(u32 *)src, (u32 __user *)dst,
                               ret, "l", "k", "ir", 4);
+               __uaccess_end();
                 return ret;
-       case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
+       case 8:
+               __uaccess_begin();
+               __put_user_asm(*(u64 *)src, (u64 __user *)dst,
                               ret, "q", "", "er", 8);
+               __uaccess_end();
                 return ret;
         case 10:
+               __uaccess_begin();
                 __put_user_asm(*(u64 *)src, (u64 __user *)dst,
                                ret, "q", "", "er", 10);
-               if (unlikely(ret))
-                       return ret;
-               asm("":::"memory");
-               __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
-                              ret, "w", "w", "ir", 2);
+               if (likely(!ret)) {
+                       asm("":::"memory");
+                       __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
+                                      ret, "w", "w", "ir", 2);
+               }
+               __uaccess_end();
                 return ret;
         case 16:
+               __uaccess_begin();
                 __put_user_asm(*(u64 *)src, (u64 __user *)dst,
                                ret, "q", "", "er", 16);
-               if (unlikely(ret))
-                       return ret;
-               asm("":::"memory");
-               __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
-                              ret, "q", "", "er", 8);
+               if (likely(!ret)) {
+                       asm("":::"memory");
+                       __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
+                                      ret, "q", "", "er", 8);
+               }
+               __uaccess_end();
                 return ret;
         default:
                 return copy_user_generic((__force void *)dst, src, size);
@@ -160,39 +192,47 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
         switch (size) {
         case 1: {
                 u8 tmp;
+               __uaccess_begin();
                 __get_user_asm(tmp, (u8 __user *)src,
                                ret, "b", "b", "=q", 1);
                 if (likely(!ret))
                         __put_user_asm(tmp, (u8 __user *)dst,
                                        ret, "b", "b", "iq", 1);
+               __uaccess_end();
                 return ret;
         }
         case 2: {
                 u16 tmp;
+               __uaccess_begin();
                 __get_user_asm(tmp, (u16 __user *)src,
                                ret, "w", "w", "=r", 2);
                 if (likely(!ret))
                         __put_user_asm(tmp, (u16 __user *)dst,
                                        ret, "w", "w", "ir", 2);
+               __uaccess_end();
                 return ret;
         }
  
         case 4: {
                 u32 tmp;
+               __uaccess_begin();
                 __get_user_asm(tmp, (u32 __user *)src,
                                ret, "l", "k", "=r", 4);
                 if (likely(!ret))
                         __put_user_asm(tmp, (u32 __user *)dst,
                                        ret, "l", "k", "ir", 4);
+               __uaccess_end();
                 return ret;
         }
         case 8: {
                 u64 tmp;
+               __uaccess_begin();
                 __get_user_asm(tmp, (u64 __user *)src,
                                ret, "q", "", "=r", 8);
                 if (likely(!ret))
                         __put_user_asm(tmp, (u64 __user *)dst,
                                        ret, "q", "", "er", 8);
+               __uaccess_end();
                 return ret;
         }
         default:
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c

index 311bcf338f07e75a48115ef4edf3df5f748b6940..eb6bd34582c692f69eb48fc50c44a9b502becd4e 100644 (file)
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -81,9 +81,9 @@ within(unsigned long addr, unsigned long start, unsigned long end)
  static unsigned long text_ip_addr(unsigned long ip)
  {
         /*
-        * On x86_64, kernel text mappings are mapped read-only with
-        * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
-        * of the kernel text mapping to modify the kernel text.
+        * On x86_64, kernel text mappings are mapped read-only, so we use
+        * the kernel identity mapping instead of the kernel text mapping
+        * to modify the kernel text.
          *
          * For 32bit kernels, these mappings are same and we can use
          * kernel identity mapping to modify code.
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c

index 44256a62702b2c51077fc0b8b82a904ed122b9f6..ed15cd486d06347626c080a0081e3eed01ac9128 100644 (file)
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -750,9 +750,7 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
  int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
  {
         int err;
-#ifdef CONFIG_DEBUG_RODATA
         char opc[BREAK_INSTR_SIZE];
-#endif /* CONFIG_DEBUG_RODATA */
  
         bpt->type = BP_BREAKPOINT;
         err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
@@ -761,7 +759,6 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
                 return err;
         err = probe_kernel_write((char *)bpt->bpt_addr,
                                  arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
-#ifdef CONFIG_DEBUG_RODATA
         if (!err)
                 return err;
         /*
@@ -778,13 +775,12 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
         if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
                 return -EINVAL;
         bpt->type = BP_POKE_BREAKPOINT;
-#endif /* CONFIG_DEBUG_RODATA */
+
         return err;
  }
  
  int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
  {
-#ifdef CONFIG_DEBUG_RODATA
         int err;
         char opc[BREAK_INSTR_SIZE];
  
@@ -801,8 +797,8 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
         if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
                 goto knl_write;
         return err;
+
  knl_write:
-#endif /* CONFIG_DEBUG_RODATA */
         return probe_kernel_write((char *)bpt->bpt_addr,
                                   (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
  }
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c

index 3f92ce07e525fd41167a64f9d7c4df1bfbe673bf..27538f183c3b15d9d59e225f2612b12660ee979b 100644 (file)
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -142,7 +142,6 @@ static int test_NX(void)
          * by the error message
          */
  
-#ifdef CONFIG_DEBUG_RODATA
         /* Test 3: Check if the .rodata section is executable */
         if (rodata_test_data != 0xC3) {
                 printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
@@ -151,7 +150,6 @@ static int test_NX(void)
                 printk(KERN_ERR "test_nx: .rodata section is executable\n");
                 ret = -ENODEV;
         }
-#endif
  
  #if 0
         /* Test 4: Check if the .data section of a module is executable */
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c

index 5ecbfe5099dad68e140b85ffdaefc75818afc6d9..cb4a01b41e277887b6769e4a17d60cb4139f9864 100644 (file)
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -76,5 +76,5 @@ int rodata_test(void)
  }
  
  MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
+MODULE_DESCRIPTION("Testcase for marking rodata as read-only");
  MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S

index 74e4bf11f562e0354c227518421e2375ec16fafa..fe133b710befa9a7495178374076e465ec61668f 100644 (file)
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,29 +41,28 @@ ENTRY(phys_startup_64)
  jiffies_64 = jiffies;
  #endif
  
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
  /*
- * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
- * we retain large page mappings for boundaries spanning kernel text, rodata
- * and data sections.
+ * On 64-bit, align RODATA to 2MB so we retain large page mappings for
+ * boundaries spanning kernel text, rodata and data sections.
   *
   * However, kernel identity mappings will have different RWX permissions
   * to the pages mapping to text and to the pages padding (which are freed) the
   * text section. Hence kernel identity mappings will be broken to smaller
   * pages. For 64-bit, kernel text and kernel identity mappings are different,
- * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
- * as well as retain 2MB large page mappings for kernel text.
+ * so we can enable protection checks as well as retain 2MB large page
+ * mappings for kernel text.
   */
-#define X64_ALIGN_DEBUG_RODATA_BEGIN   . = ALIGN(HPAGE_SIZE);
+#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
  
-#define X64_ALIGN_DEBUG_RODATA_END                             \
+#define X64_ALIGN_RODATA_END                                   \
                 . = ALIGN(HPAGE_SIZE);                          \
                 __end_rodata_hpage_align = .;
  
  #else
  
-#define X64_ALIGN_DEBUG_RODATA_BEGIN
-#define X64_ALIGN_DEBUG_RODATA_END
+#define X64_ALIGN_RODATA_BEGIN
+#define X64_ALIGN_RODATA_END
  
  #endif
  
@@ -112,13 +111,11 @@ SECTIONS
  
         EXCEPTION_TABLE(16) :text = 0x9090
  
-#if defined(CONFIG_DEBUG_RODATA)
         /* .text should occupy whole number of pages */
         . = ALIGN(PAGE_SIZE);
-#endif
-       X64_ALIGN_DEBUG_RODATA_BEGIN
+       X64_ALIGN_RODATA_BEGIN
         RO_DATA(PAGE_SIZE)
-       X64_ALIGN_DEBUG_RODATA_END
+       X64_ALIGN_RODATA_END
  
         /* Data */
         .data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c

index cb4ef3de61f9ae9c95249876965a71a5d7b58cf8..2ebfbaf611424be1937c4e2288776d50d1c8ff9e 100644 (file)
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -871,7 +871,6 @@ static noinline int do_test_wp_bit(void)
         return flag;
  }
  
-#ifdef CONFIG_DEBUG_RODATA
  const int rodata_test_data = 0xC3;
  EXPORT_SYMBOL_GPL(rodata_test_data);
  
@@ -960,5 +959,3 @@ void mark_rodata_ro(void)
         if (__supported_pte_mask & _PAGE_NX)
                 debug_checkwx();
  }
-#endif
-
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c

index ec081fe0ce2c10246fec7126dd213aa6b4d75dbb..e08d141844ee81897dd80a76b4ca240b6270b680 100644 (file)
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1062,7 +1062,6 @@ void __init mem_init(void)
         mem_init_print_info(NULL);
  }
  
-#ifdef CONFIG_DEBUG_RODATA
  const int rodata_test_data = 0xC3;
  EXPORT_SYMBOL_GPL(rodata_test_data);
  
@@ -1154,8 +1153,6 @@ void mark_rodata_ro(void)
         debug_checkwx();
  }
  
-#endif
-
  int kern_addr_valid(unsigned long addr)
  {
         unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c

index b599a780a5a915fb5d1a5b03b1ba63489ab13a19..4540e8880cd925cd3bd933426a1d4fb43d7e5176 100644 (file)
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -278,7 +278,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
                    __pa_symbol(__end_rodata) >> PAGE_SHIFT))
                 pgprot_val(forbidden) |= _PAGE_RW;
  
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
         /*
          * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
          * kernel text mappings for the large page aligned text, rodata sections
diff --git a/backported-features b/backported-features

new file mode 100644 (file)

index 0000000..b680ed4
--- /dev/null
+++ b/backported-features
@@ -0,0 +1,14 @@
+               LSK backported features
+
+1, The kaslr and kaslr-pax_usercopy branches base on LSK directly.
+       v4.4/topic/mm-kaslr
+       v4.4/topic/mm-kaslr-pax_usercopy
+
+2, Coresight and openCSD are used for Juno board 'perf' tool implement.
+       origin/v4.4/topic/coresight
+       origin/v4.4/topic/perf-opencsd-4.4-github
+
+3, OPTEE base on LSK mainline, but isn't included of mainline.
+
+Feature introducation:
+https://wiki.linaro.org/lsk/features
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c

index 9d359e05fad74d0a2f2a9a64ef3aeeb40bb3a7d6..8161090a19708684224402c59bfdb55dbbc07ef4 100644 (file)
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -788,6 +788,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
  {
         struct gendisk *disk;
         struct blkcg_gq *blkg;
+       struct module *owner;
         unsigned int major, minor;
         int key_len, part, ret;
         char *body;
@@ -804,7 +805,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
         if (!disk)
                 return -ENODEV;
         if (part) {
+               owner = disk->fops->owner;
                 put_disk(disk);
+               module_put(owner);
                 return -ENODEV;
         }
  
@@ -820,7 +823,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                 ret = PTR_ERR(blkg);
                 rcu_read_unlock();
                 spin_unlock_irq(disk->queue->queue_lock);
+               owner = disk->fops->owner;
                 put_disk(disk);
+               module_put(owner);
                 /*
                  * If queue was bypassing, we should retry.  Do so after a
                  * short msleep().  It isn't strictly necessary but queue
@@ -851,9 +856,13 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
  void blkg_conf_finish(struct blkg_conf_ctx *ctx)
         __releases(ctx->disk->queue->queue_lock) __releases(rcu)
  {
+       struct module *owner;
+
         spin_unlock_irq(ctx->disk->queue->queue_lock);
         rcu_read_unlock();
+       owner = ctx->disk->fops->owner;
         put_disk(ctx->disk);
+       module_put(owner);
  }
  EXPORT_SYMBOL_GPL(blkg_conf_finish);
  
diff --git a/drivers/base/power/opp/Makefile b/drivers/base/power/opp/Makefile

index 33c1e18c41a4d467259fb438911418372b7e82a4..19837ef04d8ef21a355e22171117398b2f966f80 100644 (file)
--- a/drivers/base/power/opp/Makefile
+++ b/drivers/base/power/opp/Makefile
@@ -1,2 +1,3 @@
  ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG
  obj-y                          += core.o cpu.o
+obj-$(CONFIG_DEBUG_FS)         += debugfs.o
diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c

index f8580900c2739d9c4389c6aa961a8dd2cdb5306c..433b60092972d56abba55897158d6c22156cf631 100644 (file)
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -13,50 +13,52 @@
  
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  
+#include <linux/clk.h>
  #include <linux/errno.h>
  #include <linux/err.h>
  #include <linux/slab.h>
  #include <linux/device.h>
  #include <linux/of.h>
  #include <linux/export.h>
+#include <linux/regulator/consumer.h>
  
  #include "opp.h"
  
  /*
- * The root of the list of all devices. All device_opp structures branch off
- * from here, with each device_opp containing the list of opp it supports in
+ * The root of the list of all opp-tables. All opp_table structures branch off
+ * from here, with each opp_table containing the list of opps it supports in
   * various states of availability.
   */
-static LIST_HEAD(dev_opp_list);
+static LIST_HEAD(opp_tables);
  /* Lock to allow exclusive modification to the device and opp lists */
-DEFINE_MUTEX(dev_opp_list_lock);
+DEFINE_MUTEX(opp_table_lock);
  
  #define opp_rcu_lockdep_assert()                                       \
  do {                                                                   \
         RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
-                               !lockdep_is_held(&dev_opp_list_lock),   \
-                          "Missing rcu_read_lock() or "                \
-                          "dev_opp_list_lock protection");             \
+                        !lockdep_is_held(&opp_table_lock),             \
+                        "Missing rcu_read_lock() or "                  \
+                        "opp_table_lock protection");                  \
  } while (0)
  
-static struct device_list_opp *_find_list_dev(const struct device *dev,
-                                             struct device_opp *dev_opp)
+static struct opp_device *_find_opp_dev(const struct device *dev,
+                                       struct opp_table *opp_table)
  {
-       struct device_list_opp *list_dev;
+       struct opp_device *opp_dev;
  
-       list_for_each_entry(list_dev, &dev_opp->dev_list, node)
-               if (list_dev->dev == dev)
-                       return list_dev;
+       list_for_each_entry(opp_dev, &opp_table->dev_list, node)
+               if (opp_dev->dev == dev)
+                       return opp_dev;
  
         return NULL;
  }
  
-static struct device_opp *_managed_opp(const struct device_node *np)
+static struct opp_table *_managed_opp(const struct device_node *np)
  {
-       struct device_opp *dev_opp;
+       struct opp_table *opp_table;
  
-       list_for_each_entry_rcu(dev_opp, &dev_opp_list, node) {
-               if (dev_opp->np == np) {
+       list_for_each_entry_rcu(opp_table, &opp_tables, node) {
+               if (opp_table->np == np) {
                         /*
                          * Multiple devices can point to the same OPP table and
                          * so will have same node-pointer, np.
@@ -64,7 +66,7 @@ static struct device_opp *_managed_opp(const struct device_node *np)
                          * But the OPPs will be considered as shared only if the
                          * OPP table contains a "opp-shared" property.
                          */
-                       return dev_opp->shared_opp ? dev_opp : NULL;
+                       return opp_table->shared_opp ? opp_table : NULL;
                 }
         }
  
@@ -72,24 +74,24 @@ static struct device_opp *_managed_opp(const struct device_node *np)
  }
  
  /**
- * _find_device_opp() - find device_opp struct using device pointer
- * @dev:       device pointer used to lookup device OPPs
+ * _find_opp_table() - find opp_table struct using device pointer
+ * @dev:       device pointer used to lookup OPP table
   *
- * Search list of device OPPs for one containing matching device. Does a RCU
- * reader operation to grab the pointer needed.
+ * Search OPP table for one containing matching device. Does a RCU reader
+ * operation to grab the pointer needed.
   *
- * Return: pointer to 'struct device_opp' if found, otherwise -ENODEV or
+ * Return: pointer to 'struct opp_table' if found, otherwise -ENODEV or
   * -EINVAL based on type of error.
   *
   * Locking: For readers, this function must be called under rcu_read_lock().
- * device_opp is a RCU protected pointer, which means that device_opp is valid
+ * opp_table is a RCU protected pointer, which means that opp_table is valid
   * as long as we are under RCU lock.
   *
- * For Writers, this function must be called with dev_opp_list_lock held.
+ * For Writers, this function must be called with opp_table_lock held.
   */
-struct device_opp *_find_device_opp(struct device *dev)
+struct opp_table *_find_opp_table(struct device *dev)
  {
-       struct device_opp *dev_opp;
+       struct opp_table *opp_table;
  
         opp_rcu_lockdep_assert();
  
@@ -98,9 +100,9 @@ struct device_opp *_find_device_opp(struct device *dev)
                 return ERR_PTR(-EINVAL);
         }
  
-       list_for_each_entry_rcu(dev_opp, &dev_opp_list, node)
-               if (_find_list_dev(dev, dev_opp))
-                       return dev_opp;
+       list_for_each_entry_rcu(opp_table, &opp_tables, node)
+               if (_find_opp_dev(dev, opp_table))
+                       return opp_table;
  
         return ERR_PTR(-ENODEV);
  }
@@ -213,22 +215,98 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_is_turbo);
   */
  unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
  {
-       struct device_opp *dev_opp;
+       struct opp_table *opp_table;
         unsigned long clock_latency_ns;
  
         rcu_read_lock();
  
-       dev_opp = _find_device_opp(dev);
-       if (IS_ERR(dev_opp))
+       opp_table = _find_opp_table(dev);
+       if (IS_ERR(opp_table))
                 clock_latency_ns = 0;
         else
-               clock_latency_ns = dev_opp->clock_latency_ns_max;
+               clock_latency_ns = opp_table->clock_latency_ns_max;
  
         rcu_read_unlock();
         return clock_latency_ns;
  }
  EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
  
+/**
+ * dev_pm_opp_get_max_volt_latency() - Get max voltage latency in nanoseconds
+ * @dev: device for which we do this operation
+ *
+ * Return: This function returns the max voltage latency in nanoseconds.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
+{
+       struct opp_table *opp_table;
+       struct dev_pm_opp *opp;
+       struct regulator *reg;
+       unsigned long latency_ns = 0;
+       unsigned long min_uV = ~0, max_uV = 0;
+       int ret;
+
+       rcu_read_lock();
+
+       opp_table = _find_opp_table(dev);
+       if (IS_ERR(opp_table)) {
+               rcu_read_unlock();
+               return 0;
+       }
+
+       reg = opp_table->regulator;
+       if (IS_ERR(reg)) {
+               /* Regulator may not be required for device */
+               if (reg)
+                       dev_err(dev, "%s: Invalid regulator (%ld)\n", __func__,
+                               PTR_ERR(reg));
+               rcu_read_unlock();
+               return 0;
+       }
+
+       list_for_each_entry_rcu(opp, &opp_table->opp_list, node) {
+               if (!opp->available)
+                       continue;
+
+               if (opp->u_volt_min < min_uV)
+                       min_uV = opp->u_volt_min;
+               if (opp->u_volt_max > max_uV)
+                       max_uV = opp->u_volt_max;
+       }
+
+       rcu_read_unlock();
+
+       /*
+        * The caller needs to ensure that opp_table (and hence the regulator)
+        * isn't freed, while we are executing this routine.
+        */
+       ret = regulator_set_voltage_time(reg, min_uV, max_uV);
+       if (ret > 0)
+               latency_ns = ret * 1000;
+
+       return latency_ns;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_volt_latency);
+
+/**
+ * dev_pm_opp_get_max_transition_latency() - Get max transition latency in
+ *                                          nanoseconds
+ * @dev: device for which we do this operation
+ *
+ * Return: This function returns the max transition latency, in nanoseconds, to
+ * switch from one OPP to other.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev)
+{
+       return dev_pm_opp_get_max_volt_latency(dev) +
+               dev_pm_opp_get_max_clock_latency(dev);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_transition_latency);
+
  /**
   * dev_pm_opp_get_suspend_opp() - Get suspend opp
   * @dev:       device for which we do this operation
@@ -244,21 +322,21 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
   */
  struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
  {
-       struct device_opp *dev_opp;
+       struct opp_table *opp_table;
  
         opp_rcu_lockdep_assert();
  
-       dev_opp = _find_device_opp(dev);
-       if (IS_ERR(dev_opp) || !dev_opp->suspend_opp ||
-           !dev_opp->suspend_opp->available)
+       opp_table = _find_opp_table(dev);
+       if (IS_ERR(opp_table) || !opp_table->suspend_opp ||
+           !opp_table->suspend_opp->available)
                 return NULL;
  
-       return dev_opp->suspend_opp;
+       return opp_table->suspend_opp;
  }
  EXPORT_SYMBOL_GPL(dev_pm_opp_get_suspend_opp);
  
  /**
- * dev_pm_opp_get_opp_count() - Get number of opps available in the opp list
+ * dev_pm_opp_get_opp_count() - Get number of opps available in the opp table
   * @dev:       device for which we do this operation
   *
   * Return: This function returns the number of available opps if there are any,
@@ -268,21 +346,21 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_suspend_opp);
   */
  int dev_pm_opp_get_opp_count(struct device *dev)
  {
-       struct device_opp *dev_opp;
+       struct opp_table *opp_table;
         struct dev_pm_opp *temp_opp;
         int count = 0;
  
         rcu_read_lock();
  
-       dev_opp = _find_device_opp(dev);
-       if (IS_ERR(dev_opp)) {
-               count = PTR_ERR(dev_opp);
-               dev_err(dev, "%s: device OPP not found (%d)\n",
+       opp_table = _find_opp_table(dev);
+       if (IS_ERR(opp_table)) {
+               count = PTR_ERR(opp_table);
+               dev_err(dev, "%s: OPP table not found (%d)\n",
                         __func__, count);
                 goto out_unlock;
         }
  
-       list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
+       list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
                 if (temp_opp->available)
                         count++;
         }
@@ -299,7 +377,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_count);
   * @freq:              frequency to search for
   * @available:         true/false - match for available opp
   *
- * Return: Searches for exact match in the opp list and returns pointer to the
+ * Return: Searches for exact match in the opp table and returns pointer to the
   * matching opp if found, else returns ERR_PTR in case of error and should
   * be handled using IS_ERR. Error return values can be:
   * EINVAL:     for bad pointer
@@ -323,19 +401,20 @@ struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
                                               unsigned long freq,
                                               bool available)
  {
-       struct device_opp *dev_opp;
+       struct opp_table *opp_table;
         struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
  
         opp_rcu_lockdep_assert();
  
-       dev_opp = _find_device_opp(dev);
-       if (IS_ERR(dev_opp)) {
-               int r = PTR_ERR(dev_opp);
-               dev_err(dev, "%s: device OPP not found (%d)\n", __func__, r);
+       opp_table = _find_opp_table(dev);
+       if (IS_ERR(opp_table)) {
+               int r = PTR_ERR(opp_table);
+
+               dev_err(dev, "%s: OPP table not found (%d)\n", __func__, r);
                 return ERR_PTR(r);
         }
  
-       list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
+       list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
                 if (temp_opp->available == available &&
                                 temp_opp->rate == freq) {
                         opp = temp_opp;
@@ -371,7 +450,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_exact);
  struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
                                              unsigned long *freq)
  {
-       struct device_opp *dev_opp;
+       struct opp_table *opp_table;
         struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
  
         opp_rcu_lockdep_assert();
@@ -381,11 +460,11 @@ struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
                 return ERR_PTR(-EINVAL);
         }
  
-       dev_opp = _find_device_opp(dev);
-       if (IS_ERR(dev_opp))
-               return ERR_CAST(dev_opp);
+       opp_table = _find_opp_table(dev);
+       if (IS_ERR(opp_table))
+               return ERR_CAST(opp_table);
  
-       list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
+       list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
                 if (temp_opp->available && temp_opp->rate >= *freq) {
                         opp = temp_opp;
                         *freq = opp->rate;
@@ -421,7 +500,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_ceil);
  struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
                                               unsigned long *freq)
  {
-       struct device_opp *dev_opp;
+       struct opp_table *opp_table;
         struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
  
         opp_rcu_lockdep_assert();
@@ -431,11 +510,11 @@ struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
                 return ERR_PTR(-EINVAL);
         }
  
-       dev_opp = _find_device_opp(dev);
-       if (IS_ERR(dev_opp))
-               return ERR_CAST(dev_opp);
+       opp_table = _find_opp_table(dev);
+       if (IS_ERR(opp_table))
+               return ERR_CAST(opp_table);
  
-       list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
+       list_for_each_entry_rcu(temp_opp, &opp_table->opp_list, node) {
                 if (temp_opp->available) {
                         /* go to the next node, before choosing prev */
                         if (temp_opp->rate > *freq)
@@ -451,116 +530,343 @@ struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
  }
  EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_floor);
  
-/* List-dev Helpers */
-static void _kfree_list_dev_rcu(struct rcu_head *head)
+/*
+ * The caller needs to ensure that opp_table (and hence the clk) isn't freed,
+ * while clk returned here is used.
+ */
+static struct clk *_get_opp_clk(struct device *dev)
  {
-       struct device_list_opp *list_dev;
+       struct opp_table *opp_table;
+       struct clk *clk;
+
+       rcu_read_lock();
  
-       list_dev = container_of(head, struct device_list_opp, rcu_head);
-       kfree_rcu(list_dev, rcu_head);
+       opp_table = _find_opp_table(dev);
+       if (IS_ERR(opp_table)) {
+               dev_err(dev, "%s: device opp doesn't exist\n", __func__);
+               clk = ERR_CAST(opp_table);
+               goto unlock;
+       }
+
+       clk = opp_table->clk;
+       if (IS_ERR(clk))
+               dev_err(dev, "%s: No clock available for the device\n",
+                       __func__);
+
+unlock:
+       rcu_read_unlock();
+       return clk;
  }
  
-static void _remove_list_dev(struct device_list_opp *list_dev,
-                            struct device_opp *dev_opp)
+static int _set_opp_voltage(struct device *dev, struct regulator *reg,
+                           unsigned long u_volt, unsigned long u_volt_min,
+                           unsigned long u_volt_max)
  {
-       list_del(&list_dev->node);
-       call_srcu(&dev_opp->srcu_head.srcu, &list_dev->rcu_head,
-                 _kfree_list_dev_rcu);
+       int ret;
+
+       /* Regulator not available for device */
+       if (IS_ERR(reg)) {
+               dev_dbg(dev, "%s: regulator not available: %ld\n", __func__,
+                       PTR_ERR(reg));
+               return 0;
+       }
+
+       dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__, u_volt_min,
+               u_volt, u_volt_max);
+
+       ret = regulator_set_voltage_triplet(reg, u_volt_min, u_volt,
+                                           u_volt_max);
+       if (ret)
+               dev_err(dev, "%s: failed to set voltage (%lu %lu %lu mV): %d\n",
+                       __func__, u_volt_min, u_volt, u_volt_max, ret);
+
+       return ret;
  }
  
-struct device_list_opp *_add_list_dev(const struct device *dev,
-                                     struct device_opp *dev_opp)
+/**
+ * dev_pm_opp_set_rate() - Configure new OPP based on frequency
+ * @dev:        device for which we do this operation
+ * @target_freq: frequency to achieve
+ *
+ * This configures the power-supplies and clock source to the levels specified
+ * by the OPP corresponding to the target_freq.
+ *
+ * Locking: This function takes rcu_read_lock().
+ */
+int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
  {
-       struct device_list_opp *list_dev;
+       struct opp_table *opp_table;
+       struct dev_pm_opp *old_opp, *opp;
+       struct regulator *reg;
+       struct clk *clk;
+       unsigned long freq, old_freq;
+       unsigned long u_volt, u_volt_min, u_volt_max;
+       unsigned long ou_volt, ou_volt_min, ou_volt_max;
+       int ret;
+
+       if (unlikely(!target_freq)) {
+               dev_err(dev, "%s: Invalid target frequency %lu\n", __func__,
+                       target_freq);
+               return -EINVAL;
+       }
+
+       clk = _get_opp_clk(dev);
+       if (IS_ERR(clk))
+               return PTR_ERR(clk);
+
+       freq = clk_round_rate(clk, target_freq);
+       if ((long)freq <= 0)
+               freq = target_freq;
+
+       old_freq = clk_get_rate(clk);
+
+       /* Return early if nothing to do */
+       if (old_freq == freq) {
+               dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, nothing to do\n",
+                       __func__, freq);
+               return 0;
+       }
+
+       rcu_read_lock();
+
+       opp_table = _find_opp_table(dev);
+       if (IS_ERR(opp_table)) {
+               dev_err(dev, "%s: device opp doesn't exist\n", __func__);
+               rcu_read_unlock();
+               return PTR_ERR(opp_table);
+       }
+
+       old_opp = dev_pm_opp_find_freq_ceil(dev, &old_freq);
+       if (!IS_ERR(old_opp)) {
+               ou_volt = old_opp->u_volt;
+               ou_volt_min = old_opp->u_volt_min;
+               ou_volt_max = old_opp->u_volt_max;
+       } else {
+               dev_err(dev, "%s: failed to find current OPP for freq %lu (%ld)\n",
+                       __func__, old_freq, PTR_ERR(old_opp));
+       }
+
+       opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+       if (IS_ERR(opp)) {
+               ret = PTR_ERR(opp);
+               dev_err(dev, "%s: failed to find OPP for freq %lu (%d)\n",
+                       __func__, freq, ret);
+               rcu_read_unlock();
+               return ret;
+       }
  
-       list_dev = kzalloc(sizeof(*list_dev), GFP_KERNEL);
-       if (!list_dev)
+       u_volt = opp->u_volt;
+       u_volt_min = opp->u_volt_min;
+       u_volt_max = opp->u_volt_max;
+
+       reg = opp_table->regulator;
+
+       rcu_read_unlock();
+
+       /* Scaling up? Scale voltage before frequency */
+       if (freq > old_freq) {
+               ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min,
+                                      u_volt_max);
+               if (ret)
+                       goto restore_voltage;
+       }
+
+       /* Change frequency */
+
+       dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n",
+               __func__, old_freq, freq);
+
+       ret = clk_set_rate(clk, freq);
+       if (ret) {
+               dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
+                       ret);
+               goto restore_voltage;
+       }
+
+       /* Scaling down? Scale voltage after frequency */
+       if (freq < old_freq) {
+               ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min,
+                                      u_volt_max);
+               if (ret)
+                       goto restore_freq;
+       }
+
+       return 0;
+
+restore_freq:
+       if (clk_set_rate(clk, old_freq))
+               dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
+                       __func__, old_freq);
+restore_voltage:
+       /* This shouldn't harm even if the voltages weren't updated earlier */
+       if (!IS_ERR(old_opp))
+               _set_opp_voltage(dev, reg, ou_volt, ou_volt_min, ou_volt_max);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate);
+
+/* OPP-dev Helpers */
+static void _kfree_opp_dev_rcu(struct rcu_head *head)
+{
+       struct opp_device *opp_dev;
+
+       opp_dev = container_of(head, struct opp_device, rcu_head);
+       kfree_rcu(opp_dev, rcu_head);
+}
+
+static void _remove_opp_dev(struct opp_device *opp_dev,
+                           struct opp_table *opp_table)
+{
+       opp_debug_unregister(opp_dev, opp_table);
+       list_del(&opp_dev->node);
+       call_srcu(&opp_table->srcu_head.srcu, &opp_dev->rcu_head,
+                 _kfree_opp_dev_rcu);
+}
+
+struct opp_device *_add_opp_dev(const struct device *dev,
+                               struct opp_table *opp_table)
+{
+       struct opp_device *opp_dev;
+       int ret;
+
+       opp_dev = kzalloc(sizeof(*opp_dev), GFP_KERNEL);
+       if (!opp_dev)
                 return NULL;
  
-       /* Initialize list-dev */
-       list_dev->dev = dev;
-       list_add_rcu(&list_dev->node, &dev_opp->dev_list);
+       /* Initialize opp-dev */
+       opp_dev->dev = dev;
+       list_add_rcu(&opp_dev->node, &opp_table->dev_list);
+
+       /* Create debugfs entries for the opp_table */
+       ret = opp_debug_register(opp_dev, opp_table);
+       if (ret)
+               dev_err(dev, "%s: Failed to register opp debugfs (%d)\n",
+                       __func__, ret);
  
-       return list_dev;
+       return opp_dev;
  }
  
  /**
- * _add_device_opp() - Find device OPP table or allocate a new one
+ * _add_opp_table() - Find OPP table or allocate a new one
   * @dev:       device for which we do this operation
   *
   * It tries to find an existing table first, if it couldn't find one, it
   * allocates a new OPP table and returns that.
   *
- * Return: valid device_opp pointer if success, else NULL.
+ * Return: valid opp_table pointer if success, else NULL.
   */
-static struct device_opp *_add_device_opp(struct device *dev)
+static struct opp_table *_add_opp_table(struct device *dev)
  {
-       struct device_opp *dev_opp;
-       struct device_list_opp *list_dev;
+       struct opp_table *opp_table;
+       struct opp_device *opp_dev;
+       struct device_node *np;
+       int ret;
  
-       /* Check for existing list for 'dev' first */
-       dev_opp = _find_device_opp(dev);
-       if (!IS_ERR(dev_opp))
-               return dev_opp;
+       /* Check for existing table for 'dev' first */
+       opp_table = _find_opp_table(dev);
+       if (!IS_ERR(opp_table))
+               return opp_table;
  
         /*
-        * Allocate a new device OPP table. In the infrequent case where a new
+        * Allocate a new OPP table. In the infrequent case where a new
          * device is needed to be added, we pay this penalty.
          */
-       dev_opp = kzalloc(sizeof(*dev_opp), GFP_KERNEL);
-       if (!dev_opp)
+       opp_table = kzalloc(sizeof(*opp_table), GFP_KERNEL);
+       if (!opp_table)
                 return NULL;
  
-       INIT_LIST_HEAD(&dev_opp->dev_list);
+       INIT_LIST_HEAD(&opp_table->dev_list);
  
-       list_dev = _add_list_dev(dev, dev_opp);
-       if (!list_dev) {
-               kfree(dev_opp);
+       opp_dev = _add_opp_dev(dev, opp_table);
+       if (!opp_dev) {
+               kfree(opp_table);
                 return NULL;
         }
  
-       srcu_init_notifier_head(&dev_opp->srcu_head);
-       INIT_LIST_HEAD(&dev_opp->opp_list);
+       /*
+        * Only required for backward compatibility with v1 bindings, but isn't
+        * harmful for other cases. And so we do it unconditionally.
+        */
+       np = of_node_get(dev->of_node);
+       if (np) {
+               u32 val;
+
+               if (!of_property_read_u32(np, "clock-latency", &val))
+                       opp_table->clock_latency_ns_max = val;
+               of_property_read_u32(np, "voltage-tolerance",
+                                    &opp_table->voltage_tolerance_v1);
+               of_node_put(np);
+       }
+
+       /* Set regulator to a non-NULL error value */
+       opp_table->regulator = ERR_PTR(-ENXIO);
+
+       /* Find clk for the device */
+       opp_table->clk = clk_get(dev, NULL);
+       if (IS_ERR(opp_table->clk)) {
+               ret = PTR_ERR(opp_table->clk);
+               if (ret != -EPROBE_DEFER)
+                       dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__,
+                               ret);
+       }
  
-       /* Secure the device list modification */
-       list_add_rcu(&dev_opp->node, &dev_opp_list);
-       return dev_opp;
+       srcu_init_notifier_head(&opp_table->srcu_head);
+       INIT_LIST_HEAD(&opp_table->opp_list);
+
+       /* Secure the device table modification */
+       list_add_rcu(&opp_table->node, &opp_tables);
+       return opp_table;
  }
  
  /**
- * _kfree_device_rcu() - Free device_opp RCU handler
+ * _kfree_device_rcu() - Free opp_table RCU handler
   * @head:      RCU head
   */
  static void _kfree_device_rcu(struct rcu_head *head)
  {
-       struct device_opp *device_opp = container_of(head, struct device_opp, rcu_head);
+       struct opp_table *opp_table = container_of(head, struct opp_table,
+                                                  rcu_head);
  
-       kfree_rcu(device_opp, rcu_head);
+       kfree_rcu(opp_table, rcu_head);
  }
  
  /**
- * _remove_device_opp() - Removes a device OPP table
- * @dev_opp: device OPP table to be removed.
+ * _remove_opp_table() - Removes a OPP table
+ * @opp_table: OPP table to be removed.
   *
- * Removes/frees device OPP table it it doesn't contain any OPPs.
+ * Removes/frees OPP table if it doesn't contain any OPPs.
   */
-static void _remove_device_opp(struct device_opp *dev_opp)
+static void _remove_opp_table(struct opp_table *opp_table)
  {
-       struct device_list_opp *list_dev;
+       struct opp_device *opp_dev;
+
+       if (!list_empty(&opp_table->opp_list))
+               return;
+
+       if (opp_table->supported_hw)
+               return;
  
-       if (!list_empty(&dev_opp->opp_list))
+       if (opp_table->prop_name)
                 return;
  
-       list_dev = list_first_entry(&dev_opp->dev_list, struct device_list_opp,
-                                   node);
+       if (!IS_ERR(opp_table->regulator))
+               return;
+
+       /* Release clk */
+       if (!IS_ERR(opp_table->clk))
+               clk_put(opp_table->clk);
+
+       opp_dev = list_first_entry(&opp_table->dev_list, struct opp_device,
+                                  node);
  
-       _remove_list_dev(list_dev, dev_opp);
+       _remove_opp_dev(opp_dev, opp_table);
  
         /* dev_list must be empty now */
-       WARN_ON(!list_empty(&dev_opp->dev_list));
+       WARN_ON(!list_empty(&opp_table->dev_list));
  
-       list_del_rcu(&dev_opp->node);
-       call_srcu(&dev_opp->srcu_head.srcu, &dev_opp->rcu_head,
+       list_del_rcu(&opp_table->node);
+       call_srcu(&opp_table->srcu_head.srcu, &opp_table->rcu_head,
                   _kfree_device_rcu);
  }
  
@@ -577,17 +883,17 @@ static void _kfree_opp_rcu(struct rcu_head *head)
  
  /**
   * _opp_remove()  - Remove an OPP from a table definition
- * @dev_opp:   points back to the device_opp struct this opp belongs to
+ * @opp_table: points back to the opp_table struct this opp belongs to
   * @opp:       pointer to the OPP to remove
   * @notify:    OPP_EVENT_REMOVE notification should be sent or not
   *
- * This function removes an opp definition from the opp list.
+ * This function removes an opp definition from the opp table.
   *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
   * It is assumed that the caller holds required mutex for an RCU updater
   * strategy.
   */
-static void _opp_remove(struct device_opp *dev_opp,
+static void _opp_remove(struct opp_table *opp_table,
                         struct dev_pm_opp *opp, bool notify)
  {
         /*
@@ -595,21 +901,23 @@ static void _opp_remove(struct device_opp *dev_opp,
          * frequency/voltage list.
          */
         if (notify)
-               srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_REMOVE, opp);
+               srcu_notifier_call_chain(&opp_table->srcu_head,
+                                        OPP_EVENT_REMOVE, opp);
+       opp_debug_remove_one(opp);
         list_del_rcu(&opp->node);
-       call_srcu(&dev_opp->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
+       call_srcu(&opp_table->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
  
-       _remove_device_opp(dev_opp);
+       _remove_opp_table(opp_table);
  }
  
  /**
- * dev_pm_opp_remove()  - Remove an OPP from OPP list
+ * dev_pm_opp_remove()  - Remove an OPP from OPP table
   * @dev:       device for which we do this operation
   * @freq:      OPP to remove with matching 'freq'
   *
- * This function removes an opp from the opp list.
+ * This function removes an opp from the opp table.
   *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
   * Hence this function internally uses RCU updater strategy with mutex locks
   * to keep the integrity of the internal data structures. Callers should ensure
   * that this function is *NOT* called under RCU protection or in contexts where
@@ -618,17 +926,17 @@ static void _opp_remove(struct device_opp *dev_opp,
  void dev_pm_opp_remove(struct device *dev, unsigned long freq)
  {
         struct dev_pm_opp *opp;
-       struct device_opp *dev_opp;
+       struct opp_table *opp_table;
         bool found = false;
  
-       /* Hold our list modification lock here */
-       mutex_lock(&dev_opp_list_lock);
+       /* Hold our table modification lock here */
+       mutex_lock(&opp_table_lock);
  
-       dev_opp = _find_device_opp(dev);
-       if (IS_ERR(dev_opp))
+       opp_table = _find_opp_table(dev);
+       if (IS_ERR(opp_table))
                 goto unlock;
  
-       list_for_each_entry(opp, &dev_opp->opp_list, node) {
+       list_for_each_entry(opp, &opp_table->opp_list, node) {
                 if (opp->rate == freq) {
                         found = true;
                         break;
@@ -641,14 +949,14 @@ void dev_pm_opp_remove(struct device *dev, unsigned long freq)
                 goto unlock;
         }
  
-       _opp_remove(dev_opp, opp, true);
+       _opp_remove(opp_table, opp, true);
  unlock:
-       mutex_unlock(&dev_opp_list_lock);
+       mutex_unlock(&opp_table_lock);
  }
  EXPORT_SYMBOL_GPL(dev_pm_opp_remove);
  
  static struct dev_pm_opp *_allocate_opp(struct device *dev,
-                                       struct device_opp **dev_opp)
+                                       struct opp_table **opp_table)
  {
         struct dev_pm_opp *opp;
  
@@ -659,8 +967,8 @@ static struct dev_pm_opp *_allocate_opp(struct device *dev,
  
         INIT_LIST_HEAD(&opp->node);
  
-       *dev_opp = _add_device_opp(dev);
-       if (!*dev_opp) {
+       *opp_table = _add_opp_table(dev);
+       if (!*opp_table) {
                 kfree(opp);
                 return NULL;
         }
@@ -668,21 +976,38 @@ static struct dev_pm_opp *_allocate_opp(struct device *dev,
         return opp;
  }
  
+static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
+                                        struct opp_table *opp_table)
+{
+       struct regulator *reg = opp_table->regulator;
+
+       if (!IS_ERR(reg) &&
+           !regulator_is_supported_voltage(reg, opp->u_volt_min,
+                                           opp->u_volt_max)) {
+               pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
+                       __func__, opp->u_volt_min, opp->u_volt_max);
+               return false;
+       }
+
+       return true;
+}
+
  static int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
-                   struct device_opp *dev_opp)
+                   struct opp_table *opp_table)
  {
         struct dev_pm_opp *opp;
-       struct list_head *head = &dev_opp->opp_list;
+       struct list_head *head = &opp_table->opp_list;
+       int ret;
  
         /*
          * Insert new OPP in order of increasing frequency and discard if
          * already present.
          *
-        * Need to use &dev_opp->opp_list in the condition part of the 'for'
+        * Need to use &opp_table->opp_list in the condition part of the 'for'
          * loop, don't replace it with head otherwise it will become an infinite
          * loop.
          */
-       list_for_each_entry_rcu(opp, &dev_opp->opp_list, node) {
+       list_for_each_entry_rcu(opp, &opp_table->opp_list, node) {
                 if (new_opp->rate > opp->rate) {
                         head = &opp->node;
                         continue;
@@ -700,9 +1025,20 @@ static int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
                         0 : -EEXIST;
         }
  
-       new_opp->dev_opp = dev_opp;
+       new_opp->opp_table = opp_table;
         list_add_rcu(&new_opp->node, head);
  
+       ret = opp_debug_create_one(new_opp, opp_table);
+       if (ret)
+               dev_err(dev, "%s: Failed to register opp to debugfs (%d)\n",
+                       __func__, ret);
+
+       if (!_opp_supported_by_regulators(new_opp, opp_table)) {
+               new_opp->available = false;
+               dev_warn(dev, "%s: OPP not supported by regulators (%lu)\n",
+                        __func__, new_opp->rate);
+       }
+
         return 0;
  }
  
@@ -713,14 +1049,14 @@ static int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
   * @u_volt:    Voltage in uVolts for this OPP
   * @dynamic:   Dynamically added OPPs.
   *
- * This function adds an opp definition to the opp list and returns status.
+ * This function adds an opp definition to the opp table and returns status.
   * The opp is made available by default and it can be controlled using
   * dev_pm_opp_enable/disable functions and may be removed by dev_pm_opp_remove.
   *
   * NOTE: "dynamic" parameter impacts OPPs added by the dev_pm_opp_of_add_table
   * and freed by dev_pm_opp_of_remove_table.
   *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
   * Hence this function internally uses RCU updater strategy with mutex locks
   * to keep the integrity of the internal data structures. Callers should ensure
   * that this function is *NOT* called under RCU protection or in contexts where
@@ -736,14 +1072,15 @@ static int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
  static int _opp_add_v1(struct device *dev, unsigned long freq, long u_volt,
                        bool dynamic)
  {
-       struct device_opp *dev_opp;
+       struct opp_table *opp_table;
         struct dev_pm_opp *new_opp;
+       unsigned long tol;
         int ret;
  
-       /* Hold our list modification lock here */
-       mutex_lock(&dev_opp_list_lock);
+       /* Hold our table modification lock here */
+       mutex_lock(&opp_table_lock);
  
-       new_opp = _allocate_opp(dev, &dev_opp);
+       new_opp = _allocate_opp(dev, &opp_table);
         if (!new_opp) {
                 ret = -ENOMEM;
                 goto unlock;
@@ -751,60 +1088,77 @@ static int _opp_add_v1(struct device *dev, unsigned long freq, long u_volt,
  
         /* populate the opp table */
         new_opp->rate = freq;
+       tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
         new_opp->u_volt = u_volt;
+       new_opp->u_volt_min = u_volt - tol;
+       new_opp->u_volt_max = u_volt + tol;
         new_opp->available = true;
         new_opp->dynamic = dynamic;
  
-       ret = _opp_add(dev, new_opp, dev_opp);
+       ret = _opp_add(dev, new_opp, opp_table);
         if (ret)
                 goto free_opp;
  
-       mutex_unlock(&dev_opp_list_lock);
+       mutex_unlock(&opp_table_lock);
  
         /*
          * Notify the changes in the availability of the operable
          * frequency/voltage list.
          */
-       srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_ADD, new_opp);
+       srcu_notifier_call_chain(&opp_table->srcu_head, OPP_EVENT_ADD, new_opp);
         return 0;
  
  free_opp:
-       _opp_remove(dev_opp, new_opp, false);
+       _opp_remove(opp_table, new_opp, false);
  unlock:
-       mutex_unlock(&dev_opp_list_lock);
+       mutex_unlock(&opp_table_lock);
         return ret;
  }
  
  /* TODO: Support multiple regulators */
-static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev)
+static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
+                             struct opp_table *opp_table)
  {
         u32 microvolt[3] = {0};
         u32 val;
         int count, ret;
+       struct property *prop = NULL;
+       char name[NAME_MAX];
+
+       /* Search for "opp-microvolt-<name>" */
+       if (opp_table->prop_name) {
+               snprintf(name, sizeof(name), "opp-microvolt-%s",
+                        opp_table->prop_name);
+               prop = of_find_property(opp->np, name, NULL);
+       }
  
-       /* Missing property isn't a problem, but an invalid entry is */
-       if (!of_find_property(opp->np, "opp-microvolt", NULL))
-               return 0;
+       if (!prop) {
+               /* Search for "opp-microvolt" */
+               sprintf(name, "opp-microvolt");
+               prop = of_find_property(opp->np, name, NULL);
  
-       count = of_property_count_u32_elems(opp->np, "opp-microvolt");
+               /* Missing property isn't a problem, but an invalid entry is */
+               if (!prop)
+                       return 0;
+       }
+
+       count = of_property_count_u32_elems(opp->np, name);
         if (count < 0) {
-               dev_err(dev, "%s: Invalid opp-microvolt property (%d)\n",
-                       __func__, count);
+               dev_err(dev, "%s: Invalid %s property (%d)\n",
+                       __func__, name, count);
                 return count;
         }
  
         /* There can be one or three elements here */
         if (count != 1 && count != 3) {
-               dev_err(dev, "%s: Invalid number of elements in opp-microvolt property (%d)\n",
-                       __func__, count);
+               dev_err(dev, "%s: Invalid number of elements in %s property (%d)\n",
+                       __func__, name, count);
                 return -EINVAL;
         }
  
-       ret = of_property_read_u32_array(opp->np, "opp-microvolt", microvolt,
-                                        count);
+       ret = of_property_read_u32_array(opp->np, name, microvolt, count);
         if (ret) {
-               dev_err(dev, "%s: error parsing opp-microvolt: %d\n", __func__,
-                       ret);
+               dev_err(dev, "%s: error parsing %s: %d\n", __func__, name, ret);
                 return -EINVAL;
         }
  
@@ -818,22 +1172,391 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev)
                 opp->u_volt_max = microvolt[2];
         }
  
-       if (!of_property_read_u32(opp->np, "opp-microamp", &val))
+       /* Search for "opp-microamp-<name>" */
+       prop = NULL;
+       if (opp_table->prop_name) {
+               snprintf(name, sizeof(name), "opp-microamp-%s",
+                        opp_table->prop_name);
+               prop = of_find_property(opp->np, name, NULL);
+       }
+
+       if (!prop) {
+               /* Search for "opp-microamp" */
+               sprintf(name, "opp-microamp");
+               prop = of_find_property(opp->np, name, NULL);
+       }
+
+       if (prop && !of_property_read_u32(opp->np, name, &val))
                 opp->u_amp = val;
  
         return 0;
  }
  
+/**
+ * dev_pm_opp_set_supported_hw() - Set supported platforms
+ * @dev: Device for which supported-hw has to be set.
+ * @versions: Array of hierarchy of versions to match.
+ * @count: Number of elements in the array.
+ *
+ * This is required only for the V2 bindings, and it enables a platform to
+ * specify the hierarchy of versions it supports. OPP layer will then enable
+ * OPPs, which are available for those versions, based on its 'opp-supported-hw'
+ * property.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
+                               unsigned int count)
+{
+       struct opp_table *opp_table;
+       int ret = 0;
+
+       /* Hold our table modification lock here */
+       mutex_lock(&opp_table_lock);
+
+       opp_table = _add_opp_table(dev);
+       if (!opp_table) {
+               ret = -ENOMEM;
+               goto unlock;
+       }
+
+       /* Make sure there are no concurrent readers while updating opp_table */
+       WARN_ON(!list_empty(&opp_table->opp_list));
+
+       /* Do we already have a version hierarchy associated with opp_table? */
+       if (opp_table->supported_hw) {
+               dev_err(dev, "%s: Already have supported hardware list\n",
+                       __func__);
+               ret = -EBUSY;
+               goto err;
+       }
+
+       opp_table->supported_hw = kmemdup(versions, count * sizeof(*versions),
+                                       GFP_KERNEL);
+       if (!opp_table->supported_hw) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       opp_table->supported_hw_count = count;
+       mutex_unlock(&opp_table_lock);
+       return 0;
+
+err:
+       _remove_opp_table(opp_table);
+unlock:
+       mutex_unlock(&opp_table_lock);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_supported_hw);
+
+/**
+ * dev_pm_opp_put_supported_hw() - Releases resources blocked for supported hw
+ * @dev: Device for which supported-hw has to be put.
+ *
+ * This is required only for the V2 bindings, and is called for a matching
+ * dev_pm_opp_set_supported_hw(). Until this is called, the opp_table structure
+ * will not be freed.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_put_supported_hw(struct device *dev)
+{
+       struct opp_table *opp_table;
+
+       /* Hold our table modification lock here */
+       mutex_lock(&opp_table_lock);
+
+       /* Check for existing table for 'dev' first */
+       opp_table = _find_opp_table(dev);
+       if (IS_ERR(opp_table)) {
+               dev_err(dev, "Failed to find opp_table: %ld\n",
+                       PTR_ERR(opp_table));
+               goto unlock;
+       }
+
+       /* Make sure there are no concurrent readers while updating opp_table */
+       WARN_ON(!list_empty(&opp_table->opp_list));
+
+       if (!opp_table->supported_hw) {
+               dev_err(dev, "%s: Doesn't have supported hardware list\n",
+                       __func__);
+               goto unlock;
+       }
+
+       kfree(opp_table->supported_hw);
+       opp_table->supported_hw = NULL;
+       opp_table->supported_hw_count = 0;
+
+       /* Try freeing opp_table if this was the last blocking resource */
+       _remove_opp_table(opp_table);
+
+unlock:
+       mutex_unlock(&opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
+
+/**
+ * dev_pm_opp_set_prop_name() - Set prop-extn name
+ * @dev: Device for which the prop-name has to be set.
+ * @name: name to postfix to properties.
+ *
+ * This is required only for the V2 bindings, and it enables a platform to
+ * specify the extn to be used for certain property names. The properties to
+ * which the extension will apply are opp-microvolt and opp-microamp. OPP core
+ * should postfix the property name with -<name> while looking for them.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
+{
+       struct opp_table *opp_table;
+       int ret = 0;
+
+       /* Hold our table modification lock here */
+       mutex_lock(&opp_table_lock);
+
+       opp_table = _add_opp_table(dev);
+       if (!opp_table) {
+               ret = -ENOMEM;
+               goto unlock;
+       }
+
+       /* Make sure there are no concurrent readers while updating opp_table */
+       WARN_ON(!list_empty(&opp_table->opp_list));
+
+       /* Do we already have a prop-name associated with opp_table? */
+       if (opp_table->prop_name) {
+               dev_err(dev, "%s: Already have prop-name %s\n", __func__,
+                       opp_table->prop_name);
+               ret = -EBUSY;
+               goto err;
+       }
+
+       opp_table->prop_name = kstrdup(name, GFP_KERNEL);
+       if (!opp_table->prop_name) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       mutex_unlock(&opp_table_lock);
+       return 0;
+
+err:
+       _remove_opp_table(opp_table);
+unlock:
+       mutex_unlock(&opp_table_lock);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_prop_name);
+
+/**
+ * dev_pm_opp_put_prop_name() - Releases resources blocked for prop-name
+ * @dev: Device for which the prop-name has to be put.
+ *
+ * This is required only for the V2 bindings, and is called for a matching
+ * dev_pm_opp_set_prop_name(). Until this is called, the opp_table structure
+ * will not be freed.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_put_prop_name(struct device *dev)
+{
+       struct opp_table *opp_table;
+
+       /* Hold our table modification lock here */
+       mutex_lock(&opp_table_lock);
+
+       /* Check for existing table for 'dev' first */
+       opp_table = _find_opp_table(dev);
+       if (IS_ERR(opp_table)) {
+               dev_err(dev, "Failed to find opp_table: %ld\n",
+                       PTR_ERR(opp_table));
+               goto unlock;
+       }
+
+       /* Make sure there are no concurrent readers while updating opp_table */
+       WARN_ON(!list_empty(&opp_table->opp_list));
+
+       if (!opp_table->prop_name) {
+               dev_err(dev, "%s: Doesn't have a prop-name\n", __func__);
+               goto unlock;
+       }
+
+       kfree(opp_table->prop_name);
+       opp_table->prop_name = NULL;
+
+       /* Try freeing opp_table if this was the last blocking resource */
+       _remove_opp_table(opp_table);
+
+unlock:
+       mutex_unlock(&opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
+
+/**
+ * dev_pm_opp_set_regulator() - Set regulator name for the device
+ * @dev: Device for which regulator name is being set.
+ * @name: Name of the regulator.
+ *
+ * In order to support OPP switching, OPP layer needs to know the name of the
+ * device's regulator, as the core would be required to switch voltages as well.
+ *
+ * This must be called before any OPPs are initialized for the device.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+int dev_pm_opp_set_regulator(struct device *dev, const char *name)
+{
+       struct opp_table *opp_table;
+       struct regulator *reg;
+       int ret;
+
+       mutex_lock(&opp_table_lock);
+
+       opp_table = _add_opp_table(dev);
+       if (!opp_table) {
+               ret = -ENOMEM;
+               goto unlock;
+       }
+
+       /* This should be called before OPPs are initialized */
+       if (WARN_ON(!list_empty(&opp_table->opp_list))) {
+               ret = -EBUSY;
+               goto err;
+       }
+
+       /* Already have a regulator set */
+       if (WARN_ON(!IS_ERR(opp_table->regulator))) {
+               ret = -EBUSY;
+               goto err;
+       }
+       /* Allocate the regulator */
+       reg = regulator_get_optional(dev, name);
+       if (IS_ERR(reg)) {
+               ret = PTR_ERR(reg);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(dev, "%s: no regulator (%s) found: %d\n",
+                               __func__, name, ret);
+               goto err;
+       }
+
+       opp_table->regulator = reg;
+
+       mutex_unlock(&opp_table_lock);
+       return 0;
+
+err:
+       _remove_opp_table(opp_table);
+unlock:
+       mutex_unlock(&opp_table_lock);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator);
+
+/**
+ * dev_pm_opp_put_regulator() - Releases resources blocked for regulator
+ * @dev: Device for which regulator was set.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_put_regulator(struct device *dev)
+{
+       struct opp_table *opp_table;
+
+       mutex_lock(&opp_table_lock);
+
+       /* Check for existing table for 'dev' first */
+       opp_table = _find_opp_table(dev);
+       if (IS_ERR(opp_table)) {
+               dev_err(dev, "Failed to find opp_table: %ld\n",
+                       PTR_ERR(opp_table));
+               goto unlock;
+       }
+
+       if (IS_ERR(opp_table->regulator)) {
+               dev_err(dev, "%s: Doesn't have regulator set\n", __func__);
+               goto unlock;
+       }
+
+       /* Make sure there are no concurrent readers while updating opp_table */
+       WARN_ON(!list_empty(&opp_table->opp_list));
+
+       regulator_put(opp_table->regulator);
+       opp_table->regulator = ERR_PTR(-ENXIO);
+
+       /* Try freeing opp_table if this was the last blocking resource */
+       _remove_opp_table(opp_table);
+
+unlock:
+       mutex_unlock(&opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulator);
+
+static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table,
+                             struct device_node *np)
+{
+       unsigned int count = opp_table->supported_hw_count;
+       u32 version;
+       int ret;
+
+       if (!opp_table->supported_hw)
+               return true;
+
+       while (count--) {
+               ret = of_property_read_u32_index(np, "opp-supported-hw", count,
+                                                &version);
+               if (ret) {
+                       dev_warn(dev, "%s: failed to read opp-supported-hw property at index %d: %d\n",
+                                __func__, count, ret);
+                       return false;
+               }
+
+               /* Both of these are bitwise masks of the versions */
+               if (!(version & opp_table->supported_hw[count]))
+                       return false;
+       }
+
+       return true;
+}
+
  /**
   * _opp_add_static_v2() - Allocate static OPPs (As per 'v2' DT bindings)
   * @dev:       device for which we do this operation
   * @np:                device node
   *
- * This function adds an opp definition to the opp list and returns status. The
+ * This function adds an opp definition to the opp table and returns status. The
   * opp can be controlled using dev_pm_opp_enable/disable functions and may be
   * removed by dev_pm_opp_remove.
   *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
   * Hence this function internally uses RCU updater strategy with mutex locks
   * to keep the integrity of the internal data structures. Callers should ensure
   * that this function is *NOT* called under RCU protection or in contexts where
@@ -849,16 +1572,16 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev)
   */
  static int _opp_add_static_v2(struct device *dev, struct device_node *np)
  {
-       struct device_opp *dev_opp;
+       struct opp_table *opp_table;
         struct dev_pm_opp *new_opp;
         u64 rate;
         u32 val;
         int ret;
  
-       /* Hold our list modification lock here */
-       mutex_lock(&dev_opp_list_lock);
+       /* Hold our table modification lock here */
+       mutex_lock(&opp_table_lock);
  
-       new_opp = _allocate_opp(dev, &dev_opp);
+       new_opp = _allocate_opp(dev, &opp_table);
         if (!new_opp) {
                 ret = -ENOMEM;
                 goto unlock;
@@ -870,6 +1593,12 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np)
                 goto free_opp;
         }
  
+       /* Check if the OPP supports hardware's hierarchy of versions or not */
+       if (!_opp_is_supported(dev, opp_table, np)) {
+               dev_dbg(dev, "OPP not supported by hardware: %llu\n", rate);
+               goto free_opp;
+       }
+
         /*
          * Rate is defined as an unsigned long in clk API, and so casting
          * explicitly to its type. Must be fixed once rate is 64 bit
@@ -885,28 +1614,30 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np)
         if (!of_property_read_u32(np, "clock-latency-ns", &val))
                 new_opp->clock_latency_ns = val;
  
-       ret = opp_parse_supplies(new_opp, dev);
+       ret = opp_parse_supplies(new_opp, dev, opp_table);
         if (ret)
                 goto free_opp;
  
-       ret = _opp_add(dev, new_opp, dev_opp);
+       ret = _opp_add(dev, new_opp, opp_table);
         if (ret)
                 goto free_opp;
  
         /* OPP to select on device suspend */
         if (of_property_read_bool(np, "opp-suspend")) {
-               if (dev_opp->suspend_opp)
+               if (opp_table->suspend_opp) {
                         dev_warn(dev, "%s: Multiple suspend OPPs found (%lu %lu)\n",
-                                __func__, dev_opp->suspend_opp->rate,
+                                __func__, opp_table->suspend_opp->rate,
                                  new_opp->rate);
-               else
-                       dev_opp->suspend_opp = new_opp;
+               } else {
+                       new_opp->suspend = true;
+                       opp_table->suspend_opp = new_opp;
+               }
         }
  
-       if (new_opp->clock_latency_ns > dev_opp->clock_latency_ns_max)
-               dev_opp->clock_latency_ns_max = new_opp->clock_latency_ns;
+       if (new_opp->clock_latency_ns > opp_table->clock_latency_ns_max)
+               opp_table->clock_latency_ns_max = new_opp->clock_latency_ns;
  
-       mutex_unlock(&dev_opp_list_lock);
+       mutex_unlock(&opp_table_lock);
  
         pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n",
                  __func__, new_opp->turbo, new_opp->rate, new_opp->u_volt,
@@ -917,13 +1648,13 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np)
          * Notify the changes in the availability of the operable
          * frequency/voltage list.
          */
-       srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_ADD, new_opp);
+       srcu_notifier_call_chain(&opp_table->srcu_head, OPP_EVENT_ADD, new_opp);
         return 0;
  
  free_opp:
-       _opp_remove(dev_opp, new_opp, false);
+       _opp_remove(opp_table, new_opp, false);
  unlock:
-       mutex_unlock(&dev_opp_list_lock);
+       mutex_unlock(&opp_table_lock);
         return ret;
  }
  
@@ -933,11 +1664,11 @@ unlock:
   * @freq:      Frequency in Hz for this OPP
   * @u_volt:    Voltage in uVolts for this OPP
   *
- * This function adds an opp definition to the opp list and returns status.
+ * This function adds an opp definition to the opp table and returns status.
   * The opp is made available by default and it can be controlled using
   * dev_pm_opp_enable/disable functions.
   *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
   * Hence this function internally uses RCU updater strategy with mutex locks
   * to keep the integrity of the internal data structures. Callers should ensure
   * that this function is *NOT* called under RCU protection or in contexts where
@@ -969,7 +1700,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_add);
   * copy operation, returns 0 if no modification was done OR modification was
   * successful.
   *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
   * Hence this function internally uses RCU updater strategy with mutex locks to
   * keep the integrity of the internal data structures. Callers should ensure
   * that this function is *NOT* called under RCU protection or in contexts where
@@ -978,7 +1709,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_add);
  static int _opp_set_availability(struct device *dev, unsigned long freq,
                                  bool availability_req)
  {
-       struct device_opp *dev_opp;
+       struct opp_table *opp_table;
         struct dev_pm_opp *new_opp, *tmp_opp, *opp = ERR_PTR(-ENODEV);
         int r = 0;
  
@@ -987,18 +1718,18 @@ static int _opp_set_availability(struct device *dev, unsigned long freq,
         if (!new_opp)
                 return -ENOMEM;
  
-       mutex_lock(&dev_opp_list_lock);
+       mutex_lock(&opp_table_lock);
  
-       /* Find the device_opp */
-       dev_opp = _find_device_opp(dev);
-       if (IS_ERR(dev_opp)) {
-               r = PTR_ERR(dev_opp);
+       /* Find the opp_table */
+       opp_table = _find_opp_table(dev);
+       if (IS_ERR(opp_table)) {
+               r = PTR_ERR(opp_table);
                 dev_warn(dev, "%s: Device OPP not found (%d)\n", __func__, r);
                 goto unlock;
         }
  
         /* Do we have the frequency? */
-       list_for_each_entry(tmp_opp, &dev_opp->opp_list, node) {
+       list_for_each_entry(tmp_opp, &opp_table->opp_list, node) {
                 if (tmp_opp->rate == freq) {
                         opp = tmp_opp;
                         break;
@@ -1019,21 +1750,21 @@ static int _opp_set_availability(struct device *dev, unsigned long freq,
         new_opp->available = availability_req;
  
         list_replace_rcu(&opp->node, &new_opp->node);
-       mutex_unlock(&dev_opp_list_lock);
-       call_srcu(&dev_opp->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
+       mutex_unlock(&opp_table_lock);
+       call_srcu(&opp_table->srcu_head.srcu, &opp->rcu_head, _kfree_opp_rcu);
  
         /* Notify the change of the OPP availability */
         if (availability_req)
-               srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_ENABLE,
-                                        new_opp);
+               srcu_notifier_call_chain(&opp_table->srcu_head,
+                                        OPP_EVENT_ENABLE, new_opp);
         else
-               srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_DISABLE,
-                                        new_opp);
+               srcu_notifier_call_chain(&opp_table->srcu_head,
+                                        OPP_EVENT_DISABLE, new_opp);
  
         return 0;
  
  unlock:
-       mutex_unlock(&dev_opp_list_lock);
+       mutex_unlock(&opp_table_lock);
         kfree(new_opp);
         return r;
  }
@@ -1047,7 +1778,7 @@ unlock:
   * corresponding error value. It is meant to be used for users an OPP available
   * after being temporarily made unavailable with dev_pm_opp_disable.
   *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
   * Hence this function indirectly uses RCU and mutex locks to keep the
   * integrity of the internal data structures. Callers should ensure that
   * this function is *NOT* called under RCU protection or in contexts where
@@ -1073,7 +1804,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_enable);
   * control by users to make this OPP not available until the circumstances are
   * right to make it available again (with a call to dev_pm_opp_enable).
   *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
   * Hence this function indirectly uses RCU and mutex locks to keep the
   * integrity of the internal data structures. Callers should ensure that
   * this function is *NOT* called under RCU protection or in contexts where
@@ -1091,26 +1822,26 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_disable);
  
  /**
   * dev_pm_opp_get_notifier() - find notifier_head of the device with opp
- * @dev:       device pointer used to lookup device OPPs.
+ * @dev:       device pointer used to lookup OPP table.
   *
   * Return: pointer to  notifier head if found, otherwise -ENODEV or
   * -EINVAL based on type of error casted as pointer. value must be checked
   *  with IS_ERR to determine valid pointer or error result.
   *
- * Locking: This function must be called under rcu_read_lock(). dev_opp is a RCU
- * protected pointer. The reason for the same is that the opp pointer which is
- * returned will remain valid for use with opp_get_{voltage, freq} only while
+ * Locking: This function must be called under rcu_read_lock(). opp_table is a
+ * RCU protected pointer. The reason for the same is that the opp pointer which
+ * is returned will remain valid for use with opp_get_{voltage, freq} only while
   * under the locked area. The pointer returned must be used prior to unlocking
   * with rcu_read_unlock() to maintain the integrity of the pointer.
   */
  struct srcu_notifier_head *dev_pm_opp_get_notifier(struct device *dev)
  {
-       struct device_opp *dev_opp = _find_device_opp(dev);
+       struct opp_table *opp_table = _find_opp_table(dev);
  
-       if (IS_ERR(dev_opp))
-               return ERR_CAST(dev_opp); /* matching type */
+       if (IS_ERR(opp_table))
+               return ERR_CAST(opp_table); /* matching type */
  
-       return &dev_opp->srcu_head;
+       return &opp_table->srcu_head;
  }
  EXPORT_SYMBOL_GPL(dev_pm_opp_get_notifier);
  
@@ -1118,11 +1849,11 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_notifier);
  /**
   * dev_pm_opp_of_remove_table() - Free OPP table entries created from static DT
   *                               entries
- * @dev:       device pointer used to lookup device OPPs.
+ * @dev:       device pointer used to lookup OPP table.
   *
   * Free OPPs created using static entries present in DT.
   *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
   * Hence this function indirectly uses RCU updater strategy with mutex locks
   * to keep the integrity of the internal data structures. Callers should ensure
   * that this function is *NOT* called under RCU protection or in contexts where
@@ -1130,38 +1861,38 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_notifier);
   */
  void dev_pm_opp_of_remove_table(struct device *dev)
  {
-       struct device_opp *dev_opp;
+       struct opp_table *opp_table;
         struct dev_pm_opp *opp, *tmp;
  
-       /* Hold our list modification lock here */
-       mutex_lock(&dev_opp_list_lock);
+       /* Hold our table modification lock here */
+       mutex_lock(&opp_table_lock);
  
-       /* Check for existing list for 'dev' */
-       dev_opp = _find_device_opp(dev);
-       if (IS_ERR(dev_opp)) {
-               int error = PTR_ERR(dev_opp);
+       /* Check for existing table for 'dev' */
+       opp_table = _find_opp_table(dev);
+       if (IS_ERR(opp_table)) {
+               int error = PTR_ERR(opp_table);
  
                 if (error != -ENODEV)
-                       WARN(1, "%s: dev_opp: %d\n",
+                       WARN(1, "%s: opp_table: %d\n",
                              IS_ERR_OR_NULL(dev) ?
                                         "Invalid device" : dev_name(dev),
                              error);
                 goto unlock;
         }
  
-       /* Find if dev_opp manages a single device */
-       if (list_is_singular(&dev_opp->dev_list)) {
+       /* Find if opp_table manages a single device */
+       if (list_is_singular(&opp_table->dev_list)) {
                 /* Free static OPPs */
-               list_for_each_entry_safe(opp, tmp, &dev_opp->opp_list, node) {
+               list_for_each_entry_safe(opp, tmp, &opp_table->opp_list, node) {
                         if (!opp->dynamic)
-                               _opp_remove(dev_opp, opp, true);
+                               _opp_remove(opp_table, opp, true);
                 }
         } else {
-               _remove_list_dev(_find_list_dev(dev, dev_opp), dev_opp);
+               _remove_opp_dev(_find_opp_dev(dev, opp_table), opp_table);
         }
  
  unlock:
-       mutex_unlock(&dev_opp_list_lock);
+       mutex_unlock(&opp_table_lock);
  }
  EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
  
@@ -1182,22 +1913,22 @@ struct device_node *_of_get_opp_desc_node(struct device *dev)
  static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np)
  {
         struct device_node *np;
-       struct device_opp *dev_opp;
+       struct opp_table *opp_table;
         int ret = 0, count = 0;
  
-       mutex_lock(&dev_opp_list_lock);
+       mutex_lock(&opp_table_lock);
  
-       dev_opp = _managed_opp(opp_np);
-       if (dev_opp) {
+       opp_table = _managed_opp(opp_np);
+       if (opp_table) {
                 /* OPPs are already managed */
-               if (!_add_list_dev(dev, dev_opp))
+               if (!_add_opp_dev(dev, opp_table))
                         ret = -ENOMEM;
-               mutex_unlock(&dev_opp_list_lock);
+               mutex_unlock(&opp_table_lock);
                 return ret;
         }
-       mutex_unlock(&dev_opp_list_lock);
+       mutex_unlock(&opp_table_lock);
  
-       /* We have opp-list node now, iterate over it and add OPPs */
+       /* We have opp-table node now, iterate over it and add OPPs */
         for_each_available_child_of_node(opp_np, np) {
                 count++;
  
@@ -1213,19 +1944,19 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np)
         if (WARN_ON(!count))
                 return -ENOENT;
  
-       mutex_lock(&dev_opp_list_lock);
+       mutex_lock(&opp_table_lock);
  
-       dev_opp = _find_device_opp(dev);
-       if (WARN_ON(IS_ERR(dev_opp))) {
-               ret = PTR_ERR(dev_opp);
-               mutex_unlock(&dev_opp_list_lock);
+       opp_table = _find_opp_table(dev);
+       if (WARN_ON(IS_ERR(opp_table))) {
+               ret = PTR_ERR(opp_table);
+               mutex_unlock(&opp_table_lock);
                 goto free_table;
         }
  
-       dev_opp->np = opp_np;
-       dev_opp->shared_opp = of_property_read_bool(opp_np, "opp-shared");
+       opp_table->np = opp_np;
+       opp_table->shared_opp = of_property_read_bool(opp_np, "opp-shared");
  
-       mutex_unlock(&dev_opp_list_lock);
+       mutex_unlock(&opp_table_lock);
  
         return 0;
  
@@ -1254,7 +1985,7 @@ static int _of_add_opp_table_v1(struct device *dev)
          */
         nr = prop->length / sizeof(u32);
         if (nr % 2) {
-               dev_err(dev, "%s: Invalid OPP list\n", __func__);
+               dev_err(dev, "%s: Invalid OPP table\n", __func__);
                 return -EINVAL;
         }
  
@@ -1274,11 +2005,11 @@ static int _of_add_opp_table_v1(struct device *dev)
  
  /**
   * dev_pm_opp_of_add_table() - Initialize opp table from device tree
- * @dev:       device pointer used to lookup device OPPs.
+ * @dev:       device pointer used to lookup OPP table.
   *
   * Register the initial OPP table with the OPP library for given device.
   *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
   * Hence this function indirectly uses RCU updater strategy with mutex locks
   * to keep the integrity of the internal data structures. Callers should ensure
   * that this function is *NOT* called under RCU protection or in contexts where
diff --git a/drivers/base/power/opp/cpu.c b/drivers/base/power/opp/cpu.c

index 7b445e88a0d559f091918be5e841f6a1380a33da..ba2bdbd932ef3c1ebaff47c6203bddb27fe9c03b 100644 (file)
--- a/drivers/base/power/opp/cpu.c
+++ b/drivers/base/power/opp/cpu.c
@@ -31,7 +31,7 @@
   * @table:     Cpufreq table returned back to caller
   *
   * Generate a cpufreq table for a provided device- this assumes that the
- * opp list is already initialized and ready for usage.
+ * opp table is already initialized and ready for usage.
   *
   * This function allocates required memory for the cpufreq table. It is
   * expected that the caller does the required maintenance such as freeing
@@ -44,7 +44,7 @@
   * WARNING: It is  important for the callers to ensure refreshing their copy of
   * the table if any of the mentioned functions have been invoked in the interim.
   *
- * Locking: The internal device_opp and opp structures are RCU protected.
+ * Locking: The internal opp_table and opp structures are RCU protected.
   * Since we just use the regular accessor functions to access the internal data
   * structures, we use RCU read lock inside this function. As a result, users of
   * this function DONOT need to use explicit locks for invoking.
@@ -122,15 +122,15 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_free_cpufreq_table);
  /* Required only for V1 bindings, as v2 can manage it from DT itself */
  int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, cpumask_var_t cpumask)
  {
-       struct device_list_opp *list_dev;
-       struct device_opp *dev_opp;
+       struct opp_device *opp_dev;
+       struct opp_table *opp_table;
         struct device *dev;
         int cpu, ret = 0;
  
-       mutex_lock(&dev_opp_list_lock);
+       mutex_lock(&opp_table_lock);
  
-       dev_opp = _find_device_opp(cpu_dev);
-       if (IS_ERR(dev_opp)) {
+       opp_table = _find_opp_table(cpu_dev);
+       if (IS_ERR(opp_table)) {
                 ret = -EINVAL;
                 goto unlock;
         }
@@ -146,15 +146,15 @@ int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, cpumask_var_t cpumask)
                         continue;
                 }
  
-               list_dev = _add_list_dev(dev, dev_opp);
-               if (!list_dev) {
-                       dev_err(dev, "%s: failed to add list-dev for cpu%d device\n",
+               opp_dev = _add_opp_dev(dev, opp_table);
+               if (!opp_dev) {
+                       dev_err(dev, "%s: failed to add opp-dev for cpu%d device\n",
                                 __func__, cpu);
                         continue;
                 }
         }
  unlock:
-       mutex_unlock(&dev_opp_list_lock);
+       mutex_unlock(&opp_table_lock);
  
         return ret;
  }
@@ -214,7 +214,6 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_add_table);
  /*
   * Works only for OPP v2 bindings.
   *
- * cpumask should be already set to mask of cpu_dev->id.
   * Returns -ENOENT if operating-points-v2 bindings aren't supported.
   */
  int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, cpumask_var_t cpumask)
@@ -230,6 +229,8 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, cpumask_var_t cpumask
                 return -ENOENT;
         }
  
+       cpumask_set_cpu(cpu_dev->id, cpumask);
+
         /* OPPs are shared ? */
         if (!of_property_read_bool(np, "opp-shared"))
                 goto put_cpu_node;
diff --git a/drivers/base/power/opp/debugfs.c b/drivers/base/power/opp/debugfs.c

new file mode 100644 (file)

index 0000000..ef1ae6b
--- /dev/null
+++ b/drivers/base/power/opp/debugfs.c
@@ -0,0 +1,218 @@
+/*
+ * Generic OPP debugfs interface
+ *
+ * Copyright (C) 2015-2016 Viresh Kumar <viresh.kumar@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/limits.h>
+
+#include "opp.h"
+
+static struct dentry *rootdir;
+
+static void opp_set_dev_name(const struct device *dev, char *name)
+{
+       if (dev->parent)
+               snprintf(name, NAME_MAX, "%s-%s", dev_name(dev->parent),
+                        dev_name(dev));
+       else
+               snprintf(name, NAME_MAX, "%s", dev_name(dev));
+}
+
+void opp_debug_remove_one(struct dev_pm_opp *opp)
+{
+       debugfs_remove_recursive(opp->dentry);
+}
+
+int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
+{
+       struct dentry *pdentry = opp_table->dentry;
+       struct dentry *d;
+       char name[25];  /* 20 chars for 64 bit value + 5 (opp:\0) */
+
+       /* Rate is unique to each OPP, use it to give opp-name */
+       snprintf(name, sizeof(name), "opp:%lu", opp->rate);
+
+       /* Create per-opp directory */
+       d = debugfs_create_dir(name, pdentry);
+       if (!d)
+               return -ENOMEM;
+
+       if (!debugfs_create_bool("available", S_IRUGO, d, &opp->available))
+               return -ENOMEM;
+
+       if (!debugfs_create_bool("dynamic", S_IRUGO, d, &opp->dynamic))
+               return -ENOMEM;
+
+       if (!debugfs_create_bool("turbo", S_IRUGO, d, &opp->turbo))
+               return -ENOMEM;
+
+       if (!debugfs_create_bool("suspend", S_IRUGO, d, &opp->suspend))
+               return -ENOMEM;
+
+       if (!debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate))
+               return -ENOMEM;
+
+       if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d, &opp->u_volt))
+               return -ENOMEM;
+
+       if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d, &opp->u_volt_min))
+               return -ENOMEM;
+
+       if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d, &opp->u_volt_max))
+               return -ENOMEM;
+
+       if (!debugfs_create_ulong("u_amp", S_IRUGO, d, &opp->u_amp))
+               return -ENOMEM;
+
+       if (!debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
+                                 &opp->clock_latency_ns))
+               return -ENOMEM;
+
+       opp->dentry = d;
+       return 0;
+}
+
+static int opp_list_debug_create_dir(struct opp_device *opp_dev,
+                                    struct opp_table *opp_table)
+{
+       const struct device *dev = opp_dev->dev;
+       struct dentry *d;
+
+       opp_set_dev_name(dev, opp_table->dentry_name);
+
+       /* Create device specific directory */
+       d = debugfs_create_dir(opp_table->dentry_name, rootdir);
+       if (!d) {
+               dev_err(dev, "%s: Failed to create debugfs dir\n", __func__);
+               return -ENOMEM;
+       }
+
+       opp_dev->dentry = d;
+       opp_table->dentry = d;
+
+       return 0;
+}
+
+static int opp_list_debug_create_link(struct opp_device *opp_dev,
+                                     struct opp_table *opp_table)
+{
+       const struct device *dev = opp_dev->dev;
+       char name[NAME_MAX];
+       struct dentry *d;
+
+       opp_set_dev_name(opp_dev->dev, name);
+
+       /* Create device specific directory link */
+       d = debugfs_create_symlink(name, rootdir, opp_table->dentry_name);
+       if (!d) {
+               dev_err(dev, "%s: Failed to create link\n", __func__);
+               return -ENOMEM;
+       }
+
+       opp_dev->dentry = d;
+
+       return 0;
+}
+
+/**
+ * opp_debug_register - add a device opp node to the debugfs 'opp' directory
+ * @opp_dev: opp-dev pointer for device
+ * @opp_table: the device-opp being added
+ *
+ * Dynamically adds device specific directory in debugfs 'opp' directory. If the
+ * device-opp is shared with other devices, then links will be created for all
+ * devices except the first.
+ *
+ * Return: 0 on success, otherwise negative error.
+ */
+int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table)
+{
+       if (!rootdir) {
+               pr_debug("%s: Uninitialized rootdir\n", __func__);
+               return -EINVAL;
+       }
+
+       if (opp_table->dentry)
+               return opp_list_debug_create_link(opp_dev, opp_table);
+
+       return opp_list_debug_create_dir(opp_dev, opp_table);
+}
+
+static void opp_migrate_dentry(struct opp_device *opp_dev,
+                              struct opp_table *opp_table)
+{
+       struct opp_device *new_dev;
+       const struct device *dev;
+       struct dentry *dentry;
+
+       /* Look for next opp-dev */
+       list_for_each_entry(new_dev, &opp_table->dev_list, node)
+               if (new_dev != opp_dev)
+                       break;
+
+       /* new_dev is guaranteed to be valid here */
+       dev = new_dev->dev;
+       debugfs_remove_recursive(new_dev->dentry);
+
+       opp_set_dev_name(dev, opp_table->dentry_name);
+
+       dentry = debugfs_rename(rootdir, opp_dev->dentry, rootdir,
+                               opp_table->dentry_name);
+       if (!dentry) {
+               dev_err(dev, "%s: Failed to rename link from: %s to %s\n",
+                       __func__, dev_name(opp_dev->dev), dev_name(dev));
+               return;
+       }
+
+       new_dev->dentry = dentry;
+       opp_table->dentry = dentry;
+}
+
+/**
+ * opp_debug_unregister - remove a device opp node from debugfs opp directory
+ * @opp_dev: opp-dev pointer for device
+ * @opp_table: the device-opp being removed
+ *
+ * Dynamically removes device specific directory from debugfs 'opp' directory.
+ */
+void opp_debug_unregister(struct opp_device *opp_dev,
+                         struct opp_table *opp_table)
+{
+       if (opp_dev->dentry == opp_table->dentry) {
+               /* Move the real dentry object under another device */
+               if (!list_is_singular(&opp_table->dev_list)) {
+                       opp_migrate_dentry(opp_dev, opp_table);
+                       goto out;
+               }
+               opp_table->dentry = NULL;
+       }
+
+       debugfs_remove_recursive(opp_dev->dentry);
+
+out:
+       opp_dev->dentry = NULL;
+}
+
+static int __init opp_debug_init(void)
+{
+       /* Create /sys/kernel/debug/opp directory */
+       rootdir = debugfs_create_dir("opp", NULL);
+       if (!rootdir) {
+               pr_err("%s: Failed to create root directory\n", __func__);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+core_initcall(opp_debug_init);
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h

index 7366b2aa8997897f89890cf99a4990d83e61a5a6..f67f806fcf3ae8f13866336cdc54958bd57f59b9 100644 (file)
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -17,17 +17,21 @@
  #include <linux/device.h>
  #include <linux/kernel.h>
  #include <linux/list.h>
+#include <linux/limits.h>
  #include <linux/pm_opp.h>
  #include <linux/rculist.h>
  #include <linux/rcupdate.h>
  
+struct clk;
+struct regulator;
+
  /* Lock to allow exclusive modification to the device and opp lists */
-extern struct mutex dev_opp_list_lock;
+extern struct mutex opp_table_lock;
  
  /*
   * Internal data structure organization with the OPP layer library is as
   * follows:
- * dev_opp_list (root)
+ * opp_tables (root)
   *     |- device 1 (represents voltage domain 1)
   *     |       |- opp 1 (availability, freq, voltage)
   *     |       |- opp 2 ..
@@ -36,23 +40,24 @@ extern struct mutex dev_opp_list_lock;
   *     |- device 2 (represents the next voltage domain)
   *     ...
   *     `- device m (represents mth voltage domain)
- * device 1, 2.. are represented by dev_opp structure while each opp
+ * device 1, 2.. are represented by opp_table structure while each opp
   * is represented by the opp structure.
   */
  
  /**
   * struct dev_pm_opp - Generic OPP description structure
- * @node:      opp list node. The nodes are maintained throughout the lifetime
+ * @node:      opp table node. The nodes are maintained throughout the lifetime
   *             of boot. It is expected only an optimal set of OPPs are
   *             added to the library by the SoC framework.
- *             RCU usage: opp list is traversed with RCU locks. node
+ *             RCU usage: opp table is traversed with RCU locks. node
   *             modification is possible realtime, hence the modifications
- *             are protected by the dev_opp_list_lock for integrity.
+ *             are protected by the opp_table_lock for integrity.
   *             IMPORTANT: the opp nodes should be maintained in increasing
   *             order.
- * @dynamic:   not-created from static DT entries.
   * @available: true/false - marks if this OPP as available or not
+ * @dynamic:   not-created from static DT entries.
   * @turbo:     true if turbo (boost) OPP
+ * @suspend:   true if suspend OPP
   * @rate:      Frequency in hertz
   * @u_volt:    Target voltage in microvolts corresponding to this OPP
   * @u_volt_min:        Minimum voltage in microvolts corresponding to this OPP
@@ -60,9 +65,10 @@ extern struct mutex dev_opp_list_lock;
   * @u_amp:     Maximum current drawn by the device in microamperes
   * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's
   *             frequency from any other OPP's frequency.
- * @dev_opp:   points back to the device_opp struct this opp belongs to
+ * @opp_table: points back to the opp_table struct this opp belongs to
   * @rcu_head:  RCU callback head used for deferred freeing
   * @np:                OPP's device node.
+ * @dentry:    debugfs dentry pointer (per opp)
   *
   * This structure stores the OPP information for a given device.
   */
@@ -72,6 +78,7 @@ struct dev_pm_opp {
         bool available;
         bool dynamic;
         bool turbo;
+       bool suspend;
         unsigned long rate;
  
         unsigned long u_volt;
@@ -80,40 +87,60 @@ struct dev_pm_opp {
         unsigned long u_amp;
         unsigned long clock_latency_ns;
  
-       struct device_opp *dev_opp;
+       struct opp_table *opp_table;
         struct rcu_head rcu_head;
  
         struct device_node *np;
+
+#ifdef CONFIG_DEBUG_FS
+       struct dentry *dentry;
+#endif
  };
  
  /**
- * struct device_list_opp - devices managed by 'struct device_opp'
+ * struct opp_device - devices managed by 'struct opp_table'
   * @node:      list node
   * @dev:       device to which the struct object belongs
   * @rcu_head:  RCU callback head used for deferred freeing
+ * @dentry:    debugfs dentry pointer (per device)
   *
- * This is an internal data structure maintaining the list of devices that are
- * managed by 'struct device_opp'.
+ * This is an internal data structure maintaining the devices that are managed
+ * by 'struct opp_table'.
   */
-struct device_list_opp {
+struct opp_device {
         struct list_head node;
         const struct device *dev;
         struct rcu_head rcu_head;
+
+#ifdef CONFIG_DEBUG_FS
+       struct dentry *dentry;
+#endif
  };
  
  /**
- * struct device_opp - Device opp structure
- * @node:      list node - contains the devices with OPPs that
+ * struct opp_table - Device opp structure
+ * @node:      table node - contains the devices with OPPs that
   *             have been registered. Nodes once added are not modified in this
- *             list.
- *             RCU usage: nodes are not modified in the list of device_opp,
- *             however addition is possible and is secured by dev_opp_list_lock
+ *             table.
+ *             RCU usage: nodes are not modified in the table of opp_table,
+ *             however addition is possible and is secured by opp_table_lock
   * @srcu_head: notifier head to notify the OPP availability changes.
   * @rcu_head:  RCU callback head used for deferred freeing
   * @dev_list:  list of devices that share these OPPs
- * @opp_list:  list of opps
+ * @opp_list:  table of opps
   * @np:                struct device_node pointer for opp's DT node.
+ * @clock_latency_ns_max: Max clock latency in nanoseconds.
   * @shared_opp: OPP is shared between multiple devices.
+ * @suspend_opp: Pointer to OPP to be used during device suspend.
+ * @supported_hw: Array of version number to support.
+ * @supported_hw_count: Number of elements in supported_hw array.
+ * @prop_name: A name to postfix to many DT properties, while parsing them.
+ * @clk: Device's clock handle
+ * @regulator: Supply regulator
+ * @dentry:    debugfs dentry pointer of the real device directory (not links).
+ * @dentry_name: Name of the real dentry.
+ *
+ * @voltage_tolerance_v1: In percentage, for v1 bindings only.
   *
   * This is an internal data structure maintaining the link to opps attached to
   * a device. This structure is not meant to be shared to users as it is
@@ -123,7 +150,7 @@ struct device_list_opp {
   * need to wait for the grace period of both of them before freeing any
   * resources. And so we have used kfree_rcu() from within call_srcu() handlers.
   */
-struct device_opp {
+struct opp_table {
         struct list_head node;
  
         struct srcu_notifier_head srcu_head;
@@ -133,14 +160,48 @@ struct device_opp {
  
         struct device_node *np;
         unsigned long clock_latency_ns_max;
+
+       /* For backward compatibility with v1 bindings */
+       unsigned int voltage_tolerance_v1;
+
         bool shared_opp;
         struct dev_pm_opp *suspend_opp;
+
+       unsigned int *supported_hw;
+       unsigned int supported_hw_count;
+       const char *prop_name;
+       struct clk *clk;
+       struct regulator *regulator;
+
+#ifdef CONFIG_DEBUG_FS
+       struct dentry *dentry;
+       char dentry_name[NAME_MAX];
+#endif
  };
  
  /* Routines internal to opp core */
-struct device_opp *_find_device_opp(struct device *dev);
-struct device_list_opp *_add_list_dev(const struct device *dev,
-                                     struct device_opp *dev_opp);
+struct opp_table *_find_opp_table(struct device *dev);
+struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table);
  struct device_node *_of_get_opp_desc_node(struct device *dev);
  
+#ifdef CONFIG_DEBUG_FS
+void opp_debug_remove_one(struct dev_pm_opp *opp);
+int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table);
+int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table);
+void opp_debug_unregister(struct opp_device *opp_dev, struct opp_table *opp_table);
+#else
+static inline void opp_debug_remove_one(struct dev_pm_opp *opp) {}
+
+static inline int opp_debug_create_one(struct dev_pm_opp *opp,
+                                      struct opp_table *opp_table)
+{ return 0; }
+static inline int opp_debug_register(struct opp_device *opp_dev,
+                                    struct opp_table *opp_table)
+{ return 0; }
+
+static inline void opp_debug_unregister(struct opp_device *opp_dev,
+                                       struct opp_table *opp_table)
+{ }
+#endif         /* DEBUG_FS */
+
  #endif         /* __DRIVER_OPP_H__ */
diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c

index 90d64081ddb34ee8ba7a06372a269defdcf07a97..f951f911786e086b2b6dc9d615018eb235347dbe 100644 (file)
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -31,9 +31,8 @@
  
  struct private_data {
         struct device *cpu_dev;
-       struct regulator *cpu_reg;
         struct thermal_cooling_device *cdev;
-       unsigned int voltage_tolerance; /* in percentage */
+       const char *reg_name;
  };
  
  static struct freq_attr *cpufreq_dt_attr[] = {
@@ -44,175 +43,128 @@ static struct freq_attr *cpufreq_dt_attr[] = {
  
  static int set_target(struct cpufreq_policy *policy, unsigned int index)
  {
-       struct dev_pm_opp *opp;
-       struct cpufreq_frequency_table *freq_table = policy->freq_table;
-       struct clk *cpu_clk = policy->clk;
         struct private_data *priv = policy->driver_data;
-       struct device *cpu_dev = priv->cpu_dev;
-       struct regulator *cpu_reg = priv->cpu_reg;
-       unsigned long volt = 0, volt_old = 0, tol = 0;
-       unsigned int old_freq, new_freq;
-       long freq_Hz, freq_exact;
-       int ret;
-
-       freq_Hz = clk_round_rate(cpu_clk, freq_table[index].frequency * 1000);
-       if (freq_Hz <= 0)
-               freq_Hz = freq_table[index].frequency * 1000;
  
-       freq_exact = freq_Hz;
-       new_freq = freq_Hz / 1000;
-       old_freq = clk_get_rate(cpu_clk) / 1000;
+       return dev_pm_opp_set_rate(priv->cpu_dev,
+                                  policy->freq_table[index].frequency * 1000);
+}
  
-       if (!IS_ERR(cpu_reg)) {
-               unsigned long opp_freq;
+/*
+ * An earlier version of opp-v1 bindings used to name the regulator
+ * "cpu0-supply", we still need to handle that for backwards compatibility.
+ */
+static const char *find_supply_name(struct device *dev)
+{
+       struct device_node *np;
+       struct property *pp;
+       int cpu = dev->id;
+       const char *name = NULL;
  
-               rcu_read_lock();
-               opp = dev_pm_opp_find_freq_ceil(cpu_dev, &freq_Hz);
-               if (IS_ERR(opp)) {
-                       rcu_read_unlock();
-                       dev_err(cpu_dev, "failed to find OPP for %ld\n",
-                               freq_Hz);
-                       return PTR_ERR(opp);
-               }
-               volt = dev_pm_opp_get_voltage(opp);
-               opp_freq = dev_pm_opp_get_freq(opp);
-               rcu_read_unlock();
-               tol = volt * priv->voltage_tolerance / 100;
-               volt_old = regulator_get_voltage(cpu_reg);
-               dev_dbg(cpu_dev, "Found OPP: %ld kHz, %ld uV\n",
-                       opp_freq / 1000, volt);
-       }
+       np = of_node_get(dev->of_node);
  
-       dev_dbg(cpu_dev, "%u MHz, %ld mV --> %u MHz, %ld mV\n",
-               old_freq / 1000, (volt_old > 0) ? volt_old / 1000 : -1,
-               new_freq / 1000, volt ? volt / 1000 : -1);
+       /* This must be valid for sure */
+       if (WARN_ON(!np))
+               return NULL;
  
-       /* scaling up?  scale voltage before frequency */
-       if (!IS_ERR(cpu_reg) && new_freq > old_freq) {
-               ret = regulator_set_voltage_tol(cpu_reg, volt, tol);
-               if (ret) {
-                       dev_err(cpu_dev, "failed to scale voltage up: %d\n",
-                               ret);
-                       return ret;
+       /* Try "cpu0" for older DTs */
+       if (!cpu) {
+               pp = of_find_property(np, "cpu0-supply", NULL);
+               if (pp) {
+                       name = "cpu0";
+                       goto node_put;
                 }
         }
  
-       ret = clk_set_rate(cpu_clk, freq_exact);
-       if (ret) {
-               dev_err(cpu_dev, "failed to set clock rate: %d\n", ret);
-               if (!IS_ERR(cpu_reg) && volt_old > 0)
-                       regulator_set_voltage_tol(cpu_reg, volt_old, tol);
-               return ret;
+       pp = of_find_property(np, "cpu-supply", NULL);
+       if (pp) {
+               name = "cpu";
+               goto node_put;
         }
  
-       /* scaling down?  scale voltage after frequency */
-       if (!IS_ERR(cpu_reg) && new_freq < old_freq) {
-               ret = regulator_set_voltage_tol(cpu_reg, volt, tol);
-               if (ret) {
-                       dev_err(cpu_dev, "failed to scale voltage down: %d\n",
-                               ret);
-                       clk_set_rate(cpu_clk, old_freq * 1000);
-               }
-       }
-
-       return ret;
+       dev_dbg(dev, "no regulator for cpu%d\n", cpu);
+node_put:
+       of_node_put(np);
+       return name;
  }
  
-static int allocate_resources(int cpu, struct device **cdev,
-                             struct regulator **creg, struct clk **cclk)
+static int resources_available(void)
  {
         struct device *cpu_dev;
         struct regulator *cpu_reg;
         struct clk *cpu_clk;
         int ret = 0;
-       char *reg_cpu0 = "cpu0", *reg_cpu = "cpu", *reg;
+       const char *name;
  
-       cpu_dev = get_cpu_device(cpu);
+       cpu_dev = get_cpu_device(0);
         if (!cpu_dev) {
-               pr_err("failed to get cpu%d device\n", cpu);
+               pr_err("failed to get cpu0 device\n");
                 return -ENODEV;
         }
  
-       /* Try "cpu0" for older DTs */
-       if (!cpu)
-               reg = reg_cpu0;
-       else
-               reg = reg_cpu;
-
-try_again:
-       cpu_reg = regulator_get_optional(cpu_dev, reg);
-       if (IS_ERR(cpu_reg)) {
+       cpu_clk = clk_get(cpu_dev, NULL);
+       ret = PTR_ERR_OR_ZERO(cpu_clk);
+       if (ret) {
                 /*
-                * If cpu's regulator supply node is present, but regulator is
-                * not yet registered, we should try defering probe.
+                * If cpu's clk node is present, but clock is not yet
+                * registered, we should try defering probe.
                  */
-               if (PTR_ERR(cpu_reg) == -EPROBE_DEFER) {
-                       dev_dbg(cpu_dev, "cpu%d regulator not ready, retry\n",
-                               cpu);
-                       return -EPROBE_DEFER;
-               }
-
-               /* Try with "cpu-supply" */
-               if (reg == reg_cpu0) {
-                       reg = reg_cpu;
-                       goto try_again;
-               }
+               if (ret == -EPROBE_DEFER)
+                       dev_dbg(cpu_dev, "clock not ready, retry\n");
+               else
+                       dev_err(cpu_dev, "failed to get clock: %d\n", ret);
  
-               dev_dbg(cpu_dev, "no regulator for cpu%d: %ld\n",
-                       cpu, PTR_ERR(cpu_reg));
+               return ret;
         }
  
-       cpu_clk = clk_get(cpu_dev, NULL);
-       if (IS_ERR(cpu_clk)) {
-               /* put regulator */
-               if (!IS_ERR(cpu_reg))
-                       regulator_put(cpu_reg);
+       clk_put(cpu_clk);
  
-               ret = PTR_ERR(cpu_clk);
+       name = find_supply_name(cpu_dev);
+       /* Platform doesn't require regulator */
+       if (!name)
+               return 0;
  
+       cpu_reg = regulator_get_optional(cpu_dev, name);
+       ret = PTR_ERR_OR_ZERO(cpu_reg);
+       if (ret) {
                 /*
-                * If cpu's clk node is present, but clock is not yet
-                * registered, we should try defering probe.
+                * If cpu's regulator supply node is present, but regulator is
+                * not yet registered, we should try defering probe.
                  */
                 if (ret == -EPROBE_DEFER)
-                       dev_dbg(cpu_dev, "cpu%d clock not ready, retry\n", cpu);
+                       dev_dbg(cpu_dev, "cpu0 regulator not ready, retry\n");
                 else
-                       dev_err(cpu_dev, "failed to get cpu%d clock: %d\n", cpu,
-                               ret);
-       } else {
-               *cdev = cpu_dev;
-               *creg = cpu_reg;
-               *cclk = cpu_clk;
+                       dev_dbg(cpu_dev, "no regulator for cpu0: %d\n", ret);
+
+               return ret;
         }
  
-       return ret;
+       regulator_put(cpu_reg);
+       return 0;
  }
  
  static int cpufreq_init(struct cpufreq_policy *policy)
  {
         struct cpufreq_frequency_table *freq_table;
-       struct device_node *np;
         struct private_data *priv;
         struct device *cpu_dev;
-       struct regulator *cpu_reg;
         struct clk *cpu_clk;
         struct dev_pm_opp *suspend_opp;
-       unsigned long min_uV = ~0, max_uV = 0;
         unsigned int transition_latency;
-       bool need_update = false;
+       bool opp_v1 = false;
+       const char *name;
         int ret;
  
-       ret = allocate_resources(policy->cpu, &cpu_dev, &cpu_reg, &cpu_clk);
-       if (ret) {
-               pr_err("%s: Failed to allocate resources: %d\n", __func__, ret);
-               return ret;
+       cpu_dev = get_cpu_device(policy->cpu);
+       if (!cpu_dev) {
+               pr_err("failed to get cpu%d device\n", policy->cpu);
+               return -ENODEV;
         }
  
-       np = of_node_get(cpu_dev->of_node);
-       if (!np) {
-               dev_err(cpu_dev, "failed to find cpu%d node\n", policy->cpu);
-               ret = -ENOENT;
-               goto out_put_reg_clk;
+       cpu_clk = clk_get(cpu_dev, NULL);
+       if (IS_ERR(cpu_clk)) {
+               ret = PTR_ERR(cpu_clk);
+               dev_err(cpu_dev, "%s: failed to get clk: %d\n", __func__, ret);
+               return ret;
         }
  
         /* Get OPP-sharing information from "operating-points-v2" bindings */
@@ -223,9 +175,23 @@ static int cpufreq_init(struct cpufreq_policy *policy)
                  * finding shared-OPPs for backward compatibility.
                  */
                 if (ret == -ENOENT)
-                       need_update = true;
+                       opp_v1 = true;
                 else
-                       goto out_node_put;
+                       goto out_put_clk;
+       }
+
+       /*
+        * OPP layer will be taking care of regulators now, but it needs to know
+        * the name of the regulator first.
+        */
+       name = find_supply_name(cpu_dev);
+       if (name) {
+               ret = dev_pm_opp_set_regulator(cpu_dev, name);
+               if (ret) {
+                       dev_err(cpu_dev, "Failed to set regulator for cpu%d: %d\n",
+                               policy->cpu, ret);
+                       goto out_put_clk;
+               }
         }
  
         /*
@@ -246,12 +212,12 @@ static int cpufreq_init(struct cpufreq_policy *policy)
          */
         ret = dev_pm_opp_get_opp_count(cpu_dev);
         if (ret <= 0) {
-               pr_debug("OPP table is not ready, deferring probe\n");
+               dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n");
                 ret = -EPROBE_DEFER;
                 goto out_free_opp;
         }
  
-       if (need_update) {
+       if (opp_v1) {
                 struct cpufreq_dt_platform_data *pd = cpufreq_get_driver_data();
  
                 if (!pd || !pd->independent_clocks)
@@ -265,10 +231,6 @@ static int cpufreq_init(struct cpufreq_policy *policy)
                 if (ret)
                         dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
                                 __func__, ret);
-
-               of_property_read_u32(np, "clock-latency", &transition_latency);
-       } else {
-               transition_latency = dev_pm_opp_get_max_clock_latency(cpu_dev);
         }
  
         priv = kzalloc(sizeof(*priv), GFP_KERNEL);
@@ -277,62 +239,16 @@ static int cpufreq_init(struct cpufreq_policy *policy)
                 goto out_free_opp;
         }
  
-       of_property_read_u32(np, "voltage-tolerance", &priv->voltage_tolerance);
-
-       if (!transition_latency)
-               transition_latency = CPUFREQ_ETERNAL;
-
-       if (!IS_ERR(cpu_reg)) {
-               unsigned long opp_freq = 0;
-
-               /*
-                * Disable any OPPs where the connected regulator isn't able to
-                * provide the specified voltage and record minimum and maximum
-                * voltage levels.
-                */
-               while (1) {
-                       struct dev_pm_opp *opp;
-                       unsigned long opp_uV, tol_uV;
-
-                       rcu_read_lock();
-                       opp = dev_pm_opp_find_freq_ceil(cpu_dev, &opp_freq);
-                       if (IS_ERR(opp)) {
-                               rcu_read_unlock();
-                               break;
-                       }
-                       opp_uV = dev_pm_opp_get_voltage(opp);
-                       rcu_read_unlock();
-
-                       tol_uV = opp_uV * priv->voltage_tolerance / 100;
-                       if (regulator_is_supported_voltage(cpu_reg,
-                                                          opp_uV - tol_uV,
-                                                          opp_uV + tol_uV)) {
-                               if (opp_uV < min_uV)
-                                       min_uV = opp_uV;
-                               if (opp_uV > max_uV)
-                                       max_uV = opp_uV;
-                       } else {
-                               dev_pm_opp_disable(cpu_dev, opp_freq);
-                       }
-
-                       opp_freq++;
-               }
-
-               ret = regulator_set_voltage_time(cpu_reg, min_uV, max_uV);
-               if (ret > 0)
-                       transition_latency += ret * 1000;
-       }
+       priv->reg_name = name;
  
         ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
         if (ret) {
-               pr_err("failed to init cpufreq table: %d\n", ret);
+               dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret);
                 goto out_free_priv;
         }
  
         priv->cpu_dev = cpu_dev;
-       priv->cpu_reg = cpu_reg;
         policy->driver_data = priv;
-
         policy->clk = cpu_clk;
  
         rcu_read_lock();
@@ -357,9 +273,11 @@ static int cpufreq_init(struct cpufreq_policy *policy)
                 cpufreq_dt_attr[1] = &cpufreq_freq_attr_scaling_boost_freqs;
         }
  
-       policy->cpuinfo.transition_latency = transition_latency;
+       transition_latency = dev_pm_opp_get_max_transition_latency(cpu_dev);
+       if (!transition_latency)
+               transition_latency = CPUFREQ_ETERNAL;
  
-       of_node_put(np);
+       policy->cpuinfo.transition_latency = transition_latency;
  
         return 0;
  
@@ -369,12 +287,10 @@ out_free_priv:
         kfree(priv);
  out_free_opp:
         dev_pm_opp_of_cpumask_remove_table(policy->cpus);
-out_node_put:
-       of_node_put(np);
-out_put_reg_clk:
+       if (name)
+               dev_pm_opp_put_regulator(cpu_dev);
+out_put_clk:
         clk_put(cpu_clk);
-       if (!IS_ERR(cpu_reg))
-               regulator_put(cpu_reg);
  
         return ret;
  }
@@ -386,9 +302,10 @@ static int cpufreq_exit(struct cpufreq_policy *policy)
         cpufreq_cooling_unregister(priv->cdev);
         dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table);
         dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
+       if (priv->reg_name)
+               dev_pm_opp_put_regulator(priv->cpu_dev);
+
         clk_put(policy->clk);
-       if (!IS_ERR(priv->cpu_reg))
-               regulator_put(priv->cpu_reg);
         kfree(priv);
  
         return 0;
@@ -407,8 +324,13 @@ static void cpufreq_ready(struct cpufreq_policy *policy)
          * thermal DT code takes care of matching them.
          */
         if (of_find_property(np, "#cooling-cells", NULL)) {
-               priv->cdev = of_cpufreq_cooling_register(np,
-                                                        policy->related_cpus);
+               u32 power_coefficient = 0;
+
+               of_property_read_u32(np, "dynamic-power-coefficient",
+                                    &power_coefficient);
+
+               priv->cdev = of_cpufreq_power_cooling_register(np,
+                               policy->related_cpus, power_coefficient, NULL);
                 if (IS_ERR(priv->cdev)) {
                         dev_err(priv->cpu_dev,
                                 "running cpufreq without cooling device: %ld\n",
@@ -436,9 +358,6 @@ static struct cpufreq_driver dt_cpufreq_driver = {
  
  static int dt_cpufreq_probe(struct platform_device *pdev)
  {
-       struct device *cpu_dev;
-       struct regulator *cpu_reg;
-       struct clk *cpu_clk;
         int ret;
  
         /*
@@ -448,19 +367,15 @@ static int dt_cpufreq_probe(struct platform_device *pdev)
          *
          * FIXME: Is checking this only for CPU0 sufficient ?
          */
-       ret = allocate_resources(0, &cpu_dev, &cpu_reg, &cpu_clk);
+       ret = resources_available();
         if (ret)
                 return ret;
  
-       clk_put(cpu_clk);
-       if (!IS_ERR(cpu_reg))
-               regulator_put(cpu_reg);
-
         dt_cpufreq_driver.driver_data = dev_get_platdata(&pdev->dev);
  
         ret = cpufreq_register_driver(&dt_cpufreq_driver);
         if (ret)
-               dev_err(cpu_dev, "failed register driver: %d\n", ret);
+               dev_err(&pdev->dev, "failed register driver: %d\n", ret);
  
         return ret;
  }
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig

index cf478fe6b335bc2cde8da7ae7f6695f9576f38ef..49a3a1185bb607ed45297fe4dea5ff2c6344f37f 100644 (file)
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -173,6 +173,9 @@ config QCOM_SCM_64
         def_bool y
         depends on QCOM_SCM && ARM64
  
+config HAVE_ARM_SMCCC
+       bool
+
  source "drivers/firmware/broadcom/Kconfig"
  source "drivers/firmware/google/Kconfig"
  source "drivers/firmware/efi/Kconfig"
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile

index 3c0467d3688cff14df877fea66d61c3fbb3279bd..c4098748e1fe8d5096a7161c2b08e54f80ae1d71 100644 (file)
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -8,7 +8,7 @@ cflags-$(CONFIG_X86_32)         := -march=i386
  cflags-$(CONFIG_X86_64)                := -mcmodel=small
  cflags-$(CONFIG_X86)           += -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 \
                                    -fPIC -fno-strict-aliasing -mno-red-zone \
-                                  -mno-mmx -mno-sse -DDISABLE_BRANCH_PROFILING
+                                  -mno-mmx -mno-sse
  
  cflags-$(CONFIG_ARM64)         := $(subst -pg,,$(KBUILD_CFLAGS))
  cflags-$(CONFIG_ARM)           := $(subst -pg,,$(KBUILD_CFLAGS)) \
@@ -16,7 +16,7 @@ cflags-$(CONFIG_ARM)          := $(subst -pg,,$(KBUILD_CFLAGS)) \
  
  cflags-$(CONFIG_EFI_ARMSTUB)   += -I$(srctree)/scripts/dtc/libfdt
  
-KBUILD_CFLAGS                  := $(cflags-y) \
+KBUILD_CFLAGS                  := $(cflags-y) -DDISABLE_BRANCH_PROFILING \
                                    $(call cc-option,-ffreestanding) \
                                    $(call cc-option,-fno-stack-protector)
  
@@ -34,7 +34,8 @@ $(obj)/lib-%.o: $(srctree)/lib/%.c FORCE
  lib-$(CONFIG_EFI_ARMSTUB)      += arm-stub.o fdt.o string.o \
                                    $(patsubst %.c,lib-%.o,$(arm-deps))
  
-lib-$(CONFIG_ARM64)            += arm64-stub.o
+lib-$(CONFIG_ARM)              += arm32-stub.o
+lib-$(CONFIG_ARM64)            += arm64-stub.o random.o
  CFLAGS_arm64-stub.o            := -DTEXT_OFFSET=$(TEXT_OFFSET)
  
  #
diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c

index 950c87f5d279335210088e4154eda135b24304d5..d5aa1d16154f5cb100e865a1bfbb8e0e49fb542c 100644 (file)
--- a/drivers/firmware/efi/libstub/arm-stub.c
+++ b/drivers/firmware/efi/libstub/arm-stub.c
@@ -18,6 +18,8 @@
  
  #include "efistub.h"
  
+bool __nokaslr;
+
  static int efi_secureboot_enabled(efi_system_table_t *sys_table_arg)
  {
         static efi_guid_t const var_guid = EFI_GLOBAL_VARIABLE_GUID;
@@ -207,14 +209,6 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
                 pr_efi_err(sys_table, "Failed to find DRAM base\n");
                 goto fail;
         }
-       status = handle_kernel_image(sys_table, image_addr, &image_size,
-                                    &reserve_addr,
-                                    &reserve_size,
-                                    dram_base, image);
-       if (status != EFI_SUCCESS) {
-               pr_efi_err(sys_table, "Failed to relocate kernel\n");
-               goto fail;
-       }
  
         /*
          * Get the command line from EFI, using the LOADED_IMAGE
@@ -224,7 +218,28 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
         cmdline_ptr = efi_convert_cmdline(sys_table, image, &cmdline_size);
         if (!cmdline_ptr) {
                 pr_efi_err(sys_table, "getting command line via LOADED_IMAGE_PROTOCOL\n");
-               goto fail_free_image;
+               goto fail;
+       }
+
+       /* check whether 'nokaslr' was passed on the command line */
+       if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
+               static const u8 default_cmdline[] = CONFIG_CMDLINE;
+               const u8 *str, *cmdline = cmdline_ptr;
+
+               if (IS_ENABLED(CONFIG_CMDLINE_FORCE))
+                       cmdline = default_cmdline;
+               str = strstr(cmdline, "nokaslr");
+               if (str == cmdline || (str > cmdline && *(str - 1) == ' '))
+                       __nokaslr = true;
+       }
+
+       status = handle_kernel_image(sys_table, image_addr, &image_size,
+                                    &reserve_addr,
+                                    &reserve_size,
+                                    dram_base, image);
+       if (status != EFI_SUCCESS) {
+               pr_efi_err(sys_table, "Failed to relocate kernel\n");
+               goto fail_free_cmdline;
         }
  
         status = efi_parse_options(cmdline_ptr);
@@ -244,7 +259,7 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
  
                 if (status != EFI_SUCCESS) {
                         pr_efi_err(sys_table, "Failed to load device tree!\n");
-                       goto fail_free_cmdline;
+                       goto fail_free_image;
                 }
         }
  
@@ -286,12 +301,11 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
         efi_free(sys_table, initrd_size, initrd_addr);
         efi_free(sys_table, fdt_size, fdt_addr);
  
-fail_free_cmdline:
-       efi_free(sys_table, cmdline_size, (unsigned long)cmdline_ptr);
-
  fail_free_image:
         efi_free(sys_table, image_size, *image_addr);
         efi_free(sys_table, reserve_size, reserve_addr);
+fail_free_cmdline:
+       efi_free(sys_table, cmdline_size, (unsigned long)cmdline_ptr);
  fail:
         return EFI_ERROR;
  }
diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c

index 78dfbd34b6bffd2fa36312da89dc6ca43f036c3c..377d935a33803402c0c8a8d1085023e364368713 100644 (file)
--- a/drivers/firmware/efi/libstub/arm64-stub.c
+++ b/drivers/firmware/efi/libstub/arm64-stub.c
@@ -13,6 +13,10 @@
  #include <asm/efi.h>
  #include <asm/sections.h>
  
+#include "efistub.h"
+
+extern bool __nokaslr;
+
  efi_status_t __init handle_kernel_image(efi_system_table_t *sys_table_arg,
                                         unsigned long *image_addr,
                                         unsigned long *image_size,
@@ -23,26 +27,61 @@ efi_status_t __init handle_kernel_image(efi_system_table_t *sys_table_arg,
  {
         efi_status_t status;
         unsigned long kernel_size, kernel_memsize = 0;
-       unsigned long nr_pages;
         void *old_image_addr = (void *)*image_addr;
         unsigned long preferred_offset;
+       u64 phys_seed = 0;
+
+       if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
+               if (!__nokaslr) {
+                       status = efi_get_random_bytes(sys_table_arg,
+                                                     sizeof(phys_seed),
+                                                     (u8 *)&phys_seed);
+                       if (status == EFI_NOT_FOUND) {
+                               pr_efi(sys_table_arg, "EFI_RNG_PROTOCOL unavailable, no randomness supplied\n");
+                       } else if (status != EFI_SUCCESS) {
+                               pr_efi_err(sys_table_arg, "efi_get_random_bytes() failed\n");
+                               return status;
+                       }
+               } else {
+                       pr_efi(sys_table_arg, "KASLR disabled on kernel command line\n");
+               }
+       }
  
         /*
          * The preferred offset of the kernel Image is TEXT_OFFSET bytes beyond
          * a 2 MB aligned base, which itself may be lower than dram_base, as
          * long as the resulting offset equals or exceeds it.
          */
-       preferred_offset = round_down(dram_base, SZ_2M) + TEXT_OFFSET;
+       preferred_offset = round_down(dram_base, MIN_KIMG_ALIGN) + TEXT_OFFSET;
         if (preferred_offset < dram_base)
-               preferred_offset += SZ_2M;
+               preferred_offset += MIN_KIMG_ALIGN;
  
-       /* Relocate the image, if required. */
         kernel_size = _edata - _text;
-       if (*image_addr != preferred_offset) {
-               kernel_memsize = kernel_size + (_end - _edata);
+       kernel_memsize = kernel_size + (_end - _edata);
+
+       if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && phys_seed != 0) {
+               /*
+                * If CONFIG_DEBUG_ALIGN_RODATA is not set, produce a
+                * displacement in the interval [0, MIN_KIMG_ALIGN) that
+                * is a multiple of the minimal segment alignment (SZ_64K)
+                */
+               u32 mask = (MIN_KIMG_ALIGN - 1) & ~(SZ_64K - 1);
+               u32 offset = !IS_ENABLED(CONFIG_DEBUG_ALIGN_RODATA) ?
+                            (phys_seed >> 32) & mask : TEXT_OFFSET;
+
+               /*
+                * If KASLR is enabled, and we have some randomness available,
+                * locate the kernel at a randomized offset in physical memory.
+                */
+               *reserve_size = kernel_memsize + offset;
+               status = efi_random_alloc(sys_table_arg, *reserve_size,
+                                         MIN_KIMG_ALIGN, reserve_addr,
+                                         (u32)phys_seed);
  
+               *image_addr = *reserve_addr + offset;
+       } else {
                 /*
-                * First, try a straight allocation at the preferred offset.
+                * Else, try a straight allocation at the preferred offset.
                  * This will work around the issue where, if dram_base == 0x0,
                  * efi_low_alloc() refuses to allocate at 0x0 (to prevent the
                  * address of the allocation to be mistaken for a FAIL return
@@ -52,27 +91,31 @@ efi_status_t __init handle_kernel_image(efi_system_table_t *sys_table_arg,
                  * Mustang), we can still place the kernel at the address
                  * 'dram_base + TEXT_OFFSET'.
                  */
+               if (*image_addr == preferred_offset)
+                       return EFI_SUCCESS;
+
                 *image_addr = *reserve_addr = preferred_offset;
-               nr_pages = round_up(kernel_memsize, EFI_ALLOC_ALIGN) /
-                          EFI_PAGE_SIZE;
+               *reserve_size = round_up(kernel_memsize, EFI_ALLOC_ALIGN);
+
                 status = efi_call_early(allocate_pages, EFI_ALLOCATE_ADDRESS,
-                                       EFI_LOADER_DATA, nr_pages,
+                                       EFI_LOADER_DATA,
+                                       *reserve_size / EFI_PAGE_SIZE,
                                         (efi_physical_addr_t *)reserve_addr);
-               if (status != EFI_SUCCESS) {
-                       kernel_memsize += TEXT_OFFSET;
-                       status = efi_low_alloc(sys_table_arg, kernel_memsize,
-                                              SZ_2M, reserve_addr);
+       }
  
-                       if (status != EFI_SUCCESS) {
-                               pr_efi_err(sys_table_arg, "Failed to relocate kernel\n");
-                               return status;
-                       }
-                       *image_addr = *reserve_addr + TEXT_OFFSET;
+       if (status != EFI_SUCCESS) {
+               *reserve_size = kernel_memsize + TEXT_OFFSET;
+               status = efi_low_alloc(sys_table_arg, *reserve_size,
+                                      MIN_KIMG_ALIGN, reserve_addr);
+
+               if (status != EFI_SUCCESS) {
+                       pr_efi_err(sys_table_arg, "Failed to relocate kernel\n");
+                       *reserve_size = 0;
+                       return status;
                 }
-               memcpy((void *)*image_addr, old_image_addr, kernel_size);
-               *reserve_size = kernel_memsize;
+               *image_addr = *reserve_addr + TEXT_OFFSET;
         }
-
+       memcpy((void *)*image_addr, old_image_addr, kernel_size);
  
         return EFI_SUCCESS;
  }
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c

index f07d4a67fa76b3a3cb542e31a24a093c6f7aff97..29ed2f9b218ca9892bfcc72da2d91ba4750f4c97 100644 (file)
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -649,6 +649,10 @@ static u8 *efi_utf16_to_utf8(u8 *dst, const u16 *src, int n)
         return dst;
  }
  
+#ifndef MAX_CMDLINE_ADDRESS
+#define MAX_CMDLINE_ADDRESS    ULONG_MAX
+#endif
+
  /*
   * Convert the unicode UEFI command line to ASCII to pass to kernel.
   * Size of memory allocated return in *cmd_line_len.
@@ -684,7 +688,8 @@ char *efi_convert_cmdline(efi_system_table_t *sys_table_arg,
  
         options_bytes++;        /* NUL termination */
  
-       status = efi_low_alloc(sys_table_arg, options_bytes, 0, &cmdline_addr);
+       status = efi_high_alloc(sys_table_arg, options_bytes, 0,
+                               &cmdline_addr, MAX_CMDLINE_ADDRESS);
         if (status != EFI_SUCCESS)
                 return NULL;
  
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h

index 6b6548fda0895ecb0ca7e9a60699d7100e335566..5ed3d3f3816637cd10d007f381797320f0567305 100644 (file)
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -43,4 +43,11 @@ void efi_get_virtmap(efi_memory_desc_t *memory_map, unsigned long map_size,
                      unsigned long desc_size, efi_memory_desc_t *runtime_map,
                      int *count);
  
+efi_status_t efi_get_random_bytes(efi_system_table_t *sys_table,
+                                 unsigned long size, u8 *out);
+
+efi_status_t efi_random_alloc(efi_system_table_t *sys_table_arg,
+                             unsigned long size, unsigned long align,
+                             unsigned long *addr, unsigned long random_seed);
+
  #endif
diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c

index b62e2f5dcab3b2d95074b534de803915145dc20c..b1c22cf18f7d39f531bca06d99ac44ff1d6a9e32 100644 (file)
--- a/drivers/firmware/efi/libstub/fdt.c
+++ b/drivers/firmware/efi/libstub/fdt.c
@@ -147,6 +147,20 @@ efi_status_t update_fdt(efi_system_table_t *sys_table, void *orig_fdt,
         if (status)
                 goto fdt_set_fail;
  
+       if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
+               efi_status_t efi_status;
+
+               efi_status = efi_get_random_bytes(sys_table, sizeof(fdt_val64),
+                                                 (u8 *)&fdt_val64);
+               if (efi_status == EFI_SUCCESS) {
+                       status = fdt_setprop(fdt, node, "kaslr-seed",
+                                            &fdt_val64, sizeof(fdt_val64));
+                       if (status)
+                               goto fdt_set_fail;
+               } else if (efi_status != EFI_NOT_FOUND) {
+                       return efi_status;
+               }
+       }
         return EFI_SUCCESS;
  
  fdt_set_fail:
diff --git a/drivers/firmware/efi/libstub/random.c b/drivers/firmware/efi/libstub/random.c

new file mode 100644 (file)

index 0000000..53f6d3f
--- /dev/null
+++ b/drivers/firmware/efi/libstub/random.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (C) 2016 Linaro Ltd;  <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/efi.h>
+#include <asm/efi.h>
+
+#include "efistub.h"
+
+struct efi_rng_protocol {
+       efi_status_t (*get_info)(struct efi_rng_protocol *,
+                                unsigned long *, efi_guid_t *);
+       efi_status_t (*get_rng)(struct efi_rng_protocol *,
+                               efi_guid_t *, unsigned long, u8 *out);
+};
+
+efi_status_t efi_get_random_bytes(efi_system_table_t *sys_table_arg,
+                                 unsigned long size, u8 *out)
+{
+       efi_guid_t rng_proto = EFI_RNG_PROTOCOL_GUID;
+       efi_status_t status;
+       struct efi_rng_protocol *rng;
+
+       status = efi_call_early(locate_protocol, &rng_proto, NULL,
+                               (void **)&rng);
+       if (status != EFI_SUCCESS)
+               return status;
+
+       return rng->get_rng(rng, NULL, size, out);
+}
+
+/*
+ * Return the number of slots covered by this entry, i.e., the number of
+ * addresses it covers that are suitably aligned and supply enough room
+ * for the allocation.
+ */
+static unsigned long get_entry_num_slots(efi_memory_desc_t *md,
+                                        unsigned long size,
+                                        unsigned long align)
+{
+       u64 start, end;
+
+       if (md->type != EFI_CONVENTIONAL_MEMORY)
+               return 0;
+
+       start = round_up(md->phys_addr, align);
+       end = round_down(md->phys_addr + md->num_pages * EFI_PAGE_SIZE - size,
+                        align);
+
+       if (start > end)
+               return 0;
+
+       return (end - start + 1) / align;
+}
+
+/*
+ * The UEFI memory descriptors have a virtual address field that is only used
+ * when installing the virtual mapping using SetVirtualAddressMap(). Since it
+ * is unused here, we can reuse it to keep track of each descriptor's slot
+ * count.
+ */
+#define MD_NUM_SLOTS(md)       ((md)->virt_addr)
+
+efi_status_t efi_random_alloc(efi_system_table_t *sys_table_arg,
+                             unsigned long size,
+                             unsigned long align,
+                             unsigned long *addr,
+                             unsigned long random_seed)
+{
+       unsigned long map_size, desc_size, total_slots = 0, target_slot;
+       efi_status_t status;
+       efi_memory_desc_t *memory_map;
+       int map_offset;
+
+       status = efi_get_memory_map(sys_table_arg, &memory_map, &map_size,
+                                   &desc_size, NULL, NULL);
+       if (status != EFI_SUCCESS)
+               return status;
+
+       if (align < EFI_ALLOC_ALIGN)
+               align = EFI_ALLOC_ALIGN;
+
+       /* count the suitable slots in each memory map entry */
+       for (map_offset = 0; map_offset < map_size; map_offset += desc_size) {
+               efi_memory_desc_t *md = (void *)memory_map + map_offset;
+               unsigned long slots;
+
+               slots = get_entry_num_slots(md, size, align);
+               MD_NUM_SLOTS(md) = slots;
+               total_slots += slots;
+       }
+
+       /* find a random number between 0 and total_slots */
+       target_slot = (total_slots * (u16)random_seed) >> 16;
+
+       /*
+        * target_slot is now a value in the range [0, total_slots), and so
+        * it corresponds with exactly one of the suitable slots we recorded
+        * when iterating over the memory map the first time around.
+        *
+        * So iterate over the memory map again, subtracting the number of
+        * slots of each entry at each iteration, until we have found the entry
+        * that covers our chosen slot. Use the residual value of target_slot
+        * to calculate the randomly chosen address, and allocate it directly
+        * using EFI_ALLOCATE_ADDRESS.
+        */
+       for (map_offset = 0; map_offset < map_size; map_offset += desc_size) {
+               efi_memory_desc_t *md = (void *)memory_map + map_offset;
+               efi_physical_addr_t target;
+               unsigned long pages;
+
+               if (target_slot >= MD_NUM_SLOTS(md)) {
+                       target_slot -= MD_NUM_SLOTS(md);
+                       continue;
+               }
+
+               target = round_up(md->phys_addr, align) + target_slot * align;
+               pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+
+               status = efi_call_early(allocate_pages, EFI_ALLOCATE_ADDRESS,
+                                       EFI_LOADER_DATA, pages, &target);
+               if (status == EFI_SUCCESS)
+                       *addr = target;
+               break;
+       }
+
+       efi_call_early(free_pool, memory_map);
+
+       return status;
+}
diff --git a/drivers/firmware/psci.c b/drivers/firmware/psci.c

index d24f35d74b27079afeae5c08d601c3bcd899dee6..11bfee8b79a9f65418bcbe53dfc708edb220e69d 100644 (file)
--- a/drivers/firmware/psci.c
+++ b/drivers/firmware/psci.c
@@ -13,6 +13,8 @@
  
  #define pr_fmt(fmt) "psci: " fmt
  
+#include <linux/arm-smccc.h>
+#include <linux/cpuidle.h>
  #include <linux/errno.h>
  #include <linux/linkage.h>
  #include <linux/of.h>
@@ -20,10 +22,12 @@
  #include <linux/printk.h>
  #include <linux/psci.h>
  #include <linux/reboot.h>
+#include <linux/slab.h>
  #include <linux/suspend.h>
  
  #include <uapi/linux/psci.h>
  
+#include <asm/cpuidle.h>
  #include <asm/cputype.h>
  #include <asm/system_misc.h>
  #include <asm/smp_plat.h>
@@ -58,8 +62,6 @@ struct psci_operations psci_ops;
  
  typedef unsigned long (psci_fn)(unsigned long, unsigned long,
                                 unsigned long, unsigned long);
-asmlinkage psci_fn __invoke_psci_fn_hvc;
-asmlinkage psci_fn __invoke_psci_fn_smc;
  static psci_fn *invoke_psci_fn;
  
  enum psci_function {
@@ -107,6 +109,26 @@ bool psci_power_state_is_valid(u32 state)
         return !(state & ~valid_mask);
  }
  
+static unsigned long __invoke_psci_fn_hvc(unsigned long function_id,
+                       unsigned long arg0, unsigned long arg1,
+                       unsigned long arg2)
+{
+       struct arm_smccc_res res;
+
+       arm_smccc_hvc(function_id, arg0, arg1, arg2, 0, 0, 0, 0, &res);
+       return res.a0;
+}
+
+static unsigned long __invoke_psci_fn_smc(unsigned long function_id,
+                       unsigned long arg0, unsigned long arg1,
+                       unsigned long arg2)
+{
+       struct arm_smccc_res res;
+
+       arm_smccc_smc(function_id, arg0, arg1, arg2, 0, 0, 0, 0, &res);
+       return res.a0;
+}
+
  static int psci_to_linux_errno(int errno)
  {
         switch (errno) {
@@ -225,6 +247,123 @@ static int __init psci_features(u32 psci_func_id)
                               psci_func_id, 0, 0);
  }
  
+#ifdef CONFIG_CPU_IDLE
+static DEFINE_PER_CPU_READ_MOSTLY(u32 *, psci_power_state);
+
+static int psci_dt_cpu_init_idle(struct device_node *cpu_node, int cpu)
+{
+       int i, ret, count = 0;
+       u32 *psci_states;
+       struct device_node *state_node;
+
+       /*
+        * If the PSCI cpu_suspend function hook has not been initialized
+        * idle states must not be enabled, so bail out
+        */
+       if (!psci_ops.cpu_suspend)
+               return -EOPNOTSUPP;
+
+       /* Count idle states */
+       while ((state_node = of_parse_phandle(cpu_node, "cpu-idle-states",
+                                             count))) {
+               count++;
+               of_node_put(state_node);
+       }
+
+       if (!count)
+               return -ENODEV;
+
+       psci_states = kcalloc(count, sizeof(*psci_states), GFP_KERNEL);
+       if (!psci_states)
+               return -ENOMEM;
+
+       for (i = 0; i < count; i++) {
+               u32 state;
+
+               state_node = of_parse_phandle(cpu_node, "cpu-idle-states", i);
+
+               ret = of_property_read_u32(state_node,
+                                          "arm,psci-suspend-param",
+                                          &state);
+               if (ret) {
+                       pr_warn(" * %s missing arm,psci-suspend-param property\n",
+                               state_node->full_name);
+                       of_node_put(state_node);
+                       goto free_mem;
+               }
+
+               of_node_put(state_node);
+               pr_debug("psci-power-state %#x index %d\n", state, i);
+               if (!psci_power_state_is_valid(state)) {
+                       pr_warn("Invalid PSCI power state %#x\n", state);
+                       ret = -EINVAL;
+                       goto free_mem;
+               }
+               psci_states[i] = state;
+       }
+       /* Idle states parsed correctly, initialize per-cpu pointer */
+       per_cpu(psci_power_state, cpu) = psci_states;
+       return 0;
+
+free_mem:
+       kfree(psci_states);
+       return ret;
+}
+
+int psci_cpu_init_idle(unsigned int cpu)
+{
+       struct device_node *cpu_node;
+       int ret;
+
+       cpu_node = of_get_cpu_node(cpu, NULL);
+       if (!cpu_node)
+               return -ENODEV;
+
+       ret = psci_dt_cpu_init_idle(cpu_node, cpu);
+
+       of_node_put(cpu_node);
+
+       return ret;
+}
+
+static int psci_suspend_finisher(unsigned long index)
+{
+       u32 *state = __this_cpu_read(psci_power_state);
+
+       return psci_ops.cpu_suspend(state[index - 1],
+                                   virt_to_phys(cpu_resume));
+}
+
+int psci_cpu_suspend_enter(unsigned long index)
+{
+       int ret;
+       u32 *state = __this_cpu_read(psci_power_state);
+       /*
+        * idle state index 0 corresponds to wfi, should never be called
+        * from the cpu_suspend operations
+        */
+       if (WARN_ON_ONCE(!index))
+               return -EINVAL;
+
+       if (!psci_power_state_loses_context(state[index - 1]))
+               ret = psci_ops.cpu_suspend(state[index - 1], 0);
+       else
+               ret = cpu_suspend(index, psci_suspend_finisher);
+
+       return ret;
+}
+
+/* ARM specific CPU idle operations */
+#ifdef CONFIG_ARM
+static struct cpuidle_ops psci_cpuidle_ops __initdata = {
+       .suspend = psci_cpu_suspend_enter,
+       .init = psci_dt_cpu_init_idle,
+};
+
+CPUIDLE_METHOD_OF_DECLARE(psci, "arm,psci", &psci_cpuidle_ops);
+#endif
+#endif
+
  static int psci_system_suspend(unsigned long unused)
  {
         return invoke_psci_fn(PSCI_FN_NATIVE(1_0, SYSTEM_SUSPEND),
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c

index 6ed7d63a0688384830894f4a455463bd7e9e060c..201947b4377c769d9393d718fc5b35c5000328ed 100644 (file)
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -513,9 +513,7 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma,
                                 return ret;
  
                         if (r->presumed_offset != offset &&
-                           __copy_to_user_inatomic(&user_relocs->presumed_offset,
-                                                   &r->presumed_offset,
-                                                   sizeof(r->presumed_offset))) {
+                           __put_user(r->presumed_offset, &user_relocs->presumed_offset)) {
                                 return -EFAULT;
                         }
  
diff --git a/drivers/hwtracing/coresight/Kconfig b/drivers/hwtracing/coresight/Kconfig

index 6c8921140f024c300bf34234de4c8545496953a5..130cb21140592bd995e9e5c1c1bd10e7156ef7e7 100644 (file)
--- a/drivers/hwtracing/coresight/Kconfig
+++ b/drivers/hwtracing/coresight/Kconfig
@@ -4,11 +4,12 @@
  menuconfig CORESIGHT
         bool "CoreSight Tracing Support"
         select ARM_AMBA
+       select PERF_EVENTS
         help
           This framework provides a kernel interface for the CoreSight debug
           and trace drivers to register themselves with. It's intended to build
           a topological view of the CoreSight components based on a DT
-         specification and configure the right serie of components when a
+         specification and configure the right series of components when a
           trace source gets enabled.
  
  if CORESIGHT
@@ -77,4 +78,15 @@ config CORESIGHT_QCOM_REPLICATOR
           programmable ATB replicator sends the ATB trace stream from the
           ETB/ETF to the TPIUi and ETR.
  
+config CORESIGHT_STM
+       bool "CoreSight System Trace Macrocell driver"
+       depends on (ARM && !(CPU_32v3 || CPU_32v4 || CPU_32v4T)) || ARM64
+       select CORESIGHT_LINKS_AND_SINKS
+       select STM
+       help
+         This driver provides support for hardware assisted software
+         instrumentation based tracing. This is primarily used for
+         logging useful software events or data coming from various entities
+         in the system, possibly running different OSs
+
  endif
diff --git a/drivers/hwtracing/coresight/Makefile b/drivers/hwtracing/coresight/Makefile

index 99f8e5f6256e25c438862c2f93b80f4e59eb0f64..af480d9c1441ae30d41269f5c9f1e7588acd14e3 100644 (file)
--- a/drivers/hwtracing/coresight/Makefile
+++ b/drivers/hwtracing/coresight/Makefile
@@ -1,13 +1,18 @@
  #
  # Makefile for CoreSight drivers.
  #
-obj-$(CONFIG_CORESIGHT) += coresight.o
+obj-$(CONFIG_CORESIGHT) += coresight.o coresight-etm-perf.o
  obj-$(CONFIG_OF) += of_coresight.o
-obj-$(CONFIG_CORESIGHT_LINK_AND_SINK_TMC) += coresight-tmc.o
+obj-$(CONFIG_CORESIGHT_LINK_AND_SINK_TMC) += coresight-tmc.o \
+                                            coresight-tmc-etf.o \
+                                            coresight-tmc-etr.o
  obj-$(CONFIG_CORESIGHT_SINK_TPIU) += coresight-tpiu.o
  obj-$(CONFIG_CORESIGHT_SINK_ETBV10) += coresight-etb10.o
  obj-$(CONFIG_CORESIGHT_LINKS_AND_SINKS) += coresight-funnel.o \
                                            coresight-replicator.o
-obj-$(CONFIG_CORESIGHT_SOURCE_ETM3X) += coresight-etm3x.o coresight-etm-cp14.o
-obj-$(CONFIG_CORESIGHT_SOURCE_ETM4X) += coresight-etm4x.o
+obj-$(CONFIG_CORESIGHT_SOURCE_ETM3X) += coresight-etm3x.o coresight-etm-cp14.o \
+                                       coresight-etm3x-sysfs.o
+obj-$(CONFIG_CORESIGHT_SOURCE_ETM4X) += coresight-etm4x.o \
+                                       coresight-etm4x-sysfs.o
  obj-$(CONFIG_CORESIGHT_QCOM_REPLICATOR) += coresight-replicator-qcom.o
+obj-$(CONFIG_CORESIGHT_STM) += coresight-stm.o
diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c

index 77d0f9c1118dfdfcc29a2d0435f3311a12793414..4d20b0be0c0b6a337cdec7cbbfd17f8ce94e2a63 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -1,4 +1,6 @@
  /* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+ *
+ * Description: CoreSight Embedded Trace Buffer driver
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License version 2 and
@@ -10,8 +12,8 @@
   * GNU General Public License for more details.
   */
  
+#include <asm/local.h>
  #include <linux/kernel.h>
-#include <linux/module.h>
  #include <linux/init.h>
  #include <linux/types.h>
  #include <linux/device.h>
@@ -27,6 +29,11 @@
  #include <linux/coresight.h>
  #include <linux/amba/bus.h>
  #include <linux/clk.h>
+#include <linux/circ_buf.h>
+#include <linux/mm.h>
+#include <linux/perf_event.h>
+
+#include <asm/local.h>
  
  #include "coresight-priv.h"
  
@@ -71,10 +78,10 @@
   * @csdev:     component vitals needed by the framework.
   * @miscdev:   specifics to handle "/dev/xyz.etb" entry.
   * @spinlock:  only one at a time pls.
- * @in_use:    synchronise user space access to etb buffer.
+ * @reading:   synchronise user space access to etb buffer.
+ * @mode:      this ETB is being used.
   * @buf:       area of memory where ETB buffer content gets sent.
   * @buffer_depth: size of @buf.
- * @enable:    this ETB is being used.
   * @trigger_cntr: amount of words to store after a trigger.
   */
  struct etb_drvdata {
@@ -84,10 +91,10 @@ struct etb_drvdata {
         struct coresight_device *csdev;
         struct miscdevice       miscdev;
         spinlock_t              spinlock;
-       atomic_t                in_use;
+       local_t                 reading;
+       local_t                 mode;
         u8                      *buf;
         u32                     buffer_depth;
-       bool                    enable;
         u32                     trigger_cntr;
  };
  
@@ -132,18 +139,31 @@ static void etb_enable_hw(struct etb_drvdata *drvdata)
         CS_LOCK(drvdata->base);
  }
  
-static int etb_enable(struct coresight_device *csdev)
+static int etb_enable(struct coresight_device *csdev, u32 mode)
  {
-       struct etb_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+       u32 val;
         unsigned long flags;
+       struct etb_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-       pm_runtime_get_sync(drvdata->dev);
+       val = local_cmpxchg(&drvdata->mode,
+                           CS_MODE_DISABLED, mode);
+       /*
+        * When accessing from Perf, a HW buffer can be handled
+        * by a single trace entity.  In sysFS mode many tracers
+        * can be logging to the same HW buffer.
+        */
+       if (val == CS_MODE_PERF)
+               return -EBUSY;
+
+       /* Nothing to do, the tracer is already enabled. */
+       if (val == CS_MODE_SYSFS)
+               goto out;
  
         spin_lock_irqsave(&drvdata->spinlock, flags);
         etb_enable_hw(drvdata);
-       drvdata->enable = true;
         spin_unlock_irqrestore(&drvdata->spinlock, flags);
  
+out:
         dev_info(drvdata->dev, "ETB enabled\n");
         return 0;
  }
@@ -244,17 +264,226 @@ static void etb_disable(struct coresight_device *csdev)
         spin_lock_irqsave(&drvdata->spinlock, flags);
         etb_disable_hw(drvdata);
         etb_dump_hw(drvdata);
-       drvdata->enable = false;
         spin_unlock_irqrestore(&drvdata->spinlock, flags);
  
-       pm_runtime_put(drvdata->dev);
+       local_set(&drvdata->mode, CS_MODE_DISABLED);
  
         dev_info(drvdata->dev, "ETB disabled\n");
  }
  
+static void *etb_alloc_buffer(struct coresight_device *csdev, int cpu,
+                             void **pages, int nr_pages, bool overwrite)
+{
+       int node;
+       struct cs_buffers *buf;
+
+       if (cpu == -1)
+               cpu = smp_processor_id();
+       node = cpu_to_node(cpu);
+
+       buf = kzalloc_node(sizeof(struct cs_buffers), GFP_KERNEL, node);
+       if (!buf)
+               return NULL;
+
+       buf->snapshot = overwrite;
+       buf->nr_pages = nr_pages;
+       buf->data_pages = pages;
+
+       return buf;
+}
+
+static void etb_free_buffer(void *config)
+{
+       struct cs_buffers *buf = config;
+
+       kfree(buf);
+}
+
+static int etb_set_buffer(struct coresight_device *csdev,
+                         struct perf_output_handle *handle,
+                         void *sink_config)
+{
+       int ret = 0;
+       unsigned long head;
+       struct cs_buffers *buf = sink_config;
+
+       /* wrap head around to the amount of space we have */
+       head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
+
+       /* find the page to write to */
+       buf->cur = head / PAGE_SIZE;
+
+       /* and offset within that page */
+       buf->offset = head % PAGE_SIZE;
+
+       local_set(&buf->data_size, 0);
+
+       return ret;
+}
+
+static unsigned long etb_reset_buffer(struct coresight_device *csdev,
+                                     struct perf_output_handle *handle,
+                                     void *sink_config, bool *lost)
+{
+       unsigned long size = 0;
+       struct cs_buffers *buf = sink_config;
+
+       if (buf) {
+               /*
+                * In snapshot mode ->data_size holds the new address of the
+                * ring buffer's head.  The size itself is the whole address
+                * range since we want the latest information.
+                */
+               if (buf->snapshot)
+                       handle->head = local_xchg(&buf->data_size,
+                                                 buf->nr_pages << PAGE_SHIFT);
+
+               /*
+                * Tell the tracer PMU how much we got in this run and if
+                * something went wrong along the way.  Nobody else can use
+                * this cs_buffers instance until we are done.  As such
+                * resetting parameters here and squaring off with the ring
+                * buffer API in the tracer PMU is fine.
+                */
+               *lost = !!local_xchg(&buf->lost, 0);
+               size = local_xchg(&buf->data_size, 0);
+       }
+
+       return size;
+}
+
+static void etb_update_buffer(struct coresight_device *csdev,
+                             struct perf_output_handle *handle,
+                             void *sink_config)
+{
+       int i, cur;
+       u8 *buf_ptr;
+       u32 read_ptr, write_ptr, capacity;
+       u32 status, read_data, to_read;
+       unsigned long offset;
+       struct cs_buffers *buf = sink_config;
+       struct etb_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+       if (!buf)
+               return;
+
+       capacity = drvdata->buffer_depth * ETB_FRAME_SIZE_WORDS;
+
+       CS_UNLOCK(drvdata->base);
+       etb_disable_hw(drvdata);
+
+       /* unit is in words, not bytes */
+       read_ptr = readl_relaxed(drvdata->base + ETB_RAM_READ_POINTER);
+       write_ptr = readl_relaxed(drvdata->base + ETB_RAM_WRITE_POINTER);
+
+       /*
+        * Entries should be aligned to the frame size.  If they are not
+        * go back to the last alignement point to give decoding tools a
+        * chance to fix things.
+        */
+       if (write_ptr % ETB_FRAME_SIZE_WORDS) {
+               dev_err(drvdata->dev,
+                       "write_ptr: %lu not aligned to formatter frame size\n",
+                       (unsigned long)write_ptr);
+
+               write_ptr &= ~(ETB_FRAME_SIZE_WORDS - 1);
+               local_inc(&buf->lost);
+       }
+
+       /*
+        * Get a hold of the status register and see if a wrap around
+        * has occurred.  If so adjust things accordingly.  Otherwise
+        * start at the beginning and go until the write pointer has
+        * been reached.
+        */
+       status = readl_relaxed(drvdata->base + ETB_STATUS_REG);
+       if (status & ETB_STATUS_RAM_FULL) {
+               local_inc(&buf->lost);
+               to_read = capacity;
+               read_ptr = write_ptr;
+       } else {
+               to_read = CIRC_CNT(write_ptr, read_ptr, drvdata->buffer_depth);
+               to_read *= ETB_FRAME_SIZE_WORDS;
+       }
+
+       /*
+        * Make sure we don't overwrite data that hasn't been consumed yet.
+        * It is entirely possible that the HW buffer has more data than the
+        * ring buffer can currently handle.  If so adjust the start address
+        * to take only the last traces.
+        *
+        * In snapshot mode we are looking to get the latest traces only and as
+        * such, we don't care about not overwriting data that hasn't been
+        * processed by user space.
+        */
+       if (!buf->snapshot && to_read > handle->size) {
+               u32 mask = ~(ETB_FRAME_SIZE_WORDS - 1);
+
+               /* The new read pointer must be frame size aligned */
+               to_read = handle->size & mask;
+               /*
+                * Move the RAM read pointer up, keeping in mind that
+                * everything is in frame size units.
+                */
+               read_ptr = (write_ptr + drvdata->buffer_depth) -
+                                       to_read / ETB_FRAME_SIZE_WORDS;
+               /* Wrap around if need be*/
+               if (read_ptr > (drvdata->buffer_depth - 1))
+                       read_ptr -= drvdata->buffer_depth;
+               /* let the decoder know we've skipped ahead */
+               local_inc(&buf->lost);
+       }
+
+       /* finally tell HW where we want to start reading from */
+       writel_relaxed(read_ptr, drvdata->base + ETB_RAM_READ_POINTER);
+
+       cur = buf->cur;
+       offset = buf->offset;
+       for (i = 0; i < to_read; i += 4) {
+               buf_ptr = buf->data_pages[cur] + offset;
+               read_data = readl_relaxed(drvdata->base +
+                                         ETB_RAM_READ_DATA_REG);
+               *buf_ptr++ = read_data >> 0;
+               *buf_ptr++ = read_data >> 8;
+               *buf_ptr++ = read_data >> 16;
+               *buf_ptr++ = read_data >> 24;
+
+               offset += 4;
+               if (offset >= PAGE_SIZE) {
+                       offset = 0;
+                       cur++;
+                       /* wrap around at the end of the buffer */
+                       cur &= buf->nr_pages - 1;
+               }
+       }
+
+       /* reset ETB buffer for next run */
+       writel_relaxed(0x0, drvdata->base + ETB_RAM_READ_POINTER);
+       writel_relaxed(0x0, drvdata->base + ETB_RAM_WRITE_POINTER);
+
+       /*
+        * In snapshot mode all we have to do is communicate to
+        * perf_aux_output_end() the address of the current head.  In full
+        * trace mode the same function expects a size to move rb->aux_head
+        * forward.
+        */
+       if (buf->snapshot)
+               local_set(&buf->data_size, (cur * PAGE_SIZE) + offset);
+       else
+               local_add(to_read, &buf->data_size);
+
+       etb_enable_hw(drvdata);
+       CS_LOCK(drvdata->base);
+}
+
  static const struct coresight_ops_sink etb_sink_ops = {
         .enable         = etb_enable,
         .disable        = etb_disable,
+       .alloc_buffer   = etb_alloc_buffer,
+       .free_buffer    = etb_free_buffer,
+       .set_buffer     = etb_set_buffer,
+       .reset_buffer   = etb_reset_buffer,
+       .update_buffer  = etb_update_buffer,
  };
  
  static const struct coresight_ops etb_cs_ops = {
@@ -266,7 +495,7 @@ static void etb_dump(struct etb_drvdata *drvdata)
         unsigned long flags;
  
         spin_lock_irqsave(&drvdata->spinlock, flags);
-       if (drvdata->enable) {
+       if (local_read(&drvdata->mode) == CS_MODE_SYSFS) {
                 etb_disable_hw(drvdata);
                 etb_dump_hw(drvdata);
                 etb_enable_hw(drvdata);
@@ -281,7 +510,7 @@ static int etb_open(struct inode *inode, struct file *file)
         struct etb_drvdata *drvdata = container_of(file->private_data,
                                                    struct etb_drvdata, miscdev);
  
-       if (atomic_cmpxchg(&drvdata->in_use, 0, 1))
+       if (local_cmpxchg(&drvdata->reading, 0, 1))
                 return -EBUSY;
  
         dev_dbg(drvdata->dev, "%s: successfully opened\n", __func__);
@@ -317,7 +546,7 @@ static int etb_release(struct inode *inode, struct file *file)
  {
         struct etb_drvdata *drvdata = container_of(file->private_data,
                                                    struct etb_drvdata, miscdev);
-       atomic_set(&drvdata->in_use, 0);
+       local_set(&drvdata->reading, 0);
  
         dev_dbg(drvdata->dev, "%s: released\n", __func__);
         return 0;
@@ -331,47 +560,29 @@ static const struct file_operations etb_fops = {
         .llseek         = no_llseek,
  };
  
-static ssize_t status_show(struct device *dev,
-                          struct device_attribute *attr, char *buf)
-{
-       unsigned long flags;
-       u32 etb_rdr, etb_sr, etb_rrp, etb_rwp;
-       u32 etb_trg, etb_cr, etb_ffsr, etb_ffcr;
-       struct etb_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       pm_runtime_get_sync(drvdata->dev);
-       spin_lock_irqsave(&drvdata->spinlock, flags);
-       CS_UNLOCK(drvdata->base);
-
-       etb_rdr = readl_relaxed(drvdata->base + ETB_RAM_DEPTH_REG);
-       etb_sr = readl_relaxed(drvdata->base + ETB_STATUS_REG);
-       etb_rrp = readl_relaxed(drvdata->base + ETB_RAM_READ_POINTER);
-       etb_rwp = readl_relaxed(drvdata->base + ETB_RAM_WRITE_POINTER);
-       etb_trg = readl_relaxed(drvdata->base + ETB_TRG);
-       etb_cr = readl_relaxed(drvdata->base + ETB_CTL_REG);
-       etb_ffsr = readl_relaxed(drvdata->base + ETB_FFSR);
-       etb_ffcr = readl_relaxed(drvdata->base + ETB_FFCR);
-
-       CS_LOCK(drvdata->base);
-       spin_unlock_irqrestore(&drvdata->spinlock, flags);
-
-       pm_runtime_put(drvdata->dev);
-
-       return sprintf(buf,
-                      "Depth:\t\t0x%x\n"
-                      "Status:\t\t0x%x\n"
-                      "RAM read ptr:\t0x%x\n"
-                      "RAM wrt ptr:\t0x%x\n"
-                      "Trigger cnt:\t0x%x\n"
-                      "Control:\t0x%x\n"
-                      "Flush status:\t0x%x\n"
-                      "Flush ctrl:\t0x%x\n",
-                      etb_rdr, etb_sr, etb_rrp, etb_rwp,
-                      etb_trg, etb_cr, etb_ffsr, etb_ffcr);
-
-       return -EINVAL;
-}
-static DEVICE_ATTR_RO(status);
+#define coresight_etb10_simple_func(name, offset)                       \
+       coresight_simple_func(struct etb_drvdata, name, offset)
+
+coresight_etb10_simple_func(rdp, ETB_RAM_DEPTH_REG);
+coresight_etb10_simple_func(sts, ETB_STATUS_REG);
+coresight_etb10_simple_func(rrp, ETB_RAM_READ_POINTER);
+coresight_etb10_simple_func(rwp, ETB_RAM_WRITE_POINTER);
+coresight_etb10_simple_func(trg, ETB_TRG);
+coresight_etb10_simple_func(ctl, ETB_CTL_REG);
+coresight_etb10_simple_func(ffsr, ETB_FFSR);
+coresight_etb10_simple_func(ffcr, ETB_FFCR);
+
+static struct attribute *coresight_etb_mgmt_attrs[] = {
+       &dev_attr_rdp.attr,
+       &dev_attr_sts.attr,
+       &dev_attr_rrp.attr,
+       &dev_attr_rwp.attr,
+       &dev_attr_trg.attr,
+       &dev_attr_ctl.attr,
+       &dev_attr_ffsr.attr,
+       &dev_attr_ffcr.attr,
+       NULL,
+};
  
  static ssize_t trigger_cntr_show(struct device *dev,
                             struct device_attribute *attr, char *buf)
@@ -401,10 +612,23 @@ static DEVICE_ATTR_RW(trigger_cntr);
  
  static struct attribute *coresight_etb_attrs[] = {
         &dev_attr_trigger_cntr.attr,
-       &dev_attr_status.attr,
         NULL,
  };
-ATTRIBUTE_GROUPS(coresight_etb);
+
+static const struct attribute_group coresight_etb_group = {
+       .attrs = coresight_etb_attrs,
+};
+
+static const struct attribute_group coresight_etb_mgmt_group = {
+       .attrs = coresight_etb_mgmt_attrs,
+       .name = "mgmt",
+};
+
+const struct attribute_group *coresight_etb_groups[] = {
+       &coresight_etb_group,
+       &coresight_etb_mgmt_group,
+       NULL,
+};
  
  static int etb_probe(struct amba_device *adev, const struct amba_id *id)
  {
@@ -481,7 +705,6 @@ static int etb_probe(struct amba_device *adev, const struct amba_id *id)
         if (ret)
                 goto err_misc_register;
  
-       dev_info(dev, "ETB initialized\n");
         return 0;
  
  err_misc_register:
@@ -489,15 +712,6 @@ err_misc_register:
         return ret;
  }
  
-static int etb_remove(struct amba_device *adev)
-{
-       struct etb_drvdata *drvdata = amba_get_drvdata(adev);
-
-       misc_deregister(&drvdata->miscdev);
-       coresight_unregister(drvdata->csdev);
-       return 0;
-}
-
  #ifdef CONFIG_PM
  static int etb_runtime_suspend(struct device *dev)
  {
@@ -537,14 +751,10 @@ static struct amba_driver etb_driver = {
                 .name   = "coresight-etb10",
                 .owner  = THIS_MODULE,
                 .pm     = &etb_dev_pm_ops,
+               .suppress_bind_attrs = true,
  
         },
         .probe          = etb_probe,
-       .remove         = etb_remove,
         .id_table       = etb_ids,
  };
-
-module_amba_driver(etb_driver);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("CoreSight Embedded Trace Buffer driver");
+builtin_amba_driver(etb_driver);
diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c

new file mode 100644 (file)

index 0000000..8fbb1dd
--- /dev/null
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -0,0 +1,510 @@
+/*
+ * Copyright(C) 2015 Linaro Limited. All rights reserved.
+ * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/coresight.h>
+#include <linux/coresight-pmu.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/parser.h>
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "coresight-priv.h"
+
+static struct pmu etm_pmu;
+static bool etm_perf_up;
+
+/**
+ * struct etm_event_data - Coresight specifics associated to an event
+ * @work:              Handle to free allocated memory outside IRQ context.
+ * @mask:              Hold the CPU(s) this event was set for.
+ * @snk_config:                The sink configuration.
+ * @path:              An array of path, each slot for one CPU.
+ */
+struct etm_event_data {
+       struct work_struct work;
+       cpumask_t mask;
+       void *snk_config;
+       struct list_head **path;
+};
+
+/**
+ * struct perf_pmu_drv_config - Driver specific configuration needed
+ *                             before a session can start.
+ * @sink:              The name of the sink this session should use.
+ * @entry:             Hook to the event->drv_configs list.
+ */
+struct perf_pmu_drv_config {
+       char *sink;
+       struct list_head entry;
+};
+
+static DEFINE_PER_CPU(struct perf_output_handle, ctx_handle);
+static DEFINE_PER_CPU(struct coresight_device *, csdev_src);
+
+/* ETMv3.5/PTM's ETMCR is 'config' */
+PMU_FORMAT_ATTR(cycacc,                "config:" __stringify(ETM_OPT_CYCACC));
+PMU_FORMAT_ATTR(timestamp,     "config:" __stringify(ETM_OPT_TS));
+
+static struct attribute *etm_config_formats_attr[] = {
+       &format_attr_cycacc.attr,
+       &format_attr_timestamp.attr,
+       NULL,
+};
+
+static struct attribute_group etm_pmu_format_group = {
+       .name   = "format",
+       .attrs  = etm_config_formats_attr,
+};
+
+static const struct attribute_group *etm_pmu_attr_groups[] = {
+       &etm_pmu_format_group,
+       NULL,
+};
+
+static void etm_event_read(struct perf_event *event) {}
+
+static int etm_event_init(struct perf_event *event)
+{
+       if (event->attr.type != etm_pmu.type)
+               return -ENOENT;
+
+       return 0;
+}
+
+static void free_event_data(struct work_struct *work)
+{
+       int cpu;
+       cpumask_t *mask;
+       struct etm_event_data *event_data;
+       struct coresight_device *sink;
+
+       event_data = container_of(work, struct etm_event_data, work);
+       mask = &event_data->mask;
+       /*
+        * First deal with the sink configuration.  See comment in
+        * etm_setup_aux() about why we take the first available path.
+        */
+       if (event_data->snk_config) {
+               cpu = cpumask_first(mask);
+               sink = coresight_get_sink(event_data->path[cpu]);
+               if (sink_ops(sink)->free_buffer)
+                       sink_ops(sink)->free_buffer(event_data->snk_config);
+       }
+
+       for_each_cpu(cpu, mask) {
+               if (event_data->path[cpu])
+                       coresight_release_path(event_data->path[cpu]);
+       }
+
+       kfree(event_data->path);
+       kfree(event_data);
+}
+
+static void *alloc_event_data(int cpu)
+{
+       int size;
+       cpumask_t *mask;
+       struct etm_event_data *event_data;
+
+       /* First get memory for the session's data */
+       event_data = kzalloc(sizeof(struct etm_event_data), GFP_KERNEL);
+       if (!event_data)
+               return NULL;
+
+       /* Make sure nothing disappears under us */
+       get_online_cpus();
+       size = num_online_cpus();
+
+       mask = &event_data->mask;
+       if (cpu != -1)
+               cpumask_set_cpu(cpu, mask);
+       else
+               cpumask_copy(mask, cpu_online_mask);
+       put_online_cpus();
+
+       /*
+        * Each CPU has a single path between source and destination.  As such
+        * allocate an array using CPU numbers as indexes.  That way a path
+        * for any CPU can easily be accessed at any given time.  We proceed
+        * the same way for sessions involving a single CPU.  The cost of
+        * unused memory when dealing with single CPU trace scenarios is small
+        * compared to the cost of searching through an optimized array.
+        */
+       event_data->path = kcalloc(size,
+                                  sizeof(struct list_head *), GFP_KERNEL);
+       if (!event_data->path) {
+               kfree(event_data);
+               return NULL;
+       }
+
+       return event_data;
+}
+
+static void etm_free_aux(void *data)
+{
+       struct etm_event_data *event_data = data;
+
+       schedule_work(&event_data->work);
+}
+
+static void *etm_setup_aux(struct perf_event *event, void **pages,
+                          int nr_pages, bool overwrite)
+{
+       int cpu;
+       char *sink_def = NULL;
+       cpumask_t *mask;
+       struct coresight_device *sink;
+       struct etm_event_data *event_data = NULL;
+       struct perf_pmu_drv_config *drv_config;
+
+       /*
+        * Search the driver configurables looking for a sink.  If more than
+        * one sink was specified the last one is taken.
+        */
+       list_for_each_entry(drv_config, &event->drv_configs, entry) {
+               if (drv_config && drv_config->sink) {
+                       sink_def = drv_config->sink;
+                       break;
+               }
+       }
+
+       event_data = alloc_event_data(event->cpu);
+       if (!event_data)
+               return NULL;
+
+       INIT_WORK(&event_data->work, free_event_data);
+
+       mask = &event_data->mask;
+
+       /* Setup the path for each CPU in a trace session */
+       for_each_cpu(cpu, mask) {
+               struct coresight_device *csdev;
+
+               csdev = per_cpu(csdev_src, cpu);
+               if (!csdev)
+                       goto err;
+
+               /*
+                * Building a path doesn't enable it, it simply builds a
+                * list of devices from source to sink that can be
+                * referenced later when the path is actually needed.
+                */
+               event_data->path[cpu] = coresight_build_path(csdev, sink_def);
+               if (!event_data->path[cpu])
+                       goto err;
+       }
+
+       /*
+        * In theory nothing prevent tracers in a trace session from being
+        * associated with different sinks, nor having a sink per tracer.  But
+        * until we have HW with this kind of topology and a way to convey
+        * sink assignement from the perf cmd line we need to assume tracers
+        * in a trace session are using the same sink.  Therefore pick the sink
+        * found at the end of the first available path.
+        */
+       cpu = cpumask_first(mask);
+       /* Grab the sink at the end of the path */
+       sink = coresight_get_sink(event_data->path[cpu]);
+       if (!sink)
+               goto err;
+
+       if (!sink_ops(sink)->alloc_buffer)
+               goto err;
+
+       /* Get the AUX specific data from the sink buffer */
+       event_data->snk_config =
+                       sink_ops(sink)->alloc_buffer(sink, cpu, pages,
+                                                    nr_pages, overwrite);
+       if (!event_data->snk_config)
+               goto err;
+
+out:
+       return event_data;
+
+err:
+       etm_free_aux(event_data);
+       event_data = NULL;
+       goto out;
+}
+
+static void etm_event_start(struct perf_event *event, int flags)
+{
+       int cpu = smp_processor_id();
+       struct etm_event_data *event_data;
+       struct perf_output_handle *handle = this_cpu_ptr(&ctx_handle);
+       struct coresight_device *sink, *csdev = per_cpu(csdev_src, cpu);
+
+       if (!csdev)
+               goto fail;
+
+       /*
+        * Deal with the ring buffer API and get a handle on the
+        * session's information.
+        */
+       event_data = perf_aux_output_begin(handle, event);
+       if (!event_data)
+               goto fail;
+
+       /* We need a sink, no need to continue without one */
+       sink = coresight_get_sink(event_data->path[cpu]);
+       if (WARN_ON_ONCE(!sink || !sink_ops(sink)->set_buffer))
+               goto fail_end_stop;
+
+       /* Configure the sink */
+       if (sink_ops(sink)->set_buffer(sink, handle,
+                                      event_data->snk_config))
+               goto fail_end_stop;
+
+       /* Nothing will happen without a path */
+       if (coresight_enable_path(event_data->path[cpu], CS_MODE_PERF))
+               goto fail_end_stop;
+
+       /* Tell the perf core the event is alive */
+       event->hw.state = 0;
+
+       /* Finally enable the tracer */
+       if (source_ops(csdev)->enable(csdev, &event->attr, CS_MODE_PERF))
+               goto fail_end_stop;
+
+out:
+       return;
+
+fail_end_stop:
+       perf_aux_output_end(handle, 0, true);
+fail:
+       event->hw.state = PERF_HES_STOPPED;
+       goto out;
+}
+
+static void etm_event_stop(struct perf_event *event, int mode)
+{
+       bool lost;
+       int cpu = smp_processor_id();
+       unsigned long size;
+       struct coresight_device *sink, *csdev = per_cpu(csdev_src, cpu);
+       struct perf_output_handle *handle = this_cpu_ptr(&ctx_handle);
+       struct etm_event_data *event_data = perf_get_aux(handle);
+
+       if (event->hw.state == PERF_HES_STOPPED)
+               return;
+
+       if (!csdev)
+               return;
+
+       sink = coresight_get_sink(event_data->path[cpu]);
+       if (!sink)
+               return;
+
+       /* stop tracer */
+       source_ops(csdev)->disable(csdev);
+
+       /* tell the core */
+       event->hw.state = PERF_HES_STOPPED;
+
+       if (mode & PERF_EF_UPDATE) {
+               if (WARN_ON_ONCE(handle->event != event))
+                       return;
+
+               /* update trace information */
+               if (!sink_ops(sink)->update_buffer)
+                       return;
+
+               sink_ops(sink)->update_buffer(sink, handle,
+                                             event_data->snk_config);
+
+               if (!sink_ops(sink)->reset_buffer)
+                       return;
+
+               size = sink_ops(sink)->reset_buffer(sink, handle,
+                                                   event_data->snk_config,
+                                                   &lost);
+
+               perf_aux_output_end(handle, size, lost);
+       }
+
+       /* Disabling the path make its elements available to other sessions */
+       coresight_disable_path(event_data->path[cpu]);
+}
+
+static int etm_event_add(struct perf_event *event, int mode)
+{
+       int ret = 0;
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (mode & PERF_EF_START) {
+               etm_event_start(event, 0);
+               if (hwc->state & PERF_HES_STOPPED)
+                       ret = -EINVAL;
+       } else {
+               hwc->state = PERF_HES_STOPPED;
+       }
+
+       return ret;
+}
+
+static void etm_event_del(struct perf_event *event, int mode)
+{
+       etm_event_stop(event, PERF_EF_UPDATE);
+}
+
+enum {
+       ETM_TOKEN_SINK_CPU,
+       ETM_TOKEN_SINK,
+       ETM_TOKEN_ERR,
+};
+
+static const match_table_t drv_cfg_tokens = {
+       {ETM_TOKEN_SINK_CPU, "sink=cpu%d:%s"},
+       {ETM_TOKEN_SINK, "sink=%s"},
+       {ETM_TOKEN_ERR, NULL},
+};
+
+static int etm_get_drv_configs(struct perf_event *event, void __user *arg)
+{
+       char *config, *sink = NULL;
+       int cpu = -1, token, ret = 0;
+       substring_t args[MAX_OPT_ARGS];
+       struct perf_pmu_drv_config *drv_config = NULL;
+
+       /* Make user supplied input usable */
+       config = strndup_user(arg, PAGE_SIZE);
+       if (IS_ERR(config))
+               return PTR_ERR(config);
+
+       /* See above declared @drv_cfg_tokens for the usable formats */
+       token = match_token(config, drv_cfg_tokens, args);
+       switch (token) {
+       case ETM_TOKEN_SINK:
+               /* Just a sink has been specified */
+               sink = match_strdup(&args[0]);
+               if (IS_ERR(sink)) {
+                       ret = PTR_ERR(sink);
+                       goto err;
+               }
+               break;
+       case ETM_TOKEN_SINK_CPU:
+               /* We have a sink and a CPU */
+               if (match_int(&args[0], &cpu)) {
+                       ret = -EINVAL;
+                       goto err;
+               }
+               sink = match_strdup(&args[1]);
+               if (IS_ERR(sink)) {
+                       ret = PTR_ERR(sink);
+                       goto err;
+               }
+               break;
+       default:
+               ret = -EINVAL;
+               goto err;
+       }
+
+       /* If the CPUs don't match the sink is destined to another path */
+       if (event->cpu != cpu)
+               goto err;
+
+       /*
+        * We have a valid configuration, allocate memory and add to the list
+        * of driver configurables.
+        */
+       drv_config = kzalloc(sizeof(*drv_config), GFP_KERNEL);
+       if (IS_ERR(drv_config)) {
+               ret = PTR_ERR(drv_config);
+               goto err;
+       }
+
+       drv_config->sink = sink;
+       list_add(&drv_config->entry, &event->drv_configs);
+
+out:
+       kfree(config);
+       return ret;
+
+err:
+       kfree(sink);
+       goto out;
+}
+
+static void etm_free_drv_configs(struct perf_event *event)
+{
+       struct perf_pmu_drv_config *config, *itr;
+
+       list_for_each_entry_safe(config, itr, &event->drv_configs, entry) {
+               list_del(&config->entry);
+               kfree(config->sink);
+               kfree(config);
+       }
+}
+
+int etm_perf_symlink(struct coresight_device *csdev, bool link)
+{
+       char entry[sizeof("cpu9999999")];
+       int ret = 0, cpu = source_ops(csdev)->cpu_id(csdev);
+       struct device *pmu_dev = etm_pmu.dev;
+       struct device *cs_dev = &csdev->dev;
+
+       sprintf(entry, "cpu%d", cpu);
+
+       if (!etm_perf_up)
+               return -EPROBE_DEFER;
+
+       if (link) {
+               ret = sysfs_create_link(&pmu_dev->kobj, &cs_dev->kobj, entry);
+               if (ret)
+                       return ret;
+               per_cpu(csdev_src, cpu) = csdev;
+       } else {
+               sysfs_remove_link(&pmu_dev->kobj, entry);
+               per_cpu(csdev_src, cpu) = NULL;
+       }
+
+       return 0;
+}
+
+static int __init etm_perf_init(void)
+{
+       int ret;
+
+       etm_pmu.capabilities    = PERF_PMU_CAP_EXCLUSIVE;
+
+       etm_pmu.attr_groups     = etm_pmu_attr_groups;
+       etm_pmu.task_ctx_nr     = perf_sw_context;
+       etm_pmu.read            = etm_event_read;
+       etm_pmu.event_init      = etm_event_init;
+       etm_pmu.setup_aux       = etm_setup_aux;
+       etm_pmu.free_aux        = etm_free_aux;
+       etm_pmu.start           = etm_event_start;
+       etm_pmu.stop            = etm_event_stop;
+       etm_pmu.add             = etm_event_add;
+       etm_pmu.del             = etm_event_del;
+       etm_pmu.get_drv_configs = etm_get_drv_configs;
+       etm_pmu.free_drv_configs
+                               = etm_free_drv_configs;
+
+       ret = perf_pmu_register(&etm_pmu, CORESIGHT_ETM_PMU_NAME, -1);
+       if (ret == 0)
+               etm_perf_up = true;
+
+       return ret;
+}
+device_initcall(etm_perf_init);
diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.h b/drivers/hwtracing/coresight/coresight-etm-perf.h

new file mode 100644 (file)

index 0000000..87f5a13
--- /dev/null
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright(C) 2015 Linaro Limited. All rights reserved.
+ * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _CORESIGHT_ETM_PERF_H
+#define _CORESIGHT_ETM_PERF_H
+
+struct coresight_device;
+
+#ifdef CONFIG_CORESIGHT
+int etm_perf_symlink(struct coresight_device *csdev, bool link);
+
+#else
+static inline int etm_perf_symlink(struct coresight_device *csdev, bool link)
+{ return -EINVAL; }
+
+#endif /* CONFIG_CORESIGHT */
+
+#endif
diff --git a/drivers/hwtracing/coresight/coresight-etm.h b/drivers/hwtracing/coresight/coresight-etm.h

index b4481eb29304a1ddf4a2b5a86d9244daa3c05f14..51597cb2c08af69293c0b17d8698019fc8af949e 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-etm.h
+++ b/drivers/hwtracing/coresight/coresight-etm.h
@@ -13,6 +13,7 @@
  #ifndef _CORESIGHT_CORESIGHT_ETM_H
  #define _CORESIGHT_CORESIGHT_ETM_H
  
+#include <asm/local.h>
  #include <linux/spinlock.h>
  #include "coresight-priv.h"
  
@@ -109,7 +110,10 @@
  #define ETM_MODE_STALL         BIT(2)
  #define ETM_MODE_TIMESTAMP     BIT(3)
  #define ETM_MODE_CTXID         BIT(4)
-#define ETM_MODE_ALL           0x1f
+#define ETM_MODE_ALL           (ETM_MODE_EXCLUDE | ETM_MODE_CYCACC | \
+                                ETM_MODE_STALL | ETM_MODE_TIMESTAMP | \
+                                ETM_MODE_CTXID | ETM_MODE_EXCL_KERN | \
+                                ETM_MODE_EXCL_USER)
  
  #define ETM_SQR_MASK           0x3
  #define ETM_TRACEID_MASK       0x3f
@@ -136,35 +140,16 @@
  #define ETM_DEFAULT_EVENT_VAL  (ETM_HARD_WIRE_RES_A    |       \
                                  ETM_ADD_COMP_0         |       \
                                  ETM_EVENT_NOT_A)
+
  /**
- * struct etm_drvdata - specifics associated to an ETM component
- * @base:      memory mapped base address for this component.
- * @dev:       the device entity associated to this component.
- * @atclk:     optional clock for the core parts of the ETM.
- * @csdev:     component vitals needed by the framework.
- * @spinlock:  only one at a time pls.
- * @cpu:       the cpu this component is affined to.
- * @port_size: port size as reported by ETMCR bit 4-6 and 21.
- * @arch:      ETM/PTM version number.
- * @use_cpu14: true if management registers need to be accessed via CP14.
- * @enable:    is this ETM/PTM currently tracing.
- * @sticky_enable: true if ETM base configuration has been done.
- * @boot_enable:true if we should start tracing at boot time.
- * @os_unlock: true if access to management registers is allowed.
- * @nr_addr_cmp:Number of pairs of address comparators as found in ETMCCR.
- * @nr_cntr:   Number of counters as found in ETMCCR bit 13-15.
- * @nr_ext_inp:        Number of external input as found in ETMCCR bit 17-19.
- * @nr_ext_out:        Number of external output as found in ETMCCR bit 20-22.
- * @nr_ctxid_cmp: Number of contextID comparators as found in ETMCCR bit 24-25.
- * @etmccr:    value of register ETMCCR.
- * @etmccer:   value of register ETMCCER.
- * @traceid:   value of the current ID for this component.
+ * struct etm_config - configuration information related to an ETM
   * @mode:      controls various modes supported by this ETM/PTM.
   * @ctrl:      used in conjunction with @mode.
   * @trigger_event: setting for register ETMTRIGGER.
   * @startstop_ctrl: setting for register ETMTSSCR.
   * @enable_event: setting for register ETMTEEVR.
   * @enable_ctrl1: setting for register ETMTECR1.
+ * @enable_ctrl2: setting for register ETMTECR2.
   * @fifofull_level: setting for register ETMFFLR.
   * @addr_idx:  index for the address comparator selection.
   * @addr_val:  value for address comparator register.
@@ -189,36 +174,16 @@
   * @ctxid_mask: mask applicable to all the context IDs.
   * @sync_freq: Synchronisation frequency.
   * @timestamp_event: Defines an event that requests the insertion
-                    of a timestamp into the trace stream.
+ *                  of a timestamp into the trace stream.
   */
-struct etm_drvdata {
-       void __iomem                    *base;
-       struct device                   *dev;
-       struct clk                      *atclk;
-       struct coresight_device         *csdev;
-       spinlock_t                      spinlock;
-       int                             cpu;
-       int                             port_size;
-       u8                              arch;
-       bool                            use_cp14;
-       bool                            enable;
-       bool                            sticky_enable;
-       bool                            boot_enable;
-       bool                            os_unlock;
-       u8                              nr_addr_cmp;
-       u8                              nr_cntr;
-       u8                              nr_ext_inp;
-       u8                              nr_ext_out;
-       u8                              nr_ctxid_cmp;
-       u32                             etmccr;
-       u32                             etmccer;
-       u32                             traceid;
+struct etm_config {
         u32                             mode;
         u32                             ctrl;
         u32                             trigger_event;
         u32                             startstop_ctrl;
         u32                             enable_event;
         u32                             enable_ctrl1;
+       u32                             enable_ctrl2;
         u32                             fifofull_level;
         u8                              addr_idx;
         u32                             addr_val[ETM_MAX_ADDR_CMP];
@@ -244,6 +209,56 @@ struct etm_drvdata {
         u32                             timestamp_event;
  };
  
+/**
+ * struct etm_drvdata - specifics associated to an ETM component
+ * @base:      memory mapped base address for this component.
+ * @dev:       the device entity associated to this component.
+ * @atclk:     optional clock for the core parts of the ETM.
+ * @csdev:     component vitals needed by the framework.
+ * @spinlock:  only one at a time pls.
+ * @cpu:       the cpu this component is affined to.
+ * @port_size: port size as reported by ETMCR bit 4-6 and 21.
+ * @arch:      ETM/PTM version number.
+ * @use_cpu14: true if management registers need to be accessed via CP14.
+ * @mode:      this tracer's mode, i.e sysFS, Perf or disabled.
+ * @sticky_enable: true if ETM base configuration has been done.
+ * @boot_enable:true if we should start tracing at boot time.
+ * @os_unlock: true if access to management registers is allowed.
+ * @nr_addr_cmp:Number of pairs of address comparators as found in ETMCCR.
+ * @nr_cntr:   Number of counters as found in ETMCCR bit 13-15.
+ * @nr_ext_inp:        Number of external input as found in ETMCCR bit 17-19.
+ * @nr_ext_out:        Number of external output as found in ETMCCR bit 20-22.
+ * @nr_ctxid_cmp: Number of contextID comparators as found in ETMCCR bit 24-25.
+ * @etmccr:    value of register ETMCCR.
+ * @etmccer:   value of register ETMCCER.
+ * @traceid:   value of the current ID for this component.
+ * @config:    structure holding configuration parameters.
+ */
+struct etm_drvdata {
+       void __iomem                    *base;
+       struct device                   *dev;
+       struct clk                      *atclk;
+       struct coresight_device         *csdev;
+       spinlock_t                      spinlock;
+       int                             cpu;
+       int                             port_size;
+       u8                              arch;
+       bool                            use_cp14;
+       local_t                         mode;
+       bool                            sticky_enable;
+       bool                            boot_enable;
+       bool                            os_unlock;
+       u8                              nr_addr_cmp;
+       u8                              nr_cntr;
+       u8                              nr_ext_inp;
+       u8                              nr_ext_out;
+       u8                              nr_ctxid_cmp;
+       u32                             etmccr;
+       u32                             etmccer;
+       u32                             traceid;
+       struct etm_config               config;
+};
+
  enum etm_addr_type {
         ETM_ADDR_TYPE_NONE,
         ETM_ADDR_TYPE_SINGLE,
@@ -251,4 +266,39 @@ enum etm_addr_type {
         ETM_ADDR_TYPE_START,
         ETM_ADDR_TYPE_STOP,
  };
+
+static inline void etm_writel(struct etm_drvdata *drvdata,
+                             u32 val, u32 off)
+{
+       if (drvdata->use_cp14) {
+               if (etm_writel_cp14(off, val)) {
+                       dev_err(drvdata->dev,
+                               "invalid CP14 access to ETM reg: %#x", off);
+               }
+       } else {
+               writel_relaxed(val, drvdata->base + off);
+       }
+}
+
+static inline unsigned int etm_readl(struct etm_drvdata *drvdata, u32 off)
+{
+       u32 val;
+
+       if (drvdata->use_cp14) {
+               if (etm_readl_cp14(off, &val)) {
+                       dev_err(drvdata->dev,
+                               "invalid CP14 access to ETM reg: %#x", off);
+               }
+       } else {
+               val = readl_relaxed(drvdata->base + off);
+       }
+
+       return val;
+}
+
+extern const struct attribute_group *coresight_etm_groups[];
+int etm_get_trace_id(struct etm_drvdata *drvdata);
+void etm_set_default(struct etm_config *config);
+void etm_config_trace_mode(struct etm_config *config);
+struct etm_config *get_etm_config(struct etm_drvdata *drvdata);
  #endif
diff --git a/drivers/hwtracing/coresight/coresight-etm3x-sysfs.c b/drivers/hwtracing/coresight/coresight-etm3x-sysfs.c

new file mode 100644 (file)

index 0000000..02d4b62
--- /dev/null
+++ b/drivers/hwtracing/coresight/coresight-etm3x-sysfs.c
@@ -0,0 +1,1265 @@
+/*
+ * Copyright(C) 2015 Linaro Limited. All rights reserved.
+ * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/pm_runtime.h>
+#include <linux/sysfs.h>
+#include "coresight-etm.h"
+
+static ssize_t nr_addr_cmp_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       val = drvdata->nr_addr_cmp;
+       return sprintf(buf, "%#lx\n", val);
+}
+static DEVICE_ATTR_RO(nr_addr_cmp);
+
+static ssize_t nr_cntr_show(struct device *dev,
+                           struct device_attribute *attr, char *buf)
+{      unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       val = drvdata->nr_cntr;
+       return sprintf(buf, "%#lx\n", val);
+}
+static DEVICE_ATTR_RO(nr_cntr);
+
+static ssize_t nr_ctxid_cmp_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       val = drvdata->nr_ctxid_cmp;
+       return sprintf(buf, "%#lx\n", val);
+}
+static DEVICE_ATTR_RO(nr_ctxid_cmp);
+
+static ssize_t etmsr_show(struct device *dev,
+                         struct device_attribute *attr, char *buf)
+{
+       unsigned long flags, val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       pm_runtime_get_sync(drvdata->dev);
+       spin_lock_irqsave(&drvdata->spinlock, flags);
+       CS_UNLOCK(drvdata->base);
+
+       val = etm_readl(drvdata, ETMSR);
+
+       CS_LOCK(drvdata->base);
+       spin_unlock_irqrestore(&drvdata->spinlock, flags);
+       pm_runtime_put(drvdata->dev);
+
+       return sprintf(buf, "%#lx\n", val);
+}
+static DEVICE_ATTR_RO(etmsr);
+
+static ssize_t reset_store(struct device *dev,
+                          struct device_attribute *attr,
+                          const char *buf, size_t size)
+{
+       int i, ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       if (val) {
+               spin_lock(&drvdata->spinlock);
+               memset(config, 0, sizeof(struct etm_config));
+               config->mode = ETM_MODE_EXCLUDE;
+               config->trigger_event = ETM_DEFAULT_EVENT_VAL;
+               for (i = 0; i < drvdata->nr_addr_cmp; i++) {
+                       config->addr_type[i] = ETM_ADDR_TYPE_NONE;
+               }
+
+               etm_set_default(config);
+               spin_unlock(&drvdata->spinlock);
+       }
+
+       return size;
+}
+static DEVICE_ATTR_WO(reset);
+
+static ssize_t mode_show(struct device *dev,
+                        struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       val = config->mode;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t mode_store(struct device *dev,
+                         struct device_attribute *attr,
+                         const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       spin_lock(&drvdata->spinlock);
+       config->mode = val & ETM_MODE_ALL;
+
+       if (config->mode & ETM_MODE_EXCLUDE)
+               config->enable_ctrl1 |= ETMTECR1_INC_EXC;
+       else
+               config->enable_ctrl1 &= ~ETMTECR1_INC_EXC;
+
+       if (config->mode & ETM_MODE_CYCACC)
+               config->ctrl |= ETMCR_CYC_ACC;
+       else
+               config->ctrl &= ~ETMCR_CYC_ACC;
+
+       if (config->mode & ETM_MODE_STALL) {
+               if (!(drvdata->etmccr & ETMCCR_FIFOFULL)) {
+                       dev_warn(drvdata->dev, "stall mode not supported\n");
+                       ret = -EINVAL;
+                       goto err_unlock;
+               }
+               config->ctrl |= ETMCR_STALL_MODE;
+        } else
+               config->ctrl &= ~ETMCR_STALL_MODE;
+
+       if (config->mode & ETM_MODE_TIMESTAMP) {
+               if (!(drvdata->etmccer & ETMCCER_TIMESTAMP)) {
+                       dev_warn(drvdata->dev, "timestamp not supported\n");
+                       ret = -EINVAL;
+                       goto err_unlock;
+               }
+               config->ctrl |= ETMCR_TIMESTAMP_EN;
+       } else
+               config->ctrl &= ~ETMCR_TIMESTAMP_EN;
+
+       if (config->mode & ETM_MODE_CTXID)
+               config->ctrl |= ETMCR_CTXID_SIZE;
+       else
+               config->ctrl &= ~ETMCR_CTXID_SIZE;
+
+       if (config->mode & (ETM_MODE_EXCL_KERN | ETM_MODE_EXCL_USER))
+               etm_config_trace_mode(config);
+
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+
+err_unlock:
+       spin_unlock(&drvdata->spinlock);
+       return ret;
+}
+static DEVICE_ATTR_RW(mode);
+
+static ssize_t trigger_event_show(struct device *dev,
+                                 struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       val = config->trigger_event;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t trigger_event_store(struct device *dev,
+                                  struct device_attribute *attr,
+                                  const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       config->trigger_event = val & ETM_EVENT_MASK;
+
+       return size;
+}
+static DEVICE_ATTR_RW(trigger_event);
+
+static ssize_t enable_event_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       val = config->enable_event;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t enable_event_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       config->enable_event = val & ETM_EVENT_MASK;
+
+       return size;
+}
+static DEVICE_ATTR_RW(enable_event);
+
+static ssize_t fifofull_level_show(struct device *dev,
+                                  struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       val = config->fifofull_level;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t fifofull_level_store(struct device *dev,
+                                   struct device_attribute *attr,
+                                   const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       config->fifofull_level = val;
+
+       return size;
+}
+static DEVICE_ATTR_RW(fifofull_level);
+
+static ssize_t addr_idx_show(struct device *dev,
+                            struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       val = config->addr_idx;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t addr_idx_store(struct device *dev,
+                             struct device_attribute *attr,
+                             const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       if (val >= drvdata->nr_addr_cmp)
+               return -EINVAL;
+
+       /*
+        * Use spinlock to ensure index doesn't change while it gets
+        * dereferenced multiple times within a spinlock block elsewhere.
+        */
+       spin_lock(&drvdata->spinlock);
+       config->addr_idx = val;
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_RW(addr_idx);
+
+static ssize_t addr_single_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       u8 idx;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       if (!(config->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
+             config->addr_type[idx] == ETM_ADDR_TYPE_SINGLE)) {
+               spin_unlock(&drvdata->spinlock);
+               return -EINVAL;
+       }
+
+       val = config->addr_val[idx];
+       spin_unlock(&drvdata->spinlock);
+
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t addr_single_store(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t size)
+{
+       u8 idx;
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       if (!(config->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
+             config->addr_type[idx] == ETM_ADDR_TYPE_SINGLE)) {
+               spin_unlock(&drvdata->spinlock);
+               return -EINVAL;
+       }
+
+       config->addr_val[idx] = val;
+       config->addr_type[idx] = ETM_ADDR_TYPE_SINGLE;
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_RW(addr_single);
+
+static ssize_t addr_range_show(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+       u8 idx;
+       unsigned long val1, val2;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       if (idx % 2 != 0) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+       if (!((config->addr_type[idx] == ETM_ADDR_TYPE_NONE &&
+              config->addr_type[idx + 1] == ETM_ADDR_TYPE_NONE) ||
+             (config->addr_type[idx] == ETM_ADDR_TYPE_RANGE &&
+              config->addr_type[idx + 1] == ETM_ADDR_TYPE_RANGE))) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+
+       val1 = config->addr_val[idx];
+       val2 = config->addr_val[idx + 1];
+       spin_unlock(&drvdata->spinlock);
+
+       return sprintf(buf, "%#lx %#lx\n", val1, val2);
+}
+
+static ssize_t addr_range_store(struct device *dev,
+                             struct device_attribute *attr,
+                             const char *buf, size_t size)
+{
+       u8 idx;
+       unsigned long val1, val2;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       if (sscanf(buf, "%lx %lx", &val1, &val2) != 2)
+               return -EINVAL;
+       /* Lower address comparator cannot have a higher address value */
+       if (val1 > val2)
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       if (idx % 2 != 0) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+       if (!((config->addr_type[idx] == ETM_ADDR_TYPE_NONE &&
+              config->addr_type[idx + 1] == ETM_ADDR_TYPE_NONE) ||
+             (config->addr_type[idx] == ETM_ADDR_TYPE_RANGE &&
+              config->addr_type[idx + 1] == ETM_ADDR_TYPE_RANGE))) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+
+       config->addr_val[idx] = val1;
+       config->addr_type[idx] = ETM_ADDR_TYPE_RANGE;
+       config->addr_val[idx + 1] = val2;
+       config->addr_type[idx + 1] = ETM_ADDR_TYPE_RANGE;
+       config->enable_ctrl1 |= (1 << (idx/2));
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_RW(addr_range);
+
+static ssize_t addr_start_show(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+       u8 idx;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       if (!(config->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
+             config->addr_type[idx] == ETM_ADDR_TYPE_START)) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+
+       val = config->addr_val[idx];
+       spin_unlock(&drvdata->spinlock);
+
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t addr_start_store(struct device *dev,
+                               struct device_attribute *attr,
+                               const char *buf, size_t size)
+{
+       u8 idx;
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       if (!(config->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
+             config->addr_type[idx] == ETM_ADDR_TYPE_START)) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+
+       config->addr_val[idx] = val;
+       config->addr_type[idx] = ETM_ADDR_TYPE_START;
+       config->startstop_ctrl |= (1 << idx);
+       config->enable_ctrl1 |= BIT(25);
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_RW(addr_start);
+
+static ssize_t addr_stop_show(struct device *dev,
+                             struct device_attribute *attr, char *buf)
+{
+       u8 idx;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       if (!(config->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
+             config->addr_type[idx] == ETM_ADDR_TYPE_STOP)) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+
+       val = config->addr_val[idx];
+       spin_unlock(&drvdata->spinlock);
+
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t addr_stop_store(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t size)
+{
+       u8 idx;
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       if (!(config->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
+             config->addr_type[idx] == ETM_ADDR_TYPE_STOP)) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+
+       config->addr_val[idx] = val;
+       config->addr_type[idx] = ETM_ADDR_TYPE_STOP;
+       config->startstop_ctrl |= (1 << (idx + 16));
+       config->enable_ctrl1 |= ETMTECR1_START_STOP;
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_RW(addr_stop);
+
+static ssize_t addr_acctype_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       val = config->addr_acctype[config->addr_idx];
+       spin_unlock(&drvdata->spinlock);
+
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t addr_acctype_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       spin_lock(&drvdata->spinlock);
+       config->addr_acctype[config->addr_idx] = val;
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_RW(addr_acctype);
+
+static ssize_t cntr_idx_show(struct device *dev,
+                            struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       val = config->cntr_idx;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t cntr_idx_store(struct device *dev,
+                             struct device_attribute *attr,
+                             const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       if (val >= drvdata->nr_cntr)
+               return -EINVAL;
+       /*
+        * Use spinlock to ensure index doesn't change while it gets
+        * dereferenced multiple times within a spinlock block elsewhere.
+        */
+       spin_lock(&drvdata->spinlock);
+       config->cntr_idx = val;
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_RW(cntr_idx);
+
+static ssize_t cntr_rld_val_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       val = config->cntr_rld_val[config->cntr_idx];
+       spin_unlock(&drvdata->spinlock);
+
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t cntr_rld_val_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       spin_lock(&drvdata->spinlock);
+       config->cntr_rld_val[config->cntr_idx] = val;
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_RW(cntr_rld_val);
+
+static ssize_t cntr_event_show(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       val = config->cntr_event[config->cntr_idx];
+       spin_unlock(&drvdata->spinlock);
+
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t cntr_event_store(struct device *dev,
+                               struct device_attribute *attr,
+                               const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       spin_lock(&drvdata->spinlock);
+       config->cntr_event[config->cntr_idx] = val & ETM_EVENT_MASK;
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_RW(cntr_event);
+
+static ssize_t cntr_rld_event_show(struct device *dev,
+                                  struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       val = config->cntr_rld_event[config->cntr_idx];
+       spin_unlock(&drvdata->spinlock);
+
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t cntr_rld_event_store(struct device *dev,
+                                   struct device_attribute *attr,
+                                   const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       spin_lock(&drvdata->spinlock);
+       config->cntr_rld_event[config->cntr_idx] = val & ETM_EVENT_MASK;
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_RW(cntr_rld_event);
+
+static ssize_t cntr_val_show(struct device *dev,
+                            struct device_attribute *attr, char *buf)
+{
+       int i, ret = 0;
+       u32 val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       if (!local_read(&drvdata->mode)) {
+               spin_lock(&drvdata->spinlock);
+               for (i = 0; i < drvdata->nr_cntr; i++)
+                       ret += sprintf(buf, "counter %d: %x\n",
+                                      i, config->cntr_val[i]);
+               spin_unlock(&drvdata->spinlock);
+               return ret;
+       }
+
+       for (i = 0; i < drvdata->nr_cntr; i++) {
+               val = etm_readl(drvdata, ETMCNTVRn(i));
+               ret += sprintf(buf, "counter %d: %x\n", i, val);
+       }
+
+       return ret;
+}
+
+static ssize_t cntr_val_store(struct device *dev,
+                             struct device_attribute *attr,
+                             const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       spin_lock(&drvdata->spinlock);
+       config->cntr_val[config->cntr_idx] = val;
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_RW(cntr_val);
+
+static ssize_t seq_12_event_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       val = config->seq_12_event;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t seq_12_event_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       config->seq_12_event = val & ETM_EVENT_MASK;
+       return size;
+}
+static DEVICE_ATTR_RW(seq_12_event);
+
+static ssize_t seq_21_event_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       val = config->seq_21_event;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t seq_21_event_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       config->seq_21_event = val & ETM_EVENT_MASK;
+       return size;
+}
+static DEVICE_ATTR_RW(seq_21_event);
+
+static ssize_t seq_23_event_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       val = config->seq_23_event;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t seq_23_event_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       config->seq_23_event = val & ETM_EVENT_MASK;
+       return size;
+}
+static DEVICE_ATTR_RW(seq_23_event);
+
+static ssize_t seq_31_event_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       val = config->seq_31_event;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t seq_31_event_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       config->seq_31_event = val & ETM_EVENT_MASK;
+       return size;
+}
+static DEVICE_ATTR_RW(seq_31_event);
+
+static ssize_t seq_32_event_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       val = config->seq_32_event;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t seq_32_event_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       config->seq_32_event = val & ETM_EVENT_MASK;
+       return size;
+}
+static DEVICE_ATTR_RW(seq_32_event);
+
+static ssize_t seq_13_event_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       val = config->seq_13_event;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t seq_13_event_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       config->seq_13_event = val & ETM_EVENT_MASK;
+       return size;
+}
+static DEVICE_ATTR_RW(seq_13_event);
+
+static ssize_t seq_curr_state_show(struct device *dev,
+                                  struct device_attribute *attr, char *buf)
+{
+       unsigned long val, flags;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       if (!local_read(&drvdata->mode)) {
+               val = config->seq_curr_state;
+               goto out;
+       }
+
+       pm_runtime_get_sync(drvdata->dev);
+       spin_lock_irqsave(&drvdata->spinlock, flags);
+
+       CS_UNLOCK(drvdata->base);
+       val = (etm_readl(drvdata, ETMSQR) & ETM_SQR_MASK);
+       CS_LOCK(drvdata->base);
+
+       spin_unlock_irqrestore(&drvdata->spinlock, flags);
+       pm_runtime_put(drvdata->dev);
+out:
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t seq_curr_state_store(struct device *dev,
+                                   struct device_attribute *attr,
+                                   const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       if (val > ETM_SEQ_STATE_MAX_VAL)
+               return -EINVAL;
+
+       config->seq_curr_state = val;
+
+       return size;
+}
+static DEVICE_ATTR_RW(seq_curr_state);
+
+static ssize_t ctxid_idx_show(struct device *dev,
+                             struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       val = config->ctxid_idx;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t ctxid_idx_store(struct device *dev,
+                               struct device_attribute *attr,
+                               const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       if (val >= drvdata->nr_ctxid_cmp)
+               return -EINVAL;
+
+       /*
+        * Use spinlock to ensure index doesn't change while it gets
+        * dereferenced multiple times within a spinlock block elsewhere.
+        */
+       spin_lock(&drvdata->spinlock);
+       config->ctxid_idx = val;
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_RW(ctxid_idx);
+
+static ssize_t ctxid_pid_show(struct device *dev,
+                             struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       val = config->ctxid_vpid[config->ctxid_idx];
+       spin_unlock(&drvdata->spinlock);
+
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t ctxid_pid_store(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t size)
+{
+       int ret;
+       unsigned long vpid, pid;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &vpid);
+       if (ret)
+               return ret;
+
+       pid = coresight_vpid_to_pid(vpid);
+
+       spin_lock(&drvdata->spinlock);
+       config->ctxid_pid[config->ctxid_idx] = pid;
+       config->ctxid_vpid[config->ctxid_idx] = vpid;
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_RW(ctxid_pid);
+
+static ssize_t ctxid_mask_show(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       val = config->ctxid_mask;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t ctxid_mask_store(struct device *dev,
+                               struct device_attribute *attr,
+                               const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       config->ctxid_mask = val;
+       return size;
+}
+static DEVICE_ATTR_RW(ctxid_mask);
+
+static ssize_t sync_freq_show(struct device *dev,
+                             struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       val = config->sync_freq;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t sync_freq_store(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       config->sync_freq = val & ETM_SYNC_MASK;
+       return size;
+}
+static DEVICE_ATTR_RW(sync_freq);
+
+static ssize_t timestamp_event_show(struct device *dev,
+                                   struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       val = config->timestamp_event;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t timestamp_event_store(struct device *dev,
+                                    struct device_attribute *attr,
+                                    const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       config->timestamp_event = val & ETM_EVENT_MASK;
+       return size;
+}
+static DEVICE_ATTR_RW(timestamp_event);
+
+static ssize_t cpu_show(struct device *dev,
+                       struct device_attribute *attr, char *buf)
+{
+       int val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       val = drvdata->cpu;
+       return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+
+}
+static DEVICE_ATTR_RO(cpu);
+
+static ssize_t traceid_show(struct device *dev,
+                           struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       val = etm_get_trace_id(drvdata);
+
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t traceid_store(struct device *dev,
+                            struct device_attribute *attr,
+                            const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       drvdata->traceid = val & ETM_TRACEID_MASK;
+       return size;
+}
+static DEVICE_ATTR_RW(traceid);
+
+static struct attribute *coresight_etm_attrs[] = {
+       &dev_attr_nr_addr_cmp.attr,
+       &dev_attr_nr_cntr.attr,
+       &dev_attr_nr_ctxid_cmp.attr,
+       &dev_attr_etmsr.attr,
+       &dev_attr_reset.attr,
+       &dev_attr_mode.attr,
+       &dev_attr_trigger_event.attr,
+       &dev_attr_enable_event.attr,
+       &dev_attr_fifofull_level.attr,
+       &dev_attr_addr_idx.attr,
+       &dev_attr_addr_single.attr,
+       &dev_attr_addr_range.attr,
+       &dev_attr_addr_start.attr,
+       &dev_attr_addr_stop.attr,
+       &dev_attr_addr_acctype.attr,
+       &dev_attr_cntr_idx.attr,
+       &dev_attr_cntr_rld_val.attr,
+       &dev_attr_cntr_event.attr,
+       &dev_attr_cntr_rld_event.attr,
+       &dev_attr_cntr_val.attr,
+       &dev_attr_seq_12_event.attr,
+       &dev_attr_seq_21_event.attr,
+       &dev_attr_seq_23_event.attr,
+       &dev_attr_seq_31_event.attr,
+       &dev_attr_seq_32_event.attr,
+       &dev_attr_seq_13_event.attr,
+       &dev_attr_seq_curr_state.attr,
+       &dev_attr_ctxid_idx.attr,
+       &dev_attr_ctxid_pid.attr,
+       &dev_attr_ctxid_mask.attr,
+       &dev_attr_sync_freq.attr,
+       &dev_attr_timestamp_event.attr,
+       &dev_attr_traceid.attr,
+       &dev_attr_cpu.attr,
+       NULL,
+};
+
+#define coresight_etm3x_simple_func(name, offset)                      \
+       coresight_simple_func(struct etm_drvdata, name, offset)
+
+coresight_etm3x_simple_func(etmccr, ETMCCR);
+coresight_etm3x_simple_func(etmccer, ETMCCER);
+coresight_etm3x_simple_func(etmscr, ETMSCR);
+coresight_etm3x_simple_func(etmidr, ETMIDR);
+coresight_etm3x_simple_func(etmcr, ETMCR);
+coresight_etm3x_simple_func(etmtraceidr, ETMTRACEIDR);
+coresight_etm3x_simple_func(etmteevr, ETMTEEVR);
+coresight_etm3x_simple_func(etmtssvr, ETMTSSCR);
+coresight_etm3x_simple_func(etmtecr1, ETMTECR1);
+coresight_etm3x_simple_func(etmtecr2, ETMTECR2);
+
+static struct attribute *coresight_etm_mgmt_attrs[] = {
+       &dev_attr_etmccr.attr,
+       &dev_attr_etmccer.attr,
+       &dev_attr_etmscr.attr,
+       &dev_attr_etmidr.attr,
+       &dev_attr_etmcr.attr,
+       &dev_attr_etmtraceidr.attr,
+       &dev_attr_etmteevr.attr,
+       &dev_attr_etmtssvr.attr,
+       &dev_attr_etmtecr1.attr,
+       &dev_attr_etmtecr2.attr,
+       NULL,
+};
+
+static const struct attribute_group coresight_etm_group = {
+       .attrs = coresight_etm_attrs,
+};
+
+static const struct attribute_group coresight_etm_mgmt_group = {
+       .attrs = coresight_etm_mgmt_attrs,
+       .name = "mgmt",
+};
+
+const struct attribute_group *coresight_etm_groups[] = {
+       &coresight_etm_group,
+       &coresight_etm_mgmt_group,
+       NULL,
+};
diff --git a/drivers/hwtracing/coresight/coresight-etm3x.c b/drivers/hwtracing/coresight/coresight-etm3x.c

index d630b7ece73521ccf8cd7b320ebd75ecc92eb1d3..d83ab82672e4e136ffb443d033c5c23ccba697f5 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-etm3x.c
+++ b/drivers/hwtracing/coresight/coresight-etm3x.c
@@ -1,4 +1,6 @@
  /* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+ *
+ * Description: CoreSight Program Flow Trace driver
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License version 2 and
@@ -11,7 +13,7 @@
   */
  
  #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/moduleparam.h>
  #include <linux/init.h>
  #include <linux/types.h>
  #include <linux/device.h>
@@ -27,14 +29,21 @@
  #include <linux/cpu.h>
  #include <linux/of.h>
  #include <linux/coresight.h>
+#include <linux/coresight-pmu.h>
  #include <linux/amba/bus.h>
  #include <linux/seq_file.h>
  #include <linux/uaccess.h>
  #include <linux/clk.h>
+#include <linux/perf_event.h>
  #include <asm/sections.h>
  
  #include "coresight-etm.h"
+#include "coresight-etm-perf.h"
  
+/*
+ * Not really modular but using module_param is the easiest way to
+ * remain consistent with existing use cases for now.
+ */
  static int boot_enable;
  module_param_named(boot_enable, boot_enable, int, S_IRUGO);
  
@@ -42,45 +51,16 @@ module_param_named(boot_enable, boot_enable, int, S_IRUGO);
  static int etm_count;
  static struct etm_drvdata *etmdrvdata[NR_CPUS];
  
-static inline void etm_writel(struct etm_drvdata *drvdata,
-                             u32 val, u32 off)
-{
-       if (drvdata->use_cp14) {
-               if (etm_writel_cp14(off, val)) {
-                       dev_err(drvdata->dev,
-                               "invalid CP14 access to ETM reg: %#x", off);
-               }
-       } else {
-               writel_relaxed(val, drvdata->base + off);
-       }
-}
-
-static inline unsigned int etm_readl(struct etm_drvdata *drvdata, u32 off)
-{
-       u32 val;
-
-       if (drvdata->use_cp14) {
-               if (etm_readl_cp14(off, &val)) {
-                       dev_err(drvdata->dev,
-                               "invalid CP14 access to ETM reg: %#x", off);
-               }
-       } else {
-               val = readl_relaxed(drvdata->base + off);
-       }
-
-       return val;
-}
-
  /*
   * Memory mapped writes to clear os lock are not supported on some processors
   * and OS lock must be unlocked before any memory mapped access on such
   * processors, otherwise memory mapped reads/writes will be invalid.
   */
-static void etm_os_unlock(void *info)
+static void etm_os_unlock(struct etm_drvdata *drvdata)
  {
-       struct etm_drvdata *drvdata = (struct etm_drvdata *)info;
         /* Writing any value to ETMOSLAR unlocks the trace registers */
         etm_writel(drvdata, 0x0, ETMOSLAR);
+       drvdata->os_unlock = true;
         isb();
  }
  
@@ -215,1431 +195,450 @@ static void etm_clr_prog(struct etm_drvdata *drvdata)
         }
  }
  
-static void etm_set_default(struct etm_drvdata *drvdata)
-{
-       int i;
-
-       drvdata->trigger_event = ETM_DEFAULT_EVENT_VAL;
-       drvdata->enable_event = ETM_HARD_WIRE_RES_A;
-
-       drvdata->seq_12_event = ETM_DEFAULT_EVENT_VAL;
-       drvdata->seq_21_event = ETM_DEFAULT_EVENT_VAL;
-       drvdata->seq_23_event = ETM_DEFAULT_EVENT_VAL;
-       drvdata->seq_31_event = ETM_DEFAULT_EVENT_VAL;
-       drvdata->seq_32_event = ETM_DEFAULT_EVENT_VAL;
-       drvdata->seq_13_event = ETM_DEFAULT_EVENT_VAL;
-       drvdata->timestamp_event = ETM_DEFAULT_EVENT_VAL;
-
-       for (i = 0; i < drvdata->nr_cntr; i++) {
-               drvdata->cntr_rld_val[i] = 0x0;
-               drvdata->cntr_event[i] = ETM_DEFAULT_EVENT_VAL;
-               drvdata->cntr_rld_event[i] = ETM_DEFAULT_EVENT_VAL;
-               drvdata->cntr_val[i] = 0x0;
-       }
-
-       drvdata->seq_curr_state = 0x0;
-       drvdata->ctxid_idx = 0x0;
-       for (i = 0; i < drvdata->nr_ctxid_cmp; i++) {
-               drvdata->ctxid_pid[i] = 0x0;
-               drvdata->ctxid_vpid[i] = 0x0;
-       }
-
-       drvdata->ctxid_mask = 0x0;
-}
-
-static void etm_enable_hw(void *info)
+void etm_set_default(struct etm_config *config)
  {
         int i;
-       u32 etmcr;
-       struct etm_drvdata *drvdata = info;
  
-       CS_UNLOCK(drvdata->base);
-
-       /* Turn engine on */
-       etm_clr_pwrdwn(drvdata);
-       /* Apply power to trace registers */
-       etm_set_pwrup(drvdata);
-       /* Make sure all registers are accessible */
-       etm_os_unlock(drvdata);
-
-       etm_set_prog(drvdata);
-
-       etmcr = etm_readl(drvdata, ETMCR);
-       etmcr &= (ETMCR_PWD_DWN | ETMCR_ETM_PRG);
-       etmcr |= drvdata->port_size;
-       etm_writel(drvdata, drvdata->ctrl | etmcr, ETMCR);
-       etm_writel(drvdata, drvdata->trigger_event, ETMTRIGGER);
-       etm_writel(drvdata, drvdata->startstop_ctrl, ETMTSSCR);
-       etm_writel(drvdata, drvdata->enable_event, ETMTEEVR);
-       etm_writel(drvdata, drvdata->enable_ctrl1, ETMTECR1);
-       etm_writel(drvdata, drvdata->fifofull_level, ETMFFLR);
-       for (i = 0; i < drvdata->nr_addr_cmp; i++) {
-               etm_writel(drvdata, drvdata->addr_val[i], ETMACVRn(i));
-               etm_writel(drvdata, drvdata->addr_acctype[i], ETMACTRn(i));
-       }
-       for (i = 0; i < drvdata->nr_cntr; i++) {
-               etm_writel(drvdata, drvdata->cntr_rld_val[i], ETMCNTRLDVRn(i));
-               etm_writel(drvdata, drvdata->cntr_event[i], ETMCNTENRn(i));
-               etm_writel(drvdata, drvdata->cntr_rld_event[i],
-                          ETMCNTRLDEVRn(i));
-               etm_writel(drvdata, drvdata->cntr_val[i], ETMCNTVRn(i));
-       }
-       etm_writel(drvdata, drvdata->seq_12_event, ETMSQ12EVR);
-       etm_writel(drvdata, drvdata->seq_21_event, ETMSQ21EVR);
-       etm_writel(drvdata, drvdata->seq_23_event, ETMSQ23EVR);
-       etm_writel(drvdata, drvdata->seq_31_event, ETMSQ31EVR);
-       etm_writel(drvdata, drvdata->seq_32_event, ETMSQ32EVR);
-       etm_writel(drvdata, drvdata->seq_13_event, ETMSQ13EVR);
-       etm_writel(drvdata, drvdata->seq_curr_state, ETMSQR);
-       for (i = 0; i < drvdata->nr_ext_out; i++)
-               etm_writel(drvdata, ETM_DEFAULT_EVENT_VAL, ETMEXTOUTEVRn(i));
-       for (i = 0; i < drvdata->nr_ctxid_cmp; i++)
-               etm_writel(drvdata, drvdata->ctxid_pid[i], ETMCIDCVRn(i));
-       etm_writel(drvdata, drvdata->ctxid_mask, ETMCIDCMR);
-       etm_writel(drvdata, drvdata->sync_freq, ETMSYNCFR);
-       /* No external input selected */
-       etm_writel(drvdata, 0x0, ETMEXTINSELR);
-       etm_writel(drvdata, drvdata->timestamp_event, ETMTSEVR);
-       /* No auxiliary control selected */
-       etm_writel(drvdata, 0x0, ETMAUXCR);
-       etm_writel(drvdata, drvdata->traceid, ETMTRACEIDR);
-       /* No VMID comparator value selected */
-       etm_writel(drvdata, 0x0, ETMVMIDCVR);
-
-       /* Ensures trace output is enabled from this ETM */
-       etm_writel(drvdata, drvdata->ctrl | ETMCR_ETM_EN | etmcr, ETMCR);
-
-       etm_clr_prog(drvdata);
-       CS_LOCK(drvdata->base);
-
-       dev_dbg(drvdata->dev, "cpu: %d enable smp call done\n", drvdata->cpu);
-}
-
-static int etm_trace_id(struct coresight_device *csdev)
-{
-       struct etm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
-       unsigned long flags;
-       int trace_id = -1;
-
-       if (!drvdata->enable)
-               return drvdata->traceid;
-       pm_runtime_get_sync(csdev->dev.parent);
-
-       spin_lock_irqsave(&drvdata->spinlock, flags);
-
-       CS_UNLOCK(drvdata->base);
-       trace_id = (etm_readl(drvdata, ETMTRACEIDR) & ETM_TRACEID_MASK);
-       CS_LOCK(drvdata->base);
-
-       spin_unlock_irqrestore(&drvdata->spinlock, flags);
-       pm_runtime_put(csdev->dev.parent);
-
-       return trace_id;
-}
-
-static int etm_enable(struct coresight_device *csdev)
-{
-       struct etm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
-       int ret;
-
-       pm_runtime_get_sync(csdev->dev.parent);
-       spin_lock(&drvdata->spinlock);
+       if (WARN_ON_ONCE(!config))
+               return;
  
         /*
-        * Configure the ETM only if the CPU is online.  If it isn't online
-        * hw configuration will take place when 'CPU_STARTING' is received
-        * in @etm_cpu_callback.
+        * Taken verbatim from the TRM:
+        *
+        * To trace all memory:
+        *  set bit [24] in register 0x009, the ETMTECR1, to 1
+        *  set all other bits in register 0x009, the ETMTECR1, to 0
+        *  set all bits in register 0x007, the ETMTECR2, to 0
+        *  set register 0x008, the ETMTEEVR, to 0x6F (TRUE).
          */
-       if (cpu_online(drvdata->cpu)) {
-               ret = smp_call_function_single(drvdata->cpu,
-                                              etm_enable_hw, drvdata, 1);
-               if (ret)
-                       goto err;
-       }
-
-       drvdata->enable = true;
-       drvdata->sticky_enable = true;
+       config->enable_ctrl1 = BIT(24);
+       config->enable_ctrl2 = 0x0;
+       config->enable_event = ETM_HARD_WIRE_RES_A;
  
-       spin_unlock(&drvdata->spinlock);
-
-       dev_info(drvdata->dev, "ETM tracing enabled\n");
-       return 0;
-err:
-       spin_unlock(&drvdata->spinlock);
-       pm_runtime_put(csdev->dev.parent);
-       return ret;
-}
+       config->trigger_event = ETM_DEFAULT_EVENT_VAL;
+       config->enable_event = ETM_HARD_WIRE_RES_A;
  
-static void etm_disable_hw(void *info)
-{
-       int i;
-       struct etm_drvdata *drvdata = info;
-
-       CS_UNLOCK(drvdata->base);
-       etm_set_prog(drvdata);
-
-       /* Program trace enable to low by using always false event */
-       etm_writel(drvdata, ETM_HARD_WIRE_RES_A | ETM_EVENT_NOT_A, ETMTEEVR);
-
-       /* Read back sequencer and counters for post trace analysis */
-       drvdata->seq_curr_state = (etm_readl(drvdata, ETMSQR) & ETM_SQR_MASK);
-
-       for (i = 0; i < drvdata->nr_cntr; i++)
-               drvdata->cntr_val[i] = etm_readl(drvdata, ETMCNTVRn(i));
-
-       etm_set_pwrdwn(drvdata);
-       CS_LOCK(drvdata->base);
-
-       dev_dbg(drvdata->dev, "cpu: %d disable smp call done\n", drvdata->cpu);
-}
-
-static void etm_disable(struct coresight_device *csdev)
-{
-       struct etm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
-
-       /*
-        * Taking hotplug lock here protects from clocks getting disabled
-        * with tracing being left on (crash scenario) if user disable occurs
-        * after cpu online mask indicates the cpu is offline but before the
-        * DYING hotplug callback is serviced by the ETM driver.
-        */
-       get_online_cpus();
-       spin_lock(&drvdata->spinlock);
-
-       /*
-        * Executing etm_disable_hw on the cpu whose ETM is being disabled
-        * ensures that register writes occur when cpu is powered.
-        */
-       smp_call_function_single(drvdata->cpu, etm_disable_hw, drvdata, 1);
-       drvdata->enable = false;
-
-       spin_unlock(&drvdata->spinlock);
-       put_online_cpus();
-       pm_runtime_put(csdev->dev.parent);
-
-       dev_info(drvdata->dev, "ETM tracing disabled\n");
-}
-
-static const struct coresight_ops_source etm_source_ops = {
-       .trace_id       = etm_trace_id,
-       .enable         = etm_enable,
-       .disable        = etm_disable,
-};
-
-static const struct coresight_ops etm_cs_ops = {
-       .source_ops     = &etm_source_ops,
-};
-
-static ssize_t nr_addr_cmp_show(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->nr_addr_cmp;
-       return sprintf(buf, "%#lx\n", val);
-}
-static DEVICE_ATTR_RO(nr_addr_cmp);
-
-static ssize_t nr_cntr_show(struct device *dev,
-                           struct device_attribute *attr, char *buf)
-{      unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->nr_cntr;
-       return sprintf(buf, "%#lx\n", val);
-}
-static DEVICE_ATTR_RO(nr_cntr);
-
-static ssize_t nr_ctxid_cmp_show(struct device *dev,
-                                struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->nr_ctxid_cmp;
-       return sprintf(buf, "%#lx\n", val);
-}
-static DEVICE_ATTR_RO(nr_ctxid_cmp);
-
-static ssize_t etmsr_show(struct device *dev,
-                         struct device_attribute *attr, char *buf)
-{
-       unsigned long flags, val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       pm_runtime_get_sync(drvdata->dev);
-       spin_lock_irqsave(&drvdata->spinlock, flags);
-       CS_UNLOCK(drvdata->base);
-
-       val = etm_readl(drvdata, ETMSR);
-
-       CS_LOCK(drvdata->base);
-       spin_unlock_irqrestore(&drvdata->spinlock, flags);
-       pm_runtime_put(drvdata->dev);
-
-       return sprintf(buf, "%#lx\n", val);
-}
-static DEVICE_ATTR_RO(etmsr);
-
-static ssize_t reset_store(struct device *dev,
-                          struct device_attribute *attr,
-                          const char *buf, size_t size)
-{
-       int i, ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
-
-       if (val) {
-               spin_lock(&drvdata->spinlock);
-               drvdata->mode = ETM_MODE_EXCLUDE;
-               drvdata->ctrl = 0x0;
-               drvdata->trigger_event = ETM_DEFAULT_EVENT_VAL;
-               drvdata->startstop_ctrl = 0x0;
-               drvdata->addr_idx = 0x0;
-               for (i = 0; i < drvdata->nr_addr_cmp; i++) {
-                       drvdata->addr_val[i] = 0x0;
-                       drvdata->addr_acctype[i] = 0x0;
-                       drvdata->addr_type[i] = ETM_ADDR_TYPE_NONE;
-               }
-               drvdata->cntr_idx = 0x0;
-
-               etm_set_default(drvdata);
-               spin_unlock(&drvdata->spinlock);
-       }
-
-       return size;
-}
-static DEVICE_ATTR_WO(reset);
-
-static ssize_t mode_show(struct device *dev,
-                        struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->mode;
-       return sprintf(buf, "%#lx\n", val);
-}
-
-static ssize_t mode_store(struct device *dev,
-                         struct device_attribute *attr,
-                         const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
-
-       spin_lock(&drvdata->spinlock);
-       drvdata->mode = val & ETM_MODE_ALL;
-
-       if (drvdata->mode & ETM_MODE_EXCLUDE)
-               drvdata->enable_ctrl1 |= ETMTECR1_INC_EXC;
-       else
-               drvdata->enable_ctrl1 &= ~ETMTECR1_INC_EXC;
-
-       if (drvdata->mode & ETM_MODE_CYCACC)
-               drvdata->ctrl |= ETMCR_CYC_ACC;
-       else
-               drvdata->ctrl &= ~ETMCR_CYC_ACC;
-
-       if (drvdata->mode & ETM_MODE_STALL) {
-               if (!(drvdata->etmccr & ETMCCR_FIFOFULL)) {
-                       dev_warn(drvdata->dev, "stall mode not supported\n");
-                       ret = -EINVAL;
-                       goto err_unlock;
-               }
-               drvdata->ctrl |= ETMCR_STALL_MODE;
-        } else
-               drvdata->ctrl &= ~ETMCR_STALL_MODE;
-
-       if (drvdata->mode & ETM_MODE_TIMESTAMP) {
-               if (!(drvdata->etmccer & ETMCCER_TIMESTAMP)) {
-                       dev_warn(drvdata->dev, "timestamp not supported\n");
-                       ret = -EINVAL;
-                       goto err_unlock;
-               }
-               drvdata->ctrl |= ETMCR_TIMESTAMP_EN;
-       } else
-               drvdata->ctrl &= ~ETMCR_TIMESTAMP_EN;
-
-       if (drvdata->mode & ETM_MODE_CTXID)
-               drvdata->ctrl |= ETMCR_CTXID_SIZE;
-       else
-               drvdata->ctrl &= ~ETMCR_CTXID_SIZE;
-       spin_unlock(&drvdata->spinlock);
-
-       return size;
-
-err_unlock:
-       spin_unlock(&drvdata->spinlock);
-       return ret;
-}
-static DEVICE_ATTR_RW(mode);
-
-static ssize_t trigger_event_show(struct device *dev,
-                                 struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->trigger_event;
-       return sprintf(buf, "%#lx\n", val);
-}
-
-static ssize_t trigger_event_store(struct device *dev,
-                                  struct device_attribute *attr,
-                                  const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
-
-       drvdata->trigger_event = val & ETM_EVENT_MASK;
-
-       return size;
-}
-static DEVICE_ATTR_RW(trigger_event);
-
-static ssize_t enable_event_show(struct device *dev,
-                                struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->enable_event;
-       return sprintf(buf, "%#lx\n", val);
-}
-
-static ssize_t enable_event_store(struct device *dev,
-                                 struct device_attribute *attr,
-                                 const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
-
-       drvdata->enable_event = val & ETM_EVENT_MASK;
-
-       return size;
-}
-static DEVICE_ATTR_RW(enable_event);
-
-static ssize_t fifofull_level_show(struct device *dev,
-                                  struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->fifofull_level;
-       return sprintf(buf, "%#lx\n", val);
-}
-
-static ssize_t fifofull_level_store(struct device *dev,
-                                   struct device_attribute *attr,
-                                   const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
-
-       drvdata->fifofull_level = val;
-
-       return size;
-}
-static DEVICE_ATTR_RW(fifofull_level);
-
-static ssize_t addr_idx_show(struct device *dev,
-                            struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->addr_idx;
-       return sprintf(buf, "%#lx\n", val);
-}
-
-static ssize_t addr_idx_store(struct device *dev,
-                             struct device_attribute *attr,
-                             const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
-
-       if (val >= drvdata->nr_addr_cmp)
-               return -EINVAL;
-
-       /*
-        * Use spinlock to ensure index doesn't change while it gets
-        * dereferenced multiple times within a spinlock block elsewhere.
-        */
-       spin_lock(&drvdata->spinlock);
-       drvdata->addr_idx = val;
-       spin_unlock(&drvdata->spinlock);
-
-       return size;
-}
-static DEVICE_ATTR_RW(addr_idx);
-
-static ssize_t addr_single_show(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-       u8 idx;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       if (!(drvdata->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
-             drvdata->addr_type[idx] == ETM_ADDR_TYPE_SINGLE)) {
-               spin_unlock(&drvdata->spinlock);
-               return -EINVAL;
-       }
-
-       val = drvdata->addr_val[idx];
-       spin_unlock(&drvdata->spinlock);
-
-       return sprintf(buf, "%#lx\n", val);
-}
-
-static ssize_t addr_single_store(struct device *dev,
-                                struct device_attribute *attr,
-                                const char *buf, size_t size)
-{
-       u8 idx;
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       if (!(drvdata->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
-             drvdata->addr_type[idx] == ETM_ADDR_TYPE_SINGLE)) {
-               spin_unlock(&drvdata->spinlock);
-               return -EINVAL;
-       }
-
-       drvdata->addr_val[idx] = val;
-       drvdata->addr_type[idx] = ETM_ADDR_TYPE_SINGLE;
-       spin_unlock(&drvdata->spinlock);
-
-       return size;
-}
-static DEVICE_ATTR_RW(addr_single);
-
-static ssize_t addr_range_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
-{
-       u8 idx;
-       unsigned long val1, val2;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       if (idx % 2 != 0) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-       if (!((drvdata->addr_type[idx] == ETM_ADDR_TYPE_NONE &&
-              drvdata->addr_type[idx + 1] == ETM_ADDR_TYPE_NONE) ||
-             (drvdata->addr_type[idx] == ETM_ADDR_TYPE_RANGE &&
-              drvdata->addr_type[idx + 1] == ETM_ADDR_TYPE_RANGE))) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-
-       val1 = drvdata->addr_val[idx];
-       val2 = drvdata->addr_val[idx + 1];
-       spin_unlock(&drvdata->spinlock);
-
-       return sprintf(buf, "%#lx %#lx\n", val1, val2);
-}
-
-static ssize_t addr_range_store(struct device *dev,
-                             struct device_attribute *attr,
-                             const char *buf, size_t size)
-{
-       u8 idx;
-       unsigned long val1, val2;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (sscanf(buf, "%lx %lx", &val1, &val2) != 2)
-               return -EINVAL;
-       /* Lower address comparator cannot have a higher address value */
-       if (val1 > val2)
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       if (idx % 2 != 0) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-       if (!((drvdata->addr_type[idx] == ETM_ADDR_TYPE_NONE &&
-              drvdata->addr_type[idx + 1] == ETM_ADDR_TYPE_NONE) ||
-             (drvdata->addr_type[idx] == ETM_ADDR_TYPE_RANGE &&
-              drvdata->addr_type[idx + 1] == ETM_ADDR_TYPE_RANGE))) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-
-       drvdata->addr_val[idx] = val1;
-       drvdata->addr_type[idx] = ETM_ADDR_TYPE_RANGE;
-       drvdata->addr_val[idx + 1] = val2;
-       drvdata->addr_type[idx + 1] = ETM_ADDR_TYPE_RANGE;
-       drvdata->enable_ctrl1 |= (1 << (idx/2));
-       spin_unlock(&drvdata->spinlock);
-
-       return size;
-}
-static DEVICE_ATTR_RW(addr_range);
-
-static ssize_t addr_start_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
-{
-       u8 idx;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       if (!(drvdata->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
-             drvdata->addr_type[idx] == ETM_ADDR_TYPE_START)) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-
-       val = drvdata->addr_val[idx];
-       spin_unlock(&drvdata->spinlock);
-
-       return sprintf(buf, "%#lx\n", val);
-}
-
-static ssize_t addr_start_store(struct device *dev,
-                               struct device_attribute *attr,
-                               const char *buf, size_t size)
-{
-       u8 idx;
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       if (!(drvdata->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
-             drvdata->addr_type[idx] == ETM_ADDR_TYPE_START)) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-
-       drvdata->addr_val[idx] = val;
-       drvdata->addr_type[idx] = ETM_ADDR_TYPE_START;
-       drvdata->startstop_ctrl |= (1 << idx);
-       drvdata->enable_ctrl1 |= BIT(25);
-       spin_unlock(&drvdata->spinlock);
-
-       return size;
-}
-static DEVICE_ATTR_RW(addr_start);
-
-static ssize_t addr_stop_show(struct device *dev,
-                             struct device_attribute *attr, char *buf)
-{
-       u8 idx;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       if (!(drvdata->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
-             drvdata->addr_type[idx] == ETM_ADDR_TYPE_STOP)) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-
-       val = drvdata->addr_val[idx];
-       spin_unlock(&drvdata->spinlock);
-
-       return sprintf(buf, "%#lx\n", val);
-}
-
-static ssize_t addr_stop_store(struct device *dev,
-                              struct device_attribute *attr,
-                              const char *buf, size_t size)
-{
-       u8 idx;
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       if (!(drvdata->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
-             drvdata->addr_type[idx] == ETM_ADDR_TYPE_STOP)) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-
-       drvdata->addr_val[idx] = val;
-       drvdata->addr_type[idx] = ETM_ADDR_TYPE_STOP;
-       drvdata->startstop_ctrl |= (1 << (idx + 16));
-       drvdata->enable_ctrl1 |= ETMTECR1_START_STOP;
-       spin_unlock(&drvdata->spinlock);
-
-       return size;
-}
-static DEVICE_ATTR_RW(addr_stop);
-
-static ssize_t addr_acctype_show(struct device *dev,
-                                struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       val = drvdata->addr_acctype[drvdata->addr_idx];
-       spin_unlock(&drvdata->spinlock);
-
-       return sprintf(buf, "%#lx\n", val);
-}
-
-static ssize_t addr_acctype_store(struct device *dev,
-                                 struct device_attribute *attr,
-                                 const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
-
-       spin_lock(&drvdata->spinlock);
-       drvdata->addr_acctype[drvdata->addr_idx] = val;
-       spin_unlock(&drvdata->spinlock);
-
-       return size;
-}
-static DEVICE_ATTR_RW(addr_acctype);
-
-static ssize_t cntr_idx_show(struct device *dev,
-                            struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->cntr_idx;
-       return sprintf(buf, "%#lx\n", val);
-}
-
-static ssize_t cntr_idx_store(struct device *dev,
-                             struct device_attribute *attr,
-                             const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
-
-       if (val >= drvdata->nr_cntr)
-               return -EINVAL;
-       /*
-        * Use spinlock to ensure index doesn't change while it gets
-        * dereferenced multiple times within a spinlock block elsewhere.
-        */
-       spin_lock(&drvdata->spinlock);
-       drvdata->cntr_idx = val;
-       spin_unlock(&drvdata->spinlock);
-
-       return size;
-}
-static DEVICE_ATTR_RW(cntr_idx);
-
-static ssize_t cntr_rld_val_show(struct device *dev,
-                                struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       val = drvdata->cntr_rld_val[drvdata->cntr_idx];
-       spin_unlock(&drvdata->spinlock);
-
-       return sprintf(buf, "%#lx\n", val);
-}
-
-static ssize_t cntr_rld_val_store(struct device *dev,
-                                 struct device_attribute *attr,
-                                 const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
-
-       spin_lock(&drvdata->spinlock);
-       drvdata->cntr_rld_val[drvdata->cntr_idx] = val;
-       spin_unlock(&drvdata->spinlock);
-
-       return size;
-}
-static DEVICE_ATTR_RW(cntr_rld_val);
-
-static ssize_t cntr_event_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       val = drvdata->cntr_event[drvdata->cntr_idx];
-       spin_unlock(&drvdata->spinlock);
-
-       return sprintf(buf, "%#lx\n", val);
-}
-
-static ssize_t cntr_event_store(struct device *dev,
-                               struct device_attribute *attr,
-                               const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
-
-       spin_lock(&drvdata->spinlock);
-       drvdata->cntr_event[drvdata->cntr_idx] = val & ETM_EVENT_MASK;
-       spin_unlock(&drvdata->spinlock);
-
-       return size;
-}
-static DEVICE_ATTR_RW(cntr_event);
-
-static ssize_t cntr_rld_event_show(struct device *dev,
-                                  struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       val = drvdata->cntr_rld_event[drvdata->cntr_idx];
-       spin_unlock(&drvdata->spinlock);
-
-       return sprintf(buf, "%#lx\n", val);
-}
-
-static ssize_t cntr_rld_event_store(struct device *dev,
-                                   struct device_attribute *attr,
-                                   const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
-
-       spin_lock(&drvdata->spinlock);
-       drvdata->cntr_rld_event[drvdata->cntr_idx] = val & ETM_EVENT_MASK;
-       spin_unlock(&drvdata->spinlock);
-
-       return size;
-}
-static DEVICE_ATTR_RW(cntr_rld_event);
-
-static ssize_t cntr_val_show(struct device *dev,
-                            struct device_attribute *attr, char *buf)
-{
-       int i, ret = 0;
-       u32 val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (!drvdata->enable) {
-               spin_lock(&drvdata->spinlock);
-               for (i = 0; i < drvdata->nr_cntr; i++)
-                       ret += sprintf(buf, "counter %d: %x\n",
-                                      i, drvdata->cntr_val[i]);
-               spin_unlock(&drvdata->spinlock);
-               return ret;
-       }
-
-       for (i = 0; i < drvdata->nr_cntr; i++) {
-               val = etm_readl(drvdata, ETMCNTVRn(i));
-               ret += sprintf(buf, "counter %d: %x\n", i, val);
-       }
-
-       return ret;
-}
-
-static ssize_t cntr_val_store(struct device *dev,
-                             struct device_attribute *attr,
-                             const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       config->seq_12_event = ETM_DEFAULT_EVENT_VAL;
+       config->seq_21_event = ETM_DEFAULT_EVENT_VAL;
+       config->seq_23_event = ETM_DEFAULT_EVENT_VAL;
+       config->seq_31_event = ETM_DEFAULT_EVENT_VAL;
+       config->seq_32_event = ETM_DEFAULT_EVENT_VAL;
+       config->seq_13_event = ETM_DEFAULT_EVENT_VAL;
+       config->timestamp_event = ETM_DEFAULT_EVENT_VAL;
  
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
+       for (i = 0; i < ETM_MAX_CNTR; i++) {
+               config->cntr_rld_val[i] = 0x0;
+               config->cntr_event[i] = ETM_DEFAULT_EVENT_VAL;
+               config->cntr_rld_event[i] = ETM_DEFAULT_EVENT_VAL;
+               config->cntr_val[i] = 0x0;
+       }
  
-       spin_lock(&drvdata->spinlock);
-       drvdata->cntr_val[drvdata->cntr_idx] = val;
-       spin_unlock(&drvdata->spinlock);
+       config->seq_curr_state = 0x0;
+       config->ctxid_idx = 0x0;
+       for (i = 0; i < ETM_MAX_CTXID_CMP; i++) {
+               config->ctxid_pid[i] = 0x0;
+               config->ctxid_vpid[i] = 0x0;
+       }
  
-       return size;
+       config->ctxid_mask = 0x0;
  }
-static DEVICE_ATTR_RW(cntr_val);
  
-static ssize_t seq_12_event_show(struct device *dev,
-                                struct device_attribute *attr, char *buf)
+void etm_config_trace_mode(struct etm_config *config)
  {
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       u32 flags, mode;
  
-       val = drvdata->seq_12_event;
-       return sprintf(buf, "%#lx\n", val);
-}
+       mode = config->mode;
  
-static ssize_t seq_12_event_store(struct device *dev,
-                                 struct device_attribute *attr,
-                                 const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       mode &= (ETM_MODE_EXCL_KERN | ETM_MODE_EXCL_USER);
  
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
+       /* excluding kernel AND user space doesn't make sense */
+       if (mode == (ETM_MODE_EXCL_KERN | ETM_MODE_EXCL_USER))
+               return;
  
-       drvdata->seq_12_event = val & ETM_EVENT_MASK;
-       return size;
-}
-static DEVICE_ATTR_RW(seq_12_event);
+       /* nothing to do if neither flags are set */
+       if (!(mode & ETM_MODE_EXCL_KERN) && !(mode & ETM_MODE_EXCL_USER))
+               return;
  
-static ssize_t seq_21_event_show(struct device *dev,
-                                struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       flags = (1 << 0 |       /* instruction execute */
+                3 << 3 |       /* ARM instruction */
+                0 << 5 |       /* No data value comparison */
+                0 << 7 |       /* No exact mach */
+                0 << 8);       /* Ignore context ID */
  
-       val = drvdata->seq_21_event;
-       return sprintf(buf, "%#lx\n", val);
-}
+       /* No need to worry about single address comparators. */
+       config->enable_ctrl2 = 0x0;
  
-static ssize_t seq_21_event_store(struct device *dev,
-                                 struct device_attribute *attr,
-                                 const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       /* Bit 0 is address range comparator 1 */
+       config->enable_ctrl1 = ETMTECR1_ADDR_COMP_1;
  
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
+       /*
+        * On ETMv3.5:
+        * ETMACTRn[13,11] == Non-secure state comparison control
+        * ETMACTRn[12,10] == Secure state comparison control
+        *
+        * b00 == Match in all modes in this state
+        * b01 == Do not match in any more in this state
+        * b10 == Match in all modes excepts user mode in this state
+        * b11 == Match only in user mode in this state
+        */
  
-       drvdata->seq_21_event = val & ETM_EVENT_MASK;
-       return size;
-}
-static DEVICE_ATTR_RW(seq_21_event);
+       /* Tracing in secure mode is not supported at this time */
+       flags |= (0 << 12 | 1 << 10);
  
-static ssize_t seq_23_event_show(struct device *dev,
-                                struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       if (mode & ETM_MODE_EXCL_USER) {
+               /* exclude user, match all modes except user mode */
+               flags |= (1 << 13 | 0 << 11);
+       } else {
+               /* exclude kernel, match only in user mode */
+               flags |= (1 << 13 | 1 << 11);
+       }
  
-       val = drvdata->seq_23_event;
-       return sprintf(buf, "%#lx\n", val);
+       /*
+        * The ETMEEVR register is already set to "hard wire A".  As such
+        * all there is to do is setup an address comparator that spans
+        * the entire address range and configure the state and mode bits.
+        */
+       config->addr_val[0] = (u32) 0x0;
+       config->addr_val[1] = (u32) ~0x0;
+       config->addr_acctype[0] = flags;
+       config->addr_acctype[1] = flags;
+       config->addr_type[0] = ETM_ADDR_TYPE_RANGE;
+       config->addr_type[1] = ETM_ADDR_TYPE_RANGE;
  }
  
-static ssize_t seq_23_event_store(struct device *dev,
-                                 struct device_attribute *attr,
-                                 const char *buf, size_t size)
+#define ETM3X_SUPPORTED_OPTIONS (ETMCR_CYC_ACC | ETMCR_TIMESTAMP_EN)
+
+static int etm_parse_event_config(struct etm_drvdata *drvdata,
+                                 struct perf_event_attr *attr)
  {
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_config *config = &drvdata->config;
  
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
+       if (!attr)
+               return -EINVAL;
  
-       drvdata->seq_23_event = val & ETM_EVENT_MASK;
-       return size;
-}
-static DEVICE_ATTR_RW(seq_23_event);
+       /* Clear configuration from previous run */
+       memset(config, 0, sizeof(struct etm_config));
  
-static ssize_t seq_31_event_show(struct device *dev,
-                                struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       if (attr->exclude_kernel)
+               config->mode = ETM_MODE_EXCL_KERN;
  
-       val = drvdata->seq_31_event;
-       return sprintf(buf, "%#lx\n", val);
-}
+       if (attr->exclude_user)
+               config->mode = ETM_MODE_EXCL_USER;
  
-static ssize_t seq_31_event_store(struct device *dev,
-                                 struct device_attribute *attr,
-                                 const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       /* Always start from the default config */
+       etm_set_default(config);
  
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
+       /*
+        * By default the tracers are configured to trace the whole address
+        * range.  Narrow the field only if requested by user space.
+        */
+       if (config->mode)
+               etm_config_trace_mode(config);
  
-       drvdata->seq_31_event = val & ETM_EVENT_MASK;
-       return size;
-}
-static DEVICE_ATTR_RW(seq_31_event);
+       /*
+        * At this time only cycle accurate and timestamp options are
+        * available.
+        */
+       if (attr->config & ~ETM3X_SUPPORTED_OPTIONS)
+               return -EINVAL;
  
-static ssize_t seq_32_event_show(struct device *dev,
-                                struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       config->ctrl = attr->config;
  
-       val = drvdata->seq_32_event;
-       return sprintf(buf, "%#lx\n", val);
+       return 0;
  }
  
-static ssize_t seq_32_event_store(struct device *dev,
-                                 struct device_attribute *attr,
-                                 const char *buf, size_t size)
+static void etm_enable_hw(void *info)
  {
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       int i;
+       u32 etmcr;
+       struct etm_drvdata *drvdata = info;
+       struct etm_config *config = &drvdata->config;
  
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
+       CS_UNLOCK(drvdata->base);
  
-       drvdata->seq_32_event = val & ETM_EVENT_MASK;
-       return size;
-}
-static DEVICE_ATTR_RW(seq_32_event);
+       /* Turn engine on */
+       etm_clr_pwrdwn(drvdata);
+       /* Apply power to trace registers */
+       etm_set_pwrup(drvdata);
+       /* Make sure all registers are accessible */
+       etm_os_unlock(drvdata);
  
-static ssize_t seq_13_event_show(struct device *dev,
-                                struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       etm_set_prog(drvdata);
+
+       etmcr = etm_readl(drvdata, ETMCR);
+       /* Clear setting from a previous run if need be */
+       etmcr &= ~ETM3X_SUPPORTED_OPTIONS;
+       etmcr |= drvdata->port_size;
+       etmcr |= ETMCR_ETM_EN;
+       etm_writel(drvdata, config->ctrl | etmcr, ETMCR);
+       etm_writel(drvdata, config->trigger_event, ETMTRIGGER);
+       etm_writel(drvdata, config->startstop_ctrl, ETMTSSCR);
+       etm_writel(drvdata, config->enable_event, ETMTEEVR);
+       etm_writel(drvdata, config->enable_ctrl1, ETMTECR1);
+       etm_writel(drvdata, config->fifofull_level, ETMFFLR);
+       for (i = 0; i < drvdata->nr_addr_cmp; i++) {
+               etm_writel(drvdata, config->addr_val[i], ETMACVRn(i));
+               etm_writel(drvdata, config->addr_acctype[i], ETMACTRn(i));
+       }
+       for (i = 0; i < drvdata->nr_cntr; i++) {
+               etm_writel(drvdata, config->cntr_rld_val[i], ETMCNTRLDVRn(i));
+               etm_writel(drvdata, config->cntr_event[i], ETMCNTENRn(i));
+               etm_writel(drvdata, config->cntr_rld_event[i],
+                          ETMCNTRLDEVRn(i));
+               etm_writel(drvdata, config->cntr_val[i], ETMCNTVRn(i));
+       }
+       etm_writel(drvdata, config->seq_12_event, ETMSQ12EVR);
+       etm_writel(drvdata, config->seq_21_event, ETMSQ21EVR);
+       etm_writel(drvdata, config->seq_23_event, ETMSQ23EVR);
+       etm_writel(drvdata, config->seq_31_event, ETMSQ31EVR);
+       etm_writel(drvdata, config->seq_32_event, ETMSQ32EVR);
+       etm_writel(drvdata, config->seq_13_event, ETMSQ13EVR);
+       etm_writel(drvdata, config->seq_curr_state, ETMSQR);
+       for (i = 0; i < drvdata->nr_ext_out; i++)
+               etm_writel(drvdata, ETM_DEFAULT_EVENT_VAL, ETMEXTOUTEVRn(i));
+       for (i = 0; i < drvdata->nr_ctxid_cmp; i++)
+               etm_writel(drvdata, config->ctxid_pid[i], ETMCIDCVRn(i));
+       etm_writel(drvdata, config->ctxid_mask, ETMCIDCMR);
+       etm_writel(drvdata, config->sync_freq, ETMSYNCFR);
+       /* No external input selected */
+       etm_writel(drvdata, 0x0, ETMEXTINSELR);
+       etm_writel(drvdata, config->timestamp_event, ETMTSEVR);
+       /* No auxiliary control selected */
+       etm_writel(drvdata, 0x0, ETMAUXCR);
+       etm_writel(drvdata, drvdata->traceid, ETMTRACEIDR);
+       /* No VMID comparator value selected */
+       etm_writel(drvdata, 0x0, ETMVMIDCVR);
+
+       etm_clr_prog(drvdata);
+       CS_LOCK(drvdata->base);
  
-       val = drvdata->seq_13_event;
-       return sprintf(buf, "%#lx\n", val);
+       dev_dbg(drvdata->dev, "cpu: %d enable smp call done\n", drvdata->cpu);
  }
  
-static ssize_t seq_13_event_store(struct device *dev,
-                                 struct device_attribute *attr,
-                                 const char *buf, size_t size)
+static int etm_cpu_id(struct coresight_device *csdev)
  {
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
+       struct etm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-       drvdata->seq_13_event = val & ETM_EVENT_MASK;
-       return size;
+       return drvdata->cpu;
  }
-static DEVICE_ATTR_RW(seq_13_event);
  
-static ssize_t seq_curr_state_show(struct device *dev,
-                                  struct device_attribute *attr, char *buf)
+int etm_get_trace_id(struct etm_drvdata *drvdata)
  {
-       unsigned long val, flags;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       unsigned long flags;
+       int trace_id = -1;
  
-       if (!drvdata->enable) {
-               val = drvdata->seq_curr_state;
+       if (!drvdata)
                 goto out;
-       }
+
+       if (!local_read(&drvdata->mode))
+               return drvdata->traceid;
  
         pm_runtime_get_sync(drvdata->dev);
+
         spin_lock_irqsave(&drvdata->spinlock, flags);
  
         CS_UNLOCK(drvdata->base);
-       val = (etm_readl(drvdata, ETMSQR) & ETM_SQR_MASK);
+       trace_id = (etm_readl(drvdata, ETMTRACEIDR) & ETM_TRACEID_MASK);
         CS_LOCK(drvdata->base);
  
         spin_unlock_irqrestore(&drvdata->spinlock, flags);
         pm_runtime_put(drvdata->dev);
-out:
-       return sprintf(buf, "%#lx\n", val);
-}
-
-static ssize_t seq_curr_state_store(struct device *dev,
-                                   struct device_attribute *attr,
-                                   const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
-
-       if (val > ETM_SEQ_STATE_MAX_VAL)
-               return -EINVAL;
  
-       drvdata->seq_curr_state = val;
+out:
+       return trace_id;
  
-       return size;
  }
-static DEVICE_ATTR_RW(seq_curr_state);
  
-static ssize_t ctxid_idx_show(struct device *dev,
-                             struct device_attribute *attr, char *buf)
+static int etm_trace_id(struct coresight_device *csdev)
  {
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-       val = drvdata->ctxid_idx;
-       return sprintf(buf, "%#lx\n", val);
+       return etm_get_trace_id(drvdata);
  }
  
-static ssize_t ctxid_idx_store(struct device *dev,
-                               struct device_attribute *attr,
-                               const char *buf, size_t size)
+static int etm_enable_perf(struct coresight_device *csdev,
+                          struct perf_event_attr *attr)
  {
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
+       struct etm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-       if (val >= drvdata->nr_ctxid_cmp)
+       if (WARN_ON_ONCE(drvdata->cpu != smp_processor_id()))
                 return -EINVAL;
  
-       /*
-        * Use spinlock to ensure index doesn't change while it gets
-        * dereferenced multiple times within a spinlock block elsewhere.
-        */
-       spin_lock(&drvdata->spinlock);
-       drvdata->ctxid_idx = val;
-       spin_unlock(&drvdata->spinlock);
+       /* Configure the tracer based on the session's specifics */
+       etm_parse_event_config(drvdata, attr);
+       /* And enable it */
+       etm_enable_hw(drvdata);
  
-       return size;
+       return 0;
  }
-static DEVICE_ATTR_RW(ctxid_idx);
  
-static ssize_t ctxid_pid_show(struct device *dev,
-                             struct device_attribute *attr, char *buf)
+static int etm_enable_sysfs(struct coresight_device *csdev)
  {
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+       int ret;
  
         spin_lock(&drvdata->spinlock);
-       val = drvdata->ctxid_vpid[drvdata->ctxid_idx];
-       spin_unlock(&drvdata->spinlock);
-
-       return sprintf(buf, "%#lx\n", val);
-}
  
-static ssize_t ctxid_pid_store(struct device *dev,
-                              struct device_attribute *attr,
-                              const char *buf, size_t size)
-{
-       int ret;
-       unsigned long vpid, pid;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       /*
+        * Configure the ETM only if the CPU is online.  If it isn't online
+        * hw configuration will take place when 'CPU_STARTING' is received
+        * in @etm_cpu_callback.
+        */
+       if (cpu_online(drvdata->cpu)) {
+               ret = smp_call_function_single(drvdata->cpu,
+                                              etm_enable_hw, drvdata, 1);
+               if (ret)
+                       goto err;
+       }
  
-       ret = kstrtoul(buf, 16, &vpid);
-       if (ret)
-               return ret;
+       drvdata->sticky_enable = true;
+       spin_unlock(&drvdata->spinlock);
  
-       pid = coresight_vpid_to_pid(vpid);
+       dev_info(drvdata->dev, "ETM tracing enabled\n");
+       return 0;
  
-       spin_lock(&drvdata->spinlock);
-       drvdata->ctxid_pid[drvdata->ctxid_idx] = pid;
-       drvdata->ctxid_vpid[drvdata->ctxid_idx] = vpid;
+err:
         spin_unlock(&drvdata->spinlock);
-
-       return size;
+       return ret;
  }
-static DEVICE_ATTR_RW(ctxid_pid);
  
-static ssize_t ctxid_mask_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
+static int etm_enable(struct coresight_device *csdev,
+                     struct perf_event_attr *attr, u32 mode)
  {
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       int ret;
+       u32 val;
+       struct etm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-       val = drvdata->ctxid_mask;
-       return sprintf(buf, "%#lx\n", val);
-}
+       val = local_cmpxchg(&drvdata->mode, CS_MODE_DISABLED, mode);
  
-static ssize_t ctxid_mask_store(struct device *dev,
-                               struct device_attribute *attr,
-                               const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       /* Someone is already using the tracer */
+       if (val)
+               return -EBUSY;
+
+       switch (mode) {
+       case CS_MODE_SYSFS:
+               ret = etm_enable_sysfs(csdev);
+               break;
+       case CS_MODE_PERF:
+               ret = etm_enable_perf(csdev, attr);
+               break;
+       default:
+               ret = -EINVAL;
+       }
  
-       ret = kstrtoul(buf, 16, &val);
+       /* The tracer didn't start */
         if (ret)
-               return ret;
+               local_set(&drvdata->mode, CS_MODE_DISABLED);
  
-       drvdata->ctxid_mask = val;
-       return size;
+       return ret;
  }
-static DEVICE_ATTR_RW(ctxid_mask);
  
-static ssize_t sync_freq_show(struct device *dev,
-                             struct device_attribute *attr, char *buf)
+static void etm_disable_hw(void *info)
  {
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->sync_freq;
-       return sprintf(buf, "%#lx\n", val);
-}
+       int i;
+       struct etm_drvdata *drvdata = info;
+       struct etm_config *config = &drvdata->config;
  
-static ssize_t sync_freq_store(struct device *dev,
-                              struct device_attribute *attr,
-                              const char *buf, size_t size)
-{
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       CS_UNLOCK(drvdata->base);
+       etm_set_prog(drvdata);
  
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
+       /* Read back sequencer and counters for post trace analysis */
+       config->seq_curr_state = (etm_readl(drvdata, ETMSQR) & ETM_SQR_MASK);
  
-       drvdata->sync_freq = val & ETM_SYNC_MASK;
-       return size;
-}
-static DEVICE_ATTR_RW(sync_freq);
+       for (i = 0; i < drvdata->nr_cntr; i++)
+               config->cntr_val[i] = etm_readl(drvdata, ETMCNTVRn(i));
  
-static ssize_t timestamp_event_show(struct device *dev,
-                                   struct device_attribute *attr, char *buf)
-{
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       etm_set_pwrdwn(drvdata);
+       CS_LOCK(drvdata->base);
  
-       val = drvdata->timestamp_event;
-       return sprintf(buf, "%#lx\n", val);
+       dev_dbg(drvdata->dev, "cpu: %d disable smp call done\n", drvdata->cpu);
  }
  
-static ssize_t timestamp_event_store(struct device *dev,
-                                    struct device_attribute *attr,
-                                    const char *buf, size_t size)
+static void etm_disable_perf(struct coresight_device *csdev)
  {
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
+       if (WARN_ON_ONCE(drvdata->cpu != smp_processor_id()))
+               return;
  
-       drvdata->timestamp_event = val & ETM_EVENT_MASK;
-       return size;
-}
-static DEVICE_ATTR_RW(timestamp_event);
+       CS_UNLOCK(drvdata->base);
  
-static ssize_t cpu_show(struct device *dev,
-                       struct device_attribute *attr, char *buf)
-{
-       int val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       /* Setting the prog bit disables tracing immediately */
+       etm_set_prog(drvdata);
  
-       val = drvdata->cpu;
-       return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+       /*
+        * There is no way to know when the tracer will be used again so
+        * power down the tracer.
+        */
+       etm_set_pwrdwn(drvdata);
  
+       CS_LOCK(drvdata->base);
  }
-static DEVICE_ATTR_RO(cpu);
  
-static ssize_t traceid_show(struct device *dev,
-                           struct device_attribute *attr, char *buf)
+static void etm_disable_sysfs(struct coresight_device *csdev)
  {
-       unsigned long val, flags;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-       if (!drvdata->enable) {
-               val = drvdata->traceid;
-               goto out;
-       }
+       /*
+        * Taking hotplug lock here protects from clocks getting disabled
+        * with tracing being left on (crash scenario) if user disable occurs
+        * after cpu online mask indicates the cpu is offline but before the
+        * DYING hotplug callback is serviced by the ETM driver.
+        */
+       get_online_cpus();
+       spin_lock(&drvdata->spinlock);
  
-       pm_runtime_get_sync(drvdata->dev);
-       spin_lock_irqsave(&drvdata->spinlock, flags);
-       CS_UNLOCK(drvdata->base);
+       /*
+        * Executing etm_disable_hw on the cpu whose ETM is being disabled
+        * ensures that register writes occur when cpu is powered.
+        */
+       smp_call_function_single(drvdata->cpu, etm_disable_hw, drvdata, 1);
  
-       val = (etm_readl(drvdata, ETMTRACEIDR) & ETM_TRACEID_MASK);
+       spin_unlock(&drvdata->spinlock);
+       put_online_cpus();
  
-       CS_LOCK(drvdata->base);
-       spin_unlock_irqrestore(&drvdata->spinlock, flags);
-       pm_runtime_put(drvdata->dev);
-out:
-       return sprintf(buf, "%#lx\n", val);
+       dev_info(drvdata->dev, "ETM tracing disabled\n");
  }
  
-static ssize_t traceid_store(struct device *dev,
-                            struct device_attribute *attr,
-                            const char *buf, size_t size)
+static void etm_disable(struct coresight_device *csdev)
  {
-       int ret;
-       unsigned long val;
-       struct etm_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       ret = kstrtoul(buf, 16, &val);
-       if (ret)
-               return ret;
-
-       drvdata->traceid = val & ETM_TRACEID_MASK;
-       return size;
-}
-static DEVICE_ATTR_RW(traceid);
-
-static struct attribute *coresight_etm_attrs[] = {
-       &dev_attr_nr_addr_cmp.attr,
-       &dev_attr_nr_cntr.attr,
-       &dev_attr_nr_ctxid_cmp.attr,
-       &dev_attr_etmsr.attr,
-       &dev_attr_reset.attr,
-       &dev_attr_mode.attr,
-       &dev_attr_trigger_event.attr,
-       &dev_attr_enable_event.attr,
-       &dev_attr_fifofull_level.attr,
-       &dev_attr_addr_idx.attr,
-       &dev_attr_addr_single.attr,
-       &dev_attr_addr_range.attr,
-       &dev_attr_addr_start.attr,
-       &dev_attr_addr_stop.attr,
-       &dev_attr_addr_acctype.attr,
-       &dev_attr_cntr_idx.attr,
-       &dev_attr_cntr_rld_val.attr,
-       &dev_attr_cntr_event.attr,
-       &dev_attr_cntr_rld_event.attr,
-       &dev_attr_cntr_val.attr,
-       &dev_attr_seq_12_event.attr,
-       &dev_attr_seq_21_event.attr,
-       &dev_attr_seq_23_event.attr,
-       &dev_attr_seq_31_event.attr,
-       &dev_attr_seq_32_event.attr,
-       &dev_attr_seq_13_event.attr,
-       &dev_attr_seq_curr_state.attr,
-       &dev_attr_ctxid_idx.attr,
-       &dev_attr_ctxid_pid.attr,
-       &dev_attr_ctxid_mask.attr,
-       &dev_attr_sync_freq.attr,
-       &dev_attr_timestamp_event.attr,
-       &dev_attr_traceid.attr,
-       &dev_attr_cpu.attr,
-       NULL,
-};
+       u32 mode;
+       struct etm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-#define coresight_simple_func(name, offset)                             \
-static ssize_t name##_show(struct device *_dev,                         \
-                          struct device_attribute *attr, char *buf)    \
-{                                                                       \
-       struct etm_drvdata *drvdata = dev_get_drvdata(_dev->parent);    \
-       return scnprintf(buf, PAGE_SIZE, "0x%x\n",                      \
-                        readl_relaxed(drvdata->base + offset));        \
-}                                                                       \
-DEVICE_ATTR_RO(name)
-
-coresight_simple_func(etmccr, ETMCCR);
-coresight_simple_func(etmccer, ETMCCER);
-coresight_simple_func(etmscr, ETMSCR);
-coresight_simple_func(etmidr, ETMIDR);
-coresight_simple_func(etmcr, ETMCR);
-coresight_simple_func(etmtraceidr, ETMTRACEIDR);
-coresight_simple_func(etmteevr, ETMTEEVR);
-coresight_simple_func(etmtssvr, ETMTSSCR);
-coresight_simple_func(etmtecr1, ETMTECR1);
-coresight_simple_func(etmtecr2, ETMTECR2);
-
-static struct attribute *coresight_etm_mgmt_attrs[] = {
-       &dev_attr_etmccr.attr,
-       &dev_attr_etmccer.attr,
-       &dev_attr_etmscr.attr,
-       &dev_attr_etmidr.attr,
-       &dev_attr_etmcr.attr,
-       &dev_attr_etmtraceidr.attr,
-       &dev_attr_etmteevr.attr,
-       &dev_attr_etmtssvr.attr,
-       &dev_attr_etmtecr1.attr,
-       &dev_attr_etmtecr2.attr,
-       NULL,
-};
+       /*
+        * For as long as the tracer isn't disabled another entity can't
+        * change its status.  As such we can read the status here without
+        * fearing it will change under us.
+        */
+       mode = local_read(&drvdata->mode);
  
-static const struct attribute_group coresight_etm_group = {
-       .attrs = coresight_etm_attrs,
-};
+       switch (mode) {
+       case CS_MODE_DISABLED:
+               break;
+       case CS_MODE_SYSFS:
+               etm_disable_sysfs(csdev);
+               break;
+       case CS_MODE_PERF:
+               etm_disable_perf(csdev);
+               break;
+       default:
+               WARN_ON_ONCE(mode);
+               return;
+       }
  
+       if (mode)
+               local_set(&drvdata->mode, CS_MODE_DISABLED);
+}
  
-static const struct attribute_group coresight_etm_mgmt_group = {
-       .attrs = coresight_etm_mgmt_attrs,
-       .name = "mgmt",
+static const struct coresight_ops_source etm_source_ops = {
+       .cpu_id         = etm_cpu_id,
+       .trace_id       = etm_trace_id,
+       .enable         = etm_enable,
+       .disable        = etm_disable,
  };
  
-static const struct attribute_group *coresight_etm_groups[] = {
-       &coresight_etm_group,
-       &coresight_etm_mgmt_group,
-       NULL,
+static const struct coresight_ops etm_cs_ops = {
+       .source_ops     = &etm_source_ops,
  };
  
  static int etm_cpu_callback(struct notifier_block *nfb, unsigned long action,
@@ -1658,7 +657,7 @@ static int etm_cpu_callback(struct notifier_block *nfb, unsigned long action,
                         etmdrvdata[cpu]->os_unlock = true;
                 }
  
-               if (etmdrvdata[cpu]->enable)
+               if (local_read(&etmdrvdata[cpu]->mode))
                         etm_enable_hw(etmdrvdata[cpu]);
                 spin_unlock(&etmdrvdata[cpu]->spinlock);
                 break;
@@ -1671,7 +670,7 @@ static int etm_cpu_callback(struct notifier_block *nfb, unsigned long action,
  
         case CPU_DYING:
                 spin_lock(&etmdrvdata[cpu]->spinlock);
-               if (etmdrvdata[cpu]->enable)
+               if (local_read(&etmdrvdata[cpu]->mode))
                         etm_disable_hw(etmdrvdata[cpu]);
                 spin_unlock(&etmdrvdata[cpu]->spinlock);
                 break;
@@ -1707,6 +706,9 @@ static void etm_init_arch_data(void *info)
         u32 etmccr;
         struct etm_drvdata *drvdata = info;
  
+       /* Make sure all registers are accessible */
+       etm_os_unlock(drvdata);
+
         CS_UNLOCK(drvdata->base);
  
         /* First dummy read */
@@ -1743,40 +745,9 @@ static void etm_init_arch_data(void *info)
         CS_LOCK(drvdata->base);
  }
  
-static void etm_init_default_data(struct etm_drvdata *drvdata)
+static void etm_init_trace_id(struct etm_drvdata *drvdata)
  {
-       /*
-        * A trace ID of value 0 is invalid, so let's start at some
-        * random value that fits in 7 bits and will be just as good.
-        */
-       static int etm3x_traceid = 0x10;
-
-       u32 flags = (1 << 0 | /* instruction execute*/
-                    3 << 3 | /* ARM instruction */
-                    0 << 5 | /* No data value comparison */
-                    0 << 7 | /* No exact mach */
-                    0 << 8 | /* Ignore context ID */
-                    0 << 10); /* Security ignored */
-
-       /*
-        * Initial configuration only - guarantees sources handled by
-        * this driver have a unique ID at startup time but not between
-        * all other types of sources.  For that we lean on the core
-        * framework.
-        */
-       drvdata->traceid = etm3x_traceid++;
-       drvdata->ctrl = (ETMCR_CYC_ACC | ETMCR_TIMESTAMP_EN);
-       drvdata->enable_ctrl1 = ETMTECR1_ADDR_COMP_1;
-       if (drvdata->nr_addr_cmp >= 2) {
-               drvdata->addr_val[0] = (u32) _stext;
-               drvdata->addr_val[1] = (u32) _etext;
-               drvdata->addr_acctype[0] = flags;
-               drvdata->addr_acctype[1] = flags;
-               drvdata->addr_type[0] = ETM_ADDR_TYPE_RANGE;
-               drvdata->addr_type[1] = ETM_ADDR_TYPE_RANGE;
-       }
-
-       etm_set_default(drvdata);
+       drvdata->traceid = coresight_get_trace_id(drvdata->cpu);
  }
  
  static int etm_probe(struct amba_device *adev, const struct amba_id *id)
@@ -1831,9 +802,6 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id)
         get_online_cpus();
         etmdrvdata[drvdata->cpu] = drvdata;
  
-       if (!smp_call_function_single(drvdata->cpu, etm_os_unlock, drvdata, 1))
-               drvdata->os_unlock = true;
-
         if (smp_call_function_single(drvdata->cpu,
                                      etm_init_arch_data,  drvdata, 1))
                 dev_err(dev, "ETM arch init failed\n");
@@ -1847,7 +815,9 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id)
                 ret = -EINVAL;
                 goto err_arch_supported;
         }
-       etm_init_default_data(drvdata);
+
+       etm_init_trace_id(drvdata);
+       etm_set_default(&drvdata->config);
  
         desc->type = CORESIGHT_DEV_TYPE_SOURCE;
         desc->subtype.source_subtype = CORESIGHT_DEV_SUBTYPE_SOURCE_PROC;
@@ -1861,6 +831,12 @@ static int etm_probe(struct amba_device *adev, const struct amba_id *id)
                 goto err_arch_supported;
         }
  
+       ret = etm_perf_symlink(drvdata->csdev, true);
+       if (ret) {
+               coresight_unregister(drvdata->csdev);
+               goto err_arch_supported;
+       }
+
         pm_runtime_put(&adev->dev);
         dev_info(dev, "%s initialized\n", (char *)id->data);
  
@@ -1877,17 +853,6 @@ err_arch_supported:
         return ret;
  }
  
-static int etm_remove(struct amba_device *adev)
-{
-       struct etm_drvdata *drvdata = amba_get_drvdata(adev);
-
-       coresight_unregister(drvdata->csdev);
-       if (--etm_count == 0)
-               unregister_hotcpu_notifier(&etm_cpu_notifier);
-
-       return 0;
-}
-
  #ifdef CONFIG_PM
  static int etm_runtime_suspend(struct device *dev)
  {
@@ -1948,13 +913,9 @@ static struct amba_driver etm_driver = {
                 .name   = "coresight-etm3x",
                 .owner  = THIS_MODULE,
                 .pm     = &etm_dev_pm_ops,
+               .suppress_bind_attrs = true,
         },
         .probe          = etm_probe,
-       .remove         = etm_remove,
         .id_table       = etm_ids,
  };
-
-module_amba_driver(etm_driver);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("CoreSight Program Flow Trace driver");
+builtin_amba_driver(etm_driver);
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-sysfs.c b/drivers/hwtracing/coresight/coresight-etm4x-sysfs.c

new file mode 100644 (file)

index 0000000..7c84308
--- /dev/null
+++ b/drivers/hwtracing/coresight/coresight-etm4x-sysfs.c
@@ -0,0 +1,2126 @@
+/*
+ * Copyright(C) 2015 Linaro Limited. All rights reserved.
+ * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/pm_runtime.h>
+#include <linux/sysfs.h>
+#include "coresight-etm4x.h"
+
+static int etm4_set_mode_exclude(struct etmv4_drvdata *drvdata, bool exclude)
+{
+       u8 idx;
+       struct etmv4_config *config = &drvdata->config;
+
+       idx = config->addr_idx;
+
+       /*
+        * TRCACATRn.TYPE bit[1:0]: type of comparison
+        * the trace unit performs
+        */
+       if (BMVAL(config->addr_acc[idx], 0, 1) == ETM_INSTR_ADDR) {
+               if (idx % 2 != 0)
+                       return -EINVAL;
+
+               /*
+                * We are performing instruction address comparison. Set the
+                * relevant bit of ViewInst Include/Exclude Control register
+                * for corresponding address comparator pair.
+                */
+               if (config->addr_type[idx] != ETM_ADDR_TYPE_RANGE ||
+                   config->addr_type[idx + 1] != ETM_ADDR_TYPE_RANGE)
+                       return -EINVAL;
+
+               if (exclude == true) {
+                       /*
+                        * Set exclude bit and unset the include bit
+                        * corresponding to comparator pair
+                        */
+                       config->viiectlr |= BIT(idx / 2 + 16);
+                       config->viiectlr &= ~BIT(idx / 2);
+               } else {
+                       /*
+                        * Set include bit and unset exclude bit
+                        * corresponding to comparator pair
+                        */
+                       config->viiectlr |= BIT(idx / 2);
+                       config->viiectlr &= ~BIT(idx / 2 + 16);
+               }
+       }
+       return 0;
+}
+
+static ssize_t nr_pe_cmp_show(struct device *dev,
+                             struct device_attribute *attr,
+                             char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       val = drvdata->nr_pe_cmp;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+static DEVICE_ATTR_RO(nr_pe_cmp);
+
+static ssize_t nr_addr_cmp_show(struct device *dev,
+                               struct device_attribute *attr,
+                               char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       val = drvdata->nr_addr_cmp;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+static DEVICE_ATTR_RO(nr_addr_cmp);
+
+static ssize_t nr_cntr_show(struct device *dev,
+                           struct device_attribute *attr,
+                           char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       val = drvdata->nr_cntr;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+static DEVICE_ATTR_RO(nr_cntr);
+
+static ssize_t nr_ext_inp_show(struct device *dev,
+                              struct device_attribute *attr,
+                              char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       val = drvdata->nr_ext_inp;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+static DEVICE_ATTR_RO(nr_ext_inp);
+
+static ssize_t numcidc_show(struct device *dev,
+                           struct device_attribute *attr,
+                           char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       val = drvdata->numcidc;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+static DEVICE_ATTR_RO(numcidc);
+
+static ssize_t numvmidc_show(struct device *dev,
+                            struct device_attribute *attr,
+                            char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       val = drvdata->numvmidc;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+static DEVICE_ATTR_RO(numvmidc);
+
+static ssize_t nrseqstate_show(struct device *dev,
+                              struct device_attribute *attr,
+                              char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       val = drvdata->nrseqstate;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+static DEVICE_ATTR_RO(nrseqstate);
+
+static ssize_t nr_resource_show(struct device *dev,
+                               struct device_attribute *attr,
+                               char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       val = drvdata->nr_resource;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+static DEVICE_ATTR_RO(nr_resource);
+
+static ssize_t nr_ss_cmp_show(struct device *dev,
+                             struct device_attribute *attr,
+                             char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       val = drvdata->nr_ss_cmp;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+static DEVICE_ATTR_RO(nr_ss_cmp);
+
+static ssize_t reset_store(struct device *dev,
+                          struct device_attribute *attr,
+                          const char *buf, size_t size)
+{
+       int i;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       if (val)
+               config->mode = 0x0;
+
+       /* Disable data tracing: do not trace load and store data transfers */
+       config->mode &= ~(ETM_MODE_LOAD | ETM_MODE_STORE);
+       config->cfg &= ~(BIT(1) | BIT(2));
+
+       /* Disable data value and data address tracing */
+       config->mode &= ~(ETM_MODE_DATA_TRACE_ADDR |
+                          ETM_MODE_DATA_TRACE_VAL);
+       config->cfg &= ~(BIT(16) | BIT(17));
+
+       /* Disable all events tracing */
+       config->eventctrl0 = 0x0;
+       config->eventctrl1 = 0x0;
+
+       /* Disable timestamp event */
+       config->ts_ctrl = 0x0;
+
+       /* Disable stalling */
+       config->stall_ctrl = 0x0;
+
+       /* Reset trace synchronization period  to 2^8 = 256 bytes*/
+       if (drvdata->syncpr == false)
+               config->syncfreq = 0x8;
+
+       /*
+        * Enable ViewInst to trace everything with start-stop logic in
+        * started state. ARM recommends start-stop logic is set before
+        * each trace run.
+        */
+       config->vinst_ctrl |= BIT(0);
+       if (drvdata->nr_addr_cmp == true) {
+               config->mode |= ETM_MODE_VIEWINST_STARTSTOP;
+               /* SSSTATUS, bit[9] */
+               config->vinst_ctrl |= BIT(9);
+       }
+
+       /* No address range filtering for ViewInst */
+       config->viiectlr = 0x0;
+
+       /* No start-stop filtering for ViewInst */
+       config->vissctlr = 0x0;
+
+       /* Disable seq events */
+       for (i = 0; i < drvdata->nrseqstate-1; i++)
+               config->seq_ctrl[i] = 0x0;
+       config->seq_rst = 0x0;
+       config->seq_state = 0x0;
+
+       /* Disable external input events */
+       config->ext_inp = 0x0;
+
+       config->cntr_idx = 0x0;
+       for (i = 0; i < drvdata->nr_cntr; i++) {
+               config->cntrldvr[i] = 0x0;
+               config->cntr_ctrl[i] = 0x0;
+               config->cntr_val[i] = 0x0;
+       }
+
+       config->res_idx = 0x0;
+       for (i = 0; i < drvdata->nr_resource; i++)
+               config->res_ctrl[i] = 0x0;
+
+       for (i = 0; i < drvdata->nr_ss_cmp; i++) {
+               config->ss_ctrl[i] = 0x0;
+               config->ss_pe_cmp[i] = 0x0;
+       }
+
+       config->addr_idx = 0x0;
+       for (i = 0; i < drvdata->nr_addr_cmp * 2; i++) {
+               config->addr_val[i] = 0x0;
+               config->addr_acc[i] = 0x0;
+               config->addr_type[i] = ETM_ADDR_TYPE_NONE;
+       }
+
+       config->ctxid_idx = 0x0;
+       for (i = 0; i < drvdata->numcidc; i++) {
+               config->ctxid_pid[i] = 0x0;
+               config->ctxid_vpid[i] = 0x0;
+       }
+
+       config->ctxid_mask0 = 0x0;
+       config->ctxid_mask1 = 0x0;
+
+       config->vmid_idx = 0x0;
+       for (i = 0; i < drvdata->numvmidc; i++)
+               config->vmid_val[i] = 0x0;
+       config->vmid_mask0 = 0x0;
+       config->vmid_mask1 = 0x0;
+
+       drvdata->trcid = drvdata->cpu + 1;
+
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_WO(reset);
+
+static ssize_t mode_show(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = config->mode;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t mode_store(struct device *dev,
+                         struct device_attribute *attr,
+                         const char *buf, size_t size)
+{
+       unsigned long val, mode;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       config->mode = val & ETMv4_MODE_ALL;
+
+       if (config->mode & ETM_MODE_EXCLUDE)
+               etm4_set_mode_exclude(drvdata, true);
+       else
+               etm4_set_mode_exclude(drvdata, false);
+
+       if (drvdata->instrp0 == true) {
+               /* start by clearing instruction P0 field */
+               config->cfg  &= ~(BIT(1) | BIT(2));
+               if (config->mode & ETM_MODE_LOAD)
+                       /* 0b01 Trace load instructions as P0 instructions */
+                       config->cfg  |= BIT(1);
+               if (config->mode & ETM_MODE_STORE)
+                       /* 0b10 Trace store instructions as P0 instructions */
+                       config->cfg  |= BIT(2);
+               if (config->mode & ETM_MODE_LOAD_STORE)
+                       /*
+                        * 0b11 Trace load and store instructions
+                        * as P0 instructions
+                        */
+                       config->cfg  |= BIT(1) | BIT(2);
+       }
+
+       /* bit[3], Branch broadcast mode */
+       if ((config->mode & ETM_MODE_BB) && (drvdata->trcbb == true))
+               config->cfg |= BIT(3);
+       else
+               config->cfg &= ~BIT(3);
+
+       /* bit[4], Cycle counting instruction trace bit */
+       if ((config->mode & ETMv4_MODE_CYCACC) &&
+               (drvdata->trccci == true))
+               config->cfg |= BIT(4);
+       else
+               config->cfg &= ~BIT(4);
+
+       /* bit[6], Context ID tracing bit */
+       if ((config->mode & ETMv4_MODE_CTXID) && (drvdata->ctxid_size))
+               config->cfg |= BIT(6);
+       else
+               config->cfg &= ~BIT(6);
+
+       if ((config->mode & ETM_MODE_VMID) && (drvdata->vmid_size))
+               config->cfg |= BIT(7);
+       else
+               config->cfg &= ~BIT(7);
+
+       /* bits[10:8], Conditional instruction tracing bit */
+       mode = ETM_MODE_COND(config->mode);
+       if (drvdata->trccond == true) {
+               config->cfg &= ~(BIT(8) | BIT(9) | BIT(10));
+               config->cfg |= mode << 8;
+       }
+
+       /* bit[11], Global timestamp tracing bit */
+       if ((config->mode & ETMv4_MODE_TIMESTAMP) && (drvdata->ts_size))
+               config->cfg |= BIT(11);
+       else
+               config->cfg &= ~BIT(11);
+
+       /* bit[12], Return stack enable bit */
+       if ((config->mode & ETM_MODE_RETURNSTACK) &&
+                                       (drvdata->retstack == true))
+               config->cfg |= BIT(12);
+       else
+               config->cfg &= ~BIT(12);
+
+       /* bits[14:13], Q element enable field */
+       mode = ETM_MODE_QELEM(config->mode);
+       /* start by clearing QE bits */
+       config->cfg &= ~(BIT(13) | BIT(14));
+       /* if supported, Q elements with instruction counts are enabled */
+       if ((mode & BIT(0)) && (drvdata->q_support & BIT(0)))
+               config->cfg |= BIT(13);
+       /*
+        * if supported, Q elements with and without instruction
+        * counts are enabled
+        */
+       if ((mode & BIT(1)) && (drvdata->q_support & BIT(1)))
+               config->cfg |= BIT(14);
+
+       /* bit[11], AMBA Trace Bus (ATB) trigger enable bit */
+       if ((config->mode & ETM_MODE_ATB_TRIGGER) &&
+           (drvdata->atbtrig == true))
+               config->eventctrl1 |= BIT(11);
+       else
+               config->eventctrl1 &= ~BIT(11);
+
+       /* bit[12], Low-power state behavior override bit */
+       if ((config->mode & ETM_MODE_LPOVERRIDE) &&
+           (drvdata->lpoverride == true))
+               config->eventctrl1 |= BIT(12);
+       else
+               config->eventctrl1 &= ~BIT(12);
+
+       /* bit[8], Instruction stall bit */
+       if (config->mode & ETM_MODE_ISTALL_EN)
+               config->stall_ctrl |= BIT(8);
+       else
+               config->stall_ctrl &= ~BIT(8);
+
+       /* bit[10], Prioritize instruction trace bit */
+       if (config->mode & ETM_MODE_INSTPRIO)
+               config->stall_ctrl |= BIT(10);
+       else
+               config->stall_ctrl &= ~BIT(10);
+
+       /* bit[13], Trace overflow prevention bit */
+       if ((config->mode & ETM_MODE_NOOVERFLOW) &&
+               (drvdata->nooverflow == true))
+               config->stall_ctrl |= BIT(13);
+       else
+               config->stall_ctrl &= ~BIT(13);
+
+       /* bit[9] Start/stop logic control bit */
+       if (config->mode & ETM_MODE_VIEWINST_STARTSTOP)
+               config->vinst_ctrl |= BIT(9);
+       else
+               config->vinst_ctrl &= ~BIT(9);
+
+       /* bit[10], Whether a trace unit must trace a Reset exception */
+       if (config->mode & ETM_MODE_TRACE_RESET)
+               config->vinst_ctrl |= BIT(10);
+       else
+               config->vinst_ctrl &= ~BIT(10);
+
+       /* bit[11], Whether a trace unit must trace a system error exception */
+       if ((config->mode & ETM_MODE_TRACE_ERR) &&
+               (drvdata->trc_error == true))
+               config->vinst_ctrl |= BIT(11);
+       else
+               config->vinst_ctrl &= ~BIT(11);
+
+       if (config->mode & (ETM_MODE_EXCL_KERN | ETM_MODE_EXCL_USER))
+               etm4_config_trace_mode(config);
+
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_RW(mode);
+
+static ssize_t pe_show(struct device *dev,
+                      struct device_attribute *attr,
+                      char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = config->pe_sel;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t pe_store(struct device *dev,
+                       struct device_attribute *attr,
+                       const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       if (val > drvdata->nr_pe) {
+               spin_unlock(&drvdata->spinlock);
+               return -EINVAL;
+       }
+
+       config->pe_sel = val;
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(pe);
+
+static ssize_t event_show(struct device *dev,
+                         struct device_attribute *attr,
+                         char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = config->eventctrl0;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t event_store(struct device *dev,
+                          struct device_attribute *attr,
+                          const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       switch (drvdata->nr_event) {
+       case 0x0:
+               /* EVENT0, bits[7:0] */
+               config->eventctrl0 = val & 0xFF;
+               break;
+       case 0x1:
+                /* EVENT1, bits[15:8] */
+               config->eventctrl0 = val & 0xFFFF;
+               break;
+       case 0x2:
+               /* EVENT2, bits[23:16] */
+               config->eventctrl0 = val & 0xFFFFFF;
+               break;
+       case 0x3:
+               /* EVENT3, bits[31:24] */
+               config->eventctrl0 = val;
+               break;
+       default:
+               break;
+       }
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(event);
+
+static ssize_t event_instren_show(struct device *dev,
+                                 struct device_attribute *attr,
+                                 char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = BMVAL(config->eventctrl1, 0, 3);
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t event_instren_store(struct device *dev,
+                                  struct device_attribute *attr,
+                                  const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       /* start by clearing all instruction event enable bits */
+       config->eventctrl1 &= ~(BIT(0) | BIT(1) | BIT(2) | BIT(3));
+       switch (drvdata->nr_event) {
+       case 0x0:
+               /* generate Event element for event 1 */
+               config->eventctrl1 |= val & BIT(1);
+               break;
+       case 0x1:
+               /* generate Event element for event 1 and 2 */
+               config->eventctrl1 |= val & (BIT(0) | BIT(1));
+               break;
+       case 0x2:
+               /* generate Event element for event 1, 2 and 3 */
+               config->eventctrl1 |= val & (BIT(0) | BIT(1) | BIT(2));
+               break;
+       case 0x3:
+               /* generate Event element for all 4 events */
+               config->eventctrl1 |= val & 0xF;
+               break;
+       default:
+               break;
+       }
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(event_instren);
+
+static ssize_t event_ts_show(struct device *dev,
+                            struct device_attribute *attr,
+                            char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = config->ts_ctrl;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t event_ts_store(struct device *dev,
+                             struct device_attribute *attr,
+                             const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+       if (!drvdata->ts_size)
+               return -EINVAL;
+
+       config->ts_ctrl = val & ETMv4_EVENT_MASK;
+       return size;
+}
+static DEVICE_ATTR_RW(event_ts);
+
+static ssize_t syncfreq_show(struct device *dev,
+                            struct device_attribute *attr,
+                            char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = config->syncfreq;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t syncfreq_store(struct device *dev,
+                             struct device_attribute *attr,
+                             const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+       if (drvdata->syncpr == true)
+               return -EINVAL;
+
+       config->syncfreq = val & ETMv4_SYNC_MASK;
+       return size;
+}
+static DEVICE_ATTR_RW(syncfreq);
+
+static ssize_t cyc_threshold_show(struct device *dev,
+                                 struct device_attribute *attr,
+                                 char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = config->ccctlr;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t cyc_threshold_store(struct device *dev,
+                                  struct device_attribute *attr,
+                                  const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+       if (val < drvdata->ccitmin)
+               return -EINVAL;
+
+       config->ccctlr = val & ETM_CYC_THRESHOLD_MASK;
+       return size;
+}
+static DEVICE_ATTR_RW(cyc_threshold);
+
+static ssize_t bb_ctrl_show(struct device *dev,
+                           struct device_attribute *attr,
+                           char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = config->bb_ctrl;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t bb_ctrl_store(struct device *dev,
+                            struct device_attribute *attr,
+                            const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+       if (drvdata->trcbb == false)
+               return -EINVAL;
+       if (!drvdata->nr_addr_cmp)
+               return -EINVAL;
+       /*
+        * Bit[7:0] selects which address range comparator is used for
+        * branch broadcast control.
+        */
+       if (BMVAL(val, 0, 7) > drvdata->nr_addr_cmp)
+               return -EINVAL;
+
+       config->bb_ctrl = val;
+       return size;
+}
+static DEVICE_ATTR_RW(bb_ctrl);
+
+static ssize_t event_vinst_show(struct device *dev,
+                               struct device_attribute *attr,
+                               char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = config->vinst_ctrl & ETMv4_EVENT_MASK;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t event_vinst_store(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       val &= ETMv4_EVENT_MASK;
+       config->vinst_ctrl &= ~ETMv4_EVENT_MASK;
+       config->vinst_ctrl |= val;
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(event_vinst);
+
+static ssize_t s_exlevel_vinst_show(struct device *dev,
+                                   struct device_attribute *attr,
+                                   char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = BMVAL(config->vinst_ctrl, 16, 19);
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t s_exlevel_vinst_store(struct device *dev,
+                                    struct device_attribute *attr,
+                                    const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       /* clear all EXLEVEL_S bits (bit[18] is never implemented) */
+       config->vinst_ctrl &= ~(BIT(16) | BIT(17) | BIT(19));
+       /* enable instruction tracing for corresponding exception level */
+       val &= drvdata->s_ex_level;
+       config->vinst_ctrl |= (val << 16);
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(s_exlevel_vinst);
+
+static ssize_t ns_exlevel_vinst_show(struct device *dev,
+                                    struct device_attribute *attr,
+                                    char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       /* EXLEVEL_NS, bits[23:20] */
+       val = BMVAL(config->vinst_ctrl, 20, 23);
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t ns_exlevel_vinst_store(struct device *dev,
+                                     struct device_attribute *attr,
+                                     const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       /* clear EXLEVEL_NS bits (bit[23] is never implemented */
+       config->vinst_ctrl &= ~(BIT(20) | BIT(21) | BIT(22));
+       /* enable instruction tracing for corresponding exception level */
+       val &= drvdata->ns_ex_level;
+       config->vinst_ctrl |= (val << 20);
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(ns_exlevel_vinst);
+
+static ssize_t addr_idx_show(struct device *dev,
+                            struct device_attribute *attr,
+                            char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = config->addr_idx;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t addr_idx_store(struct device *dev,
+                             struct device_attribute *attr,
+                             const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+       if (val >= drvdata->nr_addr_cmp * 2)
+               return -EINVAL;
+
+       /*
+        * Use spinlock to ensure index doesn't change while it gets
+        * dereferenced multiple times within a spinlock block elsewhere.
+        */
+       spin_lock(&drvdata->spinlock);
+       config->addr_idx = val;
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(addr_idx);
+
+static ssize_t addr_instdatatype_show(struct device *dev,
+                                     struct device_attribute *attr,
+                                     char *buf)
+{
+       ssize_t len;
+       u8 val, idx;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       val = BMVAL(config->addr_acc[idx], 0, 1);
+       len = scnprintf(buf, PAGE_SIZE, "%s\n",
+                       val == ETM_INSTR_ADDR ? "instr" :
+                       (val == ETM_DATA_LOAD_ADDR ? "data_load" :
+                       (val == ETM_DATA_STORE_ADDR ? "data_store" :
+                       "data_load_store")));
+       spin_unlock(&drvdata->spinlock);
+       return len;
+}
+
+static ssize_t addr_instdatatype_store(struct device *dev,
+                                      struct device_attribute *attr,
+                                      const char *buf, size_t size)
+{
+       u8 idx;
+       char str[20] = "";
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (strlen(buf) >= 20)
+               return -EINVAL;
+       if (sscanf(buf, "%s", str) != 1)
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       if (!strcmp(str, "instr"))
+               /* TYPE, bits[1:0] */
+               config->addr_acc[idx] &= ~(BIT(0) | BIT(1));
+
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(addr_instdatatype);
+
+static ssize_t addr_single_show(struct device *dev,
+                               struct device_attribute *attr,
+                               char *buf)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       idx = config->addr_idx;
+       spin_lock(&drvdata->spinlock);
+       if (!(config->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
+             config->addr_type[idx] == ETM_ADDR_TYPE_SINGLE)) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+       val = (unsigned long)config->addr_val[idx];
+       spin_unlock(&drvdata->spinlock);
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t addr_single_store(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t size)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       if (!(config->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
+             config->addr_type[idx] == ETM_ADDR_TYPE_SINGLE)) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+
+       config->addr_val[idx] = (u64)val;
+       config->addr_type[idx] = ETM_ADDR_TYPE_SINGLE;
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(addr_single);
+
+static ssize_t addr_range_show(struct device *dev,
+                              struct device_attribute *attr,
+                              char *buf)
+{
+       u8 idx;
+       unsigned long val1, val2;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       if (idx % 2 != 0) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+       if (!((config->addr_type[idx] == ETM_ADDR_TYPE_NONE &&
+              config->addr_type[idx + 1] == ETM_ADDR_TYPE_NONE) ||
+             (config->addr_type[idx] == ETM_ADDR_TYPE_RANGE &&
+              config->addr_type[idx + 1] == ETM_ADDR_TYPE_RANGE))) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+
+       val1 = (unsigned long)config->addr_val[idx];
+       val2 = (unsigned long)config->addr_val[idx + 1];
+       spin_unlock(&drvdata->spinlock);
+       return scnprintf(buf, PAGE_SIZE, "%#lx %#lx\n", val1, val2);
+}
+
+static ssize_t addr_range_store(struct device *dev,
+                               struct device_attribute *attr,
+                               const char *buf, size_t size)
+{
+       u8 idx;
+       unsigned long val1, val2;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (sscanf(buf, "%lx %lx", &val1, &val2) != 2)
+               return -EINVAL;
+       /* lower address comparator cannot have a higher address value */
+       if (val1 > val2)
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       if (idx % 2 != 0) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+
+       if (!((config->addr_type[idx] == ETM_ADDR_TYPE_NONE &&
+              config->addr_type[idx + 1] == ETM_ADDR_TYPE_NONE) ||
+             (config->addr_type[idx] == ETM_ADDR_TYPE_RANGE &&
+              config->addr_type[idx + 1] == ETM_ADDR_TYPE_RANGE))) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+
+       config->addr_val[idx] = (u64)val1;
+       config->addr_type[idx] = ETM_ADDR_TYPE_RANGE;
+       config->addr_val[idx + 1] = (u64)val2;
+       config->addr_type[idx + 1] = ETM_ADDR_TYPE_RANGE;
+       /*
+        * Program include or exclude control bits for vinst or vdata
+        * whenever we change addr comparators to ETM_ADDR_TYPE_RANGE
+        */
+       if (config->mode & ETM_MODE_EXCLUDE)
+               etm4_set_mode_exclude(drvdata, true);
+       else
+               etm4_set_mode_exclude(drvdata, false);
+
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(addr_range);
+
+static ssize_t addr_start_show(struct device *dev,
+                              struct device_attribute *attr,
+                              char *buf)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+
+       if (!(config->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
+             config->addr_type[idx] == ETM_ADDR_TYPE_START)) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+
+       val = (unsigned long)config->addr_val[idx];
+       spin_unlock(&drvdata->spinlock);
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t addr_start_store(struct device *dev,
+                               struct device_attribute *attr,
+                               const char *buf, size_t size)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       if (!drvdata->nr_addr_cmp) {
+               spin_unlock(&drvdata->spinlock);
+               return -EINVAL;
+       }
+       if (!(config->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
+             config->addr_type[idx] == ETM_ADDR_TYPE_START)) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+
+       config->addr_val[idx] = (u64)val;
+       config->addr_type[idx] = ETM_ADDR_TYPE_START;
+       config->vissctlr |= BIT(idx);
+       /* SSSTATUS, bit[9] - turn on start/stop logic */
+       config->vinst_ctrl |= BIT(9);
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(addr_start);
+
+static ssize_t addr_stop_show(struct device *dev,
+                             struct device_attribute *attr,
+                             char *buf)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+
+       if (!(config->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
+             config->addr_type[idx] == ETM_ADDR_TYPE_STOP)) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+
+       val = (unsigned long)config->addr_val[idx];
+       spin_unlock(&drvdata->spinlock);
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t addr_stop_store(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t size)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       if (!drvdata->nr_addr_cmp) {
+               spin_unlock(&drvdata->spinlock);
+               return -EINVAL;
+       }
+       if (!(config->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
+              config->addr_type[idx] == ETM_ADDR_TYPE_STOP)) {
+               spin_unlock(&drvdata->spinlock);
+               return -EPERM;
+       }
+
+       config->addr_val[idx] = (u64)val;
+       config->addr_type[idx] = ETM_ADDR_TYPE_STOP;
+       config->vissctlr |= BIT(idx + 16);
+       /* SSSTATUS, bit[9] - turn on start/stop logic */
+       config->vinst_ctrl |= BIT(9);
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(addr_stop);
+
+static ssize_t addr_ctxtype_show(struct device *dev,
+                                struct device_attribute *attr,
+                                char *buf)
+{
+       ssize_t len;
+       u8 idx, val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       /* CONTEXTTYPE, bits[3:2] */
+       val = BMVAL(config->addr_acc[idx], 2, 3);
+       len = scnprintf(buf, PAGE_SIZE, "%s\n", val == ETM_CTX_NONE ? "none" :
+                       (val == ETM_CTX_CTXID ? "ctxid" :
+                       (val == ETM_CTX_VMID ? "vmid" : "all")));
+       spin_unlock(&drvdata->spinlock);
+       return len;
+}
+
+static ssize_t addr_ctxtype_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t size)
+{
+       u8 idx;
+       char str[10] = "";
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (strlen(buf) >= 10)
+               return -EINVAL;
+       if (sscanf(buf, "%s", str) != 1)
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       if (!strcmp(str, "none"))
+               /* start by clearing context type bits */
+               config->addr_acc[idx] &= ~(BIT(2) | BIT(3));
+       else if (!strcmp(str, "ctxid")) {
+               /* 0b01 The trace unit performs a Context ID */
+               if (drvdata->numcidc) {
+                       config->addr_acc[idx] |= BIT(2);
+                       config->addr_acc[idx] &= ~BIT(3);
+               }
+       } else if (!strcmp(str, "vmid")) {
+               /* 0b10 The trace unit performs a VMID */
+               if (drvdata->numvmidc) {
+                       config->addr_acc[idx] &= ~BIT(2);
+                       config->addr_acc[idx] |= BIT(3);
+               }
+       } else if (!strcmp(str, "all")) {
+               /*
+                * 0b11 The trace unit performs a Context ID
+                * comparison and a VMID
+                */
+               if (drvdata->numcidc)
+                       config->addr_acc[idx] |= BIT(2);
+               if (drvdata->numvmidc)
+                       config->addr_acc[idx] |= BIT(3);
+       }
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(addr_ctxtype);
+
+static ssize_t addr_context_show(struct device *dev,
+                                struct device_attribute *attr,
+                                char *buf)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       /* context ID comparator bits[6:4] */
+       val = BMVAL(config->addr_acc[idx], 4, 6);
+       spin_unlock(&drvdata->spinlock);
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t addr_context_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t size)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+       if ((drvdata->numcidc <= 1) && (drvdata->numvmidc <= 1))
+               return -EINVAL;
+       if (val >=  (drvdata->numcidc >= drvdata->numvmidc ?
+                    drvdata->numcidc : drvdata->numvmidc))
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->addr_idx;
+       /* clear context ID comparator bits[6:4] */
+       config->addr_acc[idx] &= ~(BIT(4) | BIT(5) | BIT(6));
+       config->addr_acc[idx] |= (val << 4);
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(addr_context);
+
+static ssize_t seq_idx_show(struct device *dev,
+                           struct device_attribute *attr,
+                           char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = config->seq_idx;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t seq_idx_store(struct device *dev,
+                            struct device_attribute *attr,
+                            const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+       if (val >= drvdata->nrseqstate - 1)
+               return -EINVAL;
+
+       /*
+        * Use spinlock to ensure index doesn't change while it gets
+        * dereferenced multiple times within a spinlock block elsewhere.
+        */
+       spin_lock(&drvdata->spinlock);
+       config->seq_idx = val;
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(seq_idx);
+
+static ssize_t seq_state_show(struct device *dev,
+                             struct device_attribute *attr,
+                             char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = config->seq_state;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t seq_state_store(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+       if (val >= drvdata->nrseqstate)
+               return -EINVAL;
+
+       config->seq_state = val;
+       return size;
+}
+static DEVICE_ATTR_RW(seq_state);
+
+static ssize_t seq_event_show(struct device *dev,
+                             struct device_attribute *attr,
+                             char *buf)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->seq_idx;
+       val = config->seq_ctrl[idx];
+       spin_unlock(&drvdata->spinlock);
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t seq_event_store(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t size)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->seq_idx;
+       /* RST, bits[7:0] */
+       config->seq_ctrl[idx] = val & 0xFF;
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(seq_event);
+
+static ssize_t seq_reset_event_show(struct device *dev,
+                                   struct device_attribute *attr,
+                                   char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = config->seq_rst;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t seq_reset_event_store(struct device *dev,
+                                    struct device_attribute *attr,
+                                    const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+       if (!(drvdata->nrseqstate))
+               return -EINVAL;
+
+       config->seq_rst = val & ETMv4_EVENT_MASK;
+       return size;
+}
+static DEVICE_ATTR_RW(seq_reset_event);
+
+static ssize_t cntr_idx_show(struct device *dev,
+                            struct device_attribute *attr,
+                            char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = config->cntr_idx;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t cntr_idx_store(struct device *dev,
+                             struct device_attribute *attr,
+                             const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+       if (val >= drvdata->nr_cntr)
+               return -EINVAL;
+
+       /*
+        * Use spinlock to ensure index doesn't change while it gets
+        * dereferenced multiple times within a spinlock block elsewhere.
+        */
+       spin_lock(&drvdata->spinlock);
+       config->cntr_idx = val;
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(cntr_idx);
+
+static ssize_t cntrldvr_show(struct device *dev,
+                            struct device_attribute *attr,
+                            char *buf)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->cntr_idx;
+       val = config->cntrldvr[idx];
+       spin_unlock(&drvdata->spinlock);
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t cntrldvr_store(struct device *dev,
+                             struct device_attribute *attr,
+                             const char *buf, size_t size)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+       if (val > ETM_CNTR_MAX_VAL)
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->cntr_idx;
+       config->cntrldvr[idx] = val;
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(cntrldvr);
+
+static ssize_t cntr_val_show(struct device *dev,
+                            struct device_attribute *attr,
+                            char *buf)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->cntr_idx;
+       val = config->cntr_val[idx];
+       spin_unlock(&drvdata->spinlock);
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t cntr_val_store(struct device *dev,
+                             struct device_attribute *attr,
+                             const char *buf, size_t size)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+       if (val > ETM_CNTR_MAX_VAL)
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->cntr_idx;
+       config->cntr_val[idx] = val;
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(cntr_val);
+
+static ssize_t cntr_ctrl_show(struct device *dev,
+                             struct device_attribute *attr,
+                             char *buf)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->cntr_idx;
+       val = config->cntr_ctrl[idx];
+       spin_unlock(&drvdata->spinlock);
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t cntr_ctrl_store(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t size)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->cntr_idx;
+       config->cntr_ctrl[idx] = val;
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(cntr_ctrl);
+
+static ssize_t res_idx_show(struct device *dev,
+                           struct device_attribute *attr,
+                           char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = config->res_idx;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t res_idx_store(struct device *dev,
+                            struct device_attribute *attr,
+                            const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+       /* Resource selector pair 0 is always implemented and reserved */
+       if ((val == 0) || (val >= drvdata->nr_resource))
+               return -EINVAL;
+
+       /*
+        * Use spinlock to ensure index doesn't change while it gets
+        * dereferenced multiple times within a spinlock block elsewhere.
+        */
+       spin_lock(&drvdata->spinlock);
+       config->res_idx = val;
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(res_idx);
+
+static ssize_t res_ctrl_show(struct device *dev,
+                            struct device_attribute *attr,
+                            char *buf)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->res_idx;
+       val = config->res_ctrl[idx];
+       spin_unlock(&drvdata->spinlock);
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t res_ctrl_store(struct device *dev,
+                             struct device_attribute *attr,
+                             const char *buf, size_t size)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->res_idx;
+       /* For odd idx pair inversal bit is RES0 */
+       if (idx % 2 != 0)
+               /* PAIRINV, bit[21] */
+               val &= ~BIT(21);
+       config->res_ctrl[idx] = val;
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(res_ctrl);
+
+static ssize_t ctxid_idx_show(struct device *dev,
+                             struct device_attribute *attr,
+                             char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = config->ctxid_idx;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t ctxid_idx_store(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+       if (val >= drvdata->numcidc)
+               return -EINVAL;
+
+       /*
+        * Use spinlock to ensure index doesn't change while it gets
+        * dereferenced multiple times within a spinlock block elsewhere.
+        */
+       spin_lock(&drvdata->spinlock);
+       config->ctxid_idx = val;
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(ctxid_idx);
+
+static ssize_t ctxid_pid_show(struct device *dev,
+                             struct device_attribute *attr,
+                             char *buf)
+{
+       u8 idx;
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->ctxid_idx;
+       val = (unsigned long)config->ctxid_vpid[idx];
+       spin_unlock(&drvdata->spinlock);
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t ctxid_pid_store(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t size)
+{
+       u8 idx;
+       unsigned long vpid, pid;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       /*
+        * only implemented when ctxid tracing is enabled, i.e. at least one
+        * ctxid comparator is implemented and ctxid is greater than 0 bits
+        * in length
+        */
+       if (!drvdata->ctxid_size || !drvdata->numcidc)
+               return -EINVAL;
+       if (kstrtoul(buf, 16, &vpid))
+               return -EINVAL;
+
+       pid = coresight_vpid_to_pid(vpid);
+
+       spin_lock(&drvdata->spinlock);
+       idx = config->ctxid_idx;
+       config->ctxid_pid[idx] = (u64)pid;
+       config->ctxid_vpid[idx] = (u64)vpid;
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(ctxid_pid);
+
+static ssize_t ctxid_masks_show(struct device *dev,
+                               struct device_attribute *attr,
+                               char *buf)
+{
+       unsigned long val1, val2;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       val1 = config->ctxid_mask0;
+       val2 = config->ctxid_mask1;
+       spin_unlock(&drvdata->spinlock);
+       return scnprintf(buf, PAGE_SIZE, "%#lx %#lx\n", val1, val2);
+}
+
+static ssize_t ctxid_masks_store(struct device *dev,
+                               struct device_attribute *attr,
+                               const char *buf, size_t size)
+{
+       u8 i, j, maskbyte;
+       unsigned long val1, val2, mask;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       /*
+        * only implemented when ctxid tracing is enabled, i.e. at least one
+        * ctxid comparator is implemented and ctxid is greater than 0 bits
+        * in length
+        */
+       if (!drvdata->ctxid_size || !drvdata->numcidc)
+               return -EINVAL;
+       if (sscanf(buf, "%lx %lx", &val1, &val2) != 2)
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       /*
+        * each byte[0..3] controls mask value applied to ctxid
+        * comparator[0..3]
+        */
+       switch (drvdata->numcidc) {
+       case 0x1:
+               /* COMP0, bits[7:0] */
+               config->ctxid_mask0 = val1 & 0xFF;
+               break;
+       case 0x2:
+               /* COMP1, bits[15:8] */
+               config->ctxid_mask0 = val1 & 0xFFFF;
+               break;
+       case 0x3:
+               /* COMP2, bits[23:16] */
+               config->ctxid_mask0 = val1 & 0xFFFFFF;
+               break;
+       case 0x4:
+                /* COMP3, bits[31:24] */
+               config->ctxid_mask0 = val1;
+               break;
+       case 0x5:
+               /* COMP4, bits[7:0] */
+               config->ctxid_mask0 = val1;
+               config->ctxid_mask1 = val2 & 0xFF;
+               break;
+       case 0x6:
+               /* COMP5, bits[15:8] */
+               config->ctxid_mask0 = val1;
+               config->ctxid_mask1 = val2 & 0xFFFF;
+               break;
+       case 0x7:
+               /* COMP6, bits[23:16] */
+               config->ctxid_mask0 = val1;
+               config->ctxid_mask1 = val2 & 0xFFFFFF;
+               break;
+       case 0x8:
+               /* COMP7, bits[31:24] */
+               config->ctxid_mask0 = val1;
+               config->ctxid_mask1 = val2;
+               break;
+       default:
+               break;
+       }
+       /*
+        * If software sets a mask bit to 1, it must program relevant byte
+        * of ctxid comparator value 0x0, otherwise behavior is unpredictable.
+        * For example, if bit[3] of ctxid_mask0 is 1, we must clear bits[31:24]
+        * of ctxid comparator0 value (corresponding to byte 0) register.
+        */
+       mask = config->ctxid_mask0;
+       for (i = 0; i < drvdata->numcidc; i++) {
+               /* mask value of corresponding ctxid comparator */
+               maskbyte = mask & ETMv4_EVENT_MASK;
+               /*
+                * each bit corresponds to a byte of respective ctxid comparator
+                * value register
+                */
+               for (j = 0; j < 8; j++) {
+                       if (maskbyte & 1)
+                               config->ctxid_pid[i] &= ~(0xFF << (j * 8));
+                       maskbyte >>= 1;
+               }
+               /* Select the next ctxid comparator mask value */
+               if (i == 3)
+                       /* ctxid comparators[4-7] */
+                       mask = config->ctxid_mask1;
+               else
+                       mask >>= 0x8;
+       }
+
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(ctxid_masks);
+
+static ssize_t vmid_idx_show(struct device *dev,
+                            struct device_attribute *attr,
+                            char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = config->vmid_idx;
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t vmid_idx_store(struct device *dev,
+                             struct device_attribute *attr,
+                             const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+       if (val >= drvdata->numvmidc)
+               return -EINVAL;
+
+       /*
+        * Use spinlock to ensure index doesn't change while it gets
+        * dereferenced multiple times within a spinlock block elsewhere.
+        */
+       spin_lock(&drvdata->spinlock);
+       config->vmid_idx = val;
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(vmid_idx);
+
+static ssize_t vmid_val_show(struct device *dev,
+                            struct device_attribute *attr,
+                            char *buf)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       val = (unsigned long)config->vmid_val[config->vmid_idx];
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t vmid_val_store(struct device *dev,
+                             struct device_attribute *attr,
+                             const char *buf, size_t size)
+{
+       unsigned long val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       /*
+        * only implemented when vmid tracing is enabled, i.e. at least one
+        * vmid comparator is implemented and at least 8 bit vmid size
+        */
+       if (!drvdata->vmid_size || !drvdata->numvmidc)
+               return -EINVAL;
+       if (kstrtoul(buf, 16, &val))
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+       config->vmid_val[config->vmid_idx] = (u64)val;
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(vmid_val);
+
+static ssize_t vmid_masks_show(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+       unsigned long val1, val2;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       spin_lock(&drvdata->spinlock);
+       val1 = config->vmid_mask0;
+       val2 = config->vmid_mask1;
+       spin_unlock(&drvdata->spinlock);
+       return scnprintf(buf, PAGE_SIZE, "%#lx %#lx\n", val1, val2);
+}
+
+static ssize_t vmid_masks_store(struct device *dev,
+                               struct device_attribute *attr,
+                               const char *buf, size_t size)
+{
+       u8 i, j, maskbyte;
+       unsigned long val1, val2, mask;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
+
+       /*
+        * only implemented when vmid tracing is enabled, i.e. at least one
+        * vmid comparator is implemented and at least 8 bit vmid size
+        */
+       if (!drvdata->vmid_size || !drvdata->numvmidc)
+               return -EINVAL;
+       if (sscanf(buf, "%lx %lx", &val1, &val2) != 2)
+               return -EINVAL;
+
+       spin_lock(&drvdata->spinlock);
+
+       /*
+        * each byte[0..3] controls mask value applied to vmid
+        * comparator[0..3]
+        */
+       switch (drvdata->numvmidc) {
+       case 0x1:
+               /* COMP0, bits[7:0] */
+               config->vmid_mask0 = val1 & 0xFF;
+               break;
+       case 0x2:
+               /* COMP1, bits[15:8] */
+               config->vmid_mask0 = val1 & 0xFFFF;
+               break;
+       case 0x3:
+               /* COMP2, bits[23:16] */
+               config->vmid_mask0 = val1 & 0xFFFFFF;
+               break;
+       case 0x4:
+               /* COMP3, bits[31:24] */
+               config->vmid_mask0 = val1;
+               break;
+       case 0x5:
+               /* COMP4, bits[7:0] */
+               config->vmid_mask0 = val1;
+               config->vmid_mask1 = val2 & 0xFF;
+               break;
+       case 0x6:
+               /* COMP5, bits[15:8] */
+               config->vmid_mask0 = val1;
+               config->vmid_mask1 = val2 & 0xFFFF;
+               break;
+       case 0x7:
+               /* COMP6, bits[23:16] */
+               config->vmid_mask0 = val1;
+               config->vmid_mask1 = val2 & 0xFFFFFF;
+               break;
+       case 0x8:
+               /* COMP7, bits[31:24] */
+               config->vmid_mask0 = val1;
+               config->vmid_mask1 = val2;
+               break;
+       default:
+               break;
+       }
+
+       /*
+        * If software sets a mask bit to 1, it must program relevant byte
+        * of vmid comparator value 0x0, otherwise behavior is unpredictable.
+        * For example, if bit[3] of vmid_mask0 is 1, we must clear bits[31:24]
+        * of vmid comparator0 value (corresponding to byte 0) register.
+        */
+       mask = config->vmid_mask0;
+       for (i = 0; i < drvdata->numvmidc; i++) {
+               /* mask value of corresponding vmid comparator */
+               maskbyte = mask & ETMv4_EVENT_MASK;
+               /*
+                * each bit corresponds to a byte of respective vmid comparator
+                * value register
+                */
+               for (j = 0; j < 8; j++) {
+                       if (maskbyte & 1)
+                               config->vmid_val[i] &= ~(0xFF << (j * 8));
+                       maskbyte >>= 1;
+               }
+               /* Select the next vmid comparator mask value */
+               if (i == 3)
+                       /* vmid comparators[4-7] */
+                       mask = config->vmid_mask1;
+               else
+                       mask >>= 0x8;
+       }
+       spin_unlock(&drvdata->spinlock);
+       return size;
+}
+static DEVICE_ATTR_RW(vmid_masks);
+
+static ssize_t cpu_show(struct device *dev,
+                       struct device_attribute *attr, char *buf)
+{
+       int val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       val = drvdata->cpu;
+       return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+
+}
+static DEVICE_ATTR_RO(cpu);
+
+static struct attribute *coresight_etmv4_attrs[] = {
+       &dev_attr_nr_pe_cmp.attr,
+       &dev_attr_nr_addr_cmp.attr,
+       &dev_attr_nr_cntr.attr,
+       &dev_attr_nr_ext_inp.attr,
+       &dev_attr_numcidc.attr,
+       &dev_attr_numvmidc.attr,
+       &dev_attr_nrseqstate.attr,
+       &dev_attr_nr_resource.attr,
+       &dev_attr_nr_ss_cmp.attr,
+       &dev_attr_reset.attr,
+       &dev_attr_mode.attr,
+       &dev_attr_pe.attr,
+       &dev_attr_event.attr,
+       &dev_attr_event_instren.attr,
+       &dev_attr_event_ts.attr,
+       &dev_attr_syncfreq.attr,
+       &dev_attr_cyc_threshold.attr,
+       &dev_attr_bb_ctrl.attr,
+       &dev_attr_event_vinst.attr,
+       &dev_attr_s_exlevel_vinst.attr,
+       &dev_attr_ns_exlevel_vinst.attr,
+       &dev_attr_addr_idx.attr,
+       &dev_attr_addr_instdatatype.attr,
+       &dev_attr_addr_single.attr,
+       &dev_attr_addr_range.attr,
+       &dev_attr_addr_start.attr,
+       &dev_attr_addr_stop.attr,
+       &dev_attr_addr_ctxtype.attr,
+       &dev_attr_addr_context.attr,
+       &dev_attr_seq_idx.attr,
+       &dev_attr_seq_state.attr,
+       &dev_attr_seq_event.attr,
+       &dev_attr_seq_reset_event.attr,
+       &dev_attr_cntr_idx.attr,
+       &dev_attr_cntrldvr.attr,
+       &dev_attr_cntr_val.attr,
+       &dev_attr_cntr_ctrl.attr,
+       &dev_attr_res_idx.attr,
+       &dev_attr_res_ctrl.attr,
+       &dev_attr_ctxid_idx.attr,
+       &dev_attr_ctxid_pid.attr,
+       &dev_attr_ctxid_masks.attr,
+       &dev_attr_vmid_idx.attr,
+       &dev_attr_vmid_val.attr,
+       &dev_attr_vmid_masks.attr,
+       &dev_attr_cpu.attr,
+       NULL,
+};
+
+#define coresight_etm4x_simple_func(name, offset)                      \
+       coresight_simple_func(struct etmv4_drvdata, name, offset)
+
+coresight_etm4x_simple_func(trcoslsr, TRCOSLSR);
+coresight_etm4x_simple_func(trcpdcr, TRCPDCR);
+coresight_etm4x_simple_func(trcpdsr, TRCPDSR);
+coresight_etm4x_simple_func(trclsr, TRCLSR);
+coresight_etm4x_simple_func(trcconfig, TRCCONFIGR);
+coresight_etm4x_simple_func(trctraceid, TRCTRACEIDR);
+coresight_etm4x_simple_func(trcauthstatus, TRCAUTHSTATUS);
+coresight_etm4x_simple_func(trcdevid, TRCDEVID);
+coresight_etm4x_simple_func(trcdevtype, TRCDEVTYPE);
+coresight_etm4x_simple_func(trcpidr0, TRCPIDR0);
+coresight_etm4x_simple_func(trcpidr1, TRCPIDR1);
+coresight_etm4x_simple_func(trcpidr2, TRCPIDR2);
+coresight_etm4x_simple_func(trcpidr3, TRCPIDR3);
+
+static struct attribute *coresight_etmv4_mgmt_attrs[] = {
+       &dev_attr_trcoslsr.attr,
+       &dev_attr_trcpdcr.attr,
+       &dev_attr_trcpdsr.attr,
+       &dev_attr_trclsr.attr,
+       &dev_attr_trcconfig.attr,
+       &dev_attr_trctraceid.attr,
+       &dev_attr_trcauthstatus.attr,
+       &dev_attr_trcdevid.attr,
+       &dev_attr_trcdevtype.attr,
+       &dev_attr_trcpidr0.attr,
+       &dev_attr_trcpidr1.attr,
+       &dev_attr_trcpidr2.attr,
+       &dev_attr_trcpidr3.attr,
+       NULL,
+};
+
+coresight_etm4x_simple_func(trcidr0, TRCIDR0);
+coresight_etm4x_simple_func(trcidr1, TRCIDR1);
+coresight_etm4x_simple_func(trcidr2, TRCIDR2);
+coresight_etm4x_simple_func(trcidr3, TRCIDR3);
+coresight_etm4x_simple_func(trcidr4, TRCIDR4);
+coresight_etm4x_simple_func(trcidr5, TRCIDR5);
+/* trcidr[6,7] are reserved */
+coresight_etm4x_simple_func(trcidr8, TRCIDR8);
+coresight_etm4x_simple_func(trcidr9, TRCIDR9);
+coresight_etm4x_simple_func(trcidr10, TRCIDR10);
+coresight_etm4x_simple_func(trcidr11, TRCIDR11);
+coresight_etm4x_simple_func(trcidr12, TRCIDR12);
+coresight_etm4x_simple_func(trcidr13, TRCIDR13);
+
+static struct attribute *coresight_etmv4_trcidr_attrs[] = {
+       &dev_attr_trcidr0.attr,
+       &dev_attr_trcidr1.attr,
+       &dev_attr_trcidr2.attr,
+       &dev_attr_trcidr3.attr,
+       &dev_attr_trcidr4.attr,
+       &dev_attr_trcidr5.attr,
+       /* trcidr[6,7] are reserved */
+       &dev_attr_trcidr8.attr,
+       &dev_attr_trcidr9.attr,
+       &dev_attr_trcidr10.attr,
+       &dev_attr_trcidr11.attr,
+       &dev_attr_trcidr12.attr,
+       &dev_attr_trcidr13.attr,
+       NULL,
+};
+
+static const struct attribute_group coresight_etmv4_group = {
+       .attrs = coresight_etmv4_attrs,
+};
+
+static const struct attribute_group coresight_etmv4_mgmt_group = {
+       .attrs = coresight_etmv4_mgmt_attrs,
+       .name = "mgmt",
+};
+
+static const struct attribute_group coresight_etmv4_trcidr_group = {
+       .attrs = coresight_etmv4_trcidr_attrs,
+       .name = "trcidr",
+};
+
+const struct attribute_group *coresight_etmv4_groups[] = {
+       &coresight_etmv4_group,
+       &coresight_etmv4_mgmt_group,
+       &coresight_etmv4_trcidr_group,
+       NULL,
+};
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c b/drivers/hwtracing/coresight/coresight-etm4x.c

index a6707642bb238a68db73aec536ec4e8de6d92e39..462f0dc1575751a01d3778acbbaa30518a1b1dc0 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -15,7 +15,6 @@
  #include <linux/init.h>
  #include <linux/types.h>
  #include <linux/device.h>
-#include <linux/module.h>
  #include <linux/io.h>
  #include <linux/err.h>
  #include <linux/fs.h>
@@ -27,14 +26,19 @@
  #include <linux/clk.h>
  #include <linux/cpu.h>
  #include <linux/coresight.h>
+#include <linux/coresight-pmu.h>
  #include <linux/pm_wakeup.h>
  #include <linux/amba/bus.h>
  #include <linux/seq_file.h>
  #include <linux/uaccess.h>
+#include <linux/perf_event.h>
  #include <linux/pm_runtime.h>
+#include <linux/perf_event.h>
  #include <asm/sections.h>
+#include <asm/local.h>
  
  #include "coresight-etm4x.h"
+#include "coresight-etm-perf.h"
  
  static int boot_enable;
  module_param_named(boot_enable, boot_enable, int, S_IRUGO);
@@ -42,13 +46,13 @@ module_param_named(boot_enable, boot_enable, int, S_IRUGO);
  /* The number of ETMv4 currently registered */
  static int etm4_count;
  static struct etmv4_drvdata *etmdrvdata[NR_CPUS];
+static void etm4_set_default(struct etmv4_config *config);
  
-static void etm4_os_unlock(void *info)
+static void etm4_os_unlock(struct etmv4_drvdata *drvdata)
  {
-       struct etmv4_drvdata *drvdata = (struct etmv4_drvdata *)info;
-
         /* Writing any value to ETMOSLAR unlocks the trace registers */
         writel_relaxed(0x0, drvdata->base + TRCOSLAR);
+       drvdata->os_unlock = true;
         isb();
  }
  
@@ -63,16 +67,22 @@ static bool etm4_arch_supported(u8 arch)
         return true;
  }
  
+static int etm4_cpu_id(struct coresight_device *csdev)
+{
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+       return drvdata->cpu;
+}
+
  static int etm4_trace_id(struct coresight_device *csdev)
  {
         struct etmv4_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
         unsigned long flags;
         int trace_id = -1;
  
-       if (!drvdata->enable)
+       if (!local_read(&drvdata->mode))
                 return drvdata->trcid;
  
-       pm_runtime_get_sync(drvdata->dev);
         spin_lock_irqsave(&drvdata->spinlock, flags);
  
         CS_UNLOCK(drvdata->base);
@@ -81,7 +91,6 @@ static int etm4_trace_id(struct coresight_device *csdev)
         CS_LOCK(drvdata->base);
  
         spin_unlock_irqrestore(&drvdata->spinlock, flags);
-       pm_runtime_put(drvdata->dev);
  
         return trace_id;
  }
@@ -90,6 +99,7 @@ static void etm4_enable_hw(void *info)
  {
         int i;
         struct etmv4_drvdata *drvdata = info;
+       struct etmv4_config *config = &drvdata->config;
  
         CS_UNLOCK(drvdata->base);
  
@@ -104,2200 +114,288 @@ static void etm4_enable_hw(void *info)
                         "timeout observed when probing at offset %#x\n",
                         TRCSTATR);
  
-       writel_relaxed(drvdata->pe_sel, drvdata->base + TRCPROCSELR);
-       writel_relaxed(drvdata->cfg, drvdata->base + TRCCONFIGR);
+       writel_relaxed(config->pe_sel, drvdata->base + TRCPROCSELR);
+       writel_relaxed(config->cfg, drvdata->base + TRCCONFIGR);
         /* nothing specific implemented */
         writel_relaxed(0x0, drvdata->base + TRCAUXCTLR);
-       writel_relaxed(drvdata->eventctrl0, drvdata->base + TRCEVENTCTL0R);
-       writel_relaxed(drvdata->eventctrl1, drvdata->base + TRCEVENTCTL1R);
-       writel_relaxed(drvdata->stall_ctrl, drvdata->base + TRCSTALLCTLR);
-       writel_relaxed(drvdata->ts_ctrl, drvdata->base + TRCTSCTLR);
-       writel_relaxed(drvdata->syncfreq, drvdata->base + TRCSYNCPR);
-       writel_relaxed(drvdata->ccctlr, drvdata->base + TRCCCCTLR);
-       writel_relaxed(drvdata->bb_ctrl, drvdata->base + TRCBBCTLR);
+       writel_relaxed(config->eventctrl0, drvdata->base + TRCEVENTCTL0R);
+       writel_relaxed(config->eventctrl1, drvdata->base + TRCEVENTCTL1R);
+       writel_relaxed(config->stall_ctrl, drvdata->base + TRCSTALLCTLR);
+       writel_relaxed(config->ts_ctrl, drvdata->base + TRCTSCTLR);
+       writel_relaxed(config->syncfreq, drvdata->base + TRCSYNCPR);
+       writel_relaxed(config->ccctlr, drvdata->base + TRCCCCTLR);
+       writel_relaxed(config->bb_ctrl, drvdata->base + TRCBBCTLR);
         writel_relaxed(drvdata->trcid, drvdata->base + TRCTRACEIDR);
-       writel_relaxed(drvdata->vinst_ctrl, drvdata->base + TRCVICTLR);
-       writel_relaxed(drvdata->viiectlr, drvdata->base + TRCVIIECTLR);
-       writel_relaxed(drvdata->vissctlr,
+       writel_relaxed(config->vinst_ctrl, drvdata->base + TRCVICTLR);
+       writel_relaxed(config->viiectlr, drvdata->base + TRCVIIECTLR);
+       writel_relaxed(config->vissctlr,
                        drvdata->base + TRCVISSCTLR);
-       writel_relaxed(drvdata->vipcssctlr,
+       writel_relaxed(config->vipcssctlr,
                        drvdata->base + TRCVIPCSSCTLR);
         for (i = 0; i < drvdata->nrseqstate - 1; i++)
-               writel_relaxed(drvdata->seq_ctrl[i],
+               writel_relaxed(config->seq_ctrl[i],
                                drvdata->base + TRCSEQEVRn(i));
-       writel_relaxed(drvdata->seq_rst, drvdata->base + TRCSEQRSTEVR);
-       writel_relaxed(drvdata->seq_state, drvdata->base + TRCSEQSTR);
-       writel_relaxed(drvdata->ext_inp, drvdata->base + TRCEXTINSELR);
+       writel_relaxed(config->seq_rst, drvdata->base + TRCSEQRSTEVR);
+       writel_relaxed(config->seq_state, drvdata->base + TRCSEQSTR);
+       writel_relaxed(config->ext_inp, drvdata->base + TRCEXTINSELR);
         for (i = 0; i < drvdata->nr_cntr; i++) {
-               writel_relaxed(drvdata->cntrldvr[i],
+               writel_relaxed(config->cntrldvr[i],
                                drvdata->base + TRCCNTRLDVRn(i));
-               writel_relaxed(drvdata->cntr_ctrl[i],
+               writel_relaxed(config->cntr_ctrl[i],
                                drvdata->base + TRCCNTCTLRn(i));
-               writel_relaxed(drvdata->cntr_val[i],
+               writel_relaxed(config->cntr_val[i],
                                drvdata->base + TRCCNTVRn(i));
         }
  
         /* Resource selector pair 0 is always implemented and reserved */
-       for (i = 2; i < drvdata->nr_resource * 2; i++)
-               writel_relaxed(drvdata->res_ctrl[i],
-                              drvdata->base + TRCRSCTLRn(i));
-
-       for (i = 0; i < drvdata->nr_ss_cmp; i++) {
-               writel_relaxed(drvdata->ss_ctrl[i],
-                              drvdata->base + TRCSSCCRn(i));
-               writel_relaxed(drvdata->ss_status[i],
-                              drvdata->base + TRCSSCSRn(i));
-               writel_relaxed(drvdata->ss_pe_cmp[i],
-                              drvdata->base + TRCSSPCICRn(i));
-       }
-       for (i = 0; i < drvdata->nr_addr_cmp; i++) {
-               writeq_relaxed(drvdata->addr_val[i],
-                              drvdata->base + TRCACVRn(i));
-               writeq_relaxed(drvdata->addr_acc[i],
-                              drvdata->base + TRCACATRn(i));
-       }
-       for (i = 0; i < drvdata->numcidc; i++)
-               writeq_relaxed(drvdata->ctxid_pid[i],
-                              drvdata->base + TRCCIDCVRn(i));
-       writel_relaxed(drvdata->ctxid_mask0, drvdata->base + TRCCIDCCTLR0);
-       writel_relaxed(drvdata->ctxid_mask1, drvdata->base + TRCCIDCCTLR1);
-
-       for (i = 0; i < drvdata->numvmidc; i++)
-               writeq_relaxed(drvdata->vmid_val[i],
-                              drvdata->base + TRCVMIDCVRn(i));
-       writel_relaxed(drvdata->vmid_mask0, drvdata->base + TRCVMIDCCTLR0);
-       writel_relaxed(drvdata->vmid_mask1, drvdata->base + TRCVMIDCCTLR1);
-
-       /* Enable the trace unit */
-       writel_relaxed(1, drvdata->base + TRCPRGCTLR);
-
-       /* wait for TRCSTATR.IDLE to go back down to '0' */
-       if (coresight_timeout(drvdata->base, TRCSTATR, TRCSTATR_IDLE_BIT, 0))
-               dev_err(drvdata->dev,
-                       "timeout observed when probing at offset %#x\n",
-                       TRCSTATR);
-
-       CS_LOCK(drvdata->base);
-
-       dev_dbg(drvdata->dev, "cpu: %d enable smp call done\n", drvdata->cpu);
-}
-
-static int etm4_enable(struct coresight_device *csdev)
-{
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
-       int ret;
-
-       pm_runtime_get_sync(drvdata->dev);
-       spin_lock(&drvdata->spinlock);
-
-       /*
-        * Executing etm4_enable_hw on the cpu whose ETM is being enabled
-        * ensures that register writes occur when cpu is powered.
-        */
-       ret = smp_call_function_single(drvdata->cpu,
-                                      etm4_enable_hw, drvdata, 1);
-       if (ret)
-               goto err;
-       drvdata->enable = true;
-       drvdata->sticky_enable = true;
-
-       spin_unlock(&drvdata->spinlock);
-
-       dev_info(drvdata->dev, "ETM tracing enabled\n");
-       return 0;
-err:
-       spin_unlock(&drvdata->spinlock);
-       pm_runtime_put(drvdata->dev);
-       return ret;
-}
-
-static void etm4_disable_hw(void *info)
-{
-       u32 control;
-       struct etmv4_drvdata *drvdata = info;
-
-       CS_UNLOCK(drvdata->base);
-
-       control = readl_relaxed(drvdata->base + TRCPRGCTLR);
-
-       /* EN, bit[0] Trace unit enable bit */
-       control &= ~0x1;
-
-       /* make sure everything completes before disabling */
-       mb();
-       isb();
-       writel_relaxed(control, drvdata->base + TRCPRGCTLR);
-
-       CS_LOCK(drvdata->base);
-
-       dev_dbg(drvdata->dev, "cpu: %d disable smp call done\n", drvdata->cpu);
-}
-
-static void etm4_disable(struct coresight_device *csdev)
-{
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
-
-       /*
-        * Taking hotplug lock here protects from clocks getting disabled
-        * with tracing being left on (crash scenario) if user disable occurs
-        * after cpu online mask indicates the cpu is offline but before the
-        * DYING hotplug callback is serviced by the ETM driver.
-        */
-       get_online_cpus();
-       spin_lock(&drvdata->spinlock);
-
-       /*
-        * Executing etm4_disable_hw on the cpu whose ETM is being disabled
-        * ensures that register writes occur when cpu is powered.
-        */
-       smp_call_function_single(drvdata->cpu, etm4_disable_hw, drvdata, 1);
-       drvdata->enable = false;
-
-       spin_unlock(&drvdata->spinlock);
-       put_online_cpus();
-
-       pm_runtime_put(drvdata->dev);
-
-       dev_info(drvdata->dev, "ETM tracing disabled\n");
-}
-
-static const struct coresight_ops_source etm4_source_ops = {
-       .trace_id       = etm4_trace_id,
-       .enable         = etm4_enable,
-       .disable        = etm4_disable,
-};
-
-static const struct coresight_ops etm4_cs_ops = {
-       .source_ops     = &etm4_source_ops,
-};
-
-static int etm4_set_mode_exclude(struct etmv4_drvdata *drvdata, bool exclude)
-{
-       u8 idx = drvdata->addr_idx;
-
-       /*
-        * TRCACATRn.TYPE bit[1:0]: type of comparison
-        * the trace unit performs
-        */
-       if (BMVAL(drvdata->addr_acc[idx], 0, 1) == ETM_INSTR_ADDR) {
-               if (idx % 2 != 0)
-                       return -EINVAL;
-
-               /*
-                * We are performing instruction address comparison. Set the
-                * relevant bit of ViewInst Include/Exclude Control register
-                * for corresponding address comparator pair.
-                */
-               if (drvdata->addr_type[idx] != ETM_ADDR_TYPE_RANGE ||
-                   drvdata->addr_type[idx + 1] != ETM_ADDR_TYPE_RANGE)
-                       return -EINVAL;
-
-               if (exclude == true) {
-                       /*
-                        * Set exclude bit and unset the include bit
-                        * corresponding to comparator pair
-                        */
-                       drvdata->viiectlr |= BIT(idx / 2 + 16);
-                       drvdata->viiectlr &= ~BIT(idx / 2);
-               } else {
-                       /*
-                        * Set include bit and unset exclude bit
-                        * corresponding to comparator pair
-                        */
-                       drvdata->viiectlr |= BIT(idx / 2);
-                       drvdata->viiectlr &= ~BIT(idx / 2 + 16);
-               }
-       }
-       return 0;
-}
-
-static ssize_t nr_pe_cmp_show(struct device *dev,
-                             struct device_attribute *attr,
-                             char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->nr_pe_cmp;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-static DEVICE_ATTR_RO(nr_pe_cmp);
-
-static ssize_t nr_addr_cmp_show(struct device *dev,
-                               struct device_attribute *attr,
-                               char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->nr_addr_cmp;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-static DEVICE_ATTR_RO(nr_addr_cmp);
-
-static ssize_t nr_cntr_show(struct device *dev,
-                           struct device_attribute *attr,
-                           char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->nr_cntr;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-static DEVICE_ATTR_RO(nr_cntr);
-
-static ssize_t nr_ext_inp_show(struct device *dev,
-                              struct device_attribute *attr,
-                              char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->nr_ext_inp;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-static DEVICE_ATTR_RO(nr_ext_inp);
-
-static ssize_t numcidc_show(struct device *dev,
-                           struct device_attribute *attr,
-                           char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->numcidc;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-static DEVICE_ATTR_RO(numcidc);
-
-static ssize_t numvmidc_show(struct device *dev,
-                            struct device_attribute *attr,
-                            char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->numvmidc;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-static DEVICE_ATTR_RO(numvmidc);
-
-static ssize_t nrseqstate_show(struct device *dev,
-                              struct device_attribute *attr,
-                              char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->nrseqstate;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-static DEVICE_ATTR_RO(nrseqstate);
-
-static ssize_t nr_resource_show(struct device *dev,
-                               struct device_attribute *attr,
-                               char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->nr_resource;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-static DEVICE_ATTR_RO(nr_resource);
-
-static ssize_t nr_ss_cmp_show(struct device *dev,
-                             struct device_attribute *attr,
-                             char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->nr_ss_cmp;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-static DEVICE_ATTR_RO(nr_ss_cmp);
-
-static ssize_t reset_store(struct device *dev,
-                          struct device_attribute *attr,
-                          const char *buf, size_t size)
-{
-       int i;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       if (val)
-               drvdata->mode = 0x0;
-
-       /* Disable data tracing: do not trace load and store data transfers */
-       drvdata->mode &= ~(ETM_MODE_LOAD | ETM_MODE_STORE);
-       drvdata->cfg &= ~(BIT(1) | BIT(2));
-
-       /* Disable data value and data address tracing */
-       drvdata->mode &= ~(ETM_MODE_DATA_TRACE_ADDR |
-                          ETM_MODE_DATA_TRACE_VAL);
-       drvdata->cfg &= ~(BIT(16) | BIT(17));
-
-       /* Disable all events tracing */
-       drvdata->eventctrl0 = 0x0;
-       drvdata->eventctrl1 = 0x0;
-
-       /* Disable timestamp event */
-       drvdata->ts_ctrl = 0x0;
-
-       /* Disable stalling */
-       drvdata->stall_ctrl = 0x0;
-
-       /* Reset trace synchronization period  to 2^8 = 256 bytes*/
-       if (drvdata->syncpr == false)
-               drvdata->syncfreq = 0x8;
-
-       /*
-        * Enable ViewInst to trace everything with start-stop logic in
-        * started state. ARM recommends start-stop logic is set before
-        * each trace run.
-        */
-       drvdata->vinst_ctrl |= BIT(0);
-       if (drvdata->nr_addr_cmp == true) {
-               drvdata->mode |= ETM_MODE_VIEWINST_STARTSTOP;
-               /* SSSTATUS, bit[9] */
-               drvdata->vinst_ctrl |= BIT(9);
-       }
-
-       /* No address range filtering for ViewInst */
-       drvdata->viiectlr = 0x0;
-
-       /* No start-stop filtering for ViewInst */
-       drvdata->vissctlr = 0x0;
-
-       /* Disable seq events */
-       for (i = 0; i < drvdata->nrseqstate-1; i++)
-               drvdata->seq_ctrl[i] = 0x0;
-       drvdata->seq_rst = 0x0;
-       drvdata->seq_state = 0x0;
-
-       /* Disable external input events */
-       drvdata->ext_inp = 0x0;
-
-       drvdata->cntr_idx = 0x0;
-       for (i = 0; i < drvdata->nr_cntr; i++) {
-               drvdata->cntrldvr[i] = 0x0;
-               drvdata->cntr_ctrl[i] = 0x0;
-               drvdata->cntr_val[i] = 0x0;
-       }
-
-       /* Resource selector pair 0 is always implemented and reserved */
-       drvdata->res_idx = 0x2;
-       for (i = 2; i < drvdata->nr_resource * 2; i++)
-               drvdata->res_ctrl[i] = 0x0;
-
-       for (i = 0; i < drvdata->nr_ss_cmp; i++) {
-               drvdata->ss_ctrl[i] = 0x0;
-               drvdata->ss_pe_cmp[i] = 0x0;
-       }
-
-       drvdata->addr_idx = 0x0;
-       for (i = 0; i < drvdata->nr_addr_cmp * 2; i++) {
-               drvdata->addr_val[i] = 0x0;
-               drvdata->addr_acc[i] = 0x0;
-               drvdata->addr_type[i] = ETM_ADDR_TYPE_NONE;
-       }
-
-       drvdata->ctxid_idx = 0x0;
-       for (i = 0; i < drvdata->numcidc; i++) {
-               drvdata->ctxid_pid[i] = 0x0;
-               drvdata->ctxid_vpid[i] = 0x0;
-       }
-
-       drvdata->ctxid_mask0 = 0x0;
-       drvdata->ctxid_mask1 = 0x0;
-
-       drvdata->vmid_idx = 0x0;
-       for (i = 0; i < drvdata->numvmidc; i++)
-               drvdata->vmid_val[i] = 0x0;
-       drvdata->vmid_mask0 = 0x0;
-       drvdata->vmid_mask1 = 0x0;
-
-       drvdata->trcid = drvdata->cpu + 1;
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_WO(reset);
-
-static ssize_t mode_show(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->mode;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t mode_store(struct device *dev,
-                         struct device_attribute *attr,
-                         const char *buf, size_t size)
-{
-       unsigned long val, mode;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       drvdata->mode = val & ETMv4_MODE_ALL;
-
-       if (drvdata->mode & ETM_MODE_EXCLUDE)
-               etm4_set_mode_exclude(drvdata, true);
-       else
-               etm4_set_mode_exclude(drvdata, false);
-
-       if (drvdata->instrp0 == true) {
-               /* start by clearing instruction P0 field */
-               drvdata->cfg  &= ~(BIT(1) | BIT(2));
-               if (drvdata->mode & ETM_MODE_LOAD)
-                       /* 0b01 Trace load instructions as P0 instructions */
-                       drvdata->cfg  |= BIT(1);
-               if (drvdata->mode & ETM_MODE_STORE)
-                       /* 0b10 Trace store instructions as P0 instructions */
-                       drvdata->cfg  |= BIT(2);
-               if (drvdata->mode & ETM_MODE_LOAD_STORE)
-                       /*
-                        * 0b11 Trace load and store instructions
-                        * as P0 instructions
-                        */
-                       drvdata->cfg  |= BIT(1) | BIT(2);
-       }
-
-       /* bit[3], Branch broadcast mode */
-       if ((drvdata->mode & ETM_MODE_BB) && (drvdata->trcbb == true))
-               drvdata->cfg |= BIT(3);
-       else
-               drvdata->cfg &= ~BIT(3);
-
-       /* bit[4], Cycle counting instruction trace bit */
-       if ((drvdata->mode & ETMv4_MODE_CYCACC) &&
-               (drvdata->trccci == true))
-               drvdata->cfg |= BIT(4);
-       else
-               drvdata->cfg &= ~BIT(4);
-
-       /* bit[6], Context ID tracing bit */
-       if ((drvdata->mode & ETMv4_MODE_CTXID) && (drvdata->ctxid_size))
-               drvdata->cfg |= BIT(6);
-       else
-               drvdata->cfg &= ~BIT(6);
-
-       if ((drvdata->mode & ETM_MODE_VMID) && (drvdata->vmid_size))
-               drvdata->cfg |= BIT(7);
-       else
-               drvdata->cfg &= ~BIT(7);
-
-       /* bits[10:8], Conditional instruction tracing bit */
-       mode = ETM_MODE_COND(drvdata->mode);
-       if (drvdata->trccond == true) {
-               drvdata->cfg &= ~(BIT(8) | BIT(9) | BIT(10));
-               drvdata->cfg |= mode << 8;
-       }
-
-       /* bit[11], Global timestamp tracing bit */
-       if ((drvdata->mode & ETMv4_MODE_TIMESTAMP) && (drvdata->ts_size))
-               drvdata->cfg |= BIT(11);
-       else
-               drvdata->cfg &= ~BIT(11);
-
-       /* bit[12], Return stack enable bit */
-       if ((drvdata->mode & ETM_MODE_RETURNSTACK) &&
-               (drvdata->retstack == true))
-               drvdata->cfg |= BIT(12);
-       else
-               drvdata->cfg &= ~BIT(12);
-
-       /* bits[14:13], Q element enable field */
-       mode = ETM_MODE_QELEM(drvdata->mode);
-       /* start by clearing QE bits */
-       drvdata->cfg &= ~(BIT(13) | BIT(14));
-       /* if supported, Q elements with instruction counts are enabled */
-       if ((mode & BIT(0)) && (drvdata->q_support & BIT(0)))
-               drvdata->cfg |= BIT(13);
-       /*
-        * if supported, Q elements with and without instruction
-        * counts are enabled
-        */
-       if ((mode & BIT(1)) && (drvdata->q_support & BIT(1)))
-               drvdata->cfg |= BIT(14);
-
-       /* bit[11], AMBA Trace Bus (ATB) trigger enable bit */
-       if ((drvdata->mode & ETM_MODE_ATB_TRIGGER) &&
-           (drvdata->atbtrig == true))
-               drvdata->eventctrl1 |= BIT(11);
-       else
-               drvdata->eventctrl1 &= ~BIT(11);
-
-       /* bit[12], Low-power state behavior override bit */
-       if ((drvdata->mode & ETM_MODE_LPOVERRIDE) &&
-           (drvdata->lpoverride == true))
-               drvdata->eventctrl1 |= BIT(12);
-       else
-               drvdata->eventctrl1 &= ~BIT(12);
-
-       /* bit[8], Instruction stall bit */
-       if (drvdata->mode & ETM_MODE_ISTALL_EN)
-               drvdata->stall_ctrl |= BIT(8);
-       else
-               drvdata->stall_ctrl &= ~BIT(8);
-
-       /* bit[10], Prioritize instruction trace bit */
-       if (drvdata->mode & ETM_MODE_INSTPRIO)
-               drvdata->stall_ctrl |= BIT(10);
-       else
-               drvdata->stall_ctrl &= ~BIT(10);
-
-       /* bit[13], Trace overflow prevention bit */
-       if ((drvdata->mode & ETM_MODE_NOOVERFLOW) &&
-               (drvdata->nooverflow == true))
-               drvdata->stall_ctrl |= BIT(13);
-       else
-               drvdata->stall_ctrl &= ~BIT(13);
-
-       /* bit[9] Start/stop logic control bit */
-       if (drvdata->mode & ETM_MODE_VIEWINST_STARTSTOP)
-               drvdata->vinst_ctrl |= BIT(9);
-       else
-               drvdata->vinst_ctrl &= ~BIT(9);
-
-       /* bit[10], Whether a trace unit must trace a Reset exception */
-       if (drvdata->mode & ETM_MODE_TRACE_RESET)
-               drvdata->vinst_ctrl |= BIT(10);
-       else
-               drvdata->vinst_ctrl &= ~BIT(10);
-
-       /* bit[11], Whether a trace unit must trace a system error exception */
-       if ((drvdata->mode & ETM_MODE_TRACE_ERR) &&
-               (drvdata->trc_error == true))
-               drvdata->vinst_ctrl |= BIT(11);
-       else
-               drvdata->vinst_ctrl &= ~BIT(11);
-
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(mode);
-
-static ssize_t pe_show(struct device *dev,
-                      struct device_attribute *attr,
-                      char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->pe_sel;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t pe_store(struct device *dev,
-                       struct device_attribute *attr,
-                       const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       if (val > drvdata->nr_pe) {
-               spin_unlock(&drvdata->spinlock);
-               return -EINVAL;
-       }
-
-       drvdata->pe_sel = val;
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(pe);
-
-static ssize_t event_show(struct device *dev,
-                         struct device_attribute *attr,
-                         char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->eventctrl0;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t event_store(struct device *dev,
-                          struct device_attribute *attr,
-                          const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       switch (drvdata->nr_event) {
-       case 0x0:
-               /* EVENT0, bits[7:0] */
-               drvdata->eventctrl0 = val & 0xFF;
-               break;
-       case 0x1:
-                /* EVENT1, bits[15:8] */
-               drvdata->eventctrl0 = val & 0xFFFF;
-               break;
-       case 0x2:
-               /* EVENT2, bits[23:16] */
-               drvdata->eventctrl0 = val & 0xFFFFFF;
-               break;
-       case 0x3:
-               /* EVENT3, bits[31:24] */
-               drvdata->eventctrl0 = val;
-               break;
-       default:
-               break;
-       }
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(event);
-
-static ssize_t event_instren_show(struct device *dev,
-                                 struct device_attribute *attr,
-                                 char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = BMVAL(drvdata->eventctrl1, 0, 3);
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t event_instren_store(struct device *dev,
-                                  struct device_attribute *attr,
-                                  const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       /* start by clearing all instruction event enable bits */
-       drvdata->eventctrl1 &= ~(BIT(0) | BIT(1) | BIT(2) | BIT(3));
-       switch (drvdata->nr_event) {
-       case 0x0:
-               /* generate Event element for event 1 */
-               drvdata->eventctrl1 |= val & BIT(1);
-               break;
-       case 0x1:
-               /* generate Event element for event 1 and 2 */
-               drvdata->eventctrl1 |= val & (BIT(0) | BIT(1));
-               break;
-       case 0x2:
-               /* generate Event element for event 1, 2 and 3 */
-               drvdata->eventctrl1 |= val & (BIT(0) | BIT(1) | BIT(2));
-               break;
-       case 0x3:
-               /* generate Event element for all 4 events */
-               drvdata->eventctrl1 |= val & 0xF;
-               break;
-       default:
-               break;
-       }
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(event_instren);
-
-static ssize_t event_ts_show(struct device *dev,
-                            struct device_attribute *attr,
-                            char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->ts_ctrl;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t event_ts_store(struct device *dev,
-                             struct device_attribute *attr,
-                             const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-       if (!drvdata->ts_size)
-               return -EINVAL;
-
-       drvdata->ts_ctrl = val & ETMv4_EVENT_MASK;
-       return size;
-}
-static DEVICE_ATTR_RW(event_ts);
-
-static ssize_t syncfreq_show(struct device *dev,
-                            struct device_attribute *attr,
-                            char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->syncfreq;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t syncfreq_store(struct device *dev,
-                             struct device_attribute *attr,
-                             const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-       if (drvdata->syncpr == true)
-               return -EINVAL;
-
-       drvdata->syncfreq = val & ETMv4_SYNC_MASK;
-       return size;
-}
-static DEVICE_ATTR_RW(syncfreq);
-
-static ssize_t cyc_threshold_show(struct device *dev,
-                                 struct device_attribute *attr,
-                                 char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->ccctlr;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t cyc_threshold_store(struct device *dev,
-                                  struct device_attribute *attr,
-                                  const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-       if (val < drvdata->ccitmin)
-               return -EINVAL;
-
-       drvdata->ccctlr = val & ETM_CYC_THRESHOLD_MASK;
-       return size;
-}
-static DEVICE_ATTR_RW(cyc_threshold);
-
-static ssize_t bb_ctrl_show(struct device *dev,
-                           struct device_attribute *attr,
-                           char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->bb_ctrl;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t bb_ctrl_store(struct device *dev,
-                            struct device_attribute *attr,
-                            const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-       if (drvdata->trcbb == false)
-               return -EINVAL;
-       if (!drvdata->nr_addr_cmp)
-               return -EINVAL;
-       /*
-        * Bit[7:0] selects which address range comparator is used for
-        * branch broadcast control.
-        */
-       if (BMVAL(val, 0, 7) > drvdata->nr_addr_cmp)
-               return -EINVAL;
-
-       drvdata->bb_ctrl = val;
-       return size;
-}
-static DEVICE_ATTR_RW(bb_ctrl);
-
-static ssize_t event_vinst_show(struct device *dev,
-                               struct device_attribute *attr,
-                               char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->vinst_ctrl & ETMv4_EVENT_MASK;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t event_vinst_store(struct device *dev,
-                                struct device_attribute *attr,
-                                const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       val &= ETMv4_EVENT_MASK;
-       drvdata->vinst_ctrl &= ~ETMv4_EVENT_MASK;
-       drvdata->vinst_ctrl |= val;
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(event_vinst);
-
-static ssize_t s_exlevel_vinst_show(struct device *dev,
-                                   struct device_attribute *attr,
-                                   char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = BMVAL(drvdata->vinst_ctrl, 16, 19);
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t s_exlevel_vinst_store(struct device *dev,
-                                    struct device_attribute *attr,
-                                    const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       /* clear all EXLEVEL_S bits (bit[18] is never implemented) */
-       drvdata->vinst_ctrl &= ~(BIT(16) | BIT(17) | BIT(19));
-       /* enable instruction tracing for corresponding exception level */
-       val &= drvdata->s_ex_level;
-       drvdata->vinst_ctrl |= (val << 16);
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(s_exlevel_vinst);
-
-static ssize_t ns_exlevel_vinst_show(struct device *dev,
-                                    struct device_attribute *attr,
-                                    char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       /* EXLEVEL_NS, bits[23:20] */
-       val = BMVAL(drvdata->vinst_ctrl, 20, 23);
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t ns_exlevel_vinst_store(struct device *dev,
-                                     struct device_attribute *attr,
-                                     const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       /* clear EXLEVEL_NS bits (bit[23] is never implemented */
-       drvdata->vinst_ctrl &= ~(BIT(20) | BIT(21) | BIT(22));
-       /* enable instruction tracing for corresponding exception level */
-       val &= drvdata->ns_ex_level;
-       drvdata->vinst_ctrl |= (val << 20);
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(ns_exlevel_vinst);
-
-static ssize_t addr_idx_show(struct device *dev,
-                            struct device_attribute *attr,
-                            char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->addr_idx;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t addr_idx_store(struct device *dev,
-                             struct device_attribute *attr,
-                             const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-       if (val >= drvdata->nr_addr_cmp * 2)
-               return -EINVAL;
-
-       /*
-        * Use spinlock to ensure index doesn't change while it gets
-        * dereferenced multiple times within a spinlock block elsewhere.
-        */
-       spin_lock(&drvdata->spinlock);
-       drvdata->addr_idx = val;
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(addr_idx);
-
-static ssize_t addr_instdatatype_show(struct device *dev,
-                                     struct device_attribute *attr,
-                                     char *buf)
-{
-       ssize_t len;
-       u8 val, idx;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       val = BMVAL(drvdata->addr_acc[idx], 0, 1);
-       len = scnprintf(buf, PAGE_SIZE, "%s\n",
-                       val == ETM_INSTR_ADDR ? "instr" :
-                       (val == ETM_DATA_LOAD_ADDR ? "data_load" :
-                       (val == ETM_DATA_STORE_ADDR ? "data_store" :
-                       "data_load_store")));
-       spin_unlock(&drvdata->spinlock);
-       return len;
-}
-
-static ssize_t addr_instdatatype_store(struct device *dev,
-                                      struct device_attribute *attr,
-                                      const char *buf, size_t size)
-{
-       u8 idx;
-       char str[20] = "";
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (strlen(buf) >= 20)
-               return -EINVAL;
-       if (sscanf(buf, "%s", str) != 1)
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       if (!strcmp(str, "instr"))
-               /* TYPE, bits[1:0] */
-               drvdata->addr_acc[idx] &= ~(BIT(0) | BIT(1));
-
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(addr_instdatatype);
-
-static ssize_t addr_single_show(struct device *dev,
-                               struct device_attribute *attr,
-                               char *buf)
-{
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       idx = drvdata->addr_idx;
-       spin_lock(&drvdata->spinlock);
-       if (!(drvdata->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
-             drvdata->addr_type[idx] == ETM_ADDR_TYPE_SINGLE)) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-       val = (unsigned long)drvdata->addr_val[idx];
-       spin_unlock(&drvdata->spinlock);
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t addr_single_store(struct device *dev,
-                                struct device_attribute *attr,
-                                const char *buf, size_t size)
-{
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       if (!(drvdata->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
-             drvdata->addr_type[idx] == ETM_ADDR_TYPE_SINGLE)) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-
-       drvdata->addr_val[idx] = (u64)val;
-       drvdata->addr_type[idx] = ETM_ADDR_TYPE_SINGLE;
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(addr_single);
-
-static ssize_t addr_range_show(struct device *dev,
-                              struct device_attribute *attr,
-                              char *buf)
-{
-       u8 idx;
-       unsigned long val1, val2;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       if (idx % 2 != 0) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-       if (!((drvdata->addr_type[idx] == ETM_ADDR_TYPE_NONE &&
-              drvdata->addr_type[idx + 1] == ETM_ADDR_TYPE_NONE) ||
-             (drvdata->addr_type[idx] == ETM_ADDR_TYPE_RANGE &&
-              drvdata->addr_type[idx + 1] == ETM_ADDR_TYPE_RANGE))) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-
-       val1 = (unsigned long)drvdata->addr_val[idx];
-       val2 = (unsigned long)drvdata->addr_val[idx + 1];
-       spin_unlock(&drvdata->spinlock);
-       return scnprintf(buf, PAGE_SIZE, "%#lx %#lx\n", val1, val2);
-}
-
-static ssize_t addr_range_store(struct device *dev,
-                               struct device_attribute *attr,
-                               const char *buf, size_t size)
-{
-       u8 idx;
-       unsigned long val1, val2;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (sscanf(buf, "%lx %lx", &val1, &val2) != 2)
-               return -EINVAL;
-       /* lower address comparator cannot have a higher address value */
-       if (val1 > val2)
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       if (idx % 2 != 0) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-
-       if (!((drvdata->addr_type[idx] == ETM_ADDR_TYPE_NONE &&
-              drvdata->addr_type[idx + 1] == ETM_ADDR_TYPE_NONE) ||
-             (drvdata->addr_type[idx] == ETM_ADDR_TYPE_RANGE &&
-              drvdata->addr_type[idx + 1] == ETM_ADDR_TYPE_RANGE))) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-
-       drvdata->addr_val[idx] = (u64)val1;
-       drvdata->addr_type[idx] = ETM_ADDR_TYPE_RANGE;
-       drvdata->addr_val[idx + 1] = (u64)val2;
-       drvdata->addr_type[idx + 1] = ETM_ADDR_TYPE_RANGE;
-       /*
-        * Program include or exclude control bits for vinst or vdata
-        * whenever we change addr comparators to ETM_ADDR_TYPE_RANGE
-        */
-       if (drvdata->mode & ETM_MODE_EXCLUDE)
-               etm4_set_mode_exclude(drvdata, true);
-       else
-               etm4_set_mode_exclude(drvdata, false);
-
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(addr_range);
-
-static ssize_t addr_start_show(struct device *dev,
-                              struct device_attribute *attr,
-                              char *buf)
-{
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-
-       if (!(drvdata->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
-             drvdata->addr_type[idx] == ETM_ADDR_TYPE_START)) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-
-       val = (unsigned long)drvdata->addr_val[idx];
-       spin_unlock(&drvdata->spinlock);
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t addr_start_store(struct device *dev,
-                               struct device_attribute *attr,
-                               const char *buf, size_t size)
-{
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       if (!drvdata->nr_addr_cmp) {
-               spin_unlock(&drvdata->spinlock);
-               return -EINVAL;
-       }
-       if (!(drvdata->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
-             drvdata->addr_type[idx] == ETM_ADDR_TYPE_START)) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-
-       drvdata->addr_val[idx] = (u64)val;
-       drvdata->addr_type[idx] = ETM_ADDR_TYPE_START;
-       drvdata->vissctlr |= BIT(idx);
-       /* SSSTATUS, bit[9] - turn on start/stop logic */
-       drvdata->vinst_ctrl |= BIT(9);
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(addr_start);
-
-static ssize_t addr_stop_show(struct device *dev,
-                             struct device_attribute *attr,
-                             char *buf)
-{
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-
-       if (!(drvdata->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
-             drvdata->addr_type[idx] == ETM_ADDR_TYPE_STOP)) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-
-       val = (unsigned long)drvdata->addr_val[idx];
-       spin_unlock(&drvdata->spinlock);
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t addr_stop_store(struct device *dev,
-                              struct device_attribute *attr,
-                              const char *buf, size_t size)
-{
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       if (!drvdata->nr_addr_cmp) {
-               spin_unlock(&drvdata->spinlock);
-               return -EINVAL;
-       }
-       if (!(drvdata->addr_type[idx] == ETM_ADDR_TYPE_NONE ||
-              drvdata->addr_type[idx] == ETM_ADDR_TYPE_STOP)) {
-               spin_unlock(&drvdata->spinlock);
-               return -EPERM;
-       }
-
-       drvdata->addr_val[idx] = (u64)val;
-       drvdata->addr_type[idx] = ETM_ADDR_TYPE_STOP;
-       drvdata->vissctlr |= BIT(idx + 16);
-       /* SSSTATUS, bit[9] - turn on start/stop logic */
-       drvdata->vinst_ctrl |= BIT(9);
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(addr_stop);
-
-static ssize_t addr_ctxtype_show(struct device *dev,
-                                struct device_attribute *attr,
-                                char *buf)
-{
-       ssize_t len;
-       u8 idx, val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       /* CONTEXTTYPE, bits[3:2] */
-       val = BMVAL(drvdata->addr_acc[idx], 2, 3);
-       len = scnprintf(buf, PAGE_SIZE, "%s\n", val == ETM_CTX_NONE ? "none" :
-                       (val == ETM_CTX_CTXID ? "ctxid" :
-                       (val == ETM_CTX_VMID ? "vmid" : "all")));
-       spin_unlock(&drvdata->spinlock);
-       return len;
-}
-
-static ssize_t addr_ctxtype_store(struct device *dev,
-                                 struct device_attribute *attr,
-                                 const char *buf, size_t size)
-{
-       u8 idx;
-       char str[10] = "";
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (strlen(buf) >= 10)
-               return -EINVAL;
-       if (sscanf(buf, "%s", str) != 1)
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       if (!strcmp(str, "none"))
-               /* start by clearing context type bits */
-               drvdata->addr_acc[idx] &= ~(BIT(2) | BIT(3));
-       else if (!strcmp(str, "ctxid")) {
-               /* 0b01 The trace unit performs a Context ID */
-               if (drvdata->numcidc) {
-                       drvdata->addr_acc[idx] |= BIT(2);
-                       drvdata->addr_acc[idx] &= ~BIT(3);
-               }
-       } else if (!strcmp(str, "vmid")) {
-               /* 0b10 The trace unit performs a VMID */
-               if (drvdata->numvmidc) {
-                       drvdata->addr_acc[idx] &= ~BIT(2);
-                       drvdata->addr_acc[idx] |= BIT(3);
-               }
-       } else if (!strcmp(str, "all")) {
-               /*
-                * 0b11 The trace unit performs a Context ID
-                * comparison and a VMID
-                */
-               if (drvdata->numcidc)
-                       drvdata->addr_acc[idx] |= BIT(2);
-               if (drvdata->numvmidc)
-                       drvdata->addr_acc[idx] |= BIT(3);
-       }
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(addr_ctxtype);
-
-static ssize_t addr_context_show(struct device *dev,
-                                struct device_attribute *attr,
-                                char *buf)
-{
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       /* context ID comparator bits[6:4] */
-       val = BMVAL(drvdata->addr_acc[idx], 4, 6);
-       spin_unlock(&drvdata->spinlock);
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t addr_context_store(struct device *dev,
-                                 struct device_attribute *attr,
-                                 const char *buf, size_t size)
-{
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-       if ((drvdata->numcidc <= 1) && (drvdata->numvmidc <= 1))
-               return -EINVAL;
-       if (val >=  (drvdata->numcidc >= drvdata->numvmidc ?
-                    drvdata->numcidc : drvdata->numvmidc))
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->addr_idx;
-       /* clear context ID comparator bits[6:4] */
-       drvdata->addr_acc[idx] &= ~(BIT(4) | BIT(5) | BIT(6));
-       drvdata->addr_acc[idx] |= (val << 4);
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(addr_context);
-
-static ssize_t seq_idx_show(struct device *dev,
-                           struct device_attribute *attr,
-                           char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->seq_idx;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t seq_idx_store(struct device *dev,
-                            struct device_attribute *attr,
-                            const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-       if (val >= drvdata->nrseqstate - 1)
-               return -EINVAL;
-
-       /*
-        * Use spinlock to ensure index doesn't change while it gets
-        * dereferenced multiple times within a spinlock block elsewhere.
-        */
-       spin_lock(&drvdata->spinlock);
-       drvdata->seq_idx = val;
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(seq_idx);
-
-static ssize_t seq_state_show(struct device *dev,
-                             struct device_attribute *attr,
-                             char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->seq_state;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t seq_state_store(struct device *dev,
-                              struct device_attribute *attr,
-                              const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-       if (val >= drvdata->nrseqstate)
-               return -EINVAL;
-
-       drvdata->seq_state = val;
-       return size;
-}
-static DEVICE_ATTR_RW(seq_state);
-
-static ssize_t seq_event_show(struct device *dev,
-                             struct device_attribute *attr,
-                             char *buf)
-{
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->seq_idx;
-       val = drvdata->seq_ctrl[idx];
-       spin_unlock(&drvdata->spinlock);
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t seq_event_store(struct device *dev,
-                              struct device_attribute *attr,
-                              const char *buf, size_t size)
-{
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->seq_idx;
-       /* RST, bits[7:0] */
-       drvdata->seq_ctrl[idx] = val & 0xFF;
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(seq_event);
-
-static ssize_t seq_reset_event_show(struct device *dev,
-                                   struct device_attribute *attr,
-                                   char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->seq_rst;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t seq_reset_event_store(struct device *dev,
-                                    struct device_attribute *attr,
-                                    const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-       if (!(drvdata->nrseqstate))
-               return -EINVAL;
-
-       drvdata->seq_rst = val & ETMv4_EVENT_MASK;
-       return size;
-}
-static DEVICE_ATTR_RW(seq_reset_event);
-
-static ssize_t cntr_idx_show(struct device *dev,
-                            struct device_attribute *attr,
-                            char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->cntr_idx;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t cntr_idx_store(struct device *dev,
-                             struct device_attribute *attr,
-                             const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-       if (val >= drvdata->nr_cntr)
-               return -EINVAL;
-
-       /*
-        * Use spinlock to ensure index doesn't change while it gets
-        * dereferenced multiple times within a spinlock block elsewhere.
-        */
-       spin_lock(&drvdata->spinlock);
-       drvdata->cntr_idx = val;
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(cntr_idx);
-
-static ssize_t cntrldvr_show(struct device *dev,
-                            struct device_attribute *attr,
-                            char *buf)
-{
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->cntr_idx;
-       val = drvdata->cntrldvr[idx];
-       spin_unlock(&drvdata->spinlock);
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t cntrldvr_store(struct device *dev,
-                             struct device_attribute *attr,
-                             const char *buf, size_t size)
-{
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-       if (val > ETM_CNTR_MAX_VAL)
-               return -EINVAL;
-
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->cntr_idx;
-       drvdata->cntrldvr[idx] = val;
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(cntrldvr);
-
-static ssize_t cntr_val_show(struct device *dev,
-                            struct device_attribute *attr,
-                            char *buf)
-{
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       for (i = 0; i < drvdata->nr_resource * 2; i++)
+               writel_relaxed(config->res_ctrl[i],
+                              drvdata->base + TRCRSCTLRn(i));
  
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->cntr_idx;
-       val = drvdata->cntr_val[idx];
-       spin_unlock(&drvdata->spinlock);
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
+       for (i = 0; i < drvdata->nr_ss_cmp; i++) {
+               writel_relaxed(config->ss_ctrl[i],
+                              drvdata->base + TRCSSCCRn(i));
+               writel_relaxed(config->ss_status[i],
+                              drvdata->base + TRCSSCSRn(i));
+               writel_relaxed(config->ss_pe_cmp[i],
+                              drvdata->base + TRCSSPCICRn(i));
+       }
+       for (i = 0; i < drvdata->nr_addr_cmp; i++) {
+               writeq_relaxed(config->addr_val[i],
+                              drvdata->base + TRCACVRn(i));
+               writeq_relaxed(config->addr_acc[i],
+                              drvdata->base + TRCACATRn(i));
+       }
+       for (i = 0; i < drvdata->numcidc; i++)
+               writeq_relaxed(config->ctxid_pid[i],
+                              drvdata->base + TRCCIDCVRn(i));
+       writel_relaxed(config->ctxid_mask0, drvdata->base + TRCCIDCCTLR0);
+       writel_relaxed(config->ctxid_mask1, drvdata->base + TRCCIDCCTLR1);
  
-static ssize_t cntr_val_store(struct device *dev,
-                             struct device_attribute *attr,
-                             const char *buf, size_t size)
-{
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       for (i = 0; i < drvdata->numvmidc; i++)
+               writeq_relaxed(config->vmid_val[i],
+                              drvdata->base + TRCVMIDCVRn(i));
+       writel_relaxed(config->vmid_mask0, drvdata->base + TRCVMIDCCTLR0);
+       writel_relaxed(config->vmid_mask1, drvdata->base + TRCVMIDCCTLR1);
  
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-       if (val > ETM_CNTR_MAX_VAL)
-               return -EINVAL;
+       /* Enable the trace unit */
+       writel_relaxed(1, drvdata->base + TRCPRGCTLR);
  
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->cntr_idx;
-       drvdata->cntr_val[idx] = val;
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(cntr_val);
+       /* wait for TRCSTATR.IDLE to go back down to '0' */
+       if (coresight_timeout(drvdata->base, TRCSTATR, TRCSTATR_IDLE_BIT, 0))
+               dev_err(drvdata->dev,
+                       "timeout observed when probing at offset %#x\n",
+                       TRCSTATR);
  
-static ssize_t cntr_ctrl_show(struct device *dev,
-                             struct device_attribute *attr,
-                             char *buf)
-{
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       CS_LOCK(drvdata->base);
  
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->cntr_idx;
-       val = drvdata->cntr_ctrl[idx];
-       spin_unlock(&drvdata->spinlock);
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+       dev_dbg(drvdata->dev, "cpu: %d enable smp call done\n", drvdata->cpu);
  }
  
-static ssize_t cntr_ctrl_store(struct device *dev,
-                              struct device_attribute *attr,
-                              const char *buf, size_t size)
+static int etm4_parse_event_config(struct etmv4_drvdata *drvdata,
+                                  struct perf_event_attr *attr)
  {
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_config *config = &drvdata->config;
  
-       if (kstrtoul(buf, 16, &val))
+       if (!attr)
                 return -EINVAL;
  
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->cntr_idx;
-       drvdata->cntr_ctrl[idx] = val;
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(cntr_ctrl);
-
-static ssize_t res_idx_show(struct device *dev,
-                           struct device_attribute *attr,
-                           char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       /* Clear configuration from previous run */
+       memset(config, 0, sizeof(struct etmv4_config));
  
-       val = drvdata->res_idx;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
+       if (attr->exclude_kernel)
+               config->mode = ETM_MODE_EXCL_KERN;
  
-static ssize_t res_idx_store(struct device *dev,
-                            struct device_attribute *attr,
-                            const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       if (attr->exclude_user)
+               config->mode = ETM_MODE_EXCL_USER;
  
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-       /* Resource selector pair 0 is always implemented and reserved */
-       if (val < 2 || val >= drvdata->nr_resource * 2)
-               return -EINVAL;
+       /* Always start from the default config */
+       etm4_set_default(config);
  
         /*
-        * Use spinlock to ensure index doesn't change while it gets
-        * dereferenced multiple times within a spinlock block elsewhere.
+        * By default the tracers are configured to trace the whole address
+        * range.  Narrow the field only if requested by user space.
          */
-       spin_lock(&drvdata->spinlock);
-       drvdata->res_idx = val;
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(res_idx);
+       if (config->mode)
+               etm4_config_trace_mode(config);
  
-static ssize_t res_ctrl_show(struct device *dev,
-                            struct device_attribute *attr,
-                            char *buf)
-{
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       /* Go from generic option to ETMv4 specifics */
+       if (attr->config & BIT(ETM_OPT_CYCACC))
+               config->cfg |= ETMv4_MODE_CYCACC;
+       if (attr->config & BIT(ETM_OPT_TS))
+               config->cfg |= ETMv4_MODE_TIMESTAMP;
  
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->res_idx;
-       val = drvdata->res_ctrl[idx];
-       spin_unlock(&drvdata->spinlock);
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+       return 0;
  }
  
-static ssize_t res_ctrl_store(struct device *dev,
-                             struct device_attribute *attr,
-                             const char *buf, size_t size)
+static int etm4_enable_perf(struct coresight_device *csdev,
+                           struct perf_event_attr *attr)
  {
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-       if (kstrtoul(buf, 16, &val))
+       if (WARN_ON_ONCE(drvdata->cpu != smp_processor_id()))
                 return -EINVAL;
  
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->res_idx;
-       /* For odd idx pair inversal bit is RES0 */
-       if (idx % 2 != 0)
-               /* PAIRINV, bit[21] */
-               val &= ~BIT(21);
-       drvdata->res_ctrl[idx] = val;
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(res_ctrl);
-
-static ssize_t ctxid_idx_show(struct device *dev,
-                             struct device_attribute *attr,
-                             char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->ctxid_idx;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t ctxid_idx_store(struct device *dev,
-                              struct device_attribute *attr,
-                              const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-       if (val >= drvdata->numcidc)
-               return -EINVAL;
+       /* Configure the tracer based on the session's specifics */
+       etm4_parse_event_config(drvdata, attr);
+       /* And enable it */
+       etm4_enable_hw(drvdata);
  
-       /*
-        * Use spinlock to ensure index doesn't change while it gets
-        * dereferenced multiple times within a spinlock block elsewhere.
-        */
-       spin_lock(&drvdata->spinlock);
-       drvdata->ctxid_idx = val;
-       spin_unlock(&drvdata->spinlock);
-       return size;
+       return 0;
  }
-static DEVICE_ATTR_RW(ctxid_idx);
  
-static ssize_t ctxid_pid_show(struct device *dev,
-                             struct device_attribute *attr,
-                             char *buf)
+static int etm4_enable_sysfs(struct coresight_device *csdev)
  {
-       u8 idx;
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+       int ret;
  
         spin_lock(&drvdata->spinlock);
-       idx = drvdata->ctxid_idx;
-       val = (unsigned long)drvdata->ctxid_vpid[idx];
-       spin_unlock(&drvdata->spinlock);
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
-
-static ssize_t ctxid_pid_store(struct device *dev,
-                              struct device_attribute *attr,
-                              const char *buf, size_t size)
-{
-       u8 idx;
-       unsigned long vpid, pid;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
  
         /*
-        * only implemented when ctxid tracing is enabled, i.e. at least one
-        * ctxid comparator is implemented and ctxid is greater than 0 bits
-        * in length
+        * Executing etm4_enable_hw on the cpu whose ETM is being enabled
+        * ensures that register writes occur when cpu is powered.
          */
-       if (!drvdata->ctxid_size || !drvdata->numcidc)
-               return -EINVAL;
-       if (kstrtoul(buf, 16, &vpid))
-               return -EINVAL;
-
-       pid = coresight_vpid_to_pid(vpid);
+       ret = smp_call_function_single(drvdata->cpu,
+                                      etm4_enable_hw, drvdata, 1);
+       if (ret)
+               goto err;
  
-       spin_lock(&drvdata->spinlock);
-       idx = drvdata->ctxid_idx;
-       drvdata->ctxid_pid[idx] = (u64)pid;
-       drvdata->ctxid_vpid[idx] = (u64)vpid;
+       drvdata->sticky_enable = true;
         spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(ctxid_pid);
  
-static ssize_t ctxid_masks_show(struct device *dev,
-                               struct device_attribute *attr,
-                               char *buf)
-{
-       unsigned long val1, val2;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       dev_info(drvdata->dev, "ETM tracing enabled\n");
+       return 0;
  
-       spin_lock(&drvdata->spinlock);
-       val1 = drvdata->ctxid_mask0;
-       val2 = drvdata->ctxid_mask1;
+err:
         spin_unlock(&drvdata->spinlock);
-       return scnprintf(buf, PAGE_SIZE, "%#lx %#lx\n", val1, val2);
+       return ret;
  }
  
-static ssize_t ctxid_masks_store(struct device *dev,
-                               struct device_attribute *attr,
-                               const char *buf, size_t size)
+static int etm4_enable(struct coresight_device *csdev,
+                      struct perf_event_attr *attr, u32 mode)
  {
-       u8 i, j, maskbyte;
-       unsigned long val1, val2, mask;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       int ret;
+       u32 val;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-       /*
-        * only implemented when ctxid tracing is enabled, i.e. at least one
-        * ctxid comparator is implemented and ctxid is greater than 0 bits
-        * in length
-        */
-       if (!drvdata->ctxid_size || !drvdata->numcidc)
-               return -EINVAL;
-       if (sscanf(buf, "%lx %lx", &val1, &val2) != 2)
-               return -EINVAL;
+       val = local_cmpxchg(&drvdata->mode, CS_MODE_DISABLED, mode);
  
-       spin_lock(&drvdata->spinlock);
-       /*
-        * each byte[0..3] controls mask value applied to ctxid
-        * comparator[0..3]
-        */
-       switch (drvdata->numcidc) {
-       case 0x1:
-               /* COMP0, bits[7:0] */
-               drvdata->ctxid_mask0 = val1 & 0xFF;
-               break;
-       case 0x2:
-               /* COMP1, bits[15:8] */
-               drvdata->ctxid_mask0 = val1 & 0xFFFF;
-               break;
-       case 0x3:
-               /* COMP2, bits[23:16] */
-               drvdata->ctxid_mask0 = val1 & 0xFFFFFF;
-               break;
-       case 0x4:
-                /* COMP3, bits[31:24] */
-               drvdata->ctxid_mask0 = val1;
-               break;
-       case 0x5:
-               /* COMP4, bits[7:0] */
-               drvdata->ctxid_mask0 = val1;
-               drvdata->ctxid_mask1 = val2 & 0xFF;
-               break;
-       case 0x6:
-               /* COMP5, bits[15:8] */
-               drvdata->ctxid_mask0 = val1;
-               drvdata->ctxid_mask1 = val2 & 0xFFFF;
-               break;
-       case 0x7:
-               /* COMP6, bits[23:16] */
-               drvdata->ctxid_mask0 = val1;
-               drvdata->ctxid_mask1 = val2 & 0xFFFFFF;
+       /* Someone is already using the tracer */
+       if (val)
+               return -EBUSY;
+
+       switch (mode) {
+       case CS_MODE_SYSFS:
+               ret = etm4_enable_sysfs(csdev);
                 break;
-       case 0x8:
-               /* COMP7, bits[31:24] */
-               drvdata->ctxid_mask0 = val1;
-               drvdata->ctxid_mask1 = val2;
+       case CS_MODE_PERF:
+               ret = etm4_enable_perf(csdev, attr);
                 break;
         default:
-               break;
-       }
-       /*
-        * If software sets a mask bit to 1, it must program relevant byte
-        * of ctxid comparator value 0x0, otherwise behavior is unpredictable.
-        * For example, if bit[3] of ctxid_mask0 is 1, we must clear bits[31:24]
-        * of ctxid comparator0 value (corresponding to byte 0) register.
-        */
-       mask = drvdata->ctxid_mask0;
-       for (i = 0; i < drvdata->numcidc; i++) {
-               /* mask value of corresponding ctxid comparator */
-               maskbyte = mask & ETMv4_EVENT_MASK;
-               /*
-                * each bit corresponds to a byte of respective ctxid comparator
-                * value register
-                */
-               for (j = 0; j < 8; j++) {
-                       if (maskbyte & 1)
-                               drvdata->ctxid_pid[i] &= ~(0xFF << (j * 8));
-                       maskbyte >>= 1;
-               }
-               /* Select the next ctxid comparator mask value */
-               if (i == 3)
-                       /* ctxid comparators[4-7] */
-                       mask = drvdata->ctxid_mask1;
-               else
-                       mask >>= 0x8;
+               ret = -EINVAL;
         }
  
-       spin_unlock(&drvdata->spinlock);
-       return size;
+       /* The tracer didn't start */
+       if (ret)
+               local_set(&drvdata->mode, CS_MODE_DISABLED);
+
+       return ret;
  }
-static DEVICE_ATTR_RW(ctxid_masks);
  
-static ssize_t vmid_idx_show(struct device *dev,
-                            struct device_attribute *attr,
-                            char *buf)
+static void etm4_disable_hw(void *info)
  {
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       u32 control;
+       struct etmv4_drvdata *drvdata = info;
  
-       val = drvdata->vmid_idx;
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
-}
+       CS_UNLOCK(drvdata->base);
  
-static ssize_t vmid_idx_store(struct device *dev,
-                             struct device_attribute *attr,
-                             const char *buf, size_t size)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       control = readl_relaxed(drvdata->base + TRCPRGCTLR);
  
-       if (kstrtoul(buf, 16, &val))
-               return -EINVAL;
-       if (val >= drvdata->numvmidc)
-               return -EINVAL;
+       /* EN, bit[0] Trace unit enable bit */
+       control &= ~0x1;
  
-       /*
-        * Use spinlock to ensure index doesn't change while it gets
-        * dereferenced multiple times within a spinlock block elsewhere.
-        */
-       spin_lock(&drvdata->spinlock);
-       drvdata->vmid_idx = val;
-       spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(vmid_idx);
+       /* make sure everything completes before disabling */
+       mb();
+       isb();
+       writel_relaxed(control, drvdata->base + TRCPRGCTLR);
  
-static ssize_t vmid_val_show(struct device *dev,
-                            struct device_attribute *attr,
-                            char *buf)
-{
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       CS_LOCK(drvdata->base);
  
-       val = (unsigned long)drvdata->vmid_val[drvdata->vmid_idx];
-       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+       dev_dbg(drvdata->dev, "cpu: %d disable smp call done\n", drvdata->cpu);
  }
  
-static ssize_t vmid_val_store(struct device *dev,
-                             struct device_attribute *attr,
-                             const char *buf, size_t size)
+static int etm4_disable_perf(struct coresight_device *csdev)
  {
-       unsigned long val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-       /*
-        * only implemented when vmid tracing is enabled, i.e. at least one
-        * vmid comparator is implemented and at least 8 bit vmid size
-        */
-       if (!drvdata->vmid_size || !drvdata->numvmidc)
-               return -EINVAL;
-       if (kstrtoul(buf, 16, &val))
+       if (WARN_ON_ONCE(drvdata->cpu != smp_processor_id()))
                 return -EINVAL;
  
-       spin_lock(&drvdata->spinlock);
-       drvdata->vmid_val[drvdata->vmid_idx] = (u64)val;
-       spin_unlock(&drvdata->spinlock);
-       return size;
+       etm4_disable_hw(drvdata);
+       return 0;
  }
-static DEVICE_ATTR_RW(vmid_val);
  
-static ssize_t vmid_masks_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
+static void etm4_disable_sysfs(struct coresight_device *csdev)
  {
-       unsigned long val1, val2;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       spin_lock(&drvdata->spinlock);
-       val1 = drvdata->vmid_mask0;
-       val2 = drvdata->vmid_mask1;
-       spin_unlock(&drvdata->spinlock);
-       return scnprintf(buf, PAGE_SIZE, "%#lx %#lx\n", val1, val2);
-}
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-static ssize_t vmid_masks_store(struct device *dev,
-                               struct device_attribute *attr,
-                               const char *buf, size_t size)
-{
-       u8 i, j, maskbyte;
-       unsigned long val1, val2, mask;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
         /*
-        * only implemented when vmid tracing is enabled, i.e. at least one
-        * vmid comparator is implemented and at least 8 bit vmid size
+        * Taking hotplug lock here protects from clocks getting disabled
+        * with tracing being left on (crash scenario) if user disable occurs
+        * after cpu online mask indicates the cpu is offline but before the
+        * DYING hotplug callback is serviced by the ETM driver.
          */
-       if (!drvdata->vmid_size || !drvdata->numvmidc)
-               return -EINVAL;
-       if (sscanf(buf, "%lx %lx", &val1, &val2) != 2)
-               return -EINVAL;
-
+       get_online_cpus();
         spin_lock(&drvdata->spinlock);
  
         /*
-        * each byte[0..3] controls mask value applied to vmid
-        * comparator[0..3]
+        * Executing etm4_disable_hw on the cpu whose ETM is being disabled
+        * ensures that register writes occur when cpu is powered.
          */
-       switch (drvdata->numvmidc) {
-       case 0x1:
-               /* COMP0, bits[7:0] */
-               drvdata->vmid_mask0 = val1 & 0xFF;
-               break;
-       case 0x2:
-               /* COMP1, bits[15:8] */
-               drvdata->vmid_mask0 = val1 & 0xFFFF;
-               break;
-       case 0x3:
-               /* COMP2, bits[23:16] */
-               drvdata->vmid_mask0 = val1 & 0xFFFFFF;
-               break;
-       case 0x4:
-               /* COMP3, bits[31:24] */
-               drvdata->vmid_mask0 = val1;
-               break;
-       case 0x5:
-               /* COMP4, bits[7:0] */
-               drvdata->vmid_mask0 = val1;
-               drvdata->vmid_mask1 = val2 & 0xFF;
-               break;
-       case 0x6:
-               /* COMP5, bits[15:8] */
-               drvdata->vmid_mask0 = val1;
-               drvdata->vmid_mask1 = val2 & 0xFFFF;
-               break;
-       case 0x7:
-               /* COMP6, bits[23:16] */
-               drvdata->vmid_mask0 = val1;
-               drvdata->vmid_mask1 = val2 & 0xFFFFFF;
-               break;
-       case 0x8:
-               /* COMP7, bits[31:24] */
-               drvdata->vmid_mask0 = val1;
-               drvdata->vmid_mask1 = val2;
-               break;
-       default:
-               break;
-       }
+       smp_call_function_single(drvdata->cpu, etm4_disable_hw, drvdata, 1);
  
-       /*
-        * If software sets a mask bit to 1, it must program relevant byte
-        * of vmid comparator value 0x0, otherwise behavior is unpredictable.
-        * For example, if bit[3] of vmid_mask0 is 1, we must clear bits[31:24]
-        * of vmid comparator0 value (corresponding to byte 0) register.
-        */
-       mask = drvdata->vmid_mask0;
-       for (i = 0; i < drvdata->numvmidc; i++) {
-               /* mask value of corresponding vmid comparator */
-               maskbyte = mask & ETMv4_EVENT_MASK;
-               /*
-                * each bit corresponds to a byte of respective vmid comparator
-                * value register
-                */
-               for (j = 0; j < 8; j++) {
-                       if (maskbyte & 1)
-                               drvdata->vmid_val[i] &= ~(0xFF << (j * 8));
-                       maskbyte >>= 1;
-               }
-               /* Select the next vmid comparator mask value */
-               if (i == 3)
-                       /* vmid comparators[4-7] */
-                       mask = drvdata->vmid_mask1;
-               else
-                       mask >>= 0x8;
-       }
         spin_unlock(&drvdata->spinlock);
-       return size;
-}
-static DEVICE_ATTR_RW(vmid_masks);
-
-static ssize_t cpu_show(struct device *dev,
-                       struct device_attribute *attr, char *buf)
-{
-       int val;
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(dev->parent);
-
-       val = drvdata->cpu;
-       return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+       put_online_cpus();
  
+       dev_info(drvdata->dev, "ETM tracing disabled\n");
  }
-static DEVICE_ATTR_RO(cpu);
-
-static struct attribute *coresight_etmv4_attrs[] = {
-       &dev_attr_nr_pe_cmp.attr,
-       &dev_attr_nr_addr_cmp.attr,
-       &dev_attr_nr_cntr.attr,
-       &dev_attr_nr_ext_inp.attr,
-       &dev_attr_numcidc.attr,
-       &dev_attr_numvmidc.attr,
-       &dev_attr_nrseqstate.attr,
-       &dev_attr_nr_resource.attr,
-       &dev_attr_nr_ss_cmp.attr,
-       &dev_attr_reset.attr,
-       &dev_attr_mode.attr,
-       &dev_attr_pe.attr,
-       &dev_attr_event.attr,
-       &dev_attr_event_instren.attr,
-       &dev_attr_event_ts.attr,
-       &dev_attr_syncfreq.attr,
-       &dev_attr_cyc_threshold.attr,
-       &dev_attr_bb_ctrl.attr,
-       &dev_attr_event_vinst.attr,
-       &dev_attr_s_exlevel_vinst.attr,
-       &dev_attr_ns_exlevel_vinst.attr,
-       &dev_attr_addr_idx.attr,
-       &dev_attr_addr_instdatatype.attr,
-       &dev_attr_addr_single.attr,
-       &dev_attr_addr_range.attr,
-       &dev_attr_addr_start.attr,
-       &dev_attr_addr_stop.attr,
-       &dev_attr_addr_ctxtype.attr,
-       &dev_attr_addr_context.attr,
-       &dev_attr_seq_idx.attr,
-       &dev_attr_seq_state.attr,
-       &dev_attr_seq_event.attr,
-       &dev_attr_seq_reset_event.attr,
-       &dev_attr_cntr_idx.attr,
-       &dev_attr_cntrldvr.attr,
-       &dev_attr_cntr_val.attr,
-       &dev_attr_cntr_ctrl.attr,
-       &dev_attr_res_idx.attr,
-       &dev_attr_res_ctrl.attr,
-       &dev_attr_ctxid_idx.attr,
-       &dev_attr_ctxid_pid.attr,
-       &dev_attr_ctxid_masks.attr,
-       &dev_attr_vmid_idx.attr,
-       &dev_attr_vmid_val.attr,
-       &dev_attr_vmid_masks.attr,
-       &dev_attr_cpu.attr,
-       NULL,
-};
  
-#define coresight_simple_func(name, offset)                            \
-static ssize_t name##_show(struct device *_dev,                                \
-                          struct device_attribute *attr, char *buf)    \
-{                                                                      \
-       struct etmv4_drvdata *drvdata = dev_get_drvdata(_dev->parent);  \
-       return scnprintf(buf, PAGE_SIZE, "0x%x\n",                      \
-                        readl_relaxed(drvdata->base + offset));        \
-}                                                                      \
-DEVICE_ATTR_RO(name)
-
-coresight_simple_func(trcoslsr, TRCOSLSR);
-coresight_simple_func(trcpdcr, TRCPDCR);
-coresight_simple_func(trcpdsr, TRCPDSR);
-coresight_simple_func(trclsr, TRCLSR);
-coresight_simple_func(trcauthstatus, TRCAUTHSTATUS);
-coresight_simple_func(trcdevid, TRCDEVID);
-coresight_simple_func(trcdevtype, TRCDEVTYPE);
-coresight_simple_func(trcpidr0, TRCPIDR0);
-coresight_simple_func(trcpidr1, TRCPIDR1);
-coresight_simple_func(trcpidr2, TRCPIDR2);
-coresight_simple_func(trcpidr3, TRCPIDR3);
-
-static struct attribute *coresight_etmv4_mgmt_attrs[] = {
-       &dev_attr_trcoslsr.attr,
-       &dev_attr_trcpdcr.attr,
-       &dev_attr_trcpdsr.attr,
-       &dev_attr_trclsr.attr,
-       &dev_attr_trcauthstatus.attr,
-       &dev_attr_trcdevid.attr,
-       &dev_attr_trcdevtype.attr,
-       &dev_attr_trcpidr0.attr,
-       &dev_attr_trcpidr1.attr,
-       &dev_attr_trcpidr2.attr,
-       &dev_attr_trcpidr3.attr,
-       NULL,
-};
+static void etm4_disable(struct coresight_device *csdev)
+{
+       u32 mode;
+       struct etmv4_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-coresight_simple_func(trcidr0, TRCIDR0);
-coresight_simple_func(trcidr1, TRCIDR1);
-coresight_simple_func(trcidr2, TRCIDR2);
-coresight_simple_func(trcidr3, TRCIDR3);
-coresight_simple_func(trcidr4, TRCIDR4);
-coresight_simple_func(trcidr5, TRCIDR5);
-/* trcidr[6,7] are reserved */
-coresight_simple_func(trcidr8, TRCIDR8);
-coresight_simple_func(trcidr9, TRCIDR9);
-coresight_simple_func(trcidr10, TRCIDR10);
-coresight_simple_func(trcidr11, TRCIDR11);
-coresight_simple_func(trcidr12, TRCIDR12);
-coresight_simple_func(trcidr13, TRCIDR13);
-
-static struct attribute *coresight_etmv4_trcidr_attrs[] = {
-       &dev_attr_trcidr0.attr,
-       &dev_attr_trcidr1.attr,
-       &dev_attr_trcidr2.attr,
-       &dev_attr_trcidr3.attr,
-       &dev_attr_trcidr4.attr,
-       &dev_attr_trcidr5.attr,
-       /* trcidr[6,7] are reserved */
-       &dev_attr_trcidr8.attr,
-       &dev_attr_trcidr9.attr,
-       &dev_attr_trcidr10.attr,
-       &dev_attr_trcidr11.attr,
-       &dev_attr_trcidr12.attr,
-       &dev_attr_trcidr13.attr,
-       NULL,
-};
+       /*
+        * For as long as the tracer isn't disabled another entity can't
+        * change its status.  As such we can read the status here without
+        * fearing it will change under us.
+        */
+       mode = local_read(&drvdata->mode);
  
-static const struct attribute_group coresight_etmv4_group = {
-       .attrs = coresight_etmv4_attrs,
-};
+       switch (mode) {
+       case CS_MODE_DISABLED:
+               break;
+       case CS_MODE_SYSFS:
+               etm4_disable_sysfs(csdev);
+               break;
+       case CS_MODE_PERF:
+               etm4_disable_perf(csdev);
+               break;
+       }
  
-static const struct attribute_group coresight_etmv4_mgmt_group = {
-       .attrs = coresight_etmv4_mgmt_attrs,
-       .name = "mgmt",
-};
+       if (mode)
+               local_set(&drvdata->mode, CS_MODE_DISABLED);
+}
  
-static const struct attribute_group coresight_etmv4_trcidr_group = {
-       .attrs = coresight_etmv4_trcidr_attrs,
-       .name = "trcidr",
+static const struct coresight_ops_source etm4_source_ops = {
+       .cpu_id         = etm4_cpu_id,
+       .trace_id       = etm4_trace_id,
+       .enable         = etm4_enable,
+       .disable        = etm4_disable,
  };
  
-static const struct attribute_group *coresight_etmv4_groups[] = {
-       &coresight_etmv4_group,
-       &coresight_etmv4_mgmt_group,
-       &coresight_etmv4_trcidr_group,
-       NULL,
+static const struct coresight_ops etm4_cs_ops = {
+       .source_ops     = &etm4_source_ops,
  };
  
  static void etm4_init_arch_data(void *info)
@@ -2310,6 +408,9 @@ static void etm4_init_arch_data(void *info)
         u32 etmidr5;
         struct etmv4_drvdata *drvdata = info;
  
+       /* Make sure all registers are accessible */
+       etm4_os_unlock(drvdata);
+
         CS_UNLOCK(drvdata->base);
  
         /* find all capabilities of the tracing unit */
@@ -2461,93 +562,115 @@ static void etm4_init_arch_data(void *info)
         CS_LOCK(drvdata->base);
  }
  
-static void etm4_init_default_data(struct etmv4_drvdata *drvdata)
+static void etm4_set_default(struct etmv4_config *config)
  {
-       int i;
+       if (WARN_ON_ONCE(!config))
+               return;
  
-       drvdata->pe_sel = 0x0;
-       drvdata->cfg = (ETMv4_MODE_CTXID | ETM_MODE_VMID |
-                       ETMv4_MODE_TIMESTAMP | ETM_MODE_RETURNSTACK);
+       /*
+        * Make default initialisation trace everything
+        *
+        * Select the "always true" resource selector on the
+        * "Enablign Event" line and configure address range comparator
+        * '0' to trace all the possible address range.  From there
+        * configure the "include/exclude" engine to include address
+        * range comparator '0'.
+        */
  
         /* disable all events tracing */
-       drvdata->eventctrl0 = 0x0;
-       drvdata->eventctrl1 = 0x0;
+       config->eventctrl0 = 0x0;
+       config->eventctrl1 = 0x0;
  
         /* disable stalling */
-       drvdata->stall_ctrl = 0x0;
+       config->stall_ctrl = 0x0;
+
+       /* enable trace synchronization every 4096 bytes, if available */
+       config->syncfreq = 0xC;
  
         /* disable timestamp event */
-       drvdata->ts_ctrl = 0x0;
+       config->ts_ctrl = 0x0;
  
-       /* enable trace synchronization every 4096 bytes for trace */
-       if (drvdata->syncpr == false)
-               drvdata->syncfreq = 0xC;
+       /* TRCVICTLR::EVENT = 0x01, select the always on logic */
+       config->vinst_ctrl |= BIT(0);
  
         /*
-        *  enable viewInst to trace everything with start-stop logic in
-        *  started state
+        * TRCVICTLR::SSSTATUS == 1, the start-stop logic is
+        * in the started state
          */
-       drvdata->vinst_ctrl |= BIT(0);
-       /* set initial state of start-stop logic */
-       if (drvdata->nr_addr_cmp)
-               drvdata->vinst_ctrl |= BIT(9);
+       config->vinst_ctrl |= BIT(9);
  
-       /* no address range filtering for ViewInst */
-       drvdata->viiectlr = 0x0;
-       /* no start-stop filtering for ViewInst */
-       drvdata->vissctlr = 0x0;
+       /*
+        * Configure address range comparator '0' to encompass all
+        * possible addresses.
+        */
  
-       /* disable seq events */
-       for (i = 0; i < drvdata->nrseqstate-1; i++)
-               drvdata->seq_ctrl[i] = 0x0;
-       drvdata->seq_rst = 0x0;
-       drvdata->seq_state = 0x0;
+       /* First half of default address comparator: start at address 0 */
+       config->addr_val[ETM_DEFAULT_ADDR_COMP] = 0x0;
+       /* trace instruction addresses */
+       config->addr_acc[ETM_DEFAULT_ADDR_COMP] &= ~(BIT(0) | BIT(1));
+       /* EXLEVEL_NS, bits[12:15], only trace application and kernel space */
+       config->addr_acc[ETM_DEFAULT_ADDR_COMP] |= ETM_EXLEVEL_NS_HYP;
+       /* EXLEVEL_S, bits[11:8], don't trace anything in secure state */
+       config->addr_acc[ETM_DEFAULT_ADDR_COMP] |= (ETM_EXLEVEL_S_APP |
+                                                   ETM_EXLEVEL_S_OS |
+                                                   ETM_EXLEVEL_S_HYP);
+       config->addr_type[ETM_DEFAULT_ADDR_COMP] = ETM_ADDR_TYPE_RANGE;
  
-       /* disable external input events */
-       drvdata->ext_inp = 0x0;
+       /*
+        * Second half of default address comparator: go all
+        * the way to the top.
+       */
+       config->addr_val[ETM_DEFAULT_ADDR_COMP + 1] = ~0x0;
+       /* trace instruction addresses */
+       config->addr_acc[ETM_DEFAULT_ADDR_COMP + 1] &= ~(BIT(0) | BIT(1));
+       /* Address comparator type must be equal for both halves */
+       config->addr_acc[ETM_DEFAULT_ADDR_COMP + 1] =
+                                       config->addr_acc[ETM_DEFAULT_ADDR_COMP];
+       config->addr_type[ETM_DEFAULT_ADDR_COMP + 1] = ETM_ADDR_TYPE_RANGE;
  
-       for (i = 0; i < drvdata->nr_cntr; i++) {
-               drvdata->cntrldvr[i] = 0x0;
-               drvdata->cntr_ctrl[i] = 0x0;
-               drvdata->cntr_val[i] = 0x0;
-       }
+       /*
+        * Configure the ViewInst function to filter on address range
+        * comparator '0'.
+        */
+       config->viiectlr = BIT(0);
  
-       /* Resource selector pair 0 is always implemented and reserved */
-       drvdata->res_idx = 0x2;
-       for (i = 2; i < drvdata->nr_resource * 2; i++)
-               drvdata->res_ctrl[i] = 0x0;
+       /* no start-stop filtering for ViewInst */
+       config->vissctlr = 0x0;
+}
  
-       for (i = 0; i < drvdata->nr_ss_cmp; i++) {
-               drvdata->ss_ctrl[i] = 0x0;
-               drvdata->ss_pe_cmp[i] = 0x0;
-       }
+void etm4_config_trace_mode(struct etmv4_config *config)
+{
+       u32 addr_acc, mode;
  
-       if (drvdata->nr_addr_cmp >= 1) {
-               drvdata->addr_val[0] = (unsigned long)_stext;
-               drvdata->addr_val[1] = (unsigned long)_etext;
-               drvdata->addr_type[0] = ETM_ADDR_TYPE_RANGE;
-               drvdata->addr_type[1] = ETM_ADDR_TYPE_RANGE;
-       }
+       mode = config->mode;
+       mode &= (ETM_MODE_EXCL_KERN | ETM_MODE_EXCL_USER);
  
-       for (i = 0; i < drvdata->numcidc; i++) {
-               drvdata->ctxid_pid[i] = 0x0;
-               drvdata->ctxid_vpid[i] = 0x0;
-       }
+       /* excluding kernel AND user space doesn't make sense */
+       WARN_ON_ONCE(mode == (ETM_MODE_EXCL_KERN | ETM_MODE_EXCL_USER));
  
-       drvdata->ctxid_mask0 = 0x0;
-       drvdata->ctxid_mask1 = 0x0;
+       /* nothing to do if neither flags are set */
+       if (!(mode & ETM_MODE_EXCL_KERN) && !(mode & ETM_MODE_EXCL_USER))
+               return;
  
-       for (i = 0; i < drvdata->numvmidc; i++)
-               drvdata->vmid_val[i] = 0x0;
-       drvdata->vmid_mask0 = 0x0;
-       drvdata->vmid_mask1 = 0x0;
+       addr_acc = config->addr_acc[ETM_DEFAULT_ADDR_COMP];
+       /* clear default config */
+       addr_acc &= ~(ETM_EXLEVEL_NS_APP | ETM_EXLEVEL_NS_OS);
  
         /*
-        * A trace ID value of 0 is invalid, so let's start at some
-        * random value that fits in 7 bits.  ETMv3.x has 0x10 so let's
-        * start at 0x20.
+        * EXLEVEL_NS, bits[15:12]
+        * The Exception levels are:
+        *   Bit[12] Exception level 0 - Application
+        *   Bit[13] Exception level 1 - OS
+        *   Bit[14] Exception level 2 - Hypervisor
+        *   Bit[15] Never implemented
          */
-       drvdata->trcid = 0x20 + drvdata->cpu;
+       if (mode & ETM_MODE_EXCL_KERN)
+               addr_acc |= ETM_EXLEVEL_NS_OS;
+       else
+               addr_acc |= ETM_EXLEVEL_NS_APP;
+
+       config->addr_acc[ETM_DEFAULT_ADDR_COMP] = addr_acc;
+       config->addr_acc[ETM_DEFAULT_ADDR_COMP + 1] = addr_acc;
  }
  
  static int etm4_cpu_callback(struct notifier_block *nfb, unsigned long action,
@@ -2566,7 +689,7 @@ static int etm4_cpu_callback(struct notifier_block *nfb, unsigned long action,
                         etmdrvdata[cpu]->os_unlock = true;
                 }
  
-               if (etmdrvdata[cpu]->enable)
+               if (local_read(&etmdrvdata[cpu]->mode))
                         etm4_enable_hw(etmdrvdata[cpu]);
                 spin_unlock(&etmdrvdata[cpu]->spinlock);
                 break;
@@ -2579,7 +702,7 @@ static int etm4_cpu_callback(struct notifier_block *nfb, unsigned long action,
  
         case CPU_DYING:
                 spin_lock(&etmdrvdata[cpu]->spinlock);
-               if (etmdrvdata[cpu]->enable)
+               if (local_read(&etmdrvdata[cpu]->mode))
                         etm4_disable_hw(etmdrvdata[cpu]);
                 spin_unlock(&etmdrvdata[cpu]->spinlock);
                 break;
@@ -2592,6 +715,11 @@ static struct notifier_block etm4_cpu_notifier = {
         .notifier_call = etm4_cpu_callback,
  };
  
+static void etm4_init_trace_id(struct etmv4_drvdata *drvdata)
+{
+       drvdata->trcid = coresight_get_trace_id(drvdata->cpu);
+}
+
  static int etm4_probe(struct amba_device *adev, const struct amba_id *id)
  {
         int ret;
@@ -2635,9 +763,6 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id)
         get_online_cpus();
         etmdrvdata[drvdata->cpu] = drvdata;
  
-       if (!smp_call_function_single(drvdata->cpu, etm4_os_unlock, drvdata, 1))
-               drvdata->os_unlock = true;
-
         if (smp_call_function_single(drvdata->cpu,
                                 etm4_init_arch_data,  drvdata, 1))
                 dev_err(dev, "ETM arch init failed\n");
@@ -2651,9 +776,9 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id)
                 ret = -EINVAL;
                 goto err_arch_supported;
         }
-       etm4_init_default_data(drvdata);
  
-       pm_runtime_put(&adev->dev);
+       etm4_init_trace_id(drvdata);
+       etm4_set_default(&drvdata->config);
  
         desc->type = CORESIGHT_DEV_TYPE_SOURCE;
         desc->subtype.source_subtype = CORESIGHT_DEV_SUBTYPE_SOURCE_PROC;
@@ -2664,9 +789,16 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id)
         drvdata->csdev = coresight_register(desc);
         if (IS_ERR(drvdata->csdev)) {
                 ret = PTR_ERR(drvdata->csdev);
-               goto err_coresight_register;
+               goto err_arch_supported;
+       }
+
+       ret = etm_perf_symlink(drvdata->csdev, true);
+       if (ret) {
+               coresight_unregister(drvdata->csdev);
+               goto err_arch_supported;
         }
  
+       pm_runtime_put(&adev->dev);
         dev_info(dev, "%s initialized\n", (char *)id->data);
  
         if (boot_enable) {
@@ -2677,24 +809,11 @@ static int etm4_probe(struct amba_device *adev, const struct amba_id *id)
         return 0;
  
  err_arch_supported:
-       pm_runtime_put(&adev->dev);
-err_coresight_register:
         if (--etm4_count == 0)
                 unregister_hotcpu_notifier(&etm4_cpu_notifier);
         return ret;
  }
  
-static int etm4_remove(struct amba_device *adev)
-{
-       struct etmv4_drvdata *drvdata = amba_get_drvdata(adev);
-
-       coresight_unregister(drvdata->csdev);
-       if (--etm4_count == 0)
-               unregister_hotcpu_notifier(&etm4_cpu_notifier);
-
-       return 0;
-}
-
  static struct amba_id etm4_ids[] = {
         {       /* ETM 4.0 - Qualcomm */
                 .id     = 0x0003b95d,
@@ -2706,16 +825,20 @@ static struct amba_id etm4_ids[] = {
                 .mask   = 0x000fffff,
                 .data   = "ETM 4.0",
         },
+       {       /* ETM 4.0 - A72, Maia, HiSilicon */
+               .id = 0x000bb95a,
+               .mask = 0x000fffff,
+               .data = "ETM 4.0",
+       },
         { 0, 0},
  };
  
  static struct amba_driver etm4x_driver = {
         .drv = {
                 .name   = "coresight-etm4x",
+               .suppress_bind_attrs = true,
         },
         .probe          = etm4_probe,
-       .remove         = etm4_remove,
         .id_table       = etm4_ids,
  };
-
-module_amba_driver(etm4x_driver);
+builtin_amba_driver(etm4x_driver);
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.h b/drivers/hwtracing/coresight/coresight-etm4x.h

index c34100205ca948681c62cd45411623f8c5cba4a6..5359c5197c1d6e4ccbbabe666785448bd47bdeef 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-etm4x.h
+++ b/drivers/hwtracing/coresight/coresight-etm4x.h
@@ -13,6 +13,7 @@
  #ifndef _CORESIGHT_CORESIGHT_ETM_H
  #define _CORESIGHT_CORESIGHT_ETM_H
  
+#include <asm/local.h>
  #include <linux/spinlock.h>
  #include "coresight-priv.h"
  
@@ -175,71 +176,38 @@
  #define ETM_MODE_TRACE_RESET           BIT(25)
  #define ETM_MODE_TRACE_ERR             BIT(26)
  #define ETM_MODE_VIEWINST_STARTSTOP    BIT(27)
-#define ETMv4_MODE_ALL                 0xFFFFFFF
+#define ETMv4_MODE_ALL                 (GENMASK(27, 0) | \
+                                        ETM_MODE_EXCL_KERN | \
+                                        ETM_MODE_EXCL_USER)
  
  #define TRCSTATR_IDLE_BIT              0
+#define ETM_DEFAULT_ADDR_COMP          0
+
+/* secure state access levels */
+#define ETM_EXLEVEL_S_APP              BIT(8)
+#define ETM_EXLEVEL_S_OS               BIT(9)
+#define ETM_EXLEVEL_S_NA               BIT(10)
+#define ETM_EXLEVEL_S_HYP              BIT(11)
+/* non-secure state access levels */
+#define ETM_EXLEVEL_NS_APP             BIT(12)
+#define ETM_EXLEVEL_NS_OS              BIT(13)
+#define ETM_EXLEVEL_NS_HYP             BIT(14)
+#define ETM_EXLEVEL_NS_NA              BIT(15)
  
  /**
- * struct etm4_drvdata - specifics associated to an ETM component
- * @base:       Memory mapped base address for this component.
- * @dev:        The device entity associated to this component.
- * @csdev:      Component vitals needed by the framework.
- * @spinlock:   Only one at a time pls.
- * @cpu:        The cpu this component is affined to.
- * @arch:       ETM version number.
- * @enable:    Is this ETM currently tracing.
- * @sticky_enable: true if ETM base configuration has been done.
- * @boot_enable:True if we should start tracing at boot time.
- * @os_unlock:  True if access to management registers is allowed.
- * @nr_pe:     The number of processing entity available for tracing.
- * @nr_pe_cmp: The number of processing entity comparator inputs that are
- *             available for tracing.
- * @nr_addr_cmp:Number of pairs of address comparators available
- *             as found in ETMIDR4 0-3.
- * @nr_cntr:    Number of counters as found in ETMIDR5 bit 28-30.
- * @nr_ext_inp: Number of external input.
- * @numcidc:   Number of contextID comparators.
- * @numvmidc:  Number of VMID comparators.
- * @nrseqstate: The number of sequencer states that are implemented.
- * @nr_event:  Indicates how many events the trace unit support.
- * @nr_resource:The number of resource selection pairs available for tracing.
- * @nr_ss_cmp: Number of single-shot comparator controls that are available.
+ * struct etmv4_config - configuration information related to an ETMv4
   * @mode:      Controls various modes supported by this ETM.
- * @trcid:     value of the current ID for this component.
- * @trcid_size: Indicates the trace ID width.
- * @instrp0:   Tracing of load and store instructions
- *             as P0 elements is supported.
- * @trccond:   If the trace unit supports conditional
- *             instruction tracing.
- * @retstack:  Indicates if the implementation supports a return stack.
- * @trc_error: Whether a trace unit can trace a system
- *             error exception.
- * @atbtrig:   If the implementation can support ATB triggers
- * @lpoverride:        If the implementation can support low-power state over.
   * @pe_sel:    Controls which PE to trace.
   * @cfg:       Controls the tracing options.
   * @eventctrl0: Controls the tracing of arbitrary events.
   * @eventctrl1: Controls the behavior of the events that @event_ctrl0 selects.
   * @stallctl:  If functionality that prevents trace unit buffer overflows
   *             is available.
- * @sysstall:  Does the system support stall control of the PE?
- * @nooverflow:        Indicate if overflow prevention is supported.
- * @stall_ctrl:        Enables trace unit functionality that prevents trace
- *             unit buffer overflows.
- * @ts_size:   Global timestamp size field.
   * @ts_ctrl:   Controls the insertion of global timestamps in the
   *             trace streams.
- * @syncpr:    Indicates if an implementation has a fixed
- *             synchronization period.
   * @syncfreq:  Controls how often trace synchronization requests occur.
- * @trccci:    Indicates if the trace unit supports cycle counting
- *             for instruction.
- * @ccsize:    Indicates the size of the cycle counter in bits.
- * @ccitmin:   minimum value that can be programmed in
   *             the TRCCCCTLR register.
   * @ccctlr:    Sets the threshold value for cycle counting.
- * @trcbb:     Indicates if the trace unit supports branch broadcast tracing.
- * @q_support: Q element support characteristics.
   * @vinst_ctrl:        Controls instruction trace filtering.
   * @viiectlr:  Set or read, the address range comparators.
   * @vissctlr:  Set, or read, the single address comparators that control the
@@ -264,73 +232,28 @@
   * @addr_acc:  Address comparator access type.
   * @addr_type: Current status of the comparator register.
   * @ctxid_idx: Context ID index selector.
- * @ctxid_size:        Size of the context ID field to consider.
   * @ctxid_pid: Value of the context ID comparator.
   * @ctxid_vpid:        Virtual PID seen by users if PID namespace is enabled, otherwise
   *             the same value of ctxid_pid.
   * @ctxid_mask0:Context ID comparator mask for comparator 0-3.
   * @ctxid_mask1:Context ID comparator mask for comparator 4-7.
   * @vmid_idx:  VM ID index selector.
- * @vmid_size: Size of the VM ID comparator to consider.
   * @vmid_val:  Value of the VM ID comparator.
   * @vmid_mask0:        VM ID comparator mask for comparator 0-3.
   * @vmid_mask1:        VM ID comparator mask for comparator 4-7.
- * @s_ex_level:        In secure state, indicates whether instruction tracing is
- *             supported for the corresponding Exception level.
- * @ns_ex_level:In non-secure state, indicates whether instruction tracing is
- *             supported for the corresponding Exception level.
   * @ext_inp:   External input selection.
   */
-struct etmv4_drvdata {
-       void __iomem                    *base;
-       struct device                   *dev;
-       struct coresight_device         *csdev;
-       spinlock_t                      spinlock;
-       int                             cpu;
-       u8                              arch;
-       bool                            enable;
-       bool                            sticky_enable;
-       bool                            boot_enable;
-       bool                            os_unlock;
-       u8                              nr_pe;
-       u8                              nr_pe_cmp;
-       u8                              nr_addr_cmp;
-       u8                              nr_cntr;
-       u8                              nr_ext_inp;
-       u8                              numcidc;
-       u8                              numvmidc;
-       u8                              nrseqstate;
-       u8                              nr_event;
-       u8                              nr_resource;
-       u8                              nr_ss_cmp;
+struct etmv4_config {
         u32                             mode;
-       u8                              trcid;
-       u8                              trcid_size;
-       bool                            instrp0;
-       bool                            trccond;
-       bool                            retstack;
-       bool                            trc_error;
-       bool                            atbtrig;
-       bool                            lpoverride;
         u32                             pe_sel;
         u32                             cfg;
         u32                             eventctrl0;
         u32                             eventctrl1;
-       bool                            stallctl;
-       bool                            sysstall;
-       bool                            nooverflow;
         u32                             stall_ctrl;
-       u8                              ts_size;
         u32                             ts_ctrl;
-       bool                            syncpr;
         u32                             syncfreq;
-       bool                            trccci;
-       u8                              ccsize;
-       u8                              ccitmin;
         u32                             ccctlr;
-       bool                            trcbb;
         u32                             bb_ctrl;
-       bool                            q_support;
         u32                             vinst_ctrl;
         u32                             viiectlr;
         u32                             vissctlr;
@@ -353,19 +276,119 @@ struct etmv4_drvdata {
         u64                             addr_acc[ETM_MAX_SINGLE_ADDR_CMP];
         u8                              addr_type[ETM_MAX_SINGLE_ADDR_CMP];
         u8                              ctxid_idx;
-       u8                              ctxid_size;
         u64                             ctxid_pid[ETMv4_MAX_CTXID_CMP];
         u64                             ctxid_vpid[ETMv4_MAX_CTXID_CMP];
         u32                             ctxid_mask0;
         u32                             ctxid_mask1;
         u8                              vmid_idx;
-       u8                              vmid_size;
         u64                             vmid_val[ETM_MAX_VMID_CMP];
         u32                             vmid_mask0;
         u32                             vmid_mask1;
+       u32                             ext_inp;
+};
+
+/**
+ * struct etm4_drvdata - specifics associated to an ETM component
+ * @base:       Memory mapped base address for this component.
+ * @dev:        The device entity associated to this component.
+ * @csdev:      Component vitals needed by the framework.
+ * @spinlock:   Only one at a time pls.
+ * @mode:      This tracer's mode, i.e sysFS, Perf or disabled.
+ * @cpu:        The cpu this component is affined to.
+ * @arch:       ETM version number.
+ * @nr_pe:     The number of processing entity available for tracing.
+ * @nr_pe_cmp: The number of processing entity comparator inputs that are
+ *             available for tracing.
+ * @nr_addr_cmp:Number of pairs of address comparators available
+ *             as found in ETMIDR4 0-3.
+ * @nr_cntr:    Number of counters as found in ETMIDR5 bit 28-30.
+ * @nr_ext_inp: Number of external input.
+ * @numcidc:   Number of contextID comparators.
+ * @numvmidc:  Number of VMID comparators.
+ * @nrseqstate: The number of sequencer states that are implemented.
+ * @nr_event:  Indicates how many events the trace unit support.
+ * @nr_resource:The number of resource selection pairs available for tracing.
+ * @nr_ss_cmp: Number of single-shot comparator controls that are available.
+ * @trcid:     value of the current ID for this component.
+ * @trcid_size: Indicates the trace ID width.
+ * @ts_size:   Global timestamp size field.
+ * @ctxid_size:        Size of the context ID field to consider.
+ * @vmid_size: Size of the VM ID comparator to consider.
+ * @ccsize:    Indicates the size of the cycle counter in bits.
+ * @ccitmin:   minimum value that can be programmed in
+ * @s_ex_level:        In secure state, indicates whether instruction tracing is
+ *             supported for the corresponding Exception level.
+ * @ns_ex_level:In non-secure state, indicates whether instruction tracing is
+ *             supported for the corresponding Exception level.
+ * @sticky_enable: true if ETM base configuration has been done.
+ * @boot_enable:True if we should start tracing at boot time.
+ * @os_unlock:  True if access to management registers is allowed.
+ * @instrp0:   Tracing of load and store instructions
+ *             as P0 elements is supported.
+ * @trcbb:     Indicates if the trace unit supports branch broadcast tracing.
+ * @trccond:   If the trace unit supports conditional
+ *             instruction tracing.
+ * @retstack:  Indicates if the implementation supports a return stack.
+ * @trccci:    Indicates if the trace unit supports cycle counting
+ *             for instruction.
+ * @q_support: Q element support characteristics.
+ * @trc_error: Whether a trace unit can trace a system
+ *             error exception.
+ * @syncpr:    Indicates if an implementation has a fixed
+ *             synchronization period.
+ * @stall_ctrl:        Enables trace unit functionality that prevents trace
+ *             unit buffer overflows.
+ * @sysstall:  Does the system support stall control of the PE?
+ * @nooverflow:        Indicate if overflow prevention is supported.
+ * @atbtrig:   If the implementation can support ATB triggers
+ * @lpoverride:        If the implementation can support low-power state over.
+ * @config:    structure holding configuration parameters.
+ */
+struct etmv4_drvdata {
+       void __iomem                    *base;
+       struct device                   *dev;
+       struct coresight_device         *csdev;
+       spinlock_t                      spinlock;
+       local_t                         mode;
+       int                             cpu;
+       u8                              arch;
+       u8                              nr_pe;
+       u8                              nr_pe_cmp;
+       u8                              nr_addr_cmp;
+       u8                              nr_cntr;
+       u8                              nr_ext_inp;
+       u8                              numcidc;
+       u8                              numvmidc;
+       u8                              nrseqstate;
+       u8                              nr_event;
+       u8                              nr_resource;
+       u8                              nr_ss_cmp;
+       u8                              trcid;
+       u8                              trcid_size;
+       u8                              ts_size;
+       u8                              ctxid_size;
+       u8                              vmid_size;
+       u8                              ccsize;
+       u8                              ccitmin;
         u8                              s_ex_level;
         u8                              ns_ex_level;
-       u32                             ext_inp;
+       u8                              q_support;
+       bool                            sticky_enable;
+       bool                            boot_enable;
+       bool                            os_unlock;
+       bool                            instrp0;
+       bool                            trcbb;
+       bool                            trccond;
+       bool                            retstack;
+       bool                            trccci;
+       bool                            trc_error;
+       bool                            syncpr;
+       bool                            stallctl;
+       bool                            sysstall;
+       bool                            nooverflow;
+       bool                            atbtrig;
+       bool                            lpoverride;
+       struct etmv4_config             config;
  };
  
  /* Address comparator access types */
@@ -391,4 +414,7 @@ enum etm_addr_type {
         ETM_ADDR_TYPE_START,
         ETM_ADDR_TYPE_STOP,
  };
+
+extern const struct attribute_group *coresight_etmv4_groups[];
+void etm4_config_trace_mode(struct etmv4_config *config);
  #endif
diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c

index 2e36bde7fcb41bbfe3972502be5d1d11aa4c49a5..05df789056ccfc823a103d3bc070bef1eaf46d85 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-funnel.c
+++ b/drivers/hwtracing/coresight/coresight-funnel.c
@@ -1,4 +1,6 @@
  /* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+ *
+ * Description: CoreSight Funnel driver
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License version 2 and
@@ -11,7 +13,6 @@
   */
  
  #include <linux/kernel.h>
-#include <linux/module.h>
  #include <linux/init.h>
  #include <linux/types.h>
  #include <linux/device.h>
@@ -69,7 +70,6 @@ static int funnel_enable(struct coresight_device *csdev, int inport,
  {
         struct funnel_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-       pm_runtime_get_sync(drvdata->dev);
         funnel_enable_hw(drvdata, inport);
  
         dev_info(drvdata->dev, "FUNNEL inport %d enabled\n", inport);
@@ -95,7 +95,6 @@ static void funnel_disable(struct coresight_device *csdev, int inport,
         struct funnel_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
         funnel_disable_hw(drvdata, inport);
-       pm_runtime_put(drvdata->dev);
  
         dev_info(drvdata->dev, "FUNNEL inport %d disabled\n", inport);
  }
@@ -222,15 +221,6 @@ static int funnel_probe(struct amba_device *adev, const struct amba_id *id)
         if (IS_ERR(drvdata->csdev))
                 return PTR_ERR(drvdata->csdev);
  
-       dev_info(dev, "FUNNEL initialized\n");
-       return 0;
-}
-
-static int funnel_remove(struct amba_device *adev)
-{
-       struct funnel_drvdata *drvdata = amba_get_drvdata(adev);
-
-       coresight_unregister(drvdata->csdev);
         return 0;
  }
  
@@ -273,13 +263,9 @@ static struct amba_driver funnel_driver = {
                 .name   = "coresight-funnel",
                 .owner  = THIS_MODULE,
                 .pm     = &funnel_dev_pm_ops,
+               .suppress_bind_attrs = true,
         },
         .probe          = funnel_probe,
-       .remove         = funnel_remove,
         .id_table       = funnel_ids,
  };
-
-module_amba_driver(funnel_driver);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("CoreSight Funnel driver");
+builtin_amba_driver(funnel_driver);
diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h

index 62fcd98cc7cfc76798316c3a37a547063ed57955..3cb574b3cdd95f4c309da34f17453b3032cbc646 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-priv.h
+++ b/drivers/hwtracing/coresight/coresight-priv.h
@@ -34,6 +34,45 @@
  #define TIMEOUT_US             100
  #define BMVAL(val, lsb, msb)   ((val & GENMASK(msb, lsb)) >> lsb)
  
+#define ETM_MODE_EXCL_KERN     BIT(30)
+#define ETM_MODE_EXCL_USER     BIT(31)
+
+#define coresight_simple_func(type, name, offset)                      \
+static ssize_t name##_show(struct device *_dev,                                \
+                          struct device_attribute *attr, char *buf)    \
+{                                                                      \
+       type *drvdata = dev_get_drvdata(_dev->parent);                  \
+       return scnprintf(buf, PAGE_SIZE, "0x%x\n",                      \
+                        readl_relaxed(drvdata->base + offset));        \
+}                                                                      \
+static DEVICE_ATTR_RO(name)
+
+enum cs_mode {
+       CS_MODE_DISABLED,
+       CS_MODE_SYSFS,
+       CS_MODE_PERF,
+};
+
+/**
+ * struct cs_buffer - keep track of a recording session' specifics
+ * @cur:       index of the current buffer
+ * @nr_pages:  max number of pages granted to us
+ * @offset:    offset within the current buffer
+ * @data_size: how much we collected in this run
+ * @lost:      other than zero if we had a HW buffer wrap around
+ * @snapshot:  is this run in snapshot mode
+ * @data_pages:        a handle the ring buffer
+ */
+struct cs_buffers {
+       unsigned int            cur;
+       unsigned int            nr_pages;
+       unsigned long           offset;
+       local_t                 data_size;
+       local_t                 lost;
+       bool                    snapshot;
+       void                    **data_pages;
+};
+
  static inline void CS_LOCK(void __iomem *addr)
  {
         do {
@@ -52,6 +91,13 @@ static inline void CS_UNLOCK(void __iomem *addr)
         } while (0);
  }
  
+void coresight_disable_path(struct list_head *path);
+int coresight_enable_path(struct list_head *path, u32 mode);
+struct coresight_device *coresight_get_sink(struct list_head *path);
+struct list_head *coresight_build_path(struct coresight_device *csdev,
+                                      const char *sink);
+void coresight_release_path(struct list_head *path);
+
  #ifdef CONFIG_CORESIGHT_SOURCE_ETM3X
  extern int etm_readl_cp14(u32 off, unsigned int *val);
  extern int etm_writel_cp14(u32 off, u32 val);
diff --git a/drivers/hwtracing/coresight/coresight-replicator-qcom.c b/drivers/hwtracing/coresight/coresight-replicator-qcom.c

index 584059e9e8660f228f785cb87b9200e3b315d675..700f710e4bfa6cc4449d04b2842eaf377f3b9368 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-replicator-qcom.c
+++ b/drivers/hwtracing/coresight/coresight-replicator-qcom.c
@@ -15,7 +15,6 @@
  #include <linux/clk.h>
  #include <linux/coresight.h>
  #include <linux/device.h>
-#include <linux/module.h>
  #include <linux/err.h>
  #include <linux/init.h>
  #include <linux/io.h>
@@ -48,8 +47,6 @@ static int replicator_enable(struct coresight_device *csdev, int inport,
  {
         struct replicator_state *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-       pm_runtime_get_sync(drvdata->dev);
-
         CS_UNLOCK(drvdata->base);
  
         /*
@@ -86,8 +83,6 @@ static void replicator_disable(struct coresight_device *csdev, int inport,
  
         CS_LOCK(drvdata->base);
  
-       pm_runtime_put(drvdata->dev);
-
         dev_info(drvdata->dev, "REPLICATOR disabled\n");
  }
  
@@ -156,15 +151,6 @@ static int replicator_probe(struct amba_device *adev, const struct amba_id *id)
         return 0;
  }
  
-static int replicator_remove(struct amba_device *adev)
-{
-       struct replicator_state *drvdata = amba_get_drvdata(adev);
-
-       pm_runtime_disable(&adev->dev);
-       coresight_unregister(drvdata->csdev);
-       return 0;
-}
-
  #ifdef CONFIG_PM
  static int replicator_runtime_suspend(struct device *dev)
  {
@@ -206,10 +192,9 @@ static struct amba_driver replicator_driver = {
         .drv = {
                 .name   = "coresight-replicator-qcom",
                 .pm     = &replicator_dev_pm_ops,
+               .suppress_bind_attrs = true,
         },
         .probe          = replicator_probe,
-       .remove         = replicator_remove,
         .id_table       = replicator_ids,
  };
-
-module_amba_driver(replicator_driver);
+builtin_amba_driver(replicator_driver);
diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c

index 963ac197c2535caf202960af34490e6abd02d4cb..c6982e312e156892eda2126ad061d23a73522939 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-replicator.c
+++ b/drivers/hwtracing/coresight/coresight-replicator.c
@@ -1,4 +1,6 @@
  /* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+ *
+ * Description: CoreSight Replicator driver
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License version 2 and
@@ -11,7 +13,6 @@
   */
  
  #include <linux/kernel.h>
-#include <linux/module.h>
  #include <linux/device.h>
  #include <linux/platform_device.h>
  #include <linux/io.h>
@@ -41,7 +42,6 @@ static int replicator_enable(struct coresight_device *csdev, int inport,
  {
         struct replicator_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-       pm_runtime_get_sync(drvdata->dev);
         dev_info(drvdata->dev, "REPLICATOR enabled\n");
         return 0;
  }
@@ -51,7 +51,6 @@ static void replicator_disable(struct coresight_device *csdev, int inport,
  {
         struct replicator_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-       pm_runtime_put(drvdata->dev);
         dev_info(drvdata->dev, "REPLICATOR disabled\n");
  }
  
@@ -115,7 +114,6 @@ static int replicator_probe(struct platform_device *pdev)
  
         pm_runtime_put(&pdev->dev);
  
-       dev_info(dev, "REPLICATOR initialized\n");
         return 0;
  
  out_disable_pm:
@@ -127,20 +125,6 @@ out_disable_pm:
         return ret;
  }
  
-static int replicator_remove(struct platform_device *pdev)
-{
-       struct replicator_drvdata *drvdata = platform_get_drvdata(pdev);
-
-       coresight_unregister(drvdata->csdev);
-       pm_runtime_get_sync(&pdev->dev);
-       if (!IS_ERR(drvdata->atclk))
-               clk_disable_unprepare(drvdata->atclk);
-       pm_runtime_put_noidle(&pdev->dev);
-       pm_runtime_disable(&pdev->dev);
-
-       return 0;
-}
-
  #ifdef CONFIG_PM
  static int replicator_runtime_suspend(struct device *dev)
  {
@@ -175,15 +159,11 @@ static const struct of_device_id replicator_match[] = {
  
  static struct platform_driver replicator_driver = {
         .probe          = replicator_probe,
-       .remove         = replicator_remove,
         .driver         = {
                 .name   = "coresight-replicator",
                 .of_match_table = replicator_match,
                 .pm     = &replicator_dev_pm_ops,
+               .suppress_bind_attrs = true,
         },
  };
-
  builtin_platform_driver(replicator_driver);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("CoreSight Replicator driver");
diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c

new file mode 100644 (file)

index 0000000..73be58a
--- /dev/null
+++ b/drivers/hwtracing/coresight/coresight-stm.c
@@ -0,0 +1,920 @@
+/* Copyright (c) 2015-2016, The Linux Foundation. All rights reserved.
+ *
+ * Description: CoreSight System Trace Macrocell driver
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Initial implementation by Pratik Patel
+ * (C) 2014-2015 Pratik Patel <pratikp@codeaurora.org>
+ *
+ * Serious refactoring, code cleanup and upgrading to the Coresight upstream
+ * framework by Mathieu Poirier
+ * (C) 2015-2016 Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * Guaranteed timing and support for various packet type coming from the
+ * generic STM API by Chunyan Zhang
+ * (C) 2015-2016 Chunyan Zhang <zhang.chunyan@linaro.org>
+ */
+#include <asm/local.h>
+#include <linux/amba/bus.h>
+#include <linux/bitmap.h>
+#include <linux/clk.h>
+#include <linux/coresight.h>
+#include <linux/coresight-stm.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/of_address.h>
+#include <linux/perf_event.h>
+#include <linux/pm_runtime.h>
+#include <linux/stm.h>
+
+#include "coresight-priv.h"
+
+#define STMDMASTARTR                   0xc04
+#define STMDMASTOPR                    0xc08
+#define STMDMASTATR                    0xc0c
+#define STMDMACTLR                     0xc10
+#define STMDMAIDR                      0xcfc
+#define STMHEER                                0xd00
+#define STMHETER                       0xd20
+#define STMHEBSR                       0xd60
+#define STMHEMCR                       0xd64
+#define STMHEMASTR                     0xdf4
+#define STMHEFEAT1R                    0xdf8
+#define STMHEIDR                       0xdfc
+#define STMSPER                                0xe00
+#define STMSPTER                       0xe20
+#define STMPRIVMASKR                   0xe40
+#define STMSPSCR                       0xe60
+#define STMSPMSCR                      0xe64
+#define STMSPOVERRIDER                 0xe68
+#define STMSPMOVERRIDER                        0xe6c
+#define STMSPTRIGCSR                   0xe70
+#define STMTCSR                                0xe80
+#define STMTSSTIMR                     0xe84
+#define STMTSFREQR                     0xe8c
+#define STMSYNCR                       0xe90
+#define STMAUXCR                       0xe94
+#define STMSPFEAT1R                    0xea0
+#define STMSPFEAT2R                    0xea4
+#define STMSPFEAT3R                    0xea8
+#define STMITTRIGGER                   0xee8
+#define STMITATBDATA0                  0xeec
+#define STMITATBCTR2                   0xef0
+#define STMITATBID                     0xef4
+#define STMITATBCTR0                   0xef8
+
+#define STM_32_CHANNEL                 32
+#define BYTES_PER_CHANNEL              256
+#define STM_TRACE_BUF_SIZE             4096
+#define STM_SW_MASTER_END              127
+
+/* Register bit definition */
+#define STMTCSR_BUSY_BIT               23
+/* Reserve the first 10 channels for kernel usage */
+#define STM_CHANNEL_OFFSET             0
+
+enum stm_pkt_type {
+       STM_PKT_TYPE_DATA       = 0x98,
+       STM_PKT_TYPE_FLAG       = 0xE8,
+       STM_PKT_TYPE_TRIG       = 0xF8,
+};
+
+#define stm_channel_addr(drvdata, ch)  (drvdata->chs.base +    \
+                                       (ch * BYTES_PER_CHANNEL))
+#define stm_channel_off(type, opts)    (type & ~opts)
+
+static int boot_nr_channel;
+
+/*
+ * Not really modular but using module_param is the easiest way to
+ * remain consistent with existing use cases for now.
+ */
+module_param_named(
+       boot_nr_channel, boot_nr_channel, int, S_IRUGO
+);
+
+/**
+ * struct channel_space - central management entity for extended ports
+ * @base:              memory mapped base address where channels start.
+ * @guaraneed:         is the channel delivery guaranteed.
+ */
+struct channel_space {
+       void __iomem            *base;
+       unsigned long           *guaranteed;
+};
+
+/**
+ * struct stm_drvdata - specifics associated to an STM component
+ * @base:              memory mapped base address for this component.
+ * @dev:               the device entity associated to this component.
+ * @atclk:             optional clock for the core parts of the STM.
+ * @csdev:             component vitals needed by the framework.
+ * @spinlock:          only one at a time pls.
+ * @chs:               the channels accociated to this STM.
+ * @stm:               structure associated to the generic STM interface.
+ * @mode:              this tracer's mode, i.e sysFS, or disabled.
+ * @traceid:           value of the current ID for this component.
+ * @write_bytes:       Maximus bytes this STM can write at a time.
+ * @stmsper:           settings for register STMSPER.
+ * @stmspscr:          settings for register STMSPSCR.
+ * @numsp:             the total number of stimulus port support by this STM.
+ * @stmheer:           settings for register STMHEER.
+ * @stmheter:          settings for register STMHETER.
+ * @stmhebsr:          settings for register STMHEBSR.
+ */
+struct stm_drvdata {
+       void __iomem            *base;
+       struct device           *dev;
+       struct clk              *atclk;
+       struct coresight_device *csdev;
+       spinlock_t              spinlock;
+       struct channel_space    chs;
+       struct stm_data         stm;
+       local_t                 mode;
+       u8                      traceid;
+       u32                     write_bytes;
+       u32                     stmsper;
+       u32                     stmspscr;
+       u32                     numsp;
+       u32                     stmheer;
+       u32                     stmheter;
+       u32                     stmhebsr;
+};
+
+static void stm_hwevent_enable_hw(struct stm_drvdata *drvdata)
+{
+       CS_UNLOCK(drvdata->base);
+
+       writel_relaxed(drvdata->stmhebsr, drvdata->base + STMHEBSR);
+       writel_relaxed(drvdata->stmheter, drvdata->base + STMHETER);
+       writel_relaxed(drvdata->stmheer, drvdata->base + STMHEER);
+       writel_relaxed(0x01 |   /* Enable HW event tracing */
+                      0x04,    /* Error detection on event tracing */
+                      drvdata->base + STMHEMCR);
+
+       CS_LOCK(drvdata->base);
+}
+
+static void stm_port_enable_hw(struct stm_drvdata *drvdata)
+{
+       CS_UNLOCK(drvdata->base);
+       /* ATB trigger enable on direct writes to TRIG locations */
+       writel_relaxed(0x10,
+                      drvdata->base + STMSPTRIGCSR);
+       writel_relaxed(drvdata->stmspscr, drvdata->base + STMSPSCR);
+       writel_relaxed(drvdata->stmsper, drvdata->base + STMSPER);
+
+       CS_LOCK(drvdata->base);
+}
+
+static void stm_enable_hw(struct stm_drvdata *drvdata)
+{
+       if (drvdata->stmheer)
+               stm_hwevent_enable_hw(drvdata);
+
+       stm_port_enable_hw(drvdata);
+
+       CS_UNLOCK(drvdata->base);
+
+       /* 4096 byte between synchronisation packets */
+       writel_relaxed(0xFFF, drvdata->base + STMSYNCR);
+       writel_relaxed((drvdata->traceid << 16 | /* trace id */
+                       0x02 |                   /* timestamp enable */
+                       0x01),                   /* global STM enable */
+                       drvdata->base + STMTCSR);
+
+       CS_LOCK(drvdata->base);
+}
+
+static int stm_enable(struct coresight_device *csdev,
+                     struct perf_event_attr *attr, u32 mode)
+{
+       u32 val;
+       struct stm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+       if (mode != CS_MODE_SYSFS)
+               return -EINVAL;
+
+       val = local_cmpxchg(&drvdata->mode, CS_MODE_DISABLED, mode);
+
+       /* Someone is already using the tracer */
+       if (val)
+               return -EBUSY;
+
+       pm_runtime_get_sync(drvdata->dev);
+
+       spin_lock(&drvdata->spinlock);
+       stm_enable_hw(drvdata);
+       spin_unlock(&drvdata->spinlock);
+
+       dev_info(drvdata->dev, "STM tracing enabled\n");
+       return 0;
+}
+
+static void stm_hwevent_disable_hw(struct stm_drvdata *drvdata)
+{
+       CS_UNLOCK(drvdata->base);
+
+       writel_relaxed(0x0, drvdata->base + STMHEMCR);
+       writel_relaxed(0x0, drvdata->base + STMHEER);
+       writel_relaxed(0x0, drvdata->base + STMHETER);
+
+       CS_LOCK(drvdata->base);
+}
+
+static void stm_port_disable_hw(struct stm_drvdata *drvdata)
+{
+       CS_UNLOCK(drvdata->base);
+
+       writel_relaxed(0x0, drvdata->base + STMSPER);
+       writel_relaxed(0x0, drvdata->base + STMSPTRIGCSR);
+
+       CS_LOCK(drvdata->base);
+}
+
+static void stm_disable_hw(struct stm_drvdata *drvdata)
+{
+       u32 val;
+
+       CS_UNLOCK(drvdata->base);
+
+       val = readl_relaxed(drvdata->base + STMTCSR);
+       val &= ~0x1; /* clear global STM enable [0] */
+       writel_relaxed(val, drvdata->base + STMTCSR);
+
+       CS_LOCK(drvdata->base);
+
+       stm_port_disable_hw(drvdata);
+       if (drvdata->stmheer)
+               stm_hwevent_disable_hw(drvdata);
+}
+
+static void stm_disable(struct coresight_device *csdev)
+{
+       struct stm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+       /*
+        * For as long as the tracer isn't disabled another entity can't
+        * change its status.  As such we can read the status here without
+        * fearing it will change under us.
+        */
+       if (local_read(&drvdata->mode) == CS_MODE_SYSFS) {
+               spin_lock(&drvdata->spinlock);
+               stm_disable_hw(drvdata);
+               spin_unlock(&drvdata->spinlock);
+
+               /* Wait until the engine has completely stopped */
+               coresight_timeout(drvdata, STMTCSR, STMTCSR_BUSY_BIT, 0);
+
+               pm_runtime_put(drvdata->dev);
+
+               local_set(&drvdata->mode, CS_MODE_DISABLED);
+               dev_info(drvdata->dev, "STM tracing disabled\n");
+       }
+}
+
+static int stm_trace_id(struct coresight_device *csdev)
+{
+       struct stm_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+       return drvdata->traceid;
+}
+
+static const struct coresight_ops_source stm_source_ops = {
+       .trace_id       = stm_trace_id,
+       .enable         = stm_enable,
+       .disable        = stm_disable,
+};
+
+static const struct coresight_ops stm_cs_ops = {
+       .source_ops     = &stm_source_ops,
+};
+
+static inline bool stm_addr_unaligned(const void *addr, u8 write_bytes)
+{
+       return ((unsigned long)addr & (write_bytes - 1));
+}
+
+static void stm_send(void *addr, const void *data, u32 size, u8 write_bytes)
+{
+       u8 paload[8];
+
+       if (stm_addr_unaligned(data, write_bytes)) {
+               memcpy(paload, data, size);
+               data = paload;
+       }
+
+       /* now we are 64bit/32bit aligned */
+       switch (size) {
+#ifdef CONFIG_64BIT
+       case 8:
+               writeq_relaxed(*(u64 *)data, addr);
+               break;
+#endif
+       case 4:
+               writel_relaxed(*(u32 *)data, addr);
+               break;
+       case 2:
+               writew_relaxed(*(u16 *)data, addr);
+               break;
+       case 1:
+               writeb_relaxed(*(u8 *)data, addr);
+               break;
+       default:
+               break;
+       }
+}
+
+static int stm_generic_link(struct stm_data *stm_data,
+                           unsigned int master,  unsigned int channel)
+{
+       struct stm_drvdata *drvdata = container_of(stm_data,
+                                                  struct stm_drvdata, stm);
+       if (!drvdata || !drvdata->csdev)
+               return -EINVAL;
+
+       return coresight_enable(drvdata->csdev);
+}
+
+static void stm_generic_unlink(struct stm_data *stm_data,
+                              unsigned int master,  unsigned int channel)
+{
+       struct stm_drvdata *drvdata = container_of(stm_data,
+                                                  struct stm_drvdata, stm);
+       if (!drvdata || !drvdata->csdev)
+               return;
+
+       stm_disable(drvdata->csdev);
+}
+
+static long stm_generic_set_options(struct stm_data *stm_data,
+                                   unsigned int master,
+                                   unsigned int channel,
+                                   unsigned int nr_chans,
+                                   unsigned long options)
+{
+       struct stm_drvdata *drvdata = container_of(stm_data,
+                                                  struct stm_drvdata, stm);
+       if (!(drvdata && local_read(&drvdata->mode)))
+               return -EINVAL;
+
+       if (channel >= drvdata->numsp)
+               return -EINVAL;
+
+       switch (options) {
+       case STM_OPTION_GUARANTEED:
+               set_bit(channel, drvdata->chs.guaranteed);
+               break;
+
+       case STM_OPTION_INVARIANT:
+               clear_bit(channel, drvdata->chs.guaranteed);
+               break;
+
+       default:
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static ssize_t stm_generic_packet(struct stm_data *stm_data,
+                                 unsigned int master,
+                                 unsigned int channel,
+                                 unsigned int packet,
+                                 unsigned int flags,
+                                 unsigned int size,
+                                 const unsigned char *payload)
+{
+       unsigned long ch_addr;
+       struct stm_drvdata *drvdata = container_of(stm_data,
+                                                  struct stm_drvdata, stm);
+
+       if (!(drvdata && local_read(&drvdata->mode)))
+               return 0;
+
+       if (channel >= drvdata->numsp)
+               return 0;
+
+       ch_addr = (unsigned long)stm_channel_addr(drvdata, channel);
+
+       flags = (flags == STP_PACKET_TIMESTAMPED) ? STM_FLAG_TIMESTAMPED : 0;
+       flags |= test_bit(channel, drvdata->chs.guaranteed) ?
+                          STM_FLAG_GUARANTEED : 0;
+
+       if (size > drvdata->write_bytes)
+               size = drvdata->write_bytes;
+       else
+               size = rounddown_pow_of_two(size);
+
+       switch (packet) {
+       case STP_PACKET_FLAG:
+               ch_addr |= stm_channel_off(STM_PKT_TYPE_FLAG, flags);
+
+               /*
+                * The generic STM core sets a size of '0' on flag packets.
+                * As such send a flag packet of size '1' and tell the
+                * core we did so.
+                */
+               stm_send((void *)ch_addr, payload, 1, drvdata->write_bytes);
+               size = 1;
+               break;
+
+       case STP_PACKET_DATA:
+               ch_addr |= stm_channel_off(STM_PKT_TYPE_DATA, flags);
+               stm_send((void *)ch_addr, payload, size,
+                               drvdata->write_bytes);
+               break;
+
+       default:
+               return -ENOTSUPP;
+       }
+
+       return size;
+}
+
+static ssize_t hwevent_enable_show(struct device *dev,
+                                  struct device_attribute *attr, char *buf)
+{
+       struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       unsigned long val = drvdata->stmheer;
+
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t hwevent_enable_store(struct device *dev,
+                                   struct device_attribute *attr,
+                                   const char *buf, size_t size)
+{
+       struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       unsigned long val;
+       int ret = 0;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return -EINVAL;
+
+       drvdata->stmheer = val;
+       /* HW event enable and trigger go hand in hand */
+       drvdata->stmheter = val;
+
+       return size;
+}
+static DEVICE_ATTR_RW(hwevent_enable);
+
+static ssize_t hwevent_select_show(struct device *dev,
+                                  struct device_attribute *attr, char *buf)
+{
+       struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       unsigned long val = drvdata->stmhebsr;
+
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t hwevent_select_store(struct device *dev,
+                                   struct device_attribute *attr,
+                                   const char *buf, size_t size)
+{
+       struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       unsigned long val;
+       int ret = 0;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return -EINVAL;
+
+       drvdata->stmhebsr = val;
+
+       return size;
+}
+static DEVICE_ATTR_RW(hwevent_select);
+
+static ssize_t port_select_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       unsigned long val;
+
+       if (!local_read(&drvdata->mode)) {
+               val = drvdata->stmspscr;
+       } else {
+               spin_lock(&drvdata->spinlock);
+               val = readl_relaxed(drvdata->base + STMSPSCR);
+               spin_unlock(&drvdata->spinlock);
+       }
+
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t port_select_store(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t size)
+{
+       struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       unsigned long val, stmsper;
+       int ret = 0;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       spin_lock(&drvdata->spinlock);
+       drvdata->stmspscr = val;
+
+       if (local_read(&drvdata->mode)) {
+               CS_UNLOCK(drvdata->base);
+               /* Process as per ARM's TRM recommendation */
+               stmsper = readl_relaxed(drvdata->base + STMSPER);
+               writel_relaxed(0x0, drvdata->base + STMSPER);
+               writel_relaxed(drvdata->stmspscr, drvdata->base + STMSPSCR);
+               writel_relaxed(stmsper, drvdata->base + STMSPER);
+               CS_LOCK(drvdata->base);
+       }
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_RW(port_select);
+
+static ssize_t port_enable_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       unsigned long val;
+
+       if (!local_read(&drvdata->mode)) {
+               val = drvdata->stmsper;
+       } else {
+               spin_lock(&drvdata->spinlock);
+               val = readl_relaxed(drvdata->base + STMSPER);
+               spin_unlock(&drvdata->spinlock);
+       }
+
+       return scnprintf(buf, PAGE_SIZE, "%#lx\n", val);
+}
+
+static ssize_t port_enable_store(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t size)
+{
+       struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       unsigned long val;
+       int ret = 0;
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       spin_lock(&drvdata->spinlock);
+       drvdata->stmsper = val;
+
+       if (local_read(&drvdata->mode)) {
+               CS_UNLOCK(drvdata->base);
+               writel_relaxed(drvdata->stmsper, drvdata->base + STMSPER);
+               CS_LOCK(drvdata->base);
+       }
+       spin_unlock(&drvdata->spinlock);
+
+       return size;
+}
+static DEVICE_ATTR_RW(port_enable);
+
+static ssize_t traceid_show(struct device *dev,
+                           struct device_attribute *attr, char *buf)
+{
+       unsigned long val;
+       struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       val = drvdata->traceid;
+       return sprintf(buf, "%#lx\n", val);
+}
+
+static ssize_t traceid_store(struct device *dev,
+                            struct device_attribute *attr,
+                            const char *buf, size_t size)
+{
+       int ret;
+       unsigned long val;
+       struct stm_drvdata *drvdata = dev_get_drvdata(dev->parent);
+
+       ret = kstrtoul(buf, 16, &val);
+       if (ret)
+               return ret;
+
+       /* traceid field is 7bit wide on STM32 */
+       drvdata->traceid = val & 0x7f;
+       return size;
+}
+static DEVICE_ATTR_RW(traceid);
+
+#define coresight_stm_simple_func(name, offset)        \
+       coresight_simple_func(struct stm_drvdata, name, offset)
+
+coresight_stm_simple_func(tcsr, STMTCSR);
+coresight_stm_simple_func(tsfreqr, STMTSFREQR);
+coresight_stm_simple_func(syncr, STMSYNCR);
+coresight_stm_simple_func(sper, STMSPER);
+coresight_stm_simple_func(spter, STMSPTER);
+coresight_stm_simple_func(privmaskr, STMPRIVMASKR);
+coresight_stm_simple_func(spscr, STMSPSCR);
+coresight_stm_simple_func(spmscr, STMSPMSCR);
+coresight_stm_simple_func(spfeat1r, STMSPFEAT1R);
+coresight_stm_simple_func(spfeat2r, STMSPFEAT2R);
+coresight_stm_simple_func(spfeat3r, STMSPFEAT3R);
+coresight_stm_simple_func(devid, CORESIGHT_DEVID);
+
+static struct attribute *coresight_stm_attrs[] = {
+       &dev_attr_hwevent_enable.attr,
+       &dev_attr_hwevent_select.attr,
+       &dev_attr_port_enable.attr,
+       &dev_attr_port_select.attr,
+       &dev_attr_traceid.attr,
+       NULL,
+};
+
+static struct attribute *coresight_stm_mgmt_attrs[] = {
+       &dev_attr_tcsr.attr,
+       &dev_attr_tsfreqr.attr,
+       &dev_attr_syncr.attr,
+       &dev_attr_sper.attr,
+       &dev_attr_spter.attr,
+       &dev_attr_privmaskr.attr,
+       &dev_attr_spscr.attr,
+       &dev_attr_spmscr.attr,
+       &dev_attr_spfeat1r.attr,
+       &dev_attr_spfeat2r.attr,
+       &dev_attr_spfeat3r.attr,
+       &dev_attr_devid.attr,
+       NULL,
+};
+
+static const struct attribute_group coresight_stm_group = {
+       .attrs = coresight_stm_attrs,
+};
+
+static const struct attribute_group coresight_stm_mgmt_group = {
+       .attrs = coresight_stm_mgmt_attrs,
+       .name = "mgmt",
+};
+
+static const struct attribute_group *coresight_stm_groups[] = {
+       &coresight_stm_group,
+       &coresight_stm_mgmt_group,
+       NULL,
+};
+
+static int stm_get_resource_byname(struct device_node *np,
+                                  char *ch_base, struct resource *res)
+{
+       const char *name = NULL;
+       int index = 0, found = 0;
+
+       while (!of_property_read_string_index(np, "reg-names", index, &name)) {
+               if (strcmp(ch_base, name)) {
+                       index++;
+                       continue;
+               }
+
+               /* We have a match and @index is where it's at */
+               found = 1;
+               break;
+       }
+
+       if (!found)
+               return -EINVAL;
+
+       return of_address_to_resource(np, index, res);
+}
+
+static u32 stm_fundamental_data_size(struct stm_drvdata *drvdata)
+{
+       u32 stmspfeat2r;
+
+       if (!IS_ENABLED(CONFIG_64BIT))
+               return 4;
+
+       stmspfeat2r = readl_relaxed(drvdata->base + STMSPFEAT2R);
+
+       /*
+        * bit[15:12] represents the fundamental data size
+        * 0 - 32-bit data
+        * 1 - 64-bit data
+        */
+       return BMVAL(stmspfeat2r, 12, 15) ? 8 : 4;
+}
+
+static u32 stm_num_stimulus_port(struct stm_drvdata *drvdata)
+{
+       u32 numsp;
+
+       numsp = readl_relaxed(drvdata->base + CORESIGHT_DEVID);
+       /*
+        * NUMPS in STMDEVID is 17 bit long and if equal to 0x0,
+        * 32 stimulus ports are supported.
+        */
+       numsp &= 0x1ffff;
+       if (!numsp)
+               numsp = STM_32_CHANNEL;
+       return numsp;
+}
+
+static void stm_init_default_data(struct stm_drvdata *drvdata)
+{
+       /* Don't use port selection */
+       drvdata->stmspscr = 0x0;
+       /*
+        * Enable all channel regardless of their number.  When port
+        * selection isn't used (see above) STMSPER applies to all
+        * 32 channel group available, hence setting all 32 bits to 1
+        */
+       drvdata->stmsper = ~0x0;
+
+       /*
+        * The trace ID value for *ETM* tracers start at CPU_ID * 2 + 0x10 and
+        * anything equal to or higher than 0x70 is reserved.  Since 0x00 is
+        * also reserved the STM trace ID needs to be higher than 0x00 and
+        * lowner than 0x10.
+        */
+       drvdata->traceid = 0x1;
+
+       /* Set invariant transaction timing on all channels */
+       bitmap_clear(drvdata->chs.guaranteed, 0, drvdata->numsp);
+}
+
+static void stm_init_generic_data(struct stm_drvdata *drvdata)
+{
+       drvdata->stm.name = dev_name(drvdata->dev);
+
+       /*
+        * MasterIDs are assigned at HW design phase. As such the core is
+        * using a single master for interaction with this device.
+        */
+       drvdata->stm.sw_start = 1;
+       drvdata->stm.sw_end = 1;
+       drvdata->stm.hw_override = true;
+       drvdata->stm.sw_nchannels = drvdata->numsp;
+       drvdata->stm.packet = stm_generic_packet;
+       drvdata->stm.link = stm_generic_link;
+       drvdata->stm.unlink = stm_generic_unlink;
+       drvdata->stm.set_options = stm_generic_set_options;
+}
+
+static int stm_probe(struct amba_device *adev, const struct amba_id *id)
+{
+       int ret;
+       void __iomem *base;
+       unsigned long *guaranteed;
+       struct device *dev = &adev->dev;
+       struct coresight_platform_data *pdata = NULL;
+       struct stm_drvdata *drvdata;
+       struct resource *res = &adev->res;
+       struct resource ch_res;
+       size_t res_size, bitmap_size;
+       struct coresight_desc *desc;
+       struct device_node *np = adev->dev.of_node;
+
+       if (np) {
+               pdata = of_get_coresight_platform_data(dev, np);
+               if (IS_ERR(pdata))
+                       return PTR_ERR(pdata);
+               adev->dev.platform_data = pdata;
+       }
+       drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL);
+       if (!drvdata)
+               return -ENOMEM;
+
+       drvdata->dev = &adev->dev;
+       drvdata->atclk = devm_clk_get(&adev->dev, "atclk"); /* optional */
+       if (!IS_ERR(drvdata->atclk)) {
+               ret = clk_prepare_enable(drvdata->atclk);
+               if (ret)
+                       return ret;
+       }
+       dev_set_drvdata(dev, drvdata);
+
+       base = devm_ioremap_resource(dev, res);
+       if (IS_ERR(base))
+               return PTR_ERR(base);
+       drvdata->base = base;
+
+       ret = stm_get_resource_byname(np, "stm-stimulus-base", &ch_res);
+       if (ret)
+               return ret;
+
+       base = devm_ioremap_resource(dev, &ch_res);
+       if (IS_ERR(base))
+               return PTR_ERR(base);
+       drvdata->chs.base = base;
+
+       drvdata->write_bytes = stm_fundamental_data_size(drvdata);
+
+       if (boot_nr_channel) {
+               drvdata->numsp = boot_nr_channel;
+               res_size = min((resource_size_t)(boot_nr_channel *
+                                 BYTES_PER_CHANNEL), resource_size(res));
+       } else {
+               drvdata->numsp = stm_num_stimulus_port(drvdata);
+               res_size = min((resource_size_t)(drvdata->numsp *
+                                BYTES_PER_CHANNEL), resource_size(res));
+       }
+       bitmap_size = BITS_TO_LONGS(drvdata->numsp) * sizeof(long);
+
+       guaranteed = devm_kzalloc(dev, bitmap_size, GFP_KERNEL);
+       if (!guaranteed)
+               return -ENOMEM;
+       drvdata->chs.guaranteed = guaranteed;
+
+       spin_lock_init(&drvdata->spinlock);
+
+       stm_init_default_data(drvdata);
+       stm_init_generic_data(drvdata);
+
+       if (stm_register_device(dev, &drvdata->stm, THIS_MODULE)) {
+               dev_info(dev,
+                        "stm_register_device failed, probing deffered\n");
+               return -EPROBE_DEFER;
+       }
+
+       desc = devm_kzalloc(dev, sizeof(*desc), GFP_KERNEL);
+       if (!desc) {
+               ret = -ENOMEM;
+               goto stm_unregister;
+       }
+
+       desc->type = CORESIGHT_DEV_TYPE_SOURCE;
+       desc->subtype.source_subtype = CORESIGHT_DEV_SUBTYPE_SOURCE_SOFTWARE;
+       desc->ops = &stm_cs_ops;
+       desc->pdata = pdata;
+       desc->dev = dev;
+       desc->groups = coresight_stm_groups;
+       drvdata->csdev = coresight_register(desc);
+       if (IS_ERR(drvdata->csdev)) {
+               ret = PTR_ERR(drvdata->csdev);
+               goto stm_unregister;
+       }
+
+       pm_runtime_put(&adev->dev);
+
+       dev_info(dev, "%s initialized\n", (char *)id->data);
+       return 0;
+
+stm_unregister:
+       stm_unregister_device(&drvdata->stm);
+       return ret;
+}
+
+#ifdef CONFIG_PM
+static int stm_runtime_suspend(struct device *dev)
+{
+       struct stm_drvdata *drvdata = dev_get_drvdata(dev);
+
+       if (drvdata && !IS_ERR(drvdata->atclk))
+               clk_disable_unprepare(drvdata->atclk);
+
+       return 0;
+}
+
+static int stm_runtime_resume(struct device *dev)
+{
+       struct stm_drvdata *drvdata = dev_get_drvdata(dev);
+
+       if (drvdata && !IS_ERR(drvdata->atclk))
+               clk_prepare_enable(drvdata->atclk);
+
+       return 0;
+}
+#endif
+
+static const struct dev_pm_ops stm_dev_pm_ops = {
+       SET_RUNTIME_PM_OPS(stm_runtime_suspend, stm_runtime_resume, NULL)
+};
+
+static struct amba_id stm_ids[] = {
+       {
+               .id     = 0x0003b962,
+               .mask   = 0x0003ffff,
+               .data   = "STM32",
+       },
+       { 0, 0},
+};
+
+static struct amba_driver stm_driver = {
+       .drv = {
+               .name   = "coresight-stm",
+               .owner  = THIS_MODULE,
+               .pm     = &stm_dev_pm_ops,
+               .suppress_bind_attrs = true,
+       },
+       .probe          = stm_probe,
+       .id_table       = stm_ids,
+};
+
+builtin_amba_driver(stm_driver);
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c b/drivers/hwtracing/coresight/coresight-tmc-etf.c

new file mode 100644 (file)

index 0000000..466af86
--- /dev/null
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -0,0 +1,604 @@
+/*
+ * Copyright(C) 2016 Linaro Limited. All rights reserved.
+ * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/circ_buf.h>
+#include <linux/coresight.h>
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include "coresight-priv.h"
+#include "coresight-tmc.h"
+
+void tmc_etb_enable_hw(struct tmc_drvdata *drvdata)
+{
+       CS_UNLOCK(drvdata->base);
+
+       /* Wait for TMCSReady bit to be set */
+       tmc_wait_for_tmcready(drvdata);
+
+       writel_relaxed(TMC_MODE_CIRCULAR_BUFFER, drvdata->base + TMC_MODE);
+       writel_relaxed(TMC_FFCR_EN_FMT | TMC_FFCR_EN_TI |
+                      TMC_FFCR_FON_FLIN | TMC_FFCR_FON_TRIG_EVT |
+                      TMC_FFCR_TRIGON_TRIGIN,
+                      drvdata->base + TMC_FFCR);
+
+       writel_relaxed(drvdata->trigger_cntr, drvdata->base + TMC_TRG);
+       tmc_enable_hw(drvdata);
+
+       CS_LOCK(drvdata->base);
+}
+
+static void tmc_etb_dump_hw(struct tmc_drvdata *drvdata)
+{
+       char *bufp;
+       u32 read_data;
+       int i;
+
+       bufp = drvdata->buf;
+       while (1) {
+               for (i = 0; i < drvdata->memwidth; i++) {
+                       read_data = readl_relaxed(drvdata->base + TMC_RRD);
+                       if (read_data == 0xFFFFFFFF)
+                               return;
+                       memcpy(bufp, &read_data, 4);
+                       bufp += 4;
+               }
+       }
+}
+
+static void tmc_etb_disable_hw(struct tmc_drvdata *drvdata)
+{
+       CS_UNLOCK(drvdata->base);
+
+       tmc_flush_and_stop(drvdata);
+       /*
+        * When operating in sysFS mode the content of the buffer needs to be
+        * read before the TMC is disabled.
+        */
+       if (local_read(&drvdata->mode) == CS_MODE_SYSFS)
+               tmc_etb_dump_hw(drvdata);
+       tmc_disable_hw(drvdata);
+
+       CS_LOCK(drvdata->base);
+}
+
+static void tmc_etf_enable_hw(struct tmc_drvdata *drvdata)
+{
+       CS_UNLOCK(drvdata->base);
+
+       /* Wait for TMCSReady bit to be set */
+       tmc_wait_for_tmcready(drvdata);
+
+       writel_relaxed(TMC_MODE_HARDWARE_FIFO, drvdata->base + TMC_MODE);
+       writel_relaxed(TMC_FFCR_EN_FMT | TMC_FFCR_EN_TI,
+                      drvdata->base + TMC_FFCR);
+       writel_relaxed(0x0, drvdata->base + TMC_BUFWM);
+       tmc_enable_hw(drvdata);
+
+       CS_LOCK(drvdata->base);
+}
+
+static void tmc_etf_disable_hw(struct tmc_drvdata *drvdata)
+{
+       CS_UNLOCK(drvdata->base);
+
+       tmc_flush_and_stop(drvdata);
+       tmc_disable_hw(drvdata);
+
+       CS_LOCK(drvdata->base);
+}
+
+static int tmc_enable_etf_sink_sysfs(struct coresight_device *csdev, u32 mode)
+{
+       int ret = 0;
+       bool used = false;
+       char *buf = NULL;
+       long val;
+       unsigned long flags;
+       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+        /* This shouldn't be happening */
+       if (WARN_ON(mode != CS_MODE_SYSFS))
+               return -EINVAL;
+
+       /*
+        * If we don't have a buffer release the lock and allocate memory.
+        * Otherwise keep the lock and move along.
+        */
+       spin_lock_irqsave(&drvdata->spinlock, flags);
+       if (!drvdata->buf) {
+               spin_unlock_irqrestore(&drvdata->spinlock, flags);
+
+               /* Allocating the memory here while outside of the spinlock */
+               buf = kzalloc(drvdata->size, GFP_KERNEL);
+               if (!buf)
+                       return -ENOMEM;
+
+               /* Let's try again */
+               spin_lock_irqsave(&drvdata->spinlock, flags);
+       }
+
+       if (drvdata->reading) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       val = local_xchg(&drvdata->mode, mode);
+       /*
+        * In sysFS mode we can have multiple writers per sink.  Since this
+        * sink is already enabled no memory is needed and the HW need not be
+        * touched.
+        */
+       if (val == CS_MODE_SYSFS)
+               goto out;
+
+       /*
+        * If drvdata::buf isn't NULL, memory was allocated for a previous
+        * trace run but wasn't read.  If so simply zero-out the memory.
+        * Otherwise use the memory allocated above.
+        *
+        * The memory is freed when users read the buffer using the
+        * /dev/xyz.{etf|etb} interface.  See tmc_read_unprepare_etf() for
+        * details.
+        */
+       if (drvdata->buf) {
+               memset(drvdata->buf, 0, drvdata->size);
+       } else {
+               used = true;
+               drvdata->buf = buf;
+       }
+
+       tmc_etb_enable_hw(drvdata);
+out:
+       spin_unlock_irqrestore(&drvdata->spinlock, flags);
+
+       /* Free memory outside the spinlock if need be */
+       if (!used && buf)
+               kfree(buf);
+
+       if (!ret)
+               dev_info(drvdata->dev, "TMC-ETB/ETF enabled\n");
+
+       return ret;
+}
+
+static int tmc_enable_etf_sink_perf(struct coresight_device *csdev, u32 mode)
+{
+       int ret = 0;
+       long val;
+       unsigned long flags;
+       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+        /* This shouldn't be happening */
+       if (WARN_ON(mode != CS_MODE_PERF))
+               return -EINVAL;
+
+       spin_lock_irqsave(&drvdata->spinlock, flags);
+       if (drvdata->reading) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       val = local_xchg(&drvdata->mode, mode);
+       /*
+        * In Perf mode there can be only one writer per sink.  There
+        * is also no need to continue if the ETB/ETR is already operated
+        * from sysFS.
+        */
+       if (val != CS_MODE_DISABLED) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       tmc_etb_enable_hw(drvdata);
+out:
+       spin_unlock_irqrestore(&drvdata->spinlock, flags);
+
+       return ret;
+}
+
+static int tmc_enable_etf_sink(struct coresight_device *csdev, u32 mode)
+{
+       switch (mode) {
+       case CS_MODE_SYSFS:
+               return tmc_enable_etf_sink_sysfs(csdev, mode);
+       case CS_MODE_PERF:
+               return tmc_enable_etf_sink_perf(csdev, mode);
+       }
+
+       /* We shouldn't be here */
+       return -EINVAL;
+}
+
+static void tmc_disable_etf_sink(struct coresight_device *csdev)
+{
+       long val;
+       unsigned long flags;
+       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+       spin_lock_irqsave(&drvdata->spinlock, flags);
+       if (drvdata->reading) {
+               spin_unlock_irqrestore(&drvdata->spinlock, flags);
+               return;
+       }
+
+       val = local_xchg(&drvdata->mode, CS_MODE_DISABLED);
+       /* Disable the TMC only if it needs to */
+       if (val != CS_MODE_DISABLED)
+               tmc_etb_disable_hw(drvdata);
+
+       spin_unlock_irqrestore(&drvdata->spinlock, flags);
+
+       dev_info(drvdata->dev, "TMC-ETB/ETF disabled\n");
+}
+
+static int tmc_enable_etf_link(struct coresight_device *csdev,
+                              int inport, int outport)
+{
+       unsigned long flags;
+       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+       spin_lock_irqsave(&drvdata->spinlock, flags);
+       if (drvdata->reading) {
+               spin_unlock_irqrestore(&drvdata->spinlock, flags);
+               return -EBUSY;
+       }
+
+       tmc_etf_enable_hw(drvdata);
+       local_set(&drvdata->mode, CS_MODE_SYSFS);
+       spin_unlock_irqrestore(&drvdata->spinlock, flags);
+
+       dev_info(drvdata->dev, "TMC-ETF enabled\n");
+       return 0;
+}
+
+static void tmc_disable_etf_link(struct coresight_device *csdev,
+                                int inport, int outport)
+{
+       unsigned long flags;
+       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+       spin_lock_irqsave(&drvdata->spinlock, flags);
+       if (drvdata->reading) {
+               spin_unlock_irqrestore(&drvdata->spinlock, flags);
+               return;
+       }
+
+       tmc_etf_disable_hw(drvdata);
+       local_set(&drvdata->mode, CS_MODE_DISABLED);
+       spin_unlock_irqrestore(&drvdata->spinlock, flags);
+
+       dev_info(drvdata->dev, "TMC disabled\n");
+}
+
+static void *tmc_alloc_etf_buffer(struct coresight_device *csdev, int cpu,
+                                 void **pages, int nr_pages, bool overwrite)
+{
+       int node;
+       struct cs_buffers *buf;
+
+       if (cpu == -1)
+               cpu = smp_processor_id();
+       node = cpu_to_node(cpu);
+
+       /* Allocate memory structure for interaction with Perf */
+       buf = kzalloc_node(sizeof(struct cs_buffers), GFP_KERNEL, node);
+       if (!buf)
+               return NULL;
+
+       buf->snapshot = overwrite;
+       buf->nr_pages = nr_pages;
+       buf->data_pages = pages;
+
+       return buf;
+}
+
+static void tmc_free_etf_buffer(void *config)
+{
+       struct cs_buffers *buf = config;
+
+       kfree(buf);
+}
+
+static int tmc_set_etf_buffer(struct coresight_device *csdev,
+                             struct perf_output_handle *handle,
+                             void *sink_config)
+{
+       int ret = 0;
+       unsigned long head;
+       struct cs_buffers *buf = sink_config;
+
+       /* wrap head around to the amount of space we have */
+       head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
+
+       /* find the page to write to */
+       buf->cur = head / PAGE_SIZE;
+
+       /* and offset within that page */
+       buf->offset = head % PAGE_SIZE;
+
+       local_set(&buf->data_size, 0);
+
+       return ret;
+}
+
+static unsigned long tmc_reset_etf_buffer(struct coresight_device *csdev,
+                                         struct perf_output_handle *handle,
+                                         void *sink_config, bool *lost)
+{
+       long size = 0;
+       struct cs_buffers *buf = sink_config;
+
+       if (buf) {
+               /*
+                * In snapshot mode ->data_size holds the new address of the
+                * ring buffer's head.  The size itself is the whole address
+                * range since we want the latest information.
+                */
+               if (buf->snapshot)
+                       handle->head = local_xchg(&buf->data_size,
+                                                 buf->nr_pages << PAGE_SHIFT);
+               /*
+                * Tell the tracer PMU how much we got in this run and if
+                * something went wrong along the way.  Nobody else can use
+                * this cs_buffers instance until we are done.  As such
+                * resetting parameters here and squaring off with the ring
+                * buffer API in the tracer PMU is fine.
+                */
+               *lost = !!local_xchg(&buf->lost, 0);
+               size = local_xchg(&buf->data_size, 0);
+       }
+
+       return size;
+}
+
+static void tmc_update_etf_buffer(struct coresight_device *csdev,
+                                 struct perf_output_handle *handle,
+                                 void *sink_config)
+{
+       int i, cur;
+       u32 *buf_ptr;
+       u32 read_ptr, write_ptr;
+       u32 status, to_read;
+       unsigned long offset;
+       struct cs_buffers *buf = sink_config;
+       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+       if (!buf)
+               return;
+
+       /* This shouldn't happen */
+       if (WARN_ON_ONCE(local_read(&drvdata->mode) != CS_MODE_PERF))
+               return;
+
+       CS_UNLOCK(drvdata->base);
+
+       tmc_flush_and_stop(drvdata);
+
+       read_ptr = readl_relaxed(drvdata->base + TMC_RRP);
+       write_ptr = readl_relaxed(drvdata->base + TMC_RWP);
+
+       /*
+        * Get a hold of the status register and see if a wrap around
+        * has occurred.  If so adjust things accordingly.
+        */
+       status = readl_relaxed(drvdata->base + TMC_STS);
+       if (status & TMC_STS_FULL) {
+               local_inc(&buf->lost);
+               to_read = drvdata->size;
+       } else {
+               to_read = CIRC_CNT(write_ptr, read_ptr, drvdata->size);
+       }
+
+       /*
+        * The TMC RAM buffer may be bigger than the space available in the
+        * perf ring buffer (handle->size).  If so advance the RRP so that we
+        * get the latest trace data.
+        */
+       if (to_read > handle->size) {
+               u32 mask = 0;
+
+               /*
+                * The value written to RRP must be byte-address aligned to
+                * the width of the trace memory databus _and_ to a frame
+                * boundary (16 byte), whichever is the biggest. For example,
+                * for 32-bit, 64-bit and 128-bit wide trace memory, the four
+                * LSBs must be 0s. For 256-bit wide trace memory, the five
+                * LSBs must be 0s.
+                */
+               switch (drvdata->memwidth) {
+               case TMC_MEM_INTF_WIDTH_32BITS:
+               case TMC_MEM_INTF_WIDTH_64BITS:
+               case TMC_MEM_INTF_WIDTH_128BITS:
+                       mask = GENMASK(31, 5);
+                       break;
+               case TMC_MEM_INTF_WIDTH_256BITS:
+                       mask = GENMASK(31, 6);
+                       break;
+               }
+
+               /*
+                * Make sure the new size is aligned in accordance with the
+                * requirement explained above.
+                */
+               to_read = handle->size & mask;
+               /* Move the RAM read pointer up */
+               read_ptr = (write_ptr + drvdata->size) - to_read;
+               /* Make sure we are still within our limits */
+               if (read_ptr > (drvdata->size - 1))
+                       read_ptr -= drvdata->size;
+               /* Tell the HW */
+               writel_relaxed(read_ptr, drvdata->base + TMC_RRP);
+               local_inc(&buf->lost);
+       }
+
+       cur = buf->cur;
+       offset = buf->offset;
+
+       /* for every byte to read */
+       for (i = 0; i < to_read; i += 4) {
+               buf_ptr = buf->data_pages[cur] + offset;
+               *buf_ptr = readl_relaxed(drvdata->base + TMC_RRD);
+
+               offset += 4;
+               if (offset >= PAGE_SIZE) {
+                       offset = 0;
+                       cur++;
+                       /* wrap around at the end of the buffer */
+                       cur &= buf->nr_pages - 1;
+               }
+       }
+
+       /*
+        * In snapshot mode all we have to do is communicate to
+        * perf_aux_output_end() the address of the current head.  In full
+        * trace mode the same function expects a size to move rb->aux_head
+        * forward.
+        */
+       if (buf->snapshot)
+               local_set(&buf->data_size, (cur * PAGE_SIZE) + offset);
+       else
+               local_add(to_read, &buf->data_size);
+
+       CS_LOCK(drvdata->base);
+}
+
+static const struct coresight_ops_sink tmc_etf_sink_ops = {
+       .enable         = tmc_enable_etf_sink,
+       .disable        = tmc_disable_etf_sink,
+       .alloc_buffer   = tmc_alloc_etf_buffer,
+       .free_buffer    = tmc_free_etf_buffer,
+       .set_buffer     = tmc_set_etf_buffer,
+       .reset_buffer   = tmc_reset_etf_buffer,
+       .update_buffer  = tmc_update_etf_buffer,
+};
+
+static const struct coresight_ops_link tmc_etf_link_ops = {
+       .enable         = tmc_enable_etf_link,
+       .disable        = tmc_disable_etf_link,
+};
+
+const struct coresight_ops tmc_etb_cs_ops = {
+       .sink_ops       = &tmc_etf_sink_ops,
+};
+
+const struct coresight_ops tmc_etf_cs_ops = {
+       .sink_ops       = &tmc_etf_sink_ops,
+       .link_ops       = &tmc_etf_link_ops,
+};
+
+int tmc_read_prepare_etb(struct tmc_drvdata *drvdata)
+{
+       long val;
+       enum tmc_mode mode;
+       int ret = 0;
+       unsigned long flags;
+
+       /* config types are set a boot time and never change */
+       if (WARN_ON_ONCE(drvdata->config_type != TMC_CONFIG_TYPE_ETB &&
+                        drvdata->config_type != TMC_CONFIG_TYPE_ETF))
+               return -EINVAL;
+
+       spin_lock_irqsave(&drvdata->spinlock, flags);
+
+       if (drvdata->reading) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       /* There is no point in reading a TMC in HW FIFO mode */
+       mode = readl_relaxed(drvdata->base + TMC_MODE);
+       if (mode != TMC_MODE_CIRCULAR_BUFFER) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       val = local_read(&drvdata->mode);
+       /* Don't interfere if operated from Perf */
+       if (val == CS_MODE_PERF) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /* If drvdata::buf is NULL the trace data has been read already */
+       if (drvdata->buf == NULL) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /* Disable the TMC if need be */
+       if (val == CS_MODE_SYSFS)
+               tmc_etb_disable_hw(drvdata);
+
+       drvdata->reading = true;
+out:
+       spin_unlock_irqrestore(&drvdata->spinlock, flags);
+
+       return ret;
+}
+
+int tmc_read_unprepare_etb(struct tmc_drvdata *drvdata)
+{
+       char *buf = NULL;
+       enum tmc_mode mode;
+       unsigned long flags;
+
+       /* config types are set a boot time and never change */
+       if (WARN_ON_ONCE(drvdata->config_type != TMC_CONFIG_TYPE_ETB &&
+                        drvdata->config_type != TMC_CONFIG_TYPE_ETF))
+               return -EINVAL;
+
+       spin_lock_irqsave(&drvdata->spinlock, flags);
+
+       /* There is no point in reading a TMC in HW FIFO mode */
+       mode = readl_relaxed(drvdata->base + TMC_MODE);
+       if (mode != TMC_MODE_CIRCULAR_BUFFER) {
+               spin_unlock_irqrestore(&drvdata->spinlock, flags);
+               return -EINVAL;
+       }
+
+       /* Re-enable the TMC if need be */
+       if (local_read(&drvdata->mode) == CS_MODE_SYSFS) {
+               /*
+                * The trace run will continue with the same allocated trace
+                * buffer. As such zero-out the buffer so that we don't end
+                * up with stale data.
+                *
+                * Since the tracer is still enabled drvdata::buf
+                * can't be NULL.
+                */
+               memset(drvdata->buf, 0, drvdata->size);
+               tmc_etb_enable_hw(drvdata);
+       } else {
+               /*
+                * The ETB/ETF is not tracing and the buffer was just read.
+                * As such prepare to free the trace buffer.
+                */
+               buf = drvdata->buf;
+               drvdata->buf = NULL;
+       }
+
+       drvdata->reading = false;
+       spin_unlock_irqrestore(&drvdata->spinlock, flags);
+
+       /*
+        * Free allocated memory outside of the spinlock.  There is no need
+        * to assert the validity of 'buf' since calling kfree(NULL) is safe.
+        */
+       kfree(buf);
+
+       return 0;
+}
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c

new file mode 100644 (file)

index 0000000..ba54e19
--- /dev/null
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -0,0 +1,569 @@
+/*
+ * Copyright(C) 2016 Linaro Limited. All rights reserved.
+ * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/circ_buf.h>
+#include <linux/coresight.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+
+#include "coresight-priv.h"
+#include "coresight-tmc.h"
+
+/**
+ * struct cs_etr_buffer - keep track of a recording session' specifics
+ * @tmc:       generic portion of the TMC buffers
+ * @paddr:     the physical address of a DMA'able contiguous memory area
+ * @vaddr:     the virtual address associated to @paddr
+ * @size:      how much memory we have, starting at @paddr
+ * @dev:       the device @vaddr has been tied to
+ */
+struct cs_etr_buffers {
+       struct cs_buffers       tmc;
+       dma_addr_t              paddr;
+       void __iomem            *vaddr;
+       u32                     size;
+       struct device           *dev;
+};
+
+void tmc_etr_enable_hw(struct tmc_drvdata *drvdata)
+{
+       u32 axictl;
+
+       /* Zero out the memory to help with debug */
+       memset(drvdata->vaddr, 0, drvdata->size);
+
+       CS_UNLOCK(drvdata->base);
+
+       /* Wait for TMCSReady bit to be set */
+       tmc_wait_for_tmcready(drvdata);
+
+       writel_relaxed(drvdata->size / 4, drvdata->base + TMC_RSZ);
+       writel_relaxed(TMC_MODE_CIRCULAR_BUFFER, drvdata->base + TMC_MODE);
+
+       axictl = readl_relaxed(drvdata->base + TMC_AXICTL);
+       axictl |= TMC_AXICTL_WR_BURST_16;
+       writel_relaxed(axictl, drvdata->base + TMC_AXICTL);
+       axictl &= ~TMC_AXICTL_SCT_GAT_MODE;
+       writel_relaxed(axictl, drvdata->base + TMC_AXICTL);
+       axictl = (axictl &
+                 ~(TMC_AXICTL_PROT_CTL_B0 | TMC_AXICTL_PROT_CTL_B1)) |
+                 TMC_AXICTL_PROT_CTL_B1;
+       writel_relaxed(axictl, drvdata->base + TMC_AXICTL);
+
+       writel_relaxed(drvdata->paddr, drvdata->base + TMC_DBALO);
+       writel_relaxed(0x0, drvdata->base + TMC_DBAHI);
+       writel_relaxed(TMC_FFCR_EN_FMT | TMC_FFCR_EN_TI |
+                      TMC_FFCR_FON_FLIN | TMC_FFCR_FON_TRIG_EVT |
+                      TMC_FFCR_TRIGON_TRIGIN,
+                      drvdata->base + TMC_FFCR);
+       writel_relaxed(drvdata->trigger_cntr, drvdata->base + TMC_TRG);
+       tmc_enable_hw(drvdata);
+
+       CS_LOCK(drvdata->base);
+}
+
+static void tmc_etr_dump_hw(struct tmc_drvdata *drvdata)
+{
+       u32 rwp, val;
+
+       rwp = readl_relaxed(drvdata->base + TMC_RWP);
+       val = readl_relaxed(drvdata->base + TMC_STS);
+
+       /* How much memory do we still have */
+       if (val & BIT(0))
+               drvdata->buf = drvdata->vaddr + rwp - drvdata->paddr;
+       else
+               drvdata->buf = drvdata->vaddr;
+}
+
+static void tmc_etr_disable_hw(struct tmc_drvdata *drvdata)
+{
+       CS_UNLOCK(drvdata->base);
+
+       tmc_flush_and_stop(drvdata);
+       /*
+        * When operating in sysFS mode the content of the buffer needs to be
+        * read before the TMC is disabled.
+        */
+       if (local_read(&drvdata->mode) == CS_MODE_SYSFS)
+               tmc_etr_dump_hw(drvdata);
+       tmc_disable_hw(drvdata);
+
+       CS_LOCK(drvdata->base);
+}
+
+static int tmc_enable_etr_sink_sysfs(struct coresight_device *csdev, u32 mode)
+{
+       int ret = 0;
+       bool used = false;
+       long val;
+       unsigned long flags;
+       void __iomem *vaddr = NULL;
+       dma_addr_t paddr;
+       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+        /* This shouldn't be happening */
+       if (WARN_ON(mode != CS_MODE_SYSFS))
+               return -EINVAL;
+
+       /*
+        * If we don't have a buffer release the lock and allocate memory.
+        * Otherwise keep the lock and move along.
+        */
+       spin_lock_irqsave(&drvdata->spinlock, flags);
+       if (!drvdata->vaddr) {
+               spin_unlock_irqrestore(&drvdata->spinlock, flags);
+
+               /*
+                * Contiguous  memory can't be allocated while a spinlock is
+                * held.  As such allocate memory here and free it if a buffer
+                * has already been allocated (from a previous session).
+                */
+               vaddr = dma_alloc_coherent(drvdata->dev, drvdata->size,
+                                          &paddr, GFP_KERNEL);
+               if (!vaddr)
+                       return -ENOMEM;
+
+               /* Let's try again */
+               spin_lock_irqsave(&drvdata->spinlock, flags);
+       }
+
+       if (drvdata->reading) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       val = local_xchg(&drvdata->mode, mode);
+       /*
+        * In sysFS mode we can have multiple writers per sink.  Since this
+        * sink is already enabled no memory is needed and the HW need not be
+        * touched.
+        */
+       if (val == CS_MODE_SYSFS)
+               goto out;
+
+       /*
+        * If drvdata::buf == NULL, use the memory allocated above.
+        * Otherwise a buffer still exists from a previous session, so
+        * simply use that.
+        */
+       if (drvdata->buf == NULL) {
+               used = true;
+               drvdata->vaddr = vaddr;
+               drvdata->paddr = paddr;
+               drvdata->buf = drvdata->vaddr;
+       }
+
+       memset(drvdata->vaddr, 0, drvdata->size);
+
+       tmc_etr_enable_hw(drvdata);
+out:
+       spin_unlock_irqrestore(&drvdata->spinlock, flags);
+
+       /* Free memory outside the spinlock if need be */
+       if (!used && vaddr)
+               dma_free_coherent(drvdata->dev, drvdata->size, vaddr, paddr);
+
+       if (!ret)
+               dev_info(drvdata->dev, "TMC-ETR enabled\n");
+
+       return ret;
+}
+
+static int tmc_enable_etr_sink_perf(struct coresight_device *csdev, u32 mode)
+{
+       int ret = 0;
+       long val;
+       unsigned long flags;
+       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+        /* This shouldn't be happening */
+       if (WARN_ON(mode != CS_MODE_PERF))
+               return -EINVAL;
+
+       spin_lock_irqsave(&drvdata->spinlock, flags);
+       if (drvdata->reading) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       val = local_xchg(&drvdata->mode, mode);
+       /*
+        * In Perf mode there can be only one writer per sink.  There
+        * is also no need to continue if the ETR is already operated
+        * from sysFS.
+        */
+       if (val != CS_MODE_DISABLED) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       tmc_etr_enable_hw(drvdata);
+out:
+       spin_unlock_irqrestore(&drvdata->spinlock, flags);
+
+       return ret;
+}
+
+static int tmc_enable_etr_sink(struct coresight_device *csdev, u32 mode)
+{
+       switch (mode) {
+       case CS_MODE_SYSFS:
+               return tmc_enable_etr_sink_sysfs(csdev, mode);
+       case CS_MODE_PERF:
+               return tmc_enable_etr_sink_perf(csdev, mode);
+       }
+
+       /* We shouldn't be here */
+       return -EINVAL;
+}
+
+static void tmc_disable_etr_sink(struct coresight_device *csdev)
+{
+       long val;
+       unsigned long flags;
+       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+       spin_lock_irqsave(&drvdata->spinlock, flags);
+       if (drvdata->reading) {
+               spin_unlock_irqrestore(&drvdata->spinlock, flags);
+               return;
+       }
+
+       val = local_xchg(&drvdata->mode, CS_MODE_DISABLED);
+       /* Disable the TMC only if it needs to */
+       if (val != CS_MODE_DISABLED)
+               tmc_etr_disable_hw(drvdata);
+
+       spin_unlock_irqrestore(&drvdata->spinlock, flags);
+
+       dev_info(drvdata->dev, "TMC-ETR disabled\n");
+}
+
+static void *tmc_alloc_etr_buffer(struct coresight_device *csdev, int cpu,
+                                 void **pages, int nr_pages, bool overwrite)
+{
+       int node;
+       struct cs_etr_buffers *buf;
+       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+       if (cpu == -1)
+               cpu = smp_processor_id();
+       node = cpu_to_node(cpu);
+
+       /* Allocate memory structure for interaction with Perf */
+       buf = kzalloc_node(sizeof(struct cs_etr_buffers), GFP_KERNEL, node);
+       if (!buf)
+               return NULL;
+
+       buf->dev = drvdata->dev;
+       buf->size = drvdata->size;
+       buf->vaddr = dma_alloc_coherent(buf->dev, buf->size,
+                                       &buf->paddr, GFP_KERNEL);
+       if (!buf->vaddr) {
+               kfree(buf);
+               return NULL;
+       }
+
+       buf->tmc.snapshot = overwrite;
+       buf->tmc.nr_pages = nr_pages;
+       buf->tmc.data_pages = pages;
+
+       return buf;
+}
+
+static void tmc_free_etr_buffer(void *config)
+{
+       struct cs_etr_buffers *buf = config;
+
+       dma_free_coherent(buf->dev, buf->size, buf->vaddr, buf->paddr);
+       kfree(buf);
+}
+
+static int tmc_set_etr_buffer(struct coresight_device *csdev,
+                             struct perf_output_handle *handle,
+                             void *sink_config)
+{
+       int ret = 0;
+       unsigned long head;
+       struct cs_etr_buffers *buf = sink_config;
+       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+       /* wrap head around to the amount of space we have */
+       head = handle->head & ((buf->tmc.nr_pages << PAGE_SHIFT) - 1);
+
+       /* find the page to write to */
+       buf->tmc.cur = head / PAGE_SIZE;
+
+       /* and offset within that page */
+       buf->tmc.offset = head % PAGE_SIZE;
+
+       local_set(&buf->tmc.data_size, 0);
+
+       /* Tell the HW where to put the trace data */
+       drvdata->vaddr = buf->vaddr;
+       drvdata->paddr = buf->paddr;
+       memset(drvdata->vaddr, 0, drvdata->size);
+
+       return ret;
+}
+
+static unsigned long tmc_reset_etr_buffer(struct coresight_device *csdev,
+                                         struct perf_output_handle *handle,
+                                         void *sink_config, bool *lost)
+{
+       long size = 0;
+       struct cs_etr_buffers *buf = sink_config;
+       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+       if (buf) {
+               /*
+                * In snapshot mode ->data_size holds the new address of the
+                * ring buffer's head.  The size itself is the whole address
+                * range since we want the latest information.
+                */
+               if (buf->tmc.snapshot) {
+                       size = buf->tmc.nr_pages << PAGE_SHIFT;
+                       handle->head = local_xchg(&buf->tmc.data_size, size);
+               }
+
+               /*
+                * Tell the tracer PMU how much we got in this run and if
+                * something went wrong along the way.  Nobody else can use
+                * this cs_etr_buffers instance until we are done.  As such
+                * resetting parameters here and squaring off with the ring
+                * buffer API in the tracer PMU is fine.
+                */
+               *lost = !!local_xchg(&buf->tmc.lost, 0);
+               size = local_xchg(&buf->tmc.data_size, 0);
+       }
+
+       /* Get ready for another run */
+       drvdata->vaddr = NULL;
+       drvdata->paddr = 0;
+
+       return size;
+}
+
+static void tmc_update_etr_buffer(struct coresight_device *csdev,
+                                 struct perf_output_handle *handle,
+                                 void *sink_config)
+{
+       int i, cur;
+       u32 *buf_ptr;
+       u32 read_ptr, write_ptr;
+       u32 status, to_read;
+       unsigned long offset;
+       struct cs_buffers *buf = sink_config;
+       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+
+       if (!buf)
+               return;
+
+       /* This shouldn't happen */
+       if (WARN_ON_ONCE(local_read(&drvdata->mode) != CS_MODE_PERF))
+               return;
+
+       CS_UNLOCK(drvdata->base);
+
+       tmc_flush_and_stop(drvdata);
+
+       read_ptr = readl_relaxed(drvdata->base + TMC_RRP);
+       write_ptr = readl_relaxed(drvdata->base + TMC_RWP);
+
+       /*
+        * Get a hold of the status register and see if a wrap around
+        * has occurred.  If so adjust things accordingly.
+        */
+       status = readl_relaxed(drvdata->base + TMC_STS);
+       if (status & TMC_STS_FULL) {
+               local_inc(&buf->lost);
+               to_read = drvdata->size;
+       } else {
+               to_read = CIRC_CNT(write_ptr, read_ptr, drvdata->size);
+       }
+
+       /*
+        * The TMC RAM buffer may be bigger than the space available in the
+        * perf ring buffer (handle->size).  If so advance the RRP so that we
+        * get the latest trace data.
+        */
+       if (to_read > handle->size) {
+               u32 buffer_start, mask = 0;
+
+               /* Read buffer start address in system memory */
+               buffer_start = readl_relaxed(drvdata->base + TMC_DBALO);
+
+               /*
+                * The value written to RRP must be byte-address aligned to
+                * the width of the trace memory databus _and_ to a frame
+                * boundary (16 byte), whichever is the biggest. For example,
+                * for 32-bit, 64-bit and 128-bit wide trace memory, the four
+                * LSBs must be 0s. For 256-bit wide trace memory, the five
+                * LSBs must be 0s.
+                */
+               switch (drvdata->memwidth) {
+               case TMC_MEM_INTF_WIDTH_32BITS:
+               case TMC_MEM_INTF_WIDTH_64BITS:
+               case TMC_MEM_INTF_WIDTH_128BITS:
+                       mask = GENMASK(31, 5);
+                       break;
+               case TMC_MEM_INTF_WIDTH_256BITS:
+                       mask = GENMASK(31, 6);
+                       break;
+               }
+
+               /*
+                * Make sure the new size is aligned in accordance with the
+                * requirement explained above.
+                */
+               to_read = handle->size & mask;
+               /* Move the RAM read pointer up */
+               read_ptr = (write_ptr + drvdata->size) - to_read;
+               /* Make sure we are still within our limits */
+               if (read_ptr > (buffer_start + (drvdata->size - 1)))
+                       read_ptr -= drvdata->size;
+               /* Tell the HW */
+               writel_relaxed(read_ptr, drvdata->base + TMC_RRP);
+               local_inc(&buf->lost);
+       }
+
+       cur = buf->cur;
+       offset = buf->offset;
+
+       /* for every byte to read */
+       for (i = 0; i < to_read; i += 4) {
+               buf_ptr = buf->data_pages[cur] + offset;
+               *buf_ptr = readl_relaxed(drvdata->base + TMC_RRD);
+
+               offset += 4;
+               if (offset >= PAGE_SIZE) {
+                       offset = 0;
+                       cur++;
+                       /* wrap around at the end of the buffer */
+                       cur &= buf->nr_pages - 1;
+               }
+       }
+
+       /*
+        * In snapshot mode all we have to do is communicate to
+        * perf_aux_output_end() the address of the current head.  In full
+        * trace mode the same function expects a size to move rb->aux_head
+        * forward.
+        */
+       if (buf->snapshot)
+               local_set(&buf->data_size, (cur * PAGE_SIZE) + offset);
+       else
+               local_add(to_read, &buf->data_size);
+
+       CS_LOCK(drvdata->base);
+}
+
+static const struct coresight_ops_sink tmc_etr_sink_ops = {
+       .enable         = tmc_enable_etr_sink,
+       .disable        = tmc_disable_etr_sink,
+       .alloc_buffer   = tmc_alloc_etr_buffer,
+       .free_buffer    = tmc_free_etr_buffer,
+       .set_buffer     = tmc_set_etr_buffer,
+       .reset_buffer   = tmc_reset_etr_buffer,
+       .update_buffer  = tmc_update_etr_buffer,
+};
+
+const struct coresight_ops tmc_etr_cs_ops = {
+       .sink_ops       = &tmc_etr_sink_ops,
+};
+
+int tmc_read_prepare_etr(struct tmc_drvdata *drvdata)
+{
+       int ret = 0;
+       long val;
+       unsigned long flags;
+
+       /* config types are set a boot time and never change */
+       if (WARN_ON_ONCE(drvdata->config_type != TMC_CONFIG_TYPE_ETR))
+               return -EINVAL;
+
+       spin_lock_irqsave(&drvdata->spinlock, flags);
+       if (drvdata->reading) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       val = local_read(&drvdata->mode);
+       /* Don't interfere if operated from Perf */
+       if (val == CS_MODE_PERF) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /* If drvdata::buf is NULL the trace data has been read already */
+       if (drvdata->buf == NULL) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /* Disable the TMC if need be */
+       if (val == CS_MODE_SYSFS)
+               tmc_etr_disable_hw(drvdata);
+
+       drvdata->reading = true;
+out:
+       spin_unlock_irqrestore(&drvdata->spinlock, flags);
+
+       return ret;
+}
+
+int tmc_read_unprepare_etr(struct tmc_drvdata *drvdata)
+{
+       unsigned long flags;
+       dma_addr_t paddr;
+       void __iomem *vaddr = NULL;
+
+       /* config types are set a boot time and never change */
+       if (WARN_ON_ONCE(drvdata->config_type != TMC_CONFIG_TYPE_ETR))
+               return -EINVAL;
+
+       spin_lock_irqsave(&drvdata->spinlock, flags);
+
+       /* RE-enable the TMC if need be */
+       if (local_read(&drvdata->mode) == CS_MODE_SYSFS) {
+               /*
+                * The trace run will continue with the same allocated trace
+                * buffer. The trace buffer is cleared in tmc_etr_enable_hw(),
+                * so we don't have to explicitly clear it. Also, since the
+                * tracer is still enabled drvdata::buf can't be NULL.
+                */
+               tmc_etr_enable_hw(drvdata);
+       } else {
+               /*
+                * The ETR is not tracing and the buffer was just read.
+                * As such prepare to free the trace buffer.
+                */
+               vaddr = drvdata->vaddr;
+               paddr = drvdata->paddr;
+               drvdata->buf = drvdata->vaddr = NULL;
+       }
+
+       drvdata->reading = false;
+       spin_unlock_irqrestore(&drvdata->spinlock, flags);
+
+       /* Free allocated memory out side of the spinlock */
+       if (vaddr)
+               dma_free_coherent(drvdata->dev, drvdata->size, vaddr, paddr);
+
+       return 0;
+}
diff --git a/drivers/hwtracing/coresight/coresight-tmc.c b/drivers/hwtracing/coresight/coresight-tmc.c

index a57c7ec1661f915f9d7bc680cefc3558129d299a..9e02ac963cd0e3fbb38f9ee5e3fd082719f38bf7 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-tmc.c
+++ b/drivers/hwtracing/coresight/coresight-tmc.c
@@ -1,4 +1,6 @@
  /* Copyright (c) 2012, The Linux Foundation. All rights reserved.
+ *
+ * Description: CoreSight Trace Memory Controller driver
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License version 2 and
@@ -11,7 +13,6 @@
   */
  
  #include <linux/kernel.h>
-#include <linux/module.h>
  #include <linux/init.h>
  #include <linux/types.h>
  #include <linux/device.h>
@@ -29,127 +30,27 @@
  #include <linux/amba/bus.h>
  
  #include "coresight-priv.h"
+#include "coresight-tmc.h"
  
-#define TMC_RSZ                        0x004
-#define TMC_STS                        0x00c
-#define TMC_RRD                        0x010
-#define TMC_RRP                        0x014
-#define TMC_RWP                        0x018
-#define TMC_TRG                        0x01c
-#define TMC_CTL                        0x020
-#define TMC_RWD                        0x024
-#define TMC_MODE               0x028
-#define TMC_LBUFLEVEL          0x02c
-#define TMC_CBUFLEVEL          0x030
-#define TMC_BUFWM              0x034
-#define TMC_RRPHI              0x038
-#define TMC_RWPHI              0x03c
-#define TMC_AXICTL             0x110
-#define TMC_DBALO              0x118
-#define TMC_DBAHI              0x11c
-#define TMC_FFSR               0x300
-#define TMC_FFCR               0x304
-#define TMC_PSCR               0x308
-#define TMC_ITMISCOP0          0xee0
-#define TMC_ITTRFLIN           0xee8
-#define TMC_ITATBDATA0         0xeec
-#define TMC_ITATBCTR2          0xef0
-#define TMC_ITATBCTR1          0xef4
-#define TMC_ITATBCTR0          0xef8
-
-/* register description */
-/* TMC_CTL - 0x020 */
-#define TMC_CTL_CAPT_EN                BIT(0)
-/* TMC_STS - 0x00C */
-#define TMC_STS_TRIGGERED      BIT(1)
-/* TMC_AXICTL - 0x110 */
-#define TMC_AXICTL_PROT_CTL_B0 BIT(0)
-#define TMC_AXICTL_PROT_CTL_B1 BIT(1)
-#define TMC_AXICTL_SCT_GAT_MODE        BIT(7)
-#define TMC_AXICTL_WR_BURST_LEN 0xF00
-/* TMC_FFCR - 0x304 */
-#define TMC_FFCR_EN_FMT                BIT(0)
-#define TMC_FFCR_EN_TI         BIT(1)
-#define TMC_FFCR_FON_FLIN      BIT(4)
-#define TMC_FFCR_FON_TRIG_EVT  BIT(5)
-#define TMC_FFCR_FLUSHMAN      BIT(6)
-#define TMC_FFCR_TRIGON_TRIGIN BIT(8)
-#define TMC_FFCR_STOP_ON_FLUSH BIT(12)
-
-#define TMC_STS_TRIGGERED_BIT  2
-#define TMC_FFCR_FLUSHMAN_BIT  6
-
-enum tmc_config_type {
-       TMC_CONFIG_TYPE_ETB,
-       TMC_CONFIG_TYPE_ETR,
-       TMC_CONFIG_TYPE_ETF,
-};
-
-enum tmc_mode {
-       TMC_MODE_CIRCULAR_BUFFER,
-       TMC_MODE_SOFTWARE_FIFO,
-       TMC_MODE_HARDWARE_FIFO,
-};
-
-enum tmc_mem_intf_width {
-       TMC_MEM_INTF_WIDTH_32BITS       = 0x2,
-       TMC_MEM_INTF_WIDTH_64BITS       = 0x3,
-       TMC_MEM_INTF_WIDTH_128BITS      = 0x4,
-       TMC_MEM_INTF_WIDTH_256BITS      = 0x5,
-};
-
-/**
- * struct tmc_drvdata - specifics associated to an TMC component
- * @base:      memory mapped base address for this component.
- * @dev:       the device entity associated to this component.
- * @csdev:     component vitals needed by the framework.
- * @miscdev:   specifics to handle "/dev/xyz.tmc" entry.
- * @spinlock:  only one at a time pls.
- * @read_count:        manages preparation of buffer for reading.
- * @buf:       area of memory where trace data get sent.
- * @paddr:     DMA start location in RAM.
- * @vaddr:     virtual representation of @paddr.
- * @size:      @buf size.
- * @enable:    this TMC is being used.
- * @config_type: TMC variant, must be of type @tmc_config_type.
- * @trigger_cntr: amount of words to store after a trigger.
- */
-struct tmc_drvdata {
-       void __iomem            *base;
-       struct device           *dev;
-       struct coresight_device *csdev;
-       struct miscdevice       miscdev;
-       spinlock_t              spinlock;
-       int                     read_count;
-       bool                    reading;
-       char                    *buf;
-       dma_addr_t              paddr;
-       void __iomem            *vaddr;
-       u32                     size;
-       bool                    enable;
-       enum tmc_config_type    config_type;
-       u32                     trigger_cntr;
-};
-
-static void tmc_wait_for_ready(struct tmc_drvdata *drvdata)
+void tmc_wait_for_tmcready(struct tmc_drvdata *drvdata)
  {
         /* Ensure formatter, unformatter and hardware fifo are empty */
         if (coresight_timeout(drvdata->base,
-                             TMC_STS, TMC_STS_TRIGGERED_BIT, 1)) {
+                             TMC_STS, TMC_STS_TMCREADY_BIT, 1)) {
                 dev_err(drvdata->dev,
                         "timeout observed when probing at offset %#x\n",
                         TMC_STS);
         }
  }
  
-static void tmc_flush_and_stop(struct tmc_drvdata *drvdata)
+void tmc_flush_and_stop(struct tmc_drvdata *drvdata)
  {
         u32 ffcr;
  
         ffcr = readl_relaxed(drvdata->base + TMC_FFCR);
         ffcr |= TMC_FFCR_STOP_ON_FLUSH;
         writel_relaxed(ffcr, drvdata->base + TMC_FFCR);
-       ffcr |= TMC_FFCR_FLUSHMAN;
+       ffcr |= BIT(TMC_FFCR_FLUSHMAN_BIT);
         writel_relaxed(ffcr, drvdata->base + TMC_FFCR);
         /* Ensure flush completes */
         if (coresight_timeout(drvdata->base,
@@ -159,343 +60,73 @@ static void tmc_flush_and_stop(struct tmc_drvdata *drvdata)
                         TMC_FFCR);
         }
  
-       tmc_wait_for_ready(drvdata);
+       tmc_wait_for_tmcready(drvdata);
  }
  
-static void tmc_enable_hw(struct tmc_drvdata *drvdata)
+void tmc_enable_hw(struct tmc_drvdata *drvdata)
  {
         writel_relaxed(TMC_CTL_CAPT_EN, drvdata->base + TMC_CTL);
  }
  
-static void tmc_disable_hw(struct tmc_drvdata *drvdata)
+void tmc_disable_hw(struct tmc_drvdata *drvdata)
  {
         writel_relaxed(0x0, drvdata->base + TMC_CTL);
  }
  
-static void tmc_etb_enable_hw(struct tmc_drvdata *drvdata)
-{
-       /* Zero out the memory to help with debug */
-       memset(drvdata->buf, 0, drvdata->size);
-
-       CS_UNLOCK(drvdata->base);
-
-       writel_relaxed(TMC_MODE_CIRCULAR_BUFFER, drvdata->base + TMC_MODE);
-       writel_relaxed(TMC_FFCR_EN_FMT | TMC_FFCR_EN_TI |
-                      TMC_FFCR_FON_FLIN | TMC_FFCR_FON_TRIG_EVT |
-                      TMC_FFCR_TRIGON_TRIGIN,
-                      drvdata->base + TMC_FFCR);
-
-       writel_relaxed(drvdata->trigger_cntr, drvdata->base + TMC_TRG);
-       tmc_enable_hw(drvdata);
-
-       CS_LOCK(drvdata->base);
-}
-
-static void tmc_etr_enable_hw(struct tmc_drvdata *drvdata)
-{
-       u32 axictl;
-
-       /* Zero out the memory to help with debug */
-       memset(drvdata->vaddr, 0, drvdata->size);
-
-       CS_UNLOCK(drvdata->base);
-
-       writel_relaxed(drvdata->size / 4, drvdata->base + TMC_RSZ);
-       writel_relaxed(TMC_MODE_CIRCULAR_BUFFER, drvdata->base + TMC_MODE);
-
-       axictl = readl_relaxed(drvdata->base + TMC_AXICTL);
-       axictl |= TMC_AXICTL_WR_BURST_LEN;
-       writel_relaxed(axictl, drvdata->base + TMC_AXICTL);
-       axictl &= ~TMC_AXICTL_SCT_GAT_MODE;
-       writel_relaxed(axictl, drvdata->base + TMC_AXICTL);
-       axictl = (axictl &
-                 ~(TMC_AXICTL_PROT_CTL_B0 | TMC_AXICTL_PROT_CTL_B1)) |
-                 TMC_AXICTL_PROT_CTL_B1;
-       writel_relaxed(axictl, drvdata->base + TMC_AXICTL);
-
-       writel_relaxed(drvdata->paddr, drvdata->base + TMC_DBALO);
-       writel_relaxed(0x0, drvdata->base + TMC_DBAHI);
-       writel_relaxed(TMC_FFCR_EN_FMT | TMC_FFCR_EN_TI |
-                      TMC_FFCR_FON_FLIN | TMC_FFCR_FON_TRIG_EVT |
-                      TMC_FFCR_TRIGON_TRIGIN,
-                      drvdata->base + TMC_FFCR);
-       writel_relaxed(drvdata->trigger_cntr, drvdata->base + TMC_TRG);
-       tmc_enable_hw(drvdata);
-
-       CS_LOCK(drvdata->base);
-}
-
-static void tmc_etf_enable_hw(struct tmc_drvdata *drvdata)
-{
-       CS_UNLOCK(drvdata->base);
-
-       writel_relaxed(TMC_MODE_HARDWARE_FIFO, drvdata->base + TMC_MODE);
-       writel_relaxed(TMC_FFCR_EN_FMT | TMC_FFCR_EN_TI,
-                      drvdata->base + TMC_FFCR);
-       writel_relaxed(0x0, drvdata->base + TMC_BUFWM);
-       tmc_enable_hw(drvdata);
-
-       CS_LOCK(drvdata->base);
-}
-
-static int tmc_enable(struct tmc_drvdata *drvdata, enum tmc_mode mode)
-{
-       unsigned long flags;
-
-       pm_runtime_get_sync(drvdata->dev);
-
-       spin_lock_irqsave(&drvdata->spinlock, flags);
-       if (drvdata->reading) {
-               spin_unlock_irqrestore(&drvdata->spinlock, flags);
-               pm_runtime_put(drvdata->dev);
-               return -EBUSY;
-       }
-
-       if (drvdata->config_type == TMC_CONFIG_TYPE_ETB) {
-               tmc_etb_enable_hw(drvdata);
-       } else if (drvdata->config_type == TMC_CONFIG_TYPE_ETR) {
-               tmc_etr_enable_hw(drvdata);
-       } else {
-               if (mode == TMC_MODE_CIRCULAR_BUFFER)
-                       tmc_etb_enable_hw(drvdata);
-               else
-                       tmc_etf_enable_hw(drvdata);
-       }
-       drvdata->enable = true;
-       spin_unlock_irqrestore(&drvdata->spinlock, flags);
-
-       dev_info(drvdata->dev, "TMC enabled\n");
-       return 0;
-}
-
-static int tmc_enable_sink(struct coresight_device *csdev)
-{
-       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
-
-       return tmc_enable(drvdata, TMC_MODE_CIRCULAR_BUFFER);
-}
-
-static int tmc_enable_link(struct coresight_device *csdev, int inport,
-                          int outport)
+static int tmc_read_prepare(struct tmc_drvdata *drvdata)
  {
-       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
-
-       return tmc_enable(drvdata, TMC_MODE_HARDWARE_FIFO);
-}
+       int ret = 0;
  
-static void tmc_etb_dump_hw(struct tmc_drvdata *drvdata)
-{
-       enum tmc_mem_intf_width memwidth;
-       u8 memwords;
-       char *bufp;
-       u32 read_data;
-       int i;
-
-       memwidth = BMVAL(readl_relaxed(drvdata->base + CORESIGHT_DEVID), 8, 10);
-       if (memwidth == TMC_MEM_INTF_WIDTH_32BITS)
-               memwords = 1;
-       else if (memwidth == TMC_MEM_INTF_WIDTH_64BITS)
-               memwords = 2;
-       else if (memwidth == TMC_MEM_INTF_WIDTH_128BITS)
-               memwords = 4;
-       else
-               memwords = 8;
-
-       bufp = drvdata->buf;
-       while (1) {
-               for (i = 0; i < memwords; i++) {
-                       read_data = readl_relaxed(drvdata->base + TMC_RRD);
-                       if (read_data == 0xFFFFFFFF)
-                               return;
-                       memcpy(bufp, &read_data, 4);
-                       bufp += 4;
-               }
+       switch (drvdata->config_type) {
+       case TMC_CONFIG_TYPE_ETB:
+       case TMC_CONFIG_TYPE_ETF:
+               ret = tmc_read_prepare_etb(drvdata);
+               break;
+       case TMC_CONFIG_TYPE_ETR:
+               ret = tmc_read_prepare_etr(drvdata);
+               break;
+       default:
+               ret = -EINVAL;
         }
-}
-
-static void tmc_etb_disable_hw(struct tmc_drvdata *drvdata)
-{
-       CS_UNLOCK(drvdata->base);
  
-       tmc_flush_and_stop(drvdata);
-       tmc_etb_dump_hw(drvdata);
-       tmc_disable_hw(drvdata);
-
-       CS_LOCK(drvdata->base);
-}
+       if (!ret)
+               dev_info(drvdata->dev, "TMC read start\n");
  
-static void tmc_etr_dump_hw(struct tmc_drvdata *drvdata)
-{
-       u32 rwp, val;
-
-       rwp = readl_relaxed(drvdata->base + TMC_RWP);
-       val = readl_relaxed(drvdata->base + TMC_STS);
-
-       /* How much memory do we still have */
-       if (val & BIT(0))
-               drvdata->buf = drvdata->vaddr + rwp - drvdata->paddr;
-       else
-               drvdata->buf = drvdata->vaddr;
-}
-
-static void tmc_etr_disable_hw(struct tmc_drvdata *drvdata)
-{
-       CS_UNLOCK(drvdata->base);
-
-       tmc_flush_and_stop(drvdata);
-       tmc_etr_dump_hw(drvdata);
-       tmc_disable_hw(drvdata);
-
-       CS_LOCK(drvdata->base);
-}
-
-static void tmc_etf_disable_hw(struct tmc_drvdata *drvdata)
-{
-       CS_UNLOCK(drvdata->base);
-
-       tmc_flush_and_stop(drvdata);
-       tmc_disable_hw(drvdata);
-
-       CS_LOCK(drvdata->base);
+       return ret;
  }
  
-static void tmc_disable(struct tmc_drvdata *drvdata, enum tmc_mode mode)
+static int tmc_read_unprepare(struct tmc_drvdata *drvdata)
  {
-       unsigned long flags;
-
-       spin_lock_irqsave(&drvdata->spinlock, flags);
-       if (drvdata->reading)
-               goto out;
+       int ret = 0;
  
-       if (drvdata->config_type == TMC_CONFIG_TYPE_ETB) {
-               tmc_etb_disable_hw(drvdata);
-       } else if (drvdata->config_type == TMC_CONFIG_TYPE_ETR) {
-               tmc_etr_disable_hw(drvdata);
-       } else {
-               if (mode == TMC_MODE_CIRCULAR_BUFFER)
-                       tmc_etb_disable_hw(drvdata);
-               else
-                       tmc_etf_disable_hw(drvdata);
+       switch (drvdata->config_type) {
+       case TMC_CONFIG_TYPE_ETB:
+       case TMC_CONFIG_TYPE_ETF:
+               ret = tmc_read_unprepare_etb(drvdata);
+               break;
+       case TMC_CONFIG_TYPE_ETR:
+               ret = tmc_read_unprepare_etr(drvdata);
+               break;
+       default:
+               ret = -EINVAL;
         }
-out:
-       drvdata->enable = false;
-       spin_unlock_irqrestore(&drvdata->spinlock, flags);
-
-       pm_runtime_put(drvdata->dev);
  
-       dev_info(drvdata->dev, "TMC disabled\n");
-}
+       if (!ret)
+               dev_info(drvdata->dev, "TMC read end\n");
  
-static void tmc_disable_sink(struct coresight_device *csdev)
-{
-       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
-
-       tmc_disable(drvdata, TMC_MODE_CIRCULAR_BUFFER);
-}
-
-static void tmc_disable_link(struct coresight_device *csdev, int inport,
-                            int outport)
-{
-       struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
-
-       tmc_disable(drvdata, TMC_MODE_HARDWARE_FIFO);
-}
-
-static const struct coresight_ops_sink tmc_sink_ops = {
-       .enable         = tmc_enable_sink,
-       .disable        = tmc_disable_sink,
-};
-
-static const struct coresight_ops_link tmc_link_ops = {
-       .enable         = tmc_enable_link,
-       .disable        = tmc_disable_link,
-};
-
-static const struct coresight_ops tmc_etb_cs_ops = {
-       .sink_ops       = &tmc_sink_ops,
-};
-
-static const struct coresight_ops tmc_etr_cs_ops = {
-       .sink_ops       = &tmc_sink_ops,
-};
-
-static const struct coresight_ops tmc_etf_cs_ops = {
-       .sink_ops       = &tmc_sink_ops,
-       .link_ops       = &tmc_link_ops,
-};
-
-static int tmc_read_prepare(struct tmc_drvdata *drvdata)
-{
-       int ret;
-       unsigned long flags;
-       enum tmc_mode mode;
-
-       spin_lock_irqsave(&drvdata->spinlock, flags);
-       if (!drvdata->enable)
-               goto out;
-
-       if (drvdata->config_type == TMC_CONFIG_TYPE_ETB) {
-               tmc_etb_disable_hw(drvdata);
-       } else if (drvdata->config_type == TMC_CONFIG_TYPE_ETR) {
-               tmc_etr_disable_hw(drvdata);
-       } else {
-               mode = readl_relaxed(drvdata->base + TMC_MODE);
-               if (mode == TMC_MODE_CIRCULAR_BUFFER) {
-                       tmc_etb_disable_hw(drvdata);
-               } else {
-                       ret = -ENODEV;
-                       goto err;
-               }
-       }
-out:
-       drvdata->reading = true;
-       spin_unlock_irqrestore(&drvdata->spinlock, flags);
-
-       dev_info(drvdata->dev, "TMC read start\n");
-       return 0;
-err:
-       spin_unlock_irqrestore(&drvdata->spinlock, flags);
         return ret;
  }
  
-static void tmc_read_unprepare(struct tmc_drvdata *drvdata)
-{
-       unsigned long flags;
-       enum tmc_mode mode;
-
-       spin_lock_irqsave(&drvdata->spinlock, flags);
-       if (!drvdata->enable)
-               goto out;
-
-       if (drvdata->config_type == TMC_CONFIG_TYPE_ETB) {
-               tmc_etb_enable_hw(drvdata);
-       } else if (drvdata->config_type == TMC_CONFIG_TYPE_ETR) {
-               tmc_etr_enable_hw(drvdata);
-       } else {
-               mode = readl_relaxed(drvdata->base + TMC_MODE);
-               if (mode == TMC_MODE_CIRCULAR_BUFFER)
-                       tmc_etb_enable_hw(drvdata);
-       }
-out:
-       drvdata->reading = false;
-       spin_unlock_irqrestore(&drvdata->spinlock, flags);
-
-       dev_info(drvdata->dev, "TMC read end\n");
-}
-
  static int tmc_open(struct inode *inode, struct file *file)
  {
+       int ret;
         struct tmc_drvdata *drvdata = container_of(file->private_data,
                                                    struct tmc_drvdata, miscdev);
-       int ret = 0;
-
-       if (drvdata->read_count++)
-               goto out;
  
         ret = tmc_read_prepare(drvdata);
         if (ret)
                 return ret;
-out:
+
         nonseekable_open(inode, file);
  
         dev_dbg(drvdata->dev, "%s: successfully opened\n", __func__);
@@ -535,19 +166,14 @@ static ssize_t tmc_read(struct file *file, char __user *data, size_t len,
  
  static int tmc_release(struct inode *inode, struct file *file)
  {
+       int ret;
         struct tmc_drvdata *drvdata = container_of(file->private_data,
                                                    struct tmc_drvdata, miscdev);
  
-       if (--drvdata->read_count) {
-               if (drvdata->read_count < 0) {
-                       dev_err(drvdata->dev, "mismatched close\n");
-                       drvdata->read_count = 0;
-               }
-               goto out;
-       }
+       ret = tmc_read_unprepare(drvdata);
+       if (ret)
+               return ret;
  
-       tmc_read_unprepare(drvdata);
-out:
         dev_dbg(drvdata->dev, "%s: released\n", __func__);
         return 0;
  }
@@ -560,56 +186,71 @@ static const struct file_operations tmc_fops = {
         .llseek         = no_llseek,
  };
  
-static ssize_t status_show(struct device *dev,
-                          struct device_attribute *attr, char *buf)
+static enum tmc_mem_intf_width tmc_get_memwidth(u32 devid)
  {
-       unsigned long flags;
-       u32 tmc_rsz, tmc_sts, tmc_rrp, tmc_rwp, tmc_trg;
-       u32 tmc_ctl, tmc_ffsr, tmc_ffcr, tmc_mode, tmc_pscr;
-       u32 devid;
-       struct tmc_drvdata *drvdata = dev_get_drvdata(dev->parent);
+       enum tmc_mem_intf_width memwidth;
  
-       pm_runtime_get_sync(drvdata->dev);
-       spin_lock_irqsave(&drvdata->spinlock, flags);
-       CS_UNLOCK(drvdata->base);
-
-       tmc_rsz = readl_relaxed(drvdata->base + TMC_RSZ);
-       tmc_sts = readl_relaxed(drvdata->base + TMC_STS);
-       tmc_rrp = readl_relaxed(drvdata->base + TMC_RRP);
-       tmc_rwp = readl_relaxed(drvdata->base + TMC_RWP);
-       tmc_trg = readl_relaxed(drvdata->base + TMC_TRG);
-       tmc_ctl = readl_relaxed(drvdata->base + TMC_CTL);
-       tmc_ffsr = readl_relaxed(drvdata->base + TMC_FFSR);
-       tmc_ffcr = readl_relaxed(drvdata->base + TMC_FFCR);
-       tmc_mode = readl_relaxed(drvdata->base + TMC_MODE);
-       tmc_pscr = readl_relaxed(drvdata->base + TMC_PSCR);
-       devid = readl_relaxed(drvdata->base + CORESIGHT_DEVID);
+       /*
+        * Excerpt from the TRM:
+        *
+        * DEVID::MEMWIDTH[10:8]
+        * 0x2 Memory interface databus is 32 bits wide.
+        * 0x3 Memory interface databus is 64 bits wide.
+        * 0x4 Memory interface databus is 128 bits wide.
+        * 0x5 Memory interface databus is 256 bits wide.
+        */
+       switch (BMVAL(devid, 8, 10)) {
+       case 0x2:
+               memwidth = TMC_MEM_INTF_WIDTH_32BITS;
+               break;
+       case 0x3:
+               memwidth = TMC_MEM_INTF_WIDTH_64BITS;
+               break;
+       case 0x4:
+               memwidth = TMC_MEM_INTF_WIDTH_128BITS;
+               break;
+       case 0x5:
+               memwidth = TMC_MEM_INTF_WIDTH_256BITS;
+               break;
+       default:
+               memwidth = 0;
+       }
  
-       CS_LOCK(drvdata->base);
-       spin_unlock_irqrestore(&drvdata->spinlock, flags);
-       pm_runtime_put(drvdata->dev);
-
-       return sprintf(buf,
-                      "Depth:\t\t0x%x\n"
-                      "Status:\t\t0x%x\n"
-                      "RAM read ptr:\t0x%x\n"
-                      "RAM wrt ptr:\t0x%x\n"
-                      "Trigger cnt:\t0x%x\n"
-                      "Control:\t0x%x\n"
-                      "Flush status:\t0x%x\n"
-                      "Flush ctrl:\t0x%x\n"
-                      "Mode:\t\t0x%x\n"
-                      "PSRC:\t\t0x%x\n"
-                      "DEVID:\t\t0x%x\n",
-                       tmc_rsz, tmc_sts, tmc_rrp, tmc_rwp, tmc_trg,
-                       tmc_ctl, tmc_ffsr, tmc_ffcr, tmc_mode, tmc_pscr, devid);
-
-       return -EINVAL;
+       return memwidth;
  }
-static DEVICE_ATTR_RO(status);
  
-static ssize_t trigger_cntr_show(struct device *dev,
-                           struct device_attribute *attr, char *buf)
+#define coresight_tmc_simple_func(name, offset)                        \
+       coresight_simple_func(struct tmc_drvdata, name, offset)
+
+coresight_tmc_simple_func(rsz, TMC_RSZ);
+coresight_tmc_simple_func(sts, TMC_STS);
+coresight_tmc_simple_func(rrp, TMC_RRP);
+coresight_tmc_simple_func(rwp, TMC_RWP);
+coresight_tmc_simple_func(trg, TMC_TRG);
+coresight_tmc_simple_func(ctl, TMC_CTL);
+coresight_tmc_simple_func(ffsr, TMC_FFSR);
+coresight_tmc_simple_func(ffcr, TMC_FFCR);
+coresight_tmc_simple_func(mode, TMC_MODE);
+coresight_tmc_simple_func(pscr, TMC_PSCR);
+coresight_tmc_simple_func(devid, CORESIGHT_DEVID);
+
+static struct attribute *coresight_tmc_mgmt_attrs[] = {
+       &dev_attr_rsz.attr,
+       &dev_attr_sts.attr,
+       &dev_attr_rrp.attr,
+       &dev_attr_rwp.attr,
+       &dev_attr_trg.attr,
+       &dev_attr_ctl.attr,
+       &dev_attr_ffsr.attr,
+       &dev_attr_ffcr.attr,
+       &dev_attr_mode.attr,
+       &dev_attr_pscr.attr,
+       &dev_attr_devid.attr,
+       NULL,
+};
+
+ssize_t trigger_cntr_show(struct device *dev,
+                         struct device_attribute *attr, char *buf)
  {
         struct tmc_drvdata *drvdata = dev_get_drvdata(dev->parent);
         unsigned long val = drvdata->trigger_cntr;
@@ -634,26 +275,25 @@ static ssize_t trigger_cntr_store(struct device *dev,
  }
  static DEVICE_ATTR_RW(trigger_cntr);
  
-static struct attribute *coresight_etb_attrs[] = {
+static struct attribute *coresight_tmc_attrs[] = {
         &dev_attr_trigger_cntr.attr,
-       &dev_attr_status.attr,
         NULL,
  };
-ATTRIBUTE_GROUPS(coresight_etb);
  
-static struct attribute *coresight_etr_attrs[] = {
-       &dev_attr_trigger_cntr.attr,
-       &dev_attr_status.attr,
-       NULL,
+static const struct attribute_group coresight_tmc_group = {
+       .attrs = coresight_tmc_attrs,
  };
-ATTRIBUTE_GROUPS(coresight_etr);
  
-static struct attribute *coresight_etf_attrs[] = {
-       &dev_attr_trigger_cntr.attr,
-       &dev_attr_status.attr,
+static const struct attribute_group coresight_tmc_mgmt_group = {
+       .attrs = coresight_tmc_mgmt_attrs,
+       .name = "mgmt",
+};
+
+const struct attribute_group *coresight_tmc_groups[] = {
+       &coresight_tmc_group,
+       &coresight_tmc_mgmt_group,
         NULL,
  };
-ATTRIBUTE_GROUPS(coresight_etf);
  
  static int tmc_probe(struct amba_device *adev, const struct amba_id *id)
  {
@@ -692,6 +332,7 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id)
  
         devid = readl_relaxed(drvdata->base + CORESIGHT_DEVID);
         drvdata->config_type = BMVAL(devid, 6, 7);
+       drvdata->memwidth = tmc_get_memwidth(devid);
  
         if (drvdata->config_type == TMC_CONFIG_TYPE_ETR) {
                 if (np)
@@ -706,20 +347,6 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id)
  
         pm_runtime_put(&adev->dev);
  
-       if (drvdata->config_type == TMC_CONFIG_TYPE_ETR) {
-               drvdata->vaddr = dma_alloc_coherent(dev, drvdata->size,
-                                               &drvdata->paddr, GFP_KERNEL);
-               if (!drvdata->vaddr)
-                       return -ENOMEM;
-
-               memset(drvdata->vaddr, 0, drvdata->size);
-               drvdata->buf = drvdata->vaddr;
-       } else {
-               drvdata->buf = devm_kzalloc(dev, drvdata->size, GFP_KERNEL);
-               if (!drvdata->buf)
-                       return -ENOMEM;
-       }
-
         desc = devm_kzalloc(dev, sizeof(*desc), GFP_KERNEL);
         if (!desc) {
                 ret = -ENOMEM;
@@ -729,20 +356,18 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id)
         desc->pdata = pdata;
         desc->dev = dev;
         desc->subtype.sink_subtype = CORESIGHT_DEV_SUBTYPE_SINK_BUFFER;
+       desc->groups = coresight_tmc_groups;
  
         if (drvdata->config_type == TMC_CONFIG_TYPE_ETB) {
                 desc->type = CORESIGHT_DEV_TYPE_SINK;
                 desc->ops = &tmc_etb_cs_ops;
-               desc->groups = coresight_etb_groups;
         } else if (drvdata->config_type == TMC_CONFIG_TYPE_ETR) {
                 desc->type = CORESIGHT_DEV_TYPE_SINK;
                 desc->ops = &tmc_etr_cs_ops;
-               desc->groups = coresight_etr_groups;
         } else {
                 desc->type = CORESIGHT_DEV_TYPE_LINKSINK;
                 desc->subtype.link_subtype = CORESIGHT_DEV_SUBTYPE_LINK_FIFO;
                 desc->ops = &tmc_etf_cs_ops;
-               desc->groups = coresight_etf_groups;
         }
  
         drvdata->csdev = coresight_register(desc);
@@ -758,7 +383,6 @@ static int tmc_probe(struct amba_device *adev, const struct amba_id *id)
         if (ret)
                 goto err_misc_register;
  
-       dev_info(dev, "TMC initialized\n");
         return 0;
  
  err_misc_register:
@@ -766,23 +390,10 @@ err_misc_register:
  err_devm_kzalloc:
         if (drvdata->config_type == TMC_CONFIG_TYPE_ETR)
                 dma_free_coherent(dev, drvdata->size,
-                               &drvdata->paddr, GFP_KERNEL);
+                               drvdata->vaddr, drvdata->paddr);
         return ret;
  }
  
-static int tmc_remove(struct amba_device *adev)
-{
-       struct tmc_drvdata *drvdata = amba_get_drvdata(adev);
-
-       misc_deregister(&drvdata->miscdev);
-       coresight_unregister(drvdata->csdev);
-       if (drvdata->config_type == TMC_CONFIG_TYPE_ETR)
-               dma_free_coherent(drvdata->dev, drvdata->size,
-                                 &drvdata->paddr, GFP_KERNEL);
-
-       return 0;
-}
-
  static struct amba_id tmc_ids[] = {
         {
                 .id     = 0x0003b961,
@@ -795,13 +406,9 @@ static struct amba_driver tmc_driver = {
         .drv = {
                 .name   = "coresight-tmc",
                 .owner  = THIS_MODULE,
+               .suppress_bind_attrs = true,
         },
         .probe          = tmc_probe,
-       .remove         = tmc_remove,
         .id_table       = tmc_ids,
  };
-
-module_amba_driver(tmc_driver);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("CoreSight Trace Memory Controller driver");
+builtin_amba_driver(tmc_driver);
diff --git a/drivers/hwtracing/coresight/coresight-tmc.h b/drivers/hwtracing/coresight/coresight-tmc.h

new file mode 100644 (file)

index 0000000..5c5fe2a
--- /dev/null
+++ b/drivers/hwtracing/coresight/coresight-tmc.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright(C) 2015 Linaro Limited. All rights reserved.
+ * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _CORESIGHT_TMC_H
+#define _CORESIGHT_TMC_H
+
+#include <linux/miscdevice.h>
+
+#define TMC_RSZ                        0x004
+#define TMC_STS                        0x00c
+#define TMC_RRD                        0x010
+#define TMC_RRP                        0x014
+#define TMC_RWP                        0x018
+#define TMC_TRG                        0x01c
+#define TMC_CTL                        0x020
+#define TMC_RWD                        0x024
+#define TMC_MODE               0x028
+#define TMC_LBUFLEVEL          0x02c
+#define TMC_CBUFLEVEL          0x030
+#define TMC_BUFWM              0x034
+#define TMC_RRPHI              0x038
+#define TMC_RWPHI              0x03c
+#define TMC_AXICTL             0x110
+#define TMC_DBALO              0x118
+#define TMC_DBAHI              0x11c
+#define TMC_FFSR               0x300
+#define TMC_FFCR               0x304
+#define TMC_PSCR               0x308
+#define TMC_ITMISCOP0          0xee0
+#define TMC_ITTRFLIN           0xee8
+#define TMC_ITATBDATA0         0xeec
+#define TMC_ITATBCTR2          0xef0
+#define TMC_ITATBCTR1          0xef4
+#define TMC_ITATBCTR0          0xef8
+
+/* register description */
+/* TMC_CTL - 0x020 */
+#define TMC_CTL_CAPT_EN                BIT(0)
+/* TMC_STS - 0x00C */
+#define TMC_STS_TMCREADY_BIT   2
+#define TMC_STS_FULL           BIT(0)
+#define TMC_STS_TRIGGERED      BIT(1)
+/* TMC_AXICTL - 0x110 */
+#define TMC_AXICTL_PROT_CTL_B0 BIT(0)
+#define TMC_AXICTL_PROT_CTL_B1 BIT(1)
+#define TMC_AXICTL_SCT_GAT_MODE        BIT(7)
+#define TMC_AXICTL_WR_BURST_16 0xF00
+/* TMC_FFCR - 0x304 */
+#define TMC_FFCR_FLUSHMAN_BIT  6
+#define TMC_FFCR_EN_FMT                BIT(0)
+#define TMC_FFCR_EN_TI         BIT(1)
+#define TMC_FFCR_FON_FLIN      BIT(4)
+#define TMC_FFCR_FON_TRIG_EVT  BIT(5)
+#define TMC_FFCR_TRIGON_TRIGIN BIT(8)
+#define TMC_FFCR_STOP_ON_FLUSH BIT(12)
+
+
+enum tmc_config_type {
+       TMC_CONFIG_TYPE_ETB,
+       TMC_CONFIG_TYPE_ETR,
+       TMC_CONFIG_TYPE_ETF,
+};
+
+enum tmc_mode {
+       TMC_MODE_CIRCULAR_BUFFER,
+       TMC_MODE_SOFTWARE_FIFO,
+       TMC_MODE_HARDWARE_FIFO,
+};
+
+enum tmc_mem_intf_width {
+       TMC_MEM_INTF_WIDTH_32BITS       = 1,
+       TMC_MEM_INTF_WIDTH_64BITS       = 2,
+       TMC_MEM_INTF_WIDTH_128BITS      = 4,
+       TMC_MEM_INTF_WIDTH_256BITS      = 8,
+};
+
+/**
+ * struct tmc_drvdata - specifics associated to an TMC component
+ * @base:      memory mapped base address for this component.
+ * @dev:       the device entity associated to this component.
+ * @csdev:     component vitals needed by the framework.
+ * @miscdev:   specifics to handle "/dev/xyz.tmc" entry.
+ * @spinlock:  only one at a time pls.
+ * @buf:       area of memory where trace data get sent.
+ * @paddr:     DMA start location in RAM.
+ * @vaddr:     virtual representation of @paddr.
+ * @size:      @buf size.
+ * @mode:      how this TMC is being used.
+ * @config_type: TMC variant, must be of type @tmc_config_type.
+ * @memwidth:  width of the memory interface databus, in bytes.
+ * @trigger_cntr: amount of words to store after a trigger.
+ */
+struct tmc_drvdata {
+       void __iomem            *base;
+       struct device           *dev;
+       struct coresight_device *csdev;
+       struct miscdevice       miscdev;
+       spinlock_t              spinlock;
+       bool                    reading;
+       char                    *buf;
+       dma_addr_t              paddr;
+       void __iomem            *vaddr;
+       u32                     size;
+       local_t                 mode;
+       enum tmc_config_type    config_type;
+       enum tmc_mem_intf_width memwidth;
+       u32                     trigger_cntr;
+};
+
+/* Generic functions */
+void tmc_wait_for_tmcready(struct tmc_drvdata *drvdata);
+void tmc_flush_and_stop(struct tmc_drvdata *drvdata);
+void tmc_enable_hw(struct tmc_drvdata *drvdata);
+void tmc_disable_hw(struct tmc_drvdata *drvdata);
+
+/* ETB/ETF functions */
+int tmc_read_prepare_etb(struct tmc_drvdata *drvdata);
+int tmc_read_unprepare_etb(struct tmc_drvdata *drvdata);
+extern const struct coresight_ops tmc_etb_cs_ops;
+extern const struct coresight_ops tmc_etf_cs_ops;
+
+/* ETR functions */
+int tmc_read_prepare_etr(struct tmc_drvdata *drvdata);
+int tmc_read_unprepare_etr(struct tmc_drvdata *drvdata);
+extern const struct coresight_ops tmc_etr_cs_ops;
+#endif
diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c

index 7214efd10db52f9c2273ea5e0f86193034c8bb18..4e471e2e9d896df24399c9e02cdd06a463ac27be 100644 (file)
--- a/drivers/hwtracing/coresight/coresight-tpiu.c
+++ b/drivers/hwtracing/coresight/coresight-tpiu.c
@@ -1,4 +1,6 @@
  /* Copyright (c) 2011-2012, The Linux Foundation. All rights reserved.
+ *
+ * Description: CoreSight Trace Port Interface Unit driver
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License version 2 and
@@ -11,7 +13,6 @@
   */
  
  #include <linux/kernel.h>
-#include <linux/module.h>
  #include <linux/init.h>
  #include <linux/device.h>
  #include <linux/io.h>
@@ -70,11 +71,10 @@ static void tpiu_enable_hw(struct tpiu_drvdata *drvdata)
         CS_LOCK(drvdata->base);
  }
  
-static int tpiu_enable(struct coresight_device *csdev)
+static int tpiu_enable(struct coresight_device *csdev, u32 mode)
  {
         struct tpiu_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
-       pm_runtime_get_sync(csdev->dev.parent);
         tpiu_enable_hw(drvdata);
  
         dev_info(drvdata->dev, "TPIU enabled\n");
@@ -98,7 +98,6 @@ static void tpiu_disable(struct coresight_device *csdev)
         struct tpiu_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
  
         tpiu_disable_hw(drvdata);
-       pm_runtime_put(csdev->dev.parent);
  
         dev_info(drvdata->dev, "TPIU disabled\n");
  }
@@ -168,15 +167,6 @@ static int tpiu_probe(struct amba_device *adev, const struct amba_id *id)
         if (IS_ERR(drvdata->csdev))
                 return PTR_ERR(drvdata->csdev);
  
-       dev_info(dev, "TPIU initialized\n");
-       return 0;
-}
-
-static int tpiu_remove(struct amba_device *adev)
-{
-       struct tpiu_drvdata *drvdata = amba_get_drvdata(adev);
-
-       coresight_unregister(drvdata->csdev);
         return 0;
  }
  
@@ -223,13 +213,9 @@ static struct amba_driver tpiu_driver = {
                 .name   = "coresight-tpiu",
                 .owner  = THIS_MODULE,
                 .pm     = &tpiu_dev_pm_ops,
+               .suppress_bind_attrs = true,
         },
         .probe          = tpiu_probe,
-       .remove         = tpiu_remove,
         .id_table       = tpiu_ids,
  };
-
-module_amba_driver(tpiu_driver);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("CoreSight Trace Port Interface Unit driver");
+builtin_amba_driver(tpiu_driver);
diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c

index 93738dfbf6313ea09f9f970ce463ec8374b4f661..508532b3fcac74f2d6dc5593fc5d20fb7944d8f7 100644 (file)
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -11,7 +11,6 @@
   */
  
  #include <linux/kernel.h>
-#include <linux/module.h>
  #include <linux/init.h>
  #include <linux/types.h>
  #include <linux/device.h>
@@ -24,11 +23,36 @@
  #include <linux/coresight.h>
  #include <linux/of_platform.h>
  #include <linux/delay.h>
+#include <linux/pm_runtime.h>
  
  #include "coresight-priv.h"
  
  static DEFINE_MUTEX(coresight_mutex);
  
+/**
+ * struct coresight_node - elements of a path, from source to sink
+ * @csdev:     Address of an element.
+ * @link:      hook to the list.
+ */
+struct coresight_node {
+       struct coresight_device *csdev;
+       struct list_head link;
+};
+
+/*
+ * When operating Coresight drivers from the sysFS interface, only a single
+ * path can exist from a tracer (associated to a CPU) to a sink.
+ */
+static DEFINE_PER_CPU(struct list_head *, tracer_path);
+
+/*
+ * As of this writing only a single STM can be found in CS topologies.  Since
+ * there is no way to know if we'll ever see more and what kind of
+ * configuration they will enact, for the time being only define a single path
+ * for STM.
+ */
+static struct list_head *stm_path;
+
  static int coresight_id_match(struct device *dev, void *data)
  {
         int trace_id, i_trace_id;
@@ -68,15 +92,12 @@ static int coresight_source_is_unique(struct coresight_device *csdev)
                                  csdev, coresight_id_match);
  }
  
-static int coresight_find_link_inport(struct coresight_device *csdev)
+static int coresight_find_link_inport(struct coresight_device *csdev,
+                                     struct coresight_device *parent)
  {
         int i;
-       struct coresight_device *parent;
         struct coresight_connection *conn;
  
-       parent = container_of(csdev->path_link.next,
-                             struct coresight_device, path_link);
-
         for (i = 0; i < parent->nr_outport; i++) {
                 conn = &parent->conns[i];
                 if (conn->child_dev == csdev)
@@ -89,15 +110,12 @@ static int coresight_find_link_inport(struct coresight_device *csdev)
         return 0;
  }
  
-static int coresight_find_link_outport(struct coresight_device *csdev)
+static int coresight_find_link_outport(struct coresight_device *csdev,
+                                      struct coresight_device *child)
  {
         int i;
-       struct coresight_device *child;
         struct coresight_connection *conn;
  
-       child = container_of(csdev->path_link.prev,
-                            struct coresight_device, path_link);
-
         for (i = 0; i < csdev->nr_outport; i++) {
                 conn = &csdev->conns[i];
                 if (conn->child_dev == child)
@@ -110,13 +128,13 @@ static int coresight_find_link_outport(struct coresight_device *csdev)
         return 0;
  }
  
-static int coresight_enable_sink(struct coresight_device *csdev)
+static int coresight_enable_sink(struct coresight_device *csdev, u32 mode)
  {
         int ret;
  
         if (!csdev->enable) {
                 if (sink_ops(csdev)->enable) {
-                       ret = sink_ops(csdev)->enable(csdev);
+                       ret = sink_ops(csdev)->enable(csdev, mode);
                         if (ret)
                                 return ret;
                 }
@@ -138,14 +156,19 @@ static void coresight_disable_sink(struct coresight_device *csdev)
         }
  }
  
-static int coresight_enable_link(struct coresight_device *csdev)
+static int coresight_enable_link(struct coresight_device *csdev,
+                                struct coresight_device *parent,
+                                struct coresight_device *child)
  {
         int ret;
         int link_subtype;
         int refport, inport, outport;
  
-       inport = coresight_find_link_inport(csdev);
-       outport = coresight_find_link_outport(csdev);
+       if (!parent || !child)
+               return -EINVAL;
+
+       inport = coresight_find_link_inport(csdev, parent);
+       outport = coresight_find_link_outport(csdev, child);
         link_subtype = csdev->subtype.link_subtype;
  
         if (link_subtype == CORESIGHT_DEV_SUBTYPE_LINK_MERG)
@@ -168,14 +191,19 @@ static int coresight_enable_link(struct coresight_device *csdev)
         return 0;
  }
  
-static void coresight_disable_link(struct coresight_device *csdev)
+static void coresight_disable_link(struct coresight_device *csdev,
+                                  struct coresight_device *parent,
+                                  struct coresight_device *child)
  {
         int i, nr_conns;
         int link_subtype;
         int refport, inport, outport;
  
-       inport = coresight_find_link_inport(csdev);
-       outport = coresight_find_link_outport(csdev);
+       if (!parent || !child)
+               return;
+
+       inport = coresight_find_link_inport(csdev, parent);
+       outport = coresight_find_link_outport(csdev, child);
         link_subtype = csdev->subtype.link_subtype;
  
         if (link_subtype == CORESIGHT_DEV_SUBTYPE_LINK_MERG) {
@@ -201,7 +229,7 @@ static void coresight_disable_link(struct coresight_device *csdev)
         csdev->enable = false;
  }
  
-static int coresight_enable_source(struct coresight_device *csdev)
+static int coresight_enable_source(struct coresight_device *csdev, u32 mode)
  {
         int ret;
  
@@ -213,7 +241,7 @@ static int coresight_enable_source(struct coresight_device *csdev)
  
         if (!csdev->enable) {
                 if (source_ops(csdev)->enable) {
-                       ret = source_ops(csdev)->enable(csdev);
+                       ret = source_ops(csdev)->enable(csdev, NULL, mode);
                         if (ret)
                                 return ret;
                 }
@@ -235,147 +263,343 @@ static void coresight_disable_source(struct coresight_device *csdev)
         }
  }
  
-static int coresight_enable_path(struct list_head *path)
+void coresight_disable_path(struct list_head *path)
  {
-       int ret = 0;
-       struct coresight_device *cd;
+       u32 type;
+       struct coresight_node *nd;
+       struct coresight_device *csdev, *parent, *child;
  
-       /*
-        * At this point we have a full @path, from source to sink.  The
-        * sink is the first entry and the source the last one.  Go through
-        * all the components and enable them one by one.
-        */
-       list_for_each_entry(cd, path, path_link) {
-               if (cd == list_first_entry(path, struct coresight_device,
-                                          path_link)) {
-                       ret = coresight_enable_sink(cd);
-               } else if (list_is_last(&cd->path_link, path)) {
-                       /*
-                        * Don't enable the source just yet - this needs to
-                        * happen at the very end when all links and sink
-                        * along the path have been configured properly.
-                        */
-                       ;
-               } else {
-                       ret = coresight_enable_link(cd);
+       list_for_each_entry(nd, path, link) {
+               csdev = nd->csdev;
+               type = csdev->type;
+
+               /*
+                * ETF devices are tricky... They can be a link or a sink,
+                * depending on how they are configured.  If an ETF has been
+                * "activated" it will be configured as a sink, otherwise
+                * go ahead with the link configuration.
+                */
+               if (type == CORESIGHT_DEV_TYPE_LINKSINK)
+                       type = (csdev == coresight_get_sink(path)) ?
+                                               CORESIGHT_DEV_TYPE_SINK :
+                                               CORESIGHT_DEV_TYPE_LINK;
+
+               switch (type) {
+               case CORESIGHT_DEV_TYPE_SINK:
+                       coresight_disable_sink(csdev);
+                       break;
+               case CORESIGHT_DEV_TYPE_SOURCE:
+                       /* sources are disabled from either sysFS or Perf */
+                       break;
+               case CORESIGHT_DEV_TYPE_LINK:
+                       parent = list_prev_entry(nd, link)->csdev;
+                       child = list_next_entry(nd, link)->csdev;
+                       coresight_disable_link(csdev, parent, child);
+                       break;
+               default:
+                       break;
                 }
-               if (ret)
-                       goto err;
         }
+}
  
-       return 0;
-err:
-       list_for_each_entry_continue_reverse(cd, path, path_link) {
-               if (cd == list_first_entry(path, struct coresight_device,
-                                          path_link)) {
-                       coresight_disable_sink(cd);
-               } else if (list_is_last(&cd->path_link, path)) {
-                       ;
-               } else {
-                       coresight_disable_link(cd);
+int coresight_enable_path(struct list_head *path, u32 mode)
+{
+
+       int ret = 0;
+       u32 type;
+       struct coresight_node *nd;
+       struct coresight_device *csdev, *parent, *child;
+
+       list_for_each_entry_reverse(nd, path, link) {
+               csdev = nd->csdev;
+               type = csdev->type;
+
+               /*
+                * ETF devices are tricky... They can be a link or a sink,
+                * depending on how they are configured.  If an ETF has been
+                * "activated" it will be configured as a sink, otherwise
+                * go ahead with the link configuration.
+                */
+               if (type == CORESIGHT_DEV_TYPE_LINKSINK)
+                       type = (csdev == coresight_get_sink(path)) ?
+                                               CORESIGHT_DEV_TYPE_SINK :
+                                               CORESIGHT_DEV_TYPE_LINK;
+
+               switch (type) {
+               case CORESIGHT_DEV_TYPE_SINK:
+                       ret = coresight_enable_sink(csdev, mode);
+                       if (ret)
+                               goto err;
+                       break;
+               case CORESIGHT_DEV_TYPE_SOURCE:
+                       /* sources are enabled from either sysFS or Perf */
+                       break;
+               case CORESIGHT_DEV_TYPE_LINK:
+                       parent = list_prev_entry(nd, link)->csdev;
+                       child = list_next_entry(nd, link)->csdev;
+                       ret = coresight_enable_link(csdev, parent, child);
+                       if (ret)
+                               goto err;
+                       break;
+               default:
+                       goto err;
                 }
         }
  
+out:
         return ret;
+err:
+       coresight_disable_path(path);
+       goto out;
  }
  
-static int coresight_disable_path(struct list_head *path)
+struct coresight_device *coresight_get_sink(struct list_head *path)
  {
-       struct coresight_device *cd;
+       struct coresight_device *csdev;
  
-       list_for_each_entry_reverse(cd, path, path_link) {
-               if (cd == list_first_entry(path, struct coresight_device,
-                                          path_link)) {
-                       coresight_disable_sink(cd);
-               } else if (list_is_last(&cd->path_link, path)) {
-                       /*
-                        * The source has already been stopped, no need
-                        * to do it again here.
-                        */
-                       ;
-               } else {
-                       coresight_disable_link(cd);
+       if (!path)
+               return NULL;
+
+       csdev = list_last_entry(path, struct coresight_node, link)->csdev;
+       if (csdev->type != CORESIGHT_DEV_TYPE_SINK &&
+           csdev->type != CORESIGHT_DEV_TYPE_LINKSINK)
+               return NULL;
+
+       return csdev;
+}
+
+/**
+ * _coresight_build_path - recursively build a path from a @csdev to a sink.
+ * @csdev:     The device to start from.
+ * @sink:      The name of the sink this path should connect with.
+ * @path:      The list to add devices to.
+ *
+ * The tree of Coresight device is traversed until an activated sink or
+ * the one specified by @sink is found.
+ * From there the sink is added to the list along with all the devices that
+ * led to that point - the end result is a list from source to sink. In that
+ * list the source is the first device and the sink the last one.
+ */
+static int _coresight_build_path(struct coresight_device *csdev,
+                                const char *sink, struct list_head *path)
+{
+       int i;
+       bool found = false;
+       struct coresight_node *node;
+
+       /*
+        * First see if we are dealing with a sink.  If we have one check if
+        * it was selected via sysFS or the perf cmd line.
+        */
+       if (csdev->type == CORESIGHT_DEV_TYPE_SINK ||
+           csdev->type == CORESIGHT_DEV_TYPE_LINKSINK) {
+               /* Activated via perf cmd line */
+               if (sink && !strcmp(dev_name(&csdev->dev), sink))
+                       goto out;
+               /* Activatred via sysFS */
+               if (csdev->activated)
+                       goto out;
+       }
+
+       /* Not a sink - recursively explore each port found on this element */
+       for (i = 0; i < csdev->nr_outport; i++) {
+               struct coresight_device *child_dev = csdev->conns[i].child_dev;
+
+               if (child_dev &&
+                   _coresight_build_path(child_dev, sink, path) == 0) {
+                       found = true;
+                       break;
                 }
         }
  
+       if (!found)
+               return -ENODEV;
+
+out:
+       /*
+        * A path from this element to a sink has been found.  The elements
+        * leading to the sink are already enqueued, all that is left to do
+        * is tell the PM runtime core we need this element and add a node
+        * for it.
+        */
+       node = kzalloc(sizeof(struct coresight_node), GFP_KERNEL);
+       if (!node)
+               return -ENOMEM;
+
+       node->csdev = csdev;
+       list_add(&node->link, path);
+       pm_runtime_get_sync(csdev->dev.parent);
+
         return 0;
  }
  
-static int coresight_build_paths(struct coresight_device *csdev,
-                                struct list_head *path,
-                                bool enable)
+struct list_head *coresight_build_path(struct coresight_device *csdev,
+                                      const char *sink)
  {
-       int i, ret = -EINVAL;
-       struct coresight_connection *conn;
+       struct list_head *path;
+       int rc;
  
-       list_add(&csdev->path_link, path);
+       path = kzalloc(sizeof(struct list_head), GFP_KERNEL);
+       if (!path)
+               return NULL;
  
-       if ((csdev->type == CORESIGHT_DEV_TYPE_SINK ||
-           csdev->type == CORESIGHT_DEV_TYPE_LINKSINK) &&
-           csdev->activated) {
-               if (enable)
-                       ret = coresight_enable_path(path);
-               else
-                       ret = coresight_disable_path(path);
-       } else {
-               for (i = 0; i < csdev->nr_outport; i++) {
-                       conn = &csdev->conns[i];
-                       if (coresight_build_paths(conn->child_dev,
-                                                   path, enable) == 0)
-                               ret = 0;
-               }
+       INIT_LIST_HEAD(path);
+
+       rc = _coresight_build_path(csdev, sink, path);
+       if (rc) {
+               kfree(path);
+               return ERR_PTR(rc);
         }
  
-       if (list_first_entry(path, struct coresight_device, path_link) != csdev)
-               dev_err(&csdev->dev, "wrong device in %s\n", __func__);
+       return path;
+}
  
-       list_del(&csdev->path_link);
+/**
+ * coresight_release_path - release a previously built path.
+ * @path:      the path to release.
+ *
+ * Go through all the elements of a path and 1) removed it from the list and
+ * 2) free the memory allocated for each node.
+ */
+void coresight_release_path(struct list_head *path)
+{
+       struct coresight_device *csdev;
+       struct coresight_node *nd, *next;
  
-       return ret;
+       list_for_each_entry_safe(nd, next, path, link) {
+               csdev = nd->csdev;
+
+               pm_runtime_put_sync(csdev->dev.parent);
+               list_del(&nd->link);
+               kfree(nd);
+       }
+
+       kfree(path);
+       path = NULL;
+}
+
+/** coresight_validate_source - make sure a source has the right credentials
+ *  @csdev:    the device structure for a source.
+ *  @function: the function this was called from.
+ *
+ * Assumes the coresight_mutex is held.
+ */
+static int coresight_validate_source(struct coresight_device *csdev,
+                                    const char *function)
+{
+       u32 type, subtype;
+
+       type = csdev->type;
+       subtype = csdev->subtype.source_subtype;
+
+       if (type != CORESIGHT_DEV_TYPE_SOURCE) {
+               dev_err(&csdev->dev, "wrong device type in %s\n", function);
+               return -EINVAL;
+       }
+
+       if (subtype != CORESIGHT_DEV_SUBTYPE_SOURCE_PROC &&
+           subtype != CORESIGHT_DEV_SUBTYPE_SOURCE_SOFTWARE) {
+               dev_err(&csdev->dev, "wrong device subtype in %s\n", function);
+               return -EINVAL;
+       }
+
+       return 0;
  }
  
  int coresight_enable(struct coresight_device *csdev)
  {
-       int ret = 0;
-       LIST_HEAD(path);
+       int cpu, ret = 0;
+       struct list_head *path;
  
         mutex_lock(&coresight_mutex);
-       if (csdev->type != CORESIGHT_DEV_TYPE_SOURCE) {
-               ret = -EINVAL;
-               dev_err(&csdev->dev, "wrong device type in %s\n", __func__);
+
+       ret = coresight_validate_source(csdev, __func__);
+       if (ret)
                 goto out;
-       }
+
         if (csdev->enable)
                 goto out;
  
-       if (coresight_build_paths(csdev, &path, true)) {
-               dev_err(&csdev->dev, "building path(s) failed\n");
+       path = coresight_build_path(csdev, NULL);
+       if (IS_ERR(path)) {
+               pr_err("building path(s) failed\n");
+               ret = PTR_ERR(path);
                 goto out;
         }
  
-       if (coresight_enable_source(csdev))
-               dev_err(&csdev->dev, "source enable failed\n");
+       ret = coresight_enable_path(path, CS_MODE_SYSFS);
+       if (ret)
+               goto err_path;
+
+       ret = coresight_enable_source(csdev, CS_MODE_SYSFS);
+       if (ret)
+               goto err_source;
+
+       switch (csdev->subtype.source_subtype) {
+       case CORESIGHT_DEV_SUBTYPE_SOURCE_PROC:
+               /*
+                * When working from sysFS it is important to keep track
+                * of the paths that were created so that they can be
+                * undone in 'coresight_disable()'.  Since there can only
+                * be a single session per tracer (when working from sysFS)
+                * a per-cpu variable will do just fine.
+                */
+               cpu = source_ops(csdev)->cpu_id(csdev);
+               per_cpu(tracer_path, cpu) = path;
+               break;
+       case CORESIGHT_DEV_SUBTYPE_SOURCE_SOFTWARE:
+               stm_path = path;
+               break;
+       default:
+               /* We can't be here */
+               break;
+       }
+
  out:
         mutex_unlock(&coresight_mutex);
         return ret;
+
+err_source:
+       coresight_disable_path(path);
+
+err_path:
+       coresight_release_path(path);
+       goto out;
  }
  EXPORT_SYMBOL_GPL(coresight_enable);
  
  void coresight_disable(struct coresight_device *csdev)
  {
-       LIST_HEAD(path);
+       int cpu, ret;
+       struct list_head *path = NULL;
  
         mutex_lock(&coresight_mutex);
-       if (csdev->type != CORESIGHT_DEV_TYPE_SOURCE) {
-               dev_err(&csdev->dev, "wrong device type in %s\n", __func__);
+
+       ret = coresight_validate_source(csdev, __func__);
+       if (ret)
                 goto out;
-       }
+
         if (!csdev->enable)
                 goto out;
  
+       switch (csdev->subtype.source_subtype) {
+       case CORESIGHT_DEV_SUBTYPE_SOURCE_PROC:
+               cpu = source_ops(csdev)->cpu_id(csdev);
+               path = per_cpu(tracer_path, cpu);
+               per_cpu(tracer_path, cpu) = NULL;
+               break;
+       case CORESIGHT_DEV_SUBTYPE_SOURCE_SOFTWARE:
+               path = stm_path;
+               stm_path = NULL;
+               break;
+       default:
+               /* We can't be here */
+               break;
+       }
+
         coresight_disable_source(csdev);
-       if (coresight_build_paths(csdev, &path, false))
-               dev_err(&csdev->dev, "releasing path(s) failed\n");
+       coresight_disable_path(path);
+       coresight_release_path(path);
  
  out:
         mutex_unlock(&coresight_mutex);
@@ -387,7 +611,7 @@ static ssize_t enable_sink_show(struct device *dev,
  {
         struct coresight_device *csdev = to_coresight_device(dev);
  
-       return scnprintf(buf, PAGE_SIZE, "%u\n", (unsigned)csdev->activated);
+       return scnprintf(buf, PAGE_SIZE, "%u\n", csdev->activated);
  }
  
  static ssize_t enable_sink_store(struct device *dev,
@@ -417,7 +641,7 @@ static ssize_t enable_source_show(struct device *dev,
  {
         struct coresight_device *csdev = to_coresight_device(dev);
  
-       return scnprintf(buf, PAGE_SIZE, "%u\n", (unsigned)csdev->enable);
+       return scnprintf(buf, PAGE_SIZE, "%u\n", csdev->enable);
  }
  
  static ssize_t enable_source_store(struct device *dev,
@@ -481,6 +705,8 @@ static void coresight_device_release(struct device *dev)
  {
         struct coresight_device *csdev = to_coresight_device(dev);
  
+       kfree(csdev->conns);
+       kfree(csdev->refcnt);
         kfree(csdev);
  }
  
@@ -536,7 +762,7 @@ static void coresight_fixup_orphan_conns(struct coresight_device *csdev)
          * are hooked-up with each newly added component.
          */
         bus_for_each_dev(&coresight_bustype, NULL,
-                                csdev, coresight_orphan_match);
+                        csdev, coresight_orphan_match);
  }
  
  
@@ -568,6 +794,8 @@ static void coresight_fixup_device_conns(struct coresight_device *csdev)
  
                 if (dev) {
                         conn->child_dev = to_coresight_device(dev);
+                       /* and put reference from 'bus_find_device()' */
+                       put_device(dev);
                 } else {
                         csdev->orphan = true;
                         conn->child_dev = NULL;
@@ -575,6 +803,50 @@ static void coresight_fixup_device_conns(struct coresight_device *csdev)
         }
  }
  
+static int coresight_remove_match(struct device *dev, void *data)
+{
+       int i;
+       struct coresight_device *csdev, *iterator;
+       struct coresight_connection *conn;
+
+       csdev = data;
+       iterator = to_coresight_device(dev);
+
+       /* No need to check oneself */
+       if (csdev == iterator)
+               return 0;
+
+       /*
+        * Circle throuch all the connection of that component.  If we find
+        * a connection whose name matches @csdev, remove it.
+        */
+       for (i = 0; i < iterator->nr_outport; i++) {
+               conn = &iterator->conns[i];
+
+               if (conn->child_dev == NULL)
+                       continue;
+
+               if (!strcmp(dev_name(&csdev->dev), conn->child_name)) {
+                       iterator->orphan = true;
+                       conn->child_dev = NULL;
+                       /* No need to continue */
+                       break;
+               }
+       }
+
+       /*
+        * Returning '0' ensures that all known component on the
+        * bus will be checked.
+        */
+       return 0;
+}
+
+static void coresight_remove_conns(struct coresight_device *csdev)
+{
+       bus_for_each_dev(&coresight_bustype, NULL,
+                        csdev, coresight_remove_match);
+}
+
  /**
   * coresight_timeout - loop until a bit has changed to a specific state.
   * @addr: base address of the area of interest.
@@ -713,13 +985,8 @@ EXPORT_SYMBOL_GPL(coresight_register);
  
  void coresight_unregister(struct coresight_device *csdev)
  {
-       mutex_lock(&coresight_mutex);
-
-       kfree(csdev->conns);
+       /* Remove references of that device in the topology */
+       coresight_remove_conns(csdev);
         device_unregister(&csdev->dev);
-
-       mutex_unlock(&coresight_mutex);
  }
  EXPORT_SYMBOL_GPL(coresight_unregister);
-
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/hwtracing/coresight/of_coresight.c b/drivers/hwtracing/coresight/of_coresight.c

index b0973617826f62b41b31072a7ca3a7499319d4ad..b68da1888fd515879a43df8f6173dd77cdb8754e 100644 (file)
--- a/drivers/hwtracing/coresight/of_coresight.c
+++ b/drivers/hwtracing/coresight/of_coresight.c
@@ -10,7 +10,6 @@
   * GNU General Public License for more details.
   */
  
-#include <linux/module.h>
  #include <linux/types.h>
  #include <linux/err.h>
  #include <linux/slab.h>
@@ -86,7 +85,7 @@ static int of_coresight_alloc_memory(struct device *dev,
                 return -ENOMEM;
  
         /* Children connected to this component via @outports */
-        pdata->child_names = devm_kzalloc(dev, pdata->nr_outport *
+       pdata->child_names = devm_kzalloc(dev, pdata->nr_outport *
                                           sizeof(*pdata->child_names),
                                           GFP_KERNEL);
         if (!pdata->child_names)
diff --git a/drivers/hwtracing/stm/Kconfig b/drivers/hwtracing/stm/Kconfig

index e7a348807f0cb2ce91fd48e2d133f034830763be..847a39b353078618fa9009691598b4ed1d5ef583 100644 (file)
--- a/drivers/hwtracing/stm/Kconfig
+++ b/drivers/hwtracing/stm/Kconfig
@@ -9,6 +9,8 @@ config STM
  
           Say Y here to enable System Trace Module device support.
  
+if STM
+
  config STM_DUMMY
         tristate "Dummy STM driver"
         help
@@ -25,3 +27,16 @@ config STM_SOURCE_CONSOLE
  
           If you want to send kernel console messages over STM devices,
           say Y.
+
+config STM_SOURCE_HEARTBEAT
+       tristate "Heartbeat over STM devices"
+       help
+         This is a kernel space trace source that sends periodic
+         heartbeat messages to trace hosts over STM devices. It is
+         also useful for testing stm class drivers and the stm class
+         framework itself.
+
+         If you want to send heartbeat messages over STM devices,
+         say Y.
+
+endif
diff --git a/drivers/hwtracing/stm/Makefile b/drivers/hwtracing/stm/Makefile

index f9312c38dd7a8bcbfd1fce502c600229f4ff2930..a9ce3d487e5787d18eafddd06fa1be1b611ce457 100644 (file)
--- a/drivers/hwtracing/stm/Makefile
+++ b/drivers/hwtracing/stm/Makefile
@@ -5,5 +5,7 @@ stm_core-y              := core.o policy.o
  obj-$(CONFIG_STM_DUMMY)        += dummy_stm.o
  
  obj-$(CONFIG_STM_SOURCE_CONSOLE)       += stm_console.o
+obj-$(CONFIG_STM_SOURCE_HEARTBEAT)     += stm_heartbeat.o
  
  stm_console-y          := console.o
+stm_heartbeat-y                := heartbeat.o
diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c

index b6445d9e54533d224a89fbe98ad7fcfad52ac19d..02095410cb338ecd9bc6d1bc8f9b6619d096254d 100644 (file)
--- a/drivers/hwtracing/stm/core.c
+++ b/drivers/hwtracing/stm/core.c
@@ -67,9 +67,24 @@ static ssize_t channels_show(struct device *dev,
  
  static DEVICE_ATTR_RO(channels);
  
+static ssize_t hw_override_show(struct device *dev,
+                               struct device_attribute *attr,
+                               char *buf)
+{
+       struct stm_device *stm = to_stm_device(dev);
+       int ret;
+
+       ret = sprintf(buf, "%u\n", stm->data->hw_override);
+
+       return ret;
+}
+
+static DEVICE_ATTR_RO(hw_override);
+
  static struct attribute *stm_attrs[] = {
         &dev_attr_masters.attr,
         &dev_attr_channels.attr,
+       &dev_attr_hw_override.attr,
         NULL,
  };
  
@@ -113,6 +128,7 @@ struct stm_device *stm_find_device(const char *buf)
  
         stm = to_stm_device(dev);
         if (!try_module_get(stm->owner)) {
+               /* matches class_find_device() above */
                 put_device(dev);
                 return NULL;
         }
@@ -125,7 +141,7 @@ struct stm_device *stm_find_device(const char *buf)
   * @stm:       stm device, previously acquired by stm_find_device()
   *
   * This drops the module reference and device reference taken by
- * stm_find_device().
+ * stm_find_device() or stm_char_open().
   */
  void stm_put_device(struct stm_device *stm)
  {
@@ -185,6 +201,9 @@ static void stm_output_claim(struct stm_device *stm, struct stm_output *output)
  {
         struct stp_master *master = stm_master(stm, output->master);
  
+       lockdep_assert_held(&stm->mc_lock);
+       lockdep_assert_held(&output->lock);
+
         if (WARN_ON_ONCE(master->nr_free < output->nr_chans))
                 return;
  
@@ -199,6 +218,9 @@ stm_output_disclaim(struct stm_device *stm, struct stm_output *output)
  {
         struct stp_master *master = stm_master(stm, output->master);
  
+       lockdep_assert_held(&stm->mc_lock);
+       lockdep_assert_held(&output->lock);
+
         bitmap_release_region(&master->chan_map[0], output->channel,
                               ilog2(output->nr_chans));
  
@@ -288,6 +310,7 @@ static int stm_output_assign(struct stm_device *stm, unsigned int width,
         }
  
         spin_lock(&stm->mc_lock);
+       spin_lock(&output->lock);
         /* output is already assigned -- shouldn't happen */
         if (WARN_ON_ONCE(output->nr_chans))
                 goto unlock;
@@ -304,6 +327,7 @@ static int stm_output_assign(struct stm_device *stm, unsigned int width,
  
         ret = 0;
  unlock:
+       spin_unlock(&output->lock);
         spin_unlock(&stm->mc_lock);
  
         return ret;
@@ -312,11 +336,18 @@ unlock:
  static void stm_output_free(struct stm_device *stm, struct stm_output *output)
  {
         spin_lock(&stm->mc_lock);
+       spin_lock(&output->lock);
         if (output->nr_chans)
                 stm_output_disclaim(stm, output);
+       spin_unlock(&output->lock);
         spin_unlock(&stm->mc_lock);
  }
  
+static void stm_output_init(struct stm_output *output)
+{
+       spin_lock_init(&output->lock);
+}
+
  static int major_match(struct device *dev, const void *data)
  {
         unsigned int major = *(unsigned int *)data;
@@ -339,6 +370,7 @@ static int stm_char_open(struct inode *inode, struct file *file)
         if (!stmf)
                 return -ENOMEM;
  
+       stm_output_init(&stmf->output);
         stmf->stm = to_stm_device(dev);
  
         if (!try_module_get(stmf->stm->owner))
@@ -349,6 +381,8 @@ static int stm_char_open(struct inode *inode, struct file *file)
         return nonseekable_open(inode, file);
  
  err_free:
+       /* matches class_find_device() above */
+       put_device(dev);
         kfree(stmf);
  
         return err;
@@ -357,9 +391,19 @@ err_free:
  static int stm_char_release(struct inode *inode, struct file *file)
  {
         struct stm_file *stmf = file->private_data;
+       struct stm_device *stm = stmf->stm;
+
+       if (stm->data->unlink)
+               stm->data->unlink(stm->data, stmf->output.master,
+                                 stmf->output.channel);
+
+       stm_output_free(stm, &stmf->output);
  
-       stm_output_free(stmf->stm, &stmf->output);
-       stm_put_device(stmf->stm);
+       /*
+        * matches the stm_char_open()'s
+        * class_find_device() + try_module_get()
+        */
+       stm_put_device(stm);
         kfree(stmf);
  
         return 0;
@@ -380,8 +424,8 @@ static int stm_file_assign(struct stm_file *stmf, char *id, unsigned int width)
         return ret;
  }
  
-static void stm_write(struct stm_data *data, unsigned int master,
-                     unsigned int channel, const char *buf, size_t count)
+static ssize_t stm_write(struct stm_data *data, unsigned int master,
+                         unsigned int channel, const char *buf, size_t count)
  {
         unsigned int flags = STP_PACKET_TIMESTAMPED;
         const unsigned char *p = buf, nil = 0;
@@ -393,9 +437,14 @@ static void stm_write(struct stm_data *data, unsigned int master,
                 sz = data->packet(data, master, channel, STP_PACKET_DATA, flags,
                                   sz, p);
                 flags = 0;
+
+               if (sz < 0)
+                       break;
         }
  
         data->packet(data, master, channel, STP_PACKET_FLAG, 0, 0, &nil);
+
+       return pos;
  }
  
  static ssize_t stm_char_write(struct file *file, const char __user *buf,
@@ -406,6 +455,9 @@ static ssize_t stm_char_write(struct file *file, const char __user *buf,
         char *kbuf;
         int err;
  
+       if (count + 1 > PAGE_SIZE)
+               count = PAGE_SIZE - 1;
+
         /*
          * if no m/c have been assigned to this writer up to this
          * point, use "default" policy entry
@@ -430,8 +482,8 @@ static ssize_t stm_char_write(struct file *file, const char __user *buf,
                 return -EFAULT;
         }
  
-       stm_write(stm->data, stmf->output.master, stmf->output.channel, kbuf,
-                 count);
+       count = stm_write(stm->data, stmf->output.master, stmf->output.channel,
+                         kbuf, count);
  
         kfree(kbuf);
  
@@ -509,16 +561,12 @@ static int stm_char_policy_set_ioctl(struct stm_file *stmf, void __user *arg)
         if (ret)
                 goto err_free;
  
-       ret = 0;
-
         if (stm->data->link)
                 ret = stm->data->link(stm->data, stmf->output.master,
                                       stmf->output.channel);
  
-       if (ret) {
+       if (ret)
                 stm_output_free(stmf->stm, &stmf->output);
-               stm_put_device(stmf->stm);
-       }
  
  err_free:
         kfree(id);
@@ -633,17 +681,11 @@ int stm_register_device(struct device *parent, struct stm_data *stm_data,
         stm->dev.parent = parent;
         stm->dev.release = stm_device_release;
  
-       err = kobject_set_name(&stm->dev.kobj, "%s", stm_data->name);
-       if (err)
-               goto err_device;
-
-       err = device_add(&stm->dev);
-       if (err)
-               goto err_device;
-
+       mutex_init(&stm->link_mutex);
         spin_lock_init(&stm->link_lock);
         INIT_LIST_HEAD(&stm->link_list);
  
+       /* initialize the object before it is accessible via sysfs */
         spin_lock_init(&stm->mc_lock);
         mutex_init(&stm->policy_mutex);
         stm->sw_nmasters = nmasters;
@@ -651,9 +693,20 @@ int stm_register_device(struct device *parent, struct stm_data *stm_data,
         stm->data = stm_data;
         stm_data->stm = stm;
  
+       err = kobject_set_name(&stm->dev.kobj, "%s", stm_data->name);
+       if (err)
+               goto err_device;
+
+       err = device_add(&stm->dev);
+       if (err)
+               goto err_device;
+
         return 0;
  
  err_device:
+       unregister_chrdev(stm->major, stm_data->name);
+
+       /* matches device_initialize() above */
         put_device(&stm->dev);
  err_free:
         kfree(stm);
@@ -662,20 +715,28 @@ err_free:
  }
  EXPORT_SYMBOL_GPL(stm_register_device);
  
-static void __stm_source_link_drop(struct stm_source_device *src,
-                                  struct stm_device *stm);
+static int __stm_source_link_drop(struct stm_source_device *src,
+                                 struct stm_device *stm);
  
  void stm_unregister_device(struct stm_data *stm_data)
  {
         struct stm_device *stm = stm_data->stm;
         struct stm_source_device *src, *iter;
-       int i;
+       int i, ret;
  
-       spin_lock(&stm->link_lock);
+       mutex_lock(&stm->link_mutex);
         list_for_each_entry_safe(src, iter, &stm->link_list, link_entry) {
-               __stm_source_link_drop(src, stm);
+               ret = __stm_source_link_drop(src, stm);
+               /*
+                * src <-> stm link must not change under the same
+                * stm::link_mutex, so complain loudly if it has;
+                * also in this situation ret!=0 means this src is
+                * not connected to this stm and it should be otherwise
+                * safe to proceed with the tear-down of stm.
+                */
+               WARN_ON_ONCE(ret);
         }
-       spin_unlock(&stm->link_lock);
+       mutex_unlock(&stm->link_mutex);
  
         synchronize_srcu(&stm_source_srcu);
  
@@ -694,6 +755,17 @@ void stm_unregister_device(struct stm_data *stm_data)
  }
  EXPORT_SYMBOL_GPL(stm_unregister_device);
  
+/*
+ * stm::link_list access serialization uses a spinlock and a mutex; holding
+ * either of them guarantees that the list is stable; modification requires
+ * holding both of them.
+ *
+ * Lock ordering is as follows:
+ *   stm::link_mutex
+ *     stm::link_lock
+ *       src::link_lock
+ */
+
  /**
   * stm_source_link_add() - connect an stm_source device to an stm device
   * @src:       stm_source device
@@ -710,6 +782,7 @@ static int stm_source_link_add(struct stm_source_device *src,
         char *id;
         int err;
  
+       mutex_lock(&stm->link_mutex);
         spin_lock(&stm->link_lock);
         spin_lock(&src->link_lock);
  
@@ -719,6 +792,7 @@ static int stm_source_link_add(struct stm_source_device *src,
  
         spin_unlock(&src->link_lock);
         spin_unlock(&stm->link_lock);
+       mutex_unlock(&stm->link_mutex);
  
         id = kstrdup(src->data->name, GFP_KERNEL);
         if (id) {
@@ -753,9 +827,9 @@ static int stm_source_link_add(struct stm_source_device *src,
  
  fail_free_output:
         stm_output_free(stm, &src->output);
-       stm_put_device(stm);
  
  fail_detach:
+       mutex_lock(&stm->link_mutex);
         spin_lock(&stm->link_lock);
         spin_lock(&src->link_lock);
  
@@ -764,6 +838,7 @@ fail_detach:
  
         spin_unlock(&src->link_lock);
         spin_unlock(&stm->link_lock);
+       mutex_unlock(&stm->link_mutex);
  
         return err;
  }
@@ -776,28 +851,55 @@ fail_detach:
   * If @stm is @src::link, disconnect them from one another and put the
   * reference on the @stm device.
   *
- * Caller must hold stm::link_lock.
+ * Caller must hold stm::link_mutex.
   */
-static void __stm_source_link_drop(struct stm_source_device *src,
-                                  struct stm_device *stm)
+static int __stm_source_link_drop(struct stm_source_device *src,
+                                 struct stm_device *stm)
  {
         struct stm_device *link;
+       int ret = 0;
+
+       lockdep_assert_held(&stm->link_mutex);
  
+       /* for stm::link_list modification, we hold both mutex and spinlock */
+       spin_lock(&stm->link_lock);
         spin_lock(&src->link_lock);
         link = srcu_dereference_check(src->link, &stm_source_srcu, 1);
-       if (WARN_ON_ONCE(link != stm)) {
-               spin_unlock(&src->link_lock);
-               return;
+
+       /*
+        * The linked device may have changed since we last looked, because
+        * we weren't holding the src::link_lock back then; if this is the
+        * case, tell the caller to retry.
+        */
+       if (link != stm) {
+               ret = -EAGAIN;
+               goto unlock;
         }
  
         stm_output_free(link, &src->output);
-       /* caller must hold stm::link_lock */
         list_del_init(&src->link_entry);
         /* matches stm_find_device() from stm_source_link_store() */
         stm_put_device(link);
         rcu_assign_pointer(src->link, NULL);
  
+unlock:
         spin_unlock(&src->link_lock);
+       spin_unlock(&stm->link_lock);
+
+       /*
+        * Call the unlink callbacks for both source and stm, when we know
+        * that we have actually performed the unlinking.
+        */
+       if (!ret) {
+               if (src->data->unlink)
+                       src->data->unlink(src->data);
+
+               if (stm->data->unlink)
+                       stm->data->unlink(stm->data, src->output.master,
+                                         src->output.channel);
+       }
+
+       return ret;
  }
  
  /**
@@ -813,21 +915,29 @@ static void __stm_source_link_drop(struct stm_source_device *src,
  static void stm_source_link_drop(struct stm_source_device *src)
  {
         struct stm_device *stm;
-       int idx;
+       int idx, ret;
  
+retry:
         idx = srcu_read_lock(&stm_source_srcu);
+       /*
+        * The stm device will be valid for the duration of this
+        * read section, but the link may change before we grab
+        * the src::link_lock in __stm_source_link_drop().
+        */
         stm = srcu_dereference(src->link, &stm_source_srcu);
  
+       ret = 0;
         if (stm) {
-               if (src->data->unlink)
-                       src->data->unlink(src->data);
-
-               spin_lock(&stm->link_lock);
-               __stm_source_link_drop(src, stm);
-               spin_unlock(&stm->link_lock);
+               mutex_lock(&stm->link_mutex);
+               ret = __stm_source_link_drop(src, stm);
+               mutex_unlock(&stm->link_mutex);
         }
  
         srcu_read_unlock(&stm_source_srcu, idx);
+
+       /* if it did change, retry */
+       if (ret == -EAGAIN)
+               goto retry;
  }
  
  static ssize_t stm_source_link_show(struct device *dev,
@@ -862,8 +972,10 @@ static ssize_t stm_source_link_store(struct device *dev,
                 return -EINVAL;
  
         err = stm_source_link_add(src, link);
-       if (err)
+       if (err) {
+               /* matches the stm_find_device() above */
                 stm_put_device(link);
+       }
  
         return err ? : count;
  }
@@ -925,6 +1037,7 @@ int stm_source_register_device(struct device *parent,
         if (err)
                 goto err;
  
+       stm_output_init(&src->output);
         spin_lock_init(&src->link_lock);
         INIT_LIST_HEAD(&src->link_entry);
         src->data = data;
@@ -973,9 +1086,9 @@ int stm_source_write(struct stm_source_data *data, unsigned int chan,
  
         stm = srcu_dereference(src->link, &stm_source_srcu);
         if (stm)
-               stm_write(stm->data, src->output.master,
-                         src->output.channel + chan,
-                         buf, count);
+               count = stm_write(stm->data, src->output.master,
+                                 src->output.channel + chan,
+                                 buf, count);
         else
                 count = -ENODEV;
  
diff --git a/drivers/hwtracing/stm/dummy_stm.c b/drivers/hwtracing/stm/dummy_stm.c

index 3709bef0b21ff2d4ffafe29347e1f9872fbe17de..a86612d989f963ca4f31db8b39002351abab76ec 100644 (file)
--- a/drivers/hwtracing/stm/dummy_stm.c
+++ b/drivers/hwtracing/stm/dummy_stm.c
@@ -40,22 +40,71 @@ dummy_stm_packet(struct stm_data *stm_data, unsigned int master,
         return size;
  }
  
-static struct stm_data dummy_stm = {
-       .name           = "dummy_stm",
-       .sw_start       = 0x0000,
-       .sw_end         = 0xffff,
-       .sw_nchannels   = 0xffff,
-       .packet         = dummy_stm_packet,
-};
+#define DUMMY_STM_MAX 32
+
+static struct stm_data dummy_stm[DUMMY_STM_MAX];
+
+static int nr_dummies = 4;
+
+module_param(nr_dummies, int, 0400);
+
+static unsigned int fail_mode;
+
+module_param(fail_mode, int, 0600);
+
+static int dummy_stm_link(struct stm_data *data, unsigned int master,
+                         unsigned int channel)
+{
+       if (fail_mode && (channel & fail_mode))
+               return -EINVAL;
+
+       return 0;
+}
  
  static int dummy_stm_init(void)
  {
-       return stm_register_device(NULL, &dummy_stm, THIS_MODULE);
+       int i, ret = -ENOMEM;
+
+       if (nr_dummies < 0 || nr_dummies > DUMMY_STM_MAX)
+               return -EINVAL;
+
+       for (i = 0; i < nr_dummies; i++) {
+               dummy_stm[i].name = kasprintf(GFP_KERNEL, "dummy_stm.%d", i);
+               if (!dummy_stm[i].name)
+                       goto fail_unregister;
+
+               dummy_stm[i].sw_start           = 0x0000;
+               dummy_stm[i].sw_end             = 0xffff;
+               dummy_stm[i].sw_nchannels       = 0xffff;
+               dummy_stm[i].packet             = dummy_stm_packet;
+               dummy_stm[i].link               = dummy_stm_link;
+
+               ret = stm_register_device(NULL, &dummy_stm[i], THIS_MODULE);
+               if (ret)
+                       goto fail_free;
+       }
+
+       return 0;
+
+fail_unregister:
+       for (i--; i >= 0; i--) {
+               stm_unregister_device(&dummy_stm[i]);
+fail_free:
+               kfree(dummy_stm[i].name);
+       }
+
+       return ret;
+
  }
  
  static void dummy_stm_exit(void)
  {
-       stm_unregister_device(&dummy_stm);
+       int i;
+
+       for (i = 0; i < nr_dummies; i++) {
+               stm_unregister_device(&dummy_stm[i]);
+               kfree(dummy_stm[i].name);
+       }
  }
  
  module_init(dummy_stm_init);
diff --git a/drivers/hwtracing/stm/heartbeat.c b/drivers/hwtracing/stm/heartbeat.c

new file mode 100644 (file)

index 0000000..3da7b67
--- /dev/null
+++ b/drivers/hwtracing/stm/heartbeat.c
@@ -0,0 +1,126 @@
+/*
+ * Simple heartbeat STM source driver
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Heartbeat STM source will send repetitive messages over STM devices to a
+ * trace host.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/hrtimer.h>
+#include <linux/slab.h>
+#include <linux/stm.h>
+
+#define STM_HEARTBEAT_MAX      32
+
+static int nr_devs = 4;
+static int interval_ms = 10;
+
+module_param(nr_devs, int, 0400);
+module_param(interval_ms, int, 0600);
+
+static struct stm_heartbeat {
+       struct stm_source_data  data;
+       struct hrtimer          hrtimer;
+       unsigned int            active;
+} stm_heartbeat[STM_HEARTBEAT_MAX];
+
+static const char str[] = "heartbeat stm source driver is here to serve you";
+
+static enum hrtimer_restart stm_heartbeat_hrtimer_handler(struct hrtimer *hr)
+{
+       struct stm_heartbeat *heartbeat = container_of(hr, struct stm_heartbeat,
+                                                      hrtimer);
+
+       stm_source_write(&heartbeat->data, 0, str, sizeof str);
+       if (heartbeat->active)
+               hrtimer_forward_now(hr, ms_to_ktime(interval_ms));
+
+       return heartbeat->active ? HRTIMER_RESTART : HRTIMER_NORESTART;
+}
+
+static int stm_heartbeat_link(struct stm_source_data *data)
+{
+       struct stm_heartbeat *heartbeat =
+               container_of(data, struct stm_heartbeat, data);
+
+       heartbeat->active = 1;
+       hrtimer_start(&heartbeat->hrtimer, ms_to_ktime(interval_ms),
+                     HRTIMER_MODE_ABS);
+
+       return 0;
+}
+
+static void stm_heartbeat_unlink(struct stm_source_data *data)
+{
+       struct stm_heartbeat *heartbeat =
+               container_of(data, struct stm_heartbeat, data);
+
+       heartbeat->active = 0;
+       hrtimer_cancel(&heartbeat->hrtimer);
+}
+
+static int stm_heartbeat_init(void)
+{
+       int i, ret = -ENOMEM;
+
+       if (nr_devs < 0 || nr_devs > STM_HEARTBEAT_MAX)
+               return -EINVAL;
+
+       for (i = 0; i < nr_devs; i++) {
+               stm_heartbeat[i].data.name =
+                       kasprintf(GFP_KERNEL, "heartbeat.%d", i);
+               if (!stm_heartbeat[i].data.name)
+                       goto fail_unregister;
+
+               stm_heartbeat[i].data.nr_chans  = 1;
+               stm_heartbeat[i].data.link              = stm_heartbeat_link;
+               stm_heartbeat[i].data.unlink    = stm_heartbeat_unlink;
+               hrtimer_init(&stm_heartbeat[i].hrtimer, CLOCK_MONOTONIC,
+                            HRTIMER_MODE_ABS);
+               stm_heartbeat[i].hrtimer.function =
+                       stm_heartbeat_hrtimer_handler;
+
+               ret = stm_source_register_device(NULL, &stm_heartbeat[i].data);
+               if (ret)
+                       goto fail_free;
+       }
+
+       return 0;
+
+fail_unregister:
+       for (i--; i >= 0; i--) {
+               stm_source_unregister_device(&stm_heartbeat[i].data);
+fail_free:
+               kfree(stm_heartbeat[i].data.name);
+       }
+
+       return ret;
+}
+
+static void stm_heartbeat_exit(void)
+{
+       int i;
+
+       for (i = 0; i < nr_devs; i++) {
+               stm_source_unregister_device(&stm_heartbeat[i].data);
+               kfree(stm_heartbeat[i].data.name);
+       }
+}
+
+module_init(stm_heartbeat_init);
+module_exit(stm_heartbeat_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("stm_heartbeat driver");
+MODULE_AUTHOR("Alexander Shishkin <alexander.shishkin@linux.intel.com>");
diff --git a/drivers/hwtracing/stm/policy.c b/drivers/hwtracing/stm/policy.c

index 11ab6d01adf63d1490c8474801d358ada4f657a8..1c061cb9bff05ff7a0923dca13bbc1b5a2104c3b 100644 (file)
--- a/drivers/hwtracing/stm/policy.c
+++ b/drivers/hwtracing/stm/policy.c
@@ -272,13 +272,17 @@ void stp_policy_unbind(struct stp_policy *policy)
  {
         struct stm_device *stm = policy->stm;
  
+       /*
+        * stp_policy_release() will not call here if the policy is already
+        * unbound; other users should not either, as no link exists between
+        * this policy and anything else in that case
+        */
         if (WARN_ON_ONCE(!policy->stm))
                 return;
  
-       mutex_lock(&stm->policy_mutex);
-       stm->policy = NULL;
-       mutex_unlock(&stm->policy_mutex);
+       lockdep_assert_held(&stm->policy_mutex);
  
+       stm->policy = NULL;
         policy->stm = NULL;
  
         stm_put_device(stm);
@@ -287,8 +291,16 @@ void stp_policy_unbind(struct stp_policy *policy)
  static void stp_policy_release(struct config_item *item)
  {
         struct stp_policy *policy = to_stp_policy(item);
+       struct stm_device *stm = policy->stm;
  
+       /* a policy *can* be unbound and still exist in configfs tree */
+       if (!stm)
+               return;
+
+       mutex_lock(&stm->policy_mutex);
         stp_policy_unbind(policy);
+       mutex_unlock(&stm->policy_mutex);
+
         kfree(policy);
  }
  
@@ -320,16 +332,17 @@ stp_policies_make(struct config_group *group, const char *name)
  
         /*
          * node must look like <device_name>.<policy_name>, where
-        * <device_name> is the name of an existing stm device and
-        * <policy_name> is an arbitrary string
+        * <device_name> is the name of an existing stm device; may
+        *               contain dots;
+        * <policy_name> is an arbitrary string; may not contain dots
          */
-       p = strchr(devname, '.');
+       p = strrchr(devname, '.');
         if (!p) {
                 kfree(devname);
                 return ERR_PTR(-EINVAL);
         }
  
-       *p++ = '\0';
+       *p = '\0';
  
         stm = stm_find_device(devname);
         kfree(devname);
diff --git a/drivers/hwtracing/stm/stm.h b/drivers/hwtracing/stm/stm.h

index 95ece0292c991c8ad60f81c4a15fe4f3687d4504..4e8c6926260f3e8eec0ec8da70de8f13f86f0cc6 100644 (file)
--- a/drivers/hwtracing/stm/stm.h
+++ b/drivers/hwtracing/stm/stm.h
@@ -45,6 +45,7 @@ struct stm_device {
         int                     major;
         unsigned int            sw_nmasters;
         struct stm_data         *data;
+       struct mutex            link_mutex;
         spinlock_t              link_lock;
         struct list_head        link_list;
         /* master allocation */
@@ -56,6 +57,7 @@ struct stm_device {
         container_of((_d), struct stm_device, dev)
  
  struct stm_output {
+       spinlock_t              lock;
         unsigned int            master;
         unsigned int            channel;
         unsigned int            nr_chans;
diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c

index 11fdadc68e53e57722b4d58892fdf2c644beba34..2a6eaf1122b4e9b742eb3777fb4b6b317c07b201 100644 (file)
--- a/drivers/misc/lkdtm.c
+++ b/drivers/misc/lkdtm.c
@@ -103,6 +103,7 @@ enum ctype {
         CT_EXEC_USERSPACE,
         CT_ACCESS_USERSPACE,
         CT_WRITE_RO,
+       CT_WRITE_RO_AFTER_INIT,
         CT_WRITE_KERN,
  };
  
@@ -140,6 +141,7 @@ static char* cp_type[] = {
         "EXEC_USERSPACE",
         "ACCESS_USERSPACE",
         "WRITE_RO",
+       "WRITE_RO_AFTER_INIT",
         "WRITE_KERN",
  };
  
@@ -162,6 +164,7 @@ static DEFINE_SPINLOCK(lock_me_up);
  static u8 data_area[EXEC_SIZE];
  
  static const unsigned long rodata = 0xAA55AA55;
+static unsigned long ro_after_init __ro_after_init = 0x55AA5500;
  
  module_param(recur_count, int, 0644);
  MODULE_PARM_DESC(recur_count, " Recursion level for the stack overflow test");
@@ -503,11 +506,28 @@ static void lkdtm_do_action(enum ctype which)
                 break;
         }
         case CT_WRITE_RO: {
-               unsigned long *ptr;
+               /* Explicitly cast away "const" for the test. */
+               unsigned long *ptr = (unsigned long *)&rodata;
  
-               ptr = (unsigned long *)&rodata;
+               pr_info("attempting bad rodata write at %p\n", ptr);
+               *ptr ^= 0xabcd1234;
  
-               pr_info("attempting bad write at %p\n", ptr);
+               break;
+       }
+       case CT_WRITE_RO_AFTER_INIT: {
+               unsigned long *ptr = &ro_after_init;
+
+               /*
+                * Verify we were written to during init. Since an Oops
+                * is considered a "success", a failure is to just skip the
+                * real test.
+                */
+               if ((*ptr & 0xAA) != 0xAA) {
+                       pr_info("%p was NOT written during init!?\n", ptr);
+                       break;
+               }
+
+               pr_info("attempting bad ro_after_init write at %p\n", ptr);
                 *ptr ^= 0xabcd1234;
  
                 break;
@@ -817,6 +837,9 @@ static int __init lkdtm_module_init(void)
         int n_debugfs_entries = 1; /* Assume only the direct entry */
         int i;
  
+       /* Make sure we can write to __ro_after_init values during __init */
+       ro_after_init |= 0xAA;
+
         /* Register debugfs interface */
         lkdtm_debugfs_root = debugfs_create_dir("provoke-crash", NULL);
         if (!lkdtm_debugfs_root) {
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c

index 655f79db7899ffd0628714d51203847630a8075c..3e90bce70545a759b415081050d0762f4ae3c640 100644 (file)
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -760,6 +760,16 @@ const void * __init of_flat_dt_match_machine(const void *default_match,
  }
  
  #ifdef CONFIG_BLK_DEV_INITRD
+#ifndef __early_init_dt_declare_initrd
+static void __early_init_dt_declare_initrd(unsigned long start,
+                                          unsigned long end)
+{
+       initrd_start = (unsigned long)__va(start);
+       initrd_end = (unsigned long)__va(end);
+       initrd_below_start_ok = 1;
+}
+#endif
+
  /**
   * early_init_dt_check_for_initrd - Decode initrd location from flat tree
   * @node: reference to node containing initrd location ('chosen')
@@ -782,9 +792,7 @@ static void __init early_init_dt_check_for_initrd(unsigned long node)
                 return;
         end = of_read_number(prop, len/4);
  
-       initrd_start = (unsigned long)__va(start);
-       initrd_end = (unsigned long)__va(end);
-       initrd_below_start_ok = 1;
+       __early_init_dt_declare_initrd(start, end);
  
         pr_debug("initrd_start=0x%llx  initrd_end=0x%llx\n",
                  (unsigned long long)start, (unsigned long long)end);
@@ -976,13 +984,16 @@ int __init early_init_dt_scan_chosen(unsigned long node, const char *uname,
  }
  
  #ifdef CONFIG_HAVE_MEMBLOCK
+#ifndef MIN_MEMBLOCK_ADDR
+#define MIN_MEMBLOCK_ADDR      __pa(PAGE_OFFSET)
+#endif
  #ifndef MAX_MEMBLOCK_ADDR
  #define MAX_MEMBLOCK_ADDR      ((phys_addr_t)~0)
  #endif
  
  void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size)
  {
-       const u64 phys_offset = __pa(PAGE_OFFSET);
+       const u64 phys_offset = MIN_MEMBLOCK_ADDR;
  
         if (!PAGE_ALIGNED(base)) {
                 if (size < PAGE_SIZE - (base & ~PAGE_MASK)) {
diff --git a/drivers/soc/qcom/spm.c b/drivers/soc/qcom/spm.c

index 65bce1eecaf868ddd60cca8324920f5d5fa57802..5548a31e1a39a100142b45841cbe38e1aa007e38 100644 (file)
--- a/drivers/soc/qcom/spm.c
+++ b/drivers/soc/qcom/spm.c
@@ -116,7 +116,7 @@ static const struct spm_reg_data spm_reg_8064_cpu = {
  
  static DEFINE_PER_CPU(struct spm_driver_data *, cpu_spm_drv);
  
-typedef int (*idle_fn)(int);
+typedef int (*idle_fn)(void);
  static DEFINE_PER_CPU(idle_fn*, qcom_idle_ops);
  
  static inline void spm_register_write(struct spm_driver_data *drv,
@@ -179,10 +179,10 @@ static int qcom_pm_collapse(unsigned long int unused)
         return -1;
  }
  
-static int qcom_cpu_spc(int cpu)
+static int qcom_cpu_spc(void)
  {
         int ret;
-       struct spm_driver_data *drv = per_cpu(cpu_spm_drv, cpu);
+       struct spm_driver_data *drv = __this_cpu_read(cpu_spm_drv);
  
         spm_set_low_power_mode(drv, PM_SLEEP_MODE_SPC);
         ret = cpu_suspend(0, qcom_pm_collapse);
@@ -197,9 +197,9 @@ static int qcom_cpu_spc(int cpu)
         return ret;
  }
  
-static int qcom_idle_enter(int cpu, unsigned long index)
+static int qcom_idle_enter(unsigned long index)
  {
-       return per_cpu(qcom_idle_ops, cpu)[index](cpu);
+       return __this_cpu_read(qcom_idle_ops)[index]();
  }
  
  static const struct of_device_id qcom_idle_state_match[] __initconst = {
diff --git a/fs/inode.c b/fs/inode.c

index b0edef500590c3e49cd5f632640add63426a39d5..2c16b758831dd08715c3c6e270c97f00b6bed10c 100644 (file)
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -154,6 +154,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
         inode->i_rdev = 0;
         inode->dirtied_when = 0;
  
+#ifdef CONFIG_CGROUP_WRITEBACK
+       inode->i_wb_frn_winner = 0;
+       inode->i_wb_frn_avg_time = 0;
+       inode->i_wb_frn_history = 0;
+#endif
+
         if (security_inode_alloc(inode))
                 goto out;
         spin_lock_init(&inode->i_lock);
diff --git a/include/asm-generic/fixmap.h b/include/asm-generic/fixmap.h

index 1cbb8338edf391bd83c4d1b0bc0dff2cbbe56e75..827e4d3bbc7a46ef59222651a8020234addc82cb 100644 (file)
--- a/include/asm-generic/fixmap.h
+++ b/include/asm-generic/fixmap.h
@@ -70,12 +70,12 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
  #endif
  
  /* Return a pointer with offset calculated */
-#define __set_fixmap_offset(idx, phys, flags)                \
-({                                                           \
-       unsigned long addr;                                   \
-       __set_fixmap(idx, phys, flags);                       \
-       addr = fix_to_virt(idx) + ((phys) & (PAGE_SIZE - 1)); \
-       addr;                                                 \
+#define __set_fixmap_offset(idx, phys, flags)                          \
+({                                                                     \
+       unsigned long ________addr;                                     \
+       __set_fixmap(idx, phys, flags);                                 \
+       ________addr = fix_to_virt(idx) + ((phys) & (PAGE_SIZE - 1));   \
+       ________addr;                                                   \
  })
  
  #define set_fixmap_offset(idx, phys) \
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h

index ef2e8c97e18326091fece74091f7602fe6f55bcf..71581125e60402b26f217456f9638466181e19ca 100644 (file)
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -256,6 +256,7 @@
         .rodata           : AT(ADDR(.rodata) - LOAD_OFFSET) {           \
                 VMLINUX_SYMBOL(__start_rodata) = .;                     \
                 *(.rodata) *(.rodata.*)                                 \
+               *(.data..ro_after_init) /* Read only after init */      \
                 *(__vermagic)           /* Kernel version magic */      \
                 . = ALIGN(8);                                           \
                 VMLINUX_SYMBOL(__start___tracepoints_ptrs) = .;         \
diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h

index 9916d0e4eff505f18cf5e30ad52dbe2e37b3fa3b..25d0914481a26d04a001a91216df6fd68c52e87c 100644 (file)
--- a/include/clocksource/arm_arch_timer.h
+++ b/include/clocksource/arm_arch_timer.h
@@ -23,6 +23,12 @@
  #define ARCH_TIMER_CTRL_IT_MASK                (1 << 1)
  #define ARCH_TIMER_CTRL_IT_STAT                (1 << 2)
  
+#define CNTHCTL_EL1PCTEN               (1 << 0)
+#define CNTHCTL_EL1PCEN                        (1 << 1)
+#define CNTHCTL_EVNTEN                 (1 << 2)
+#define CNTHCTL_EVNTDIR                        (1 << 3)
+#define CNTHCTL_EVNTI                  (0xF << 4)
+
  enum arch_timer_reg {
         ARCH_TIMER_REG_CTRL,
         ARCH_TIMER_REG_TVAL,
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h

index d2f41477f8ae77600a8683890b3615766b9a3701..13a3d537811b9f7d12b3fa892b8312330f89d70f 100644 (file)
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -279,6 +279,12 @@ struct vgic_v2_cpu_if {
         u32             vgic_lr[VGIC_V2_MAX_LRS];
  };
  
+/*
+ * LRs are stored in reverse order in memory. make sure we index them
+ * correctly.
+ */
+#define VGIC_V3_LR_INDEX(lr)           (VGIC_V3_MAX_LRS - 1 - lr)
+
  struct vgic_v3_cpu_if {
  #ifdef CONFIG_KVM_ARM_VGIC_V3
         u32             vgic_hcr;
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h

index 9006c4e75cf737a90335eadcd73e14d59b0f753e..3d8dcdd1aeae902ad2f587e2f2f875a77ef985f7 100644 (file)
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -163,4 +163,13 @@ struct amba_device name##_device = {                               \
  #define module_amba_driver(__amba_drv) \
         module_driver(__amba_drv, amba_driver_register, amba_driver_unregister)
  
+/*
+ * builtin_amba_driver() - Helper macro for drivers that don't do anything
+ * special in driver initcall.  This eliminates a lot of boilerplate.  Each
+ * driver may only use this macro once, and calling it replaces the instance
+ * device_initcall().
+ */
+#define builtin_amba_driver(__amba_drv) \
+       builtin_driver(__amba_drv, amba_driver_register)
+
  #endif
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h

new file mode 100644 (file)

index 0000000..b5abfda
--- /dev/null
+++ b/include/linux/arm-smccc.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef __LINUX_ARM_SMCCC_H
+#define __LINUX_ARM_SMCCC_H
+
+#include <linux/linkage.h>
+#include <linux/types.h>
+
+/*
+ * This file provides common defines for ARM SMC Calling Convention as
+ * specified in
+ * http://infocenter.arm.com/help/topic/com.arm.doc.den0028a/index.html
+ */
+
+#define ARM_SMCCC_STD_CALL             0
+#define ARM_SMCCC_FAST_CALL            1
+#define ARM_SMCCC_TYPE_SHIFT           31
+
+#define ARM_SMCCC_SMC_32               0
+#define ARM_SMCCC_SMC_64               1
+#define ARM_SMCCC_CALL_CONV_SHIFT      30
+
+#define ARM_SMCCC_OWNER_MASK           0x3F
+#define ARM_SMCCC_OWNER_SHIFT          24
+
+#define ARM_SMCCC_FUNC_MASK            0xFFFF
+
+#define ARM_SMCCC_IS_FAST_CALL(smc_val)        \
+       ((smc_val) & (ARM_SMCCC_FAST_CALL << ARM_SMCCC_TYPE_SHIFT))
+#define ARM_SMCCC_IS_64(smc_val) \
+       ((smc_val) & (ARM_SMCCC_SMC_64 << ARM_SMCCC_CALL_CONV_SHIFT))
+#define ARM_SMCCC_FUNC_NUM(smc_val)    ((smc_val) & ARM_SMCCC_FUNC_MASK)
+#define ARM_SMCCC_OWNER_NUM(smc_val) \
+       (((smc_val) >> ARM_SMCCC_OWNER_SHIFT) & ARM_SMCCC_OWNER_MASK)
+
+#define ARM_SMCCC_CALL_VAL(type, calling_convention, owner, func_num) \
+       (((type) << ARM_SMCCC_TYPE_SHIFT) | \
+       ((calling_convention) << ARM_SMCCC_CALL_CONV_SHIFT) | \
+       (((owner) & ARM_SMCCC_OWNER_MASK) << ARM_SMCCC_OWNER_SHIFT) | \
+       ((func_num) & ARM_SMCCC_FUNC_MASK))
+
+#define ARM_SMCCC_OWNER_ARCH           0
+#define ARM_SMCCC_OWNER_CPU            1
+#define ARM_SMCCC_OWNER_SIP            2
+#define ARM_SMCCC_OWNER_OEM            3
+#define ARM_SMCCC_OWNER_STANDARD       4
+#define ARM_SMCCC_OWNER_TRUSTED_APP    48
+#define ARM_SMCCC_OWNER_TRUSTED_APP_END        49
+#define ARM_SMCCC_OWNER_TRUSTED_OS     50
+#define ARM_SMCCC_OWNER_TRUSTED_OS_END 63
+
+/**
+ * struct arm_smccc_res - Result from SMC/HVC call
+ * @a0-a3 result values from registers 0 to 3
+ */
+struct arm_smccc_res {
+       unsigned long a0;
+       unsigned long a1;
+       unsigned long a2;
+       unsigned long a3;
+};
+
+/**
+ * arm_smccc_smc() - make SMC calls
+ * @a0-a7: arguments passed in registers 0 to 7
+ * @res: result values from registers 0 to 3
+ *
+ * This function is used to make SMC calls following SMC Calling Convention.
+ * The content of the supplied param are copied to registers 0 to 7 prior
+ * to the SMC instruction. The return values are updated with the content
+ * from register 0 to 3 on return from the SMC instruction.
+ */
+asmlinkage void arm_smccc_smc(unsigned long a0, unsigned long a1,
+                       unsigned long a2, unsigned long a3, unsigned long a4,
+                       unsigned long a5, unsigned long a6, unsigned long a7,
+                       struct arm_smccc_res *res);
+
+/**
+ * arm_smccc_hvc() - make HVC calls
+ * @a0-a7: arguments passed in registers 0 to 7
+ * @res: result values from registers 0 to 3
+ *
+ * This function is used to make HVC calls following SMC Calling
+ * Convention.  The content of the supplied param are copied to registers 0
+ * to 7 prior to the HVC instruction. The return values are updated with
+ * the content from register 0 to 3 on return from the HVC instruction.
+ */
+asmlinkage void arm_smccc_hvc(unsigned long a0, unsigned long a1,
+                       unsigned long a2, unsigned long a3, unsigned long a4,
+                       unsigned long a5, unsigned long a6, unsigned long a7,
+                       struct arm_smccc_res *res);
+
+#endif /*__LINUX_ARM_SMCCC_H*/
diff --git a/include/linux/cache.h b/include/linux/cache.h

index 17e7e82d2aa758f9888419a9c03aa4059e16b247..1be04f8c563a0c60bdfca72a36c120ec96ef327c 100644 (file)
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -12,10 +12,24 @@
  #define SMP_CACHE_BYTES L1_CACHE_BYTES
  #endif
  
+/*
+ * __read_mostly is used to keep rarely changing variables out of frequently
+ * updated cachelines. If an architecture doesn't support it, ignore the
+ * hint.
+ */
  #ifndef __read_mostly
  #define __read_mostly
  #endif
  
+/*
+ * __ro_after_init is used to mark things that are read-only after init (i.e.
+ * after mark_rodata_ro() has been called). These are effectively read-only,
+ * but may get written to during init, so can't live in .rodata (via "const").
+ */
+#ifndef __ro_after_init
+#define __ro_after_init __attribute__((__section__(".data..ro_after_init")))
+#endif
+
  #ifndef ____cacheline_aligned
  #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES)))
  #endif
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h

index 8da26329975429c8895afcceab874d11e3326471..4cd5c95d1ca0ccbf59312d4ef3d92dae752b1c38 100644 (file)
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -66,7 +66,6 @@ enum {
  
  /* cgroup_root->flags */
  enum {
-       CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */
         CGRP_ROOT_NOPREFIX      = (1 << 1), /* mounted subsystems have no named prefix */
         CGRP_ROOT_XATTR         = (1 << 2), /* supports extended attributes */
  };
diff --git a/include/linux/coresight-pmu.h b/include/linux/coresight-pmu.h

new file mode 100644 (file)

index 0000000..7d41026
--- /dev/null
+++ b/include/linux/coresight-pmu.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright(C) 2015 Linaro Limited. All rights reserved.
+ * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _LINUX_CORESIGHT_PMU_H
+#define _LINUX_CORESIGHT_PMU_H
+
+#define CORESIGHT_ETM_PMU_NAME "cs_etm"
+#define CORESIGHT_ETM_PMU_SEED  0x10
+
+/* ETMv3.5/PTM's ETMCR config bit */
+#define ETM_OPT_CYCACC  12
+#define ETM_OPT_TS      28
+
+static inline int coresight_get_trace_id(int cpu)
+{
+       /*
+        * A trace ID of value 0 is invalid, so let's start at some
+        * random value that fits in 7 bits and go from there.  Since
+        * the common convention is to have data trace IDs be I(N) + 1,
+        * set instruction trace IDs as a function of the CPU number.
+        */
+       return (CORESIGHT_ETM_PMU_SEED + (cpu * 2));
+}
+
+#endif
diff --git a/include/linux/coresight-stm.h b/include/linux/coresight-stm.h

new file mode 100644 (file)

index 0000000..a978bb8
--- /dev/null
+++ b/include/linux/coresight-stm.h
@@ -0,0 +1,6 @@
+#ifndef __LINUX_CORESIGHT_STM_H_
+#define __LINUX_CORESIGHT_STM_H_
+
+#include <uapi/linux/coresight-stm.h>
+
+#endif
diff --git a/include/linux/coresight.h b/include/linux/coresight.h

index a7cabfa23b55823773cb91e9bfd7996ae34f3f97..385d62e64abb00218d5f52f32bd1875dde5b7b43 100644 (file)
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -14,6 +14,7 @@
  #define _LINUX_CORESIGHT_H
  
  #include <linux/device.h>
+#include <linux/perf_event.h>
  #include <linux/sched.h>
  
  /* Peripheral id registers (0xFD0-0xFEC) */
@@ -152,7 +153,6 @@ struct coresight_connection {
                 by @coresight_ops.
   * @dev:       The device entity associated to this component.
   * @refcnt:    keep track of what is in use.
- * @path_link: link of current component into the path being enabled.
   * @orphan:    true if the component has connections that haven't been linked.
   * @enable:    'true' if component is currently part of an active path.
   * @activated: 'true' only if a _sink_ has been activated.  A sink can be
@@ -168,7 +168,6 @@ struct coresight_device {
         const struct coresight_ops *ops;
         struct device dev;
         atomic_t *refcnt;
-       struct list_head path_link;
         bool orphan;
         bool enable;    /* true only if configured as part of a path */
         bool activated; /* true only if a sink is part of a path */
@@ -183,12 +182,29 @@ struct coresight_device {
  /**
   * struct coresight_ops_sink - basic operations for a sink
   * Operations available for sinks
- * @enable:    enables the sink.
- * @disable:   disables the sink.
+ * @enable:            enables the sink.
+ * @disable:           disables the sink.
+ * @alloc_buffer:      initialises perf's ring buffer for trace collection.
+ * @free_buffer:       release memory allocated in @get_config.
+ * @set_buffer:                initialises buffer mechanic before a trace session.
+ * @reset_buffer:      finalises buffer mechanic after a trace session.
+ * @update_buffer:     update buffer pointers after a trace session.
   */
  struct coresight_ops_sink {
-       int (*enable)(struct coresight_device *csdev);
+       int (*enable)(struct coresight_device *csdev, u32 mode);
         void (*disable)(struct coresight_device *csdev);
+       void *(*alloc_buffer)(struct coresight_device *csdev, int cpu,
+                             void **pages, int nr_pages, bool overwrite);
+       void (*free_buffer)(void *config);
+       int (*set_buffer)(struct coresight_device *csdev,
+                         struct perf_output_handle *handle,
+                         void *sink_config);
+       unsigned long (*reset_buffer)(struct coresight_device *csdev,
+                                     struct perf_output_handle *handle,
+                                     void *sink_config, bool *lost);
+       void (*update_buffer)(struct coresight_device *csdev,
+                             struct perf_output_handle *handle,
+                             void *sink_config);
  };
  
  /**
@@ -205,14 +221,18 @@ struct coresight_ops_link {
  /**
   * struct coresight_ops_source - basic operations for a source
   * Operations available for sources.
+ * @cpu_id:    returns the value of the CPU number this component
+ *             is associated to.
   * @trace_id:  returns the value of the component's trace ID as known
-               to the HW.
+ *             to the HW.
   * @enable:    enables tracing for a source.
   * @disable:   disables tracing for a source.
   */
  struct coresight_ops_source {
+       int (*cpu_id)(struct coresight_device *csdev);
         int (*trace_id)(struct coresight_device *csdev);
-       int (*enable)(struct coresight_device *csdev);
+       int (*enable)(struct coresight_device *csdev,
+                     struct perf_event_attr *attr,  u32 mode);
         void (*disable)(struct coresight_device *csdev);
  };
  
diff --git a/include/linux/efi.h b/include/linux/efi.h

index 47be3ad7d3e5bad63b48a8fa344dbea65c0a97dd..333d0ca6940f0214f37b69377716d58852240a3c 100644 (file)
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -299,7 +299,7 @@ typedef struct {
         void *open_protocol_information;
         void *protocols_per_handle;
         void *locate_handle_buffer;
-       void *locate_protocol;
+       efi_status_t (*locate_protocol)(efi_guid_t *, void *, void **);
         void *install_multiple_protocol_interfaces;
         void *uninstall_multiple_protocol_interfaces;
         void *calculate_crc32;
@@ -599,6 +599,10 @@ void efi_native_runtime_setup(void);
  #define EFI_PROPERTIES_TABLE_GUID \
      EFI_GUID(  0x880aaca3, 0x4adc, 0x4a04, 0x90, 0x79, 0xb7, 0x47, 0x34, 0x08, 0x25, 0xe5 )
  
+#define EFI_RNG_PROTOCOL_GUID \
+       EFI_GUID(0x3152bca5, 0xeade, 0x433d, \
+                0x86, 0x2e, 0xc0, 0x1c, 0xdc, 0x29, 0x1f, 0x44)
+
  typedef struct {
         efi_guid_t guid;
         u64 table;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index 685c262e0be848ca049ee041d00d389d3cc327fe..b0eb06423d5eccba6cb850078af6ffc60d97c382 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -96,9 +96,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
                                 struct address_space *mapping,
                                 pgoff_t idx, unsigned long address);
  
-#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
  pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
-#endif
  
  extern int hugepages_treat_as_movable;
  extern int sysctl_hugetlb_shm_group;
diff --git a/include/linux/init.h b/include/linux/init.h

index b449f378f995ae647077f521d9f7af3af9480a70..aedb254abc37204a091b790eedfde857dc22155f 100644 (file)
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -142,6 +142,10 @@ void prepare_namespace(void);
  void __init load_default_modules(void);
  int __init init_rootfs(void);
  
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void);
+#endif
+
  extern void (*late_time_init)(void);
  
  extern bool initcall_debug;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index e23a9e704536278dad66bc5e5d1f9f798036b8be..71b61b0b9b459891091d0096e7d2a4b9465eb7fa 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -65,8 +65,10 @@ enum {
  
  #ifdef CONFIG_CMA
  #  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
+#  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
  #else
  #  define is_migrate_cma(migratetype) false
+#  define is_migrate_cma_page(_page) false
  #endif
  
  #define for_each_migratetype_order(order, type) \
@@ -361,10 +363,10 @@ struct zone {
         struct per_cpu_pageset __percpu *pageset;
  
         /*
-        * This is a per-zone reserve of pages that should not be
-        * considered dirtyable memory.
+        * This is a per-zone reserve of pages that are not available
+        * to userspace allocations.
          */
-       unsigned long           dirty_balance_reserve;
+       unsigned long           totalreserve_pages;
  
  #ifndef CONFIG_SPARSEMEM
         /*
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index 6cdd50f7f52d7dc1009bef3b39c30a4b86135308..a288010667dcf561f1019a6d28ade8ead8673787 100644 (file)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -379,7 +379,7 @@ struct pmu {
         /*
          * Set up pmu-private data structures for an AUX area
          */
-       void *(*setup_aux)              (int cpu, void **pages,
+       void *(*setup_aux)              (struct perf_event *event, void **pages,
                                          int nr_pages, bool overwrite);
                                         /* optional */
  
@@ -392,6 +392,14 @@ struct pmu {
          * Filter events for PMU-specific reasons.
          */
         int (*filter_match)             (struct perf_event *event); /* optional */
+
+       /*
+        * Initial, PMU driver specific configuration.
+        */
+       int (*get_drv_configs)          (struct perf_event *event,
+                                        void __user *arg); /* optional */
+       void (*free_drv_configs)        (struct perf_event *event);
+                                       /* optional */
  };
  
  /**
@@ -559,6 +567,7 @@ struct perf_event {
         struct irq_work                 pending;
  
         atomic_t                        event_limit;
+       struct list_head                drv_configs;
  
         void (*destroy)(struct perf_event *);
         struct rcu_head                 rcu_head;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h

index 9a2e50337af9fd233656b8fb6f06a91c10b2e7bf..cccaf4a29e9f02c9a60b65f73a523a69efa5af3a 100644 (file)
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -34,6 +34,8 @@ bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp);
  
  int dev_pm_opp_get_opp_count(struct device *dev);
  unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev);
+unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev);
+unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev);
  struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev);
  
  struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
@@ -55,6 +57,14 @@ int dev_pm_opp_enable(struct device *dev, unsigned long freq);
  int dev_pm_opp_disable(struct device *dev, unsigned long freq);
  
  struct srcu_notifier_head *dev_pm_opp_get_notifier(struct device *dev);
+int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
+                               unsigned int count);
+void dev_pm_opp_put_supported_hw(struct device *dev);
+int dev_pm_opp_set_prop_name(struct device *dev, const char *name);
+void dev_pm_opp_put_prop_name(struct device *dev);
+int dev_pm_opp_set_regulator(struct device *dev, const char *name);
+void dev_pm_opp_put_regulator(struct device *dev);
+int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
  #else
  static inline unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
  {
@@ -81,6 +91,16 @@ static inline unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
         return 0;
  }
  
+static inline unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
+{
+       return 0;
+}
+
+static inline unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev)
+{
+       return 0;
+}
+
  static inline struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
  {
         return NULL;
@@ -129,6 +149,35 @@ static inline struct srcu_notifier_head *dev_pm_opp_get_notifier(
  {
         return ERR_PTR(-EINVAL);
  }
+
+static inline int dev_pm_opp_set_supported_hw(struct device *dev,
+                                             const u32 *versions,
+                                             unsigned int count)
+{
+       return -EINVAL;
+}
+
+static inline void dev_pm_opp_put_supported_hw(struct device *dev) {}
+
+static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
+{
+       return -EINVAL;
+}
+
+static inline void dev_pm_opp_put_prop_name(struct device *dev) {}
+
+static inline int dev_pm_opp_set_regulator(struct device *dev, const char *name)
+{
+       return -EINVAL;
+}
+
+static inline void dev_pm_opp_put_regulator(struct device *dev) {}
+
+static inline int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
+{
+       return -EINVAL;
+}
+
  #endif         /* CONFIG_PM_OPP */
  
  #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF)
diff --git a/include/linux/psci.h b/include/linux/psci.h

index 12c4865457adc3d0412c573b710feec7d3d6fd81..393efe2edf9afb9d8a38d1a16201e78c64881026 100644 (file)
--- a/include/linux/psci.h
+++ b/include/linux/psci.h
@@ -24,6 +24,9 @@ bool psci_tos_resident_on(int cpu);
  bool psci_power_state_loses_context(u32 state);
  bool psci_power_state_is_valid(u32 state);
  
+int psci_cpu_init_idle(unsigned int cpu);
+int psci_cpu_suspend_enter(unsigned long index);
+
  struct psci_operations {
         int (*cpu_suspend)(u32 state, unsigned long entry_point);
         int (*cpu_off)(u32 state);
diff --git a/include/linux/slab.h b/include/linux/slab.h

index 2037a861e3679910152a98ba98a667b395c6773c..4ef384b172e0e5f307edfd69abd5d100d339c913 100644 (file)
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -144,6 +144,18 @@ void kfree(const void *);
  void kzfree(const void *);
  size_t ksize(const void *);
  
+#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
+const char *__check_heap_object(const void *ptr, unsigned long n,
+                               struct page *page);
+#else
+static inline const char *__check_heap_object(const void *ptr,
+                                             unsigned long n,
+                                             struct page *page)
+{
+       return NULL;
+}
+#endif
+
  /*
   * Some archs want to perform DMA into kmalloc caches and need a guaranteed
   * alignment larger than the alignment of a 64-bit integer.
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h

index 33885118523c7ce695e2c901e42d76250a84a8da..f4e857e920cd5b9b1e8edc3d94d08704c6344ea4 100644 (file)
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -81,6 +81,7 @@ struct kmem_cache {
         int reserved;           /* Reserved bytes at the end of slabs */
         const char *name;       /* Name (only for display!) */
         struct list_head list;  /* List of slab caches */
+       int red_left_pad;       /* Left redzone padding size */
  #ifdef CONFIG_SYSFS
         struct kobject kobj;    /* For sysfs */
  #endif
diff --git a/include/linux/stm.h b/include/linux/stm.h

index 9d0083d364e642b6ac68ae56cdae3bc197ffebde..8369d8a8cabd7f1e43505ac404253974ce101db0 100644 (file)
--- a/include/linux/stm.h
+++ b/include/linux/stm.h
@@ -50,6 +50,8 @@ struct stm_device;
   * @sw_end:            last STP master available to software
   * @sw_nchannels:      number of STP channels per master
   * @sw_mmiosz:         size of one channel's IO space, for mmap, optional
+ * @hw_override:       masters in the STP stream will not match the ones
+ *                     assigned by software, but are up to the STM hardware
   * @packet:            callback that sends an STP packet
   * @mmio_addr:         mmap callback, optional
   * @link:              called when a new stm_source gets linked to us, optional
@@ -67,6 +69,16 @@ struct stm_device;
   * description. That is, the lowest master that can be allocated to software
   * writers is @sw_start and data from this writer will appear is @sw_start
   * master in the STP stream.
+ *
+ * The @packet callback should adhere to the following rules:
+ *   1) it must return the number of bytes it consumed from the payload;
+ *   2) therefore, if it sent a packet that does not have payload (like FLAG),
+ *      it must return zero;
+ *   3) if it does not support the requested packet type/flag combination,
+ *      it must return -ENOTSUPP.
+ *
+ * The @unlink callback is called when there are no more active writers so
+ * that the master/channel can be quiesced.
   */
  struct stm_data {
         const char              *name;
@@ -75,6 +87,7 @@ struct stm_data {
         unsigned int            sw_end;
         unsigned int            sw_nchannels;
         unsigned int            sw_mmiosz;
+       unsigned int            hw_override;
         ssize_t                 (*packet)(struct stm_data *, unsigned int,
                                           unsigned int, unsigned int,
                                           unsigned int, unsigned int,
diff --git a/include/linux/swap.h b/include/linux/swap.h

index d8ca2eaa3a8bff3b548ecf14560643761b55e9ca..f1a52c11de0edabf4e16e44e93a32e3103169ea9 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -289,7 +289,6 @@ static inline void workingset_node_shadows_dec(struct radix_tree_node *node)
  /* linux/mm/page_alloc.c */
  extern unsigned long totalram_pages;
  extern unsigned long totalreserve_pages;
-extern unsigned long dirty_balance_reserve;
  extern unsigned long nr_free_buffer_pages(void);
  extern unsigned long nr_free_pagecache_pages(void);
  
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h

index ff307b548ed3c91a0f1cd05e486789305cefb43f..eded095fe81e5c3ddc04e2af91f5bdaf9e4d28a5 100644 (file)
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -145,6 +145,31 @@ static inline bool test_and_clear_restore_sigmask(void)
  #error "no set_restore_sigmask() provided and default one won't work"
  #endif
  
+#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
+static inline int arch_within_stack_frames(const void * const stack,
+                                          const void * const stackend,
+                                          const void *obj, unsigned long len)
+{
+       return 0;
+}
+#endif
+
+#ifdef CONFIG_HARDENED_USERCOPY
+extern void __check_object_size(const void *ptr, unsigned long n,
+                                       bool to_user);
+
+static inline void check_object_size(const void *ptr, unsigned long n,
+                                    bool to_user)
+{
+       if (!__builtin_constant_p(n))
+               __check_object_size(ptr, n, to_user);
+}
+#else
+static inline void check_object_size(const void *ptr, unsigned long n,
+                                    bool to_user)
+{ }
+#endif /* CONFIG_HARDENED_USERCOPY */
+
  #endif /* __KERNEL__ */
  
  #endif /* _LINUX_THREAD_INFO_H */
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h

index 558129af828a7eb97ad64b1531ac2a4e3f71174d..f30c187ed785366231e318a7beab151e0bba64b6 100644 (file)
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -111,4 +111,11 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
  #define probe_kernel_address(addr, retval)             \
         probe_kernel_read(&retval, addr, sizeof(retval))
  
+#ifndef user_access_begin
+#define user_access_begin() do { } while (0)
+#define user_access_end() do { } while (0)
+#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0)
+#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0)
+#endif
+
  #endif         /* __LINUX_UACCESS_H__ */
diff --git a/include/uapi/linux/coresight-stm.h b/include/uapi/linux/coresight-stm.h

new file mode 100644 (file)

index 0000000..7e4272c
--- /dev/null
+++ b/include/uapi/linux/coresight-stm.h
@@ -0,0 +1,21 @@
+#ifndef __UAPI_CORESIGHT_STM_H_
+#define __UAPI_CORESIGHT_STM_H_
+
+#define STM_FLAG_TIMESTAMPED   BIT(3)
+#define STM_FLAG_GUARANTEED    BIT(7)
+
+/*
+ * The CoreSight STM supports guaranteed and invariant timing
+ * transactions.  Guaranteed transactions are guaranteed to be
+ * traced, this might involve stalling the bus or system to
+ * ensure the transaction is accepted by the STM.  While invariant
+ * timing transactions are not guaranteed to be traced, they
+ * will take an invariant amount of time regardless of the
+ * state of the STM.
+ */
+enum {
+       STM_OPTION_GUARANTEED = 0,
+       STM_OPTION_INVARIANT,
+};
+
+#endif
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h

index accb036bbc9c3621d9929dda225b6d0689348860..b283d56c1db97955f0558776fdc98e03294b7ce0 100644 (file)
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -54,6 +54,7 @@
  
  #define SMB_SUPER_MAGIC                0x517B
  #define CGROUP_SUPER_MAGIC     0x27e0eb
+#define CGROUP2_SUPER_MAGIC    0x63677270
  
  
  #define STACK_END_MAGIC                0x57AC6E9D
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h

index d801bb0d9f6d2b08b6be600565716d045d8b85ca..cd2e88450faaddd516a2ae6665744e5888d7bdfd 100644 (file)
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -395,6 +395,7 @@ struct perf_event_attr {
  #define PERF_EVENT_IOC_SET_FILTER      _IOW('$', 6, char *)
  #define PERF_EVENT_IOC_ID              _IOR('$', 7, __u64 *)
  #define PERF_EVENT_IOC_SET_BPF         _IOW('$', 8, __u32)
+#define PERF_EVENT_IOC_SET_DRV_CONFIGS _IOW('$', 10, char *)
  
  enum perf_event_ioc_flags {
         PERF_IOC_FLAG_GROUP             = 1U << 0,
diff --git a/init/Kconfig b/init/Kconfig

index 235c7a2c0d2004f1121b7d98f4140683ab45ef53..e1d1d6936f9228ad1580144563c27a8b2d8d2dd1 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1719,6 +1719,7 @@ choice
  
  config SLAB
         bool "SLAB"
+       select HAVE_HARDENED_USERCOPY_ALLOCATOR
         help
           The regular slab allocator that is established and known to work
           well in all environments. It organizes cache hot objects in
@@ -1726,6 +1727,7 @@ config SLAB
  
  config SLUB
         bool "SLUB (Unqueued Allocator)"
+       select HAVE_HARDENED_USERCOPY_ALLOCATOR
         help
            SLUB is a slab allocator that minimizes cache line usage
            instead of managing queues of cached objects (SLAB approach).
diff --git a/init/main.c b/init/main.c

index 9e64d7097f1ad4d5744755c977cac583debbaf38..fbafa271531cb417a60bb1366baacc04f2e81880 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -93,9 +93,6 @@ static int kernel_init(void *);
  extern void init_IRQ(void);
  extern void fork_init(void);
  extern void radix_tree_init(void);
-#ifndef CONFIG_DEBUG_RODATA
-static inline void mark_rodata_ro(void) { }
-#endif
  
  /*
   * Debug helper: via this flag we know that we are in 'early bootup code'
@@ -929,6 +926,28 @@ static int try_to_run_init_process(const char *init_filename)
  
  static noinline void __init kernel_init_freeable(void);
  
+#ifdef CONFIG_DEBUG_RODATA
+static bool rodata_enabled = true;
+static int __init set_debug_rodata(char *str)
+{
+       return strtobool(str, &rodata_enabled);
+}
+__setup("rodata=", set_debug_rodata);
+
+static void mark_readonly(void)
+{
+       if (rodata_enabled)
+               mark_rodata_ro();
+       else
+               pr_info("Kernel memory protection disabled.\n");
+}
+#else
+static inline void mark_readonly(void)
+{
+       pr_warn("This architecture does not have kernel memory protection.\n");
+}
+#endif
+
  static int __ref kernel_init(void *unused)
  {
         int ret;
@@ -937,7 +956,7 @@ static int __ref kernel_init(void *unused)
         /* need to finish all async __init code before freeing the memory */
         async_synchronize_full();
         free_initmem();
-       mark_rodata_ro();
+       mark_readonly();
         system_state = SYSTEM_RUNNING;
         numa_default_policy();
  
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index 127c63e02d52b99d6c566ba50af719bde32f49c7..b5946676f84ef0013f07bcce35b10534dd23de04 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -211,6 +211,7 @@ static unsigned long have_free_callback __read_mostly;
  /* Ditto for the can_fork callback. */
  static unsigned long have_canfork_callback __read_mostly;
  
+static struct file_system_type cgroup2_fs_type;
  static struct cftype cgroup_dfl_base_files[];
  static struct cftype cgroup_legacy_base_files[];
  
@@ -1650,10 +1651,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                         all_ss = true;
                         continue;
                 }
-               if (!strcmp(token, "__DEVEL__sane_behavior")) {
-                       opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
-                       continue;
-               }
                 if (!strcmp(token, "noprefix")) {
                         opts->flags |= CGRP_ROOT_NOPREFIX;
                         continue;
@@ -1720,15 +1717,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                         return -ENOENT;
         }
  
-       if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
-               pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
-               if (nr_opts != 1) {
-                       pr_err("sane_behavior: no other mount options allowed\n");
-                       return -EINVAL;
-               }
-               return 0;
-       }
-
         /*
          * If the 'all' option was specified select all the subsystems,
          * otherwise if 'none', 'name=' and a subsystem name options were
@@ -2007,6 +1995,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                          int flags, const char *unused_dev_name,
                          void *data)
  {
+       bool is_v2 = fs_type == &cgroup2_fs_type;
         struct super_block *pinned_sb = NULL;
         struct cgroup_subsys *ss;
         struct cgroup_root *root;
@@ -2023,6 +2012,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
         if (!use_task_css_set_links)
                 cgroup_enable_task_cg_lists();
  
+       if (is_v2) {
+               if (data) {
+                       pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+                       return ERR_PTR(-EINVAL);
+               }
+               cgrp_dfl_root_visible = true;
+               root = &cgrp_dfl_root;
+               cgroup_get(&root->cgrp);
+               goto out_mount;
+       }
+
         mutex_lock(&cgroup_mutex);
  
         /* First find the desired set of subsystems */
@@ -2030,15 +2030,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
         if (ret)
                 goto out_unlock;
  
-       /* look for a matching existing root */
-       if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
-               cgrp_dfl_root_visible = true;
-               root = &cgrp_dfl_root;
-               cgroup_get(&root->cgrp);
-               ret = 0;
-               goto out_unlock;
-       }
-
         /*
          * Destruction of cgroup root is asynchronous, so subsystems may
          * still be dying after the previous unmount.  Let's drain the
@@ -2149,9 +2140,10 @@ out_free:
  
         if (ret)
                 return ERR_PTR(ret);
-
+out_mount:
         dentry = kernfs_mount(fs_type, flags, root->kf_root,
-                               CGROUP_SUPER_MAGIC, &new_sb);
+                             is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
+                             &new_sb);
         if (IS_ERR(dentry) || !new_sb)
                 cgroup_put(&root->cgrp);
  
@@ -2194,6 +2186,12 @@ static struct file_system_type cgroup_fs_type = {
         .kill_sb = cgroup_kill_sb,
  };
  
+static struct file_system_type cgroup2_fs_type = {
+       .name = "cgroup2",
+       .mount = cgroup_mount,
+       .kill_sb = cgroup_kill_sb,
+};
+
  /**
   * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
   * @task: target task
@@ -5383,6 +5381,7 @@ int __init cgroup_init(void)
  
         WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
         WARN_ON(register_filesystem(&cgroup_fs_type));
+       WARN_ON(register_filesystem(&cgroup2_fs_type));
         WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
  
         return 0;
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c

index e1dbf4a2c69e4ca9721c22184cb9f800325b9194..90ff129c88a27c50e33be234be695650e7210494 100644 (file)
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
         } else {
                 kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
                            __func__, bp->bp_addr);
-#ifdef CONFIG_DEBUG_RODATA
                 if (!bp->bp_type) {
                         kdb_printf("Software breakpoints are unavailable.\n"
-                                  "  Change the kernel CONFIG_DEBUG_RODATA=n\n"
+                                  "  Boot the kernel with rodata=off\n"
                                    "  OR use hw breaks: help bph\n");
                 }
-#endif
                 return 1;
         }
         return 0;
diff --git a/kernel/events/core.c b/kernel/events/core.c

index bc6371b0e4fb12b1fe4945b5027feceafc223447..b4998fe563dc31068bca3f620393e002228f6667 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1906,8 +1906,13 @@ event_sched_in(struct perf_event *event,
         if (event->state <= PERF_EVENT_STATE_OFF)
                 return 0;
  
-       event->state = PERF_EVENT_STATE_ACTIVE;
-       event->oncpu = smp_processor_id();
+       WRITE_ONCE(event->oncpu, smp_processor_id());
+       /*
+        * Order event::oncpu write to happen before the ACTIVE state
+        * is visible.
+        */
+       smp_wmb();
+       WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
  
         /*
          * Unthrottle events, since we scheduled we might have missed several
@@ -2388,6 +2393,29 @@ void perf_event_enable(struct perf_event *event)
  }
  EXPORT_SYMBOL_GPL(perf_event_enable);
  
+static int __perf_event_stop(void *info)
+{
+       struct perf_event *event = info;
+
+       /* for AUX events, our job is done if the event is already inactive */
+       if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+               return 0;
+
+       /* matches smp_wmb() in event_sched_in() */
+       smp_rmb();
+
+       /*
+        * There is a window with interrupts enabled before we get here,
+        * so we need to check again lest we try to stop another CPU's event.
+        */
+       if (READ_ONCE(event->oncpu) != smp_processor_id())
+               return -EAGAIN;
+
+       event->pmu->stop(event, PERF_EF_UPDATE);
+
+       return 0;
+}
+
  static int _perf_event_refresh(struct perf_event *event, int refresh)
  {
         /*
@@ -3713,6 +3741,9 @@ static void __free_event(struct perf_event *event)
         if (event->destroy)
                 event->destroy(event);
  
+       if (event->pmu->free_drv_configs)
+               event->pmu->free_drv_configs(event);
+
         if (event->ctx)
                 put_ctx(event->ctx);
  
@@ -4265,6 +4296,8 @@ static int perf_event_set_output(struct perf_event *event,
                                  struct perf_event *output_event);
  static int perf_event_set_filter(struct perf_event *event, void __user *arg);
  static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
+static int perf_event_drv_configs(struct perf_event *event,
+                                 void __user *arg);
  
  static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
  {
@@ -4321,6 +4354,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
         case PERF_EVENT_IOC_SET_BPF:
                 return perf_event_set_bpf_prog(event, arg);
  
+       case PERF_EVENT_IOC_SET_DRV_CONFIGS:
+               return perf_event_drv_configs(event, (void __user *)arg);
+
         default:
                 return -ENOTTY;
         }
@@ -4353,6 +4389,7 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
         switch (_IOC_NR(cmd)) {
         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
         case _IOC_NR(PERF_EVENT_IOC_ID):
+       case _IOC_NR(PERF_EVENT_IOC_SET_DRV_CONFIGS):
                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
                         cmd &= ~IOCSIZE_MASK;
@@ -4637,6 +4674,8 @@ static void perf_mmap_open(struct vm_area_struct *vma)
                 event->pmu->event_mapped(event);
  }
  
+static void perf_pmu_output_stop(struct perf_event *event);
+
  /*
   * A buffer can be mmap()ed multiple times; either directly through the same
   * event, or through other events by use of perf_event_set_output().
@@ -4664,10 +4703,22 @@ static void perf_mmap_close(struct vm_area_struct *vma)
          */
         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
             atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+               /*
+                * Stop all AUX events that are writing to this buffer,
+                * so that we can free its AUX pages and corresponding PMU
+                * data. Note that after rb::aux_mmap_count dropped to zero,
+                * they won't start any more (see perf_aux_output_begin()).
+                */
+               perf_pmu_output_stop(event);
+
+               /* now it's safe to free the pages */
                 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
                 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
  
+               /* this has to be the last one */
                 rb_free_aux(rb);
+               WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
+
                 mutex_unlock(&event->mmap_mutex);
         }
  
@@ -5738,6 +5789,80 @@ next:
         rcu_read_unlock();
  }
  
+struct remote_output {
+       struct ring_buffer      *rb;
+       int                     err;
+};
+
+static void __perf_event_output_stop(struct perf_event *event, void *data)
+{
+       struct perf_event *parent = event->parent;
+       struct remote_output *ro = data;
+       struct ring_buffer *rb = ro->rb;
+
+       if (!has_aux(event))
+               return;
+
+       if (!parent)
+               parent = event;
+
+       /*
+        * In case of inheritance, it will be the parent that links to the
+        * ring-buffer, but it will be the child that's actually using it:
+        */
+       if (rcu_dereference(parent->rb) == rb)
+               ro->err = __perf_event_stop(event);
+}
+
+static int __perf_pmu_output_stop(void *info)
+{
+       struct perf_event *event = info;
+       struct pmu *pmu = event->pmu;
+       struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+       struct remote_output ro = {
+               .rb     = event->rb,
+       };
+
+       rcu_read_lock();
+       perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro);
+       if (cpuctx->task_ctx)
+               perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+                                  &ro);
+       rcu_read_unlock();
+
+       return ro.err;
+}
+
+static void perf_pmu_output_stop(struct perf_event *event)
+{
+       struct perf_event *iter;
+       int err, cpu;
+
+restart:
+       rcu_read_lock();
+       list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
+               /*
+                * For per-CPU events, we need to make sure that neither they
+                * nor their children are running; for cpu==-1 events it's
+                * sufficient to stop the event itself if it's active, since
+                * it can't have children.
+                */
+               cpu = iter->cpu;
+               if (cpu == -1)
+                       cpu = READ_ONCE(iter->oncpu);
+
+               if (cpu == -1)
+                       continue;
+
+               err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
+               if (err == -EAGAIN) {
+                       rcu_read_unlock();
+                       goto restart;
+               }
+       }
+       rcu_read_unlock();
+}
+
  /*
   * task tracking -- fork/exit
   *
@@ -7164,6 +7289,15 @@ void perf_bp_event(struct perf_event *bp, void *data)
  }
  #endif
  
+static int perf_event_drv_configs(struct perf_event *event,
+                                 void __user *arg)
+{
+       if (!event->pmu->get_drv_configs)
+               return -EINVAL;
+
+       return event->pmu->get_drv_configs(event, arg);
+}
+
  /*
   * hrtimer based swevent callback
   */
@@ -7900,6 +8034,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
         INIT_LIST_HEAD(&event->sibling_list);
         INIT_LIST_HEAD(&event->rb_entry);
         INIT_LIST_HEAD(&event->active_entry);
+       INIT_LIST_HEAD(&event->drv_configs);
         INIT_HLIST_NODE(&event->hlist_entry);
  
  
@@ -8482,6 +8617,7 @@ SYSCALL_DEFINE5(perf_event_open,
                                         f_flags);
         if (IS_ERR(event_file)) {
                 err = PTR_ERR(event_file);
+               event_file = NULL;
                 goto err_context;
         }
  
diff --git a/kernel/events/internal.h b/kernel/events/internal.h

index 2bbad9c1274c3199338e653bbb5c8bd640815fa7..2b229fdcfc099f608c601e83b43c2fcdb1eba12b 100644 (file)
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -11,7 +11,6 @@
  struct ring_buffer {
         atomic_t                        refcount;
         struct rcu_head                 rcu_head;
-       struct irq_work                 irq_work;
  #ifdef CONFIG_PERF_USE_VMALLOC
         struct work_struct              work;
         int                             page_order;     /* allocation order  */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c

index 014b6952819463e165a8f273839af53424c258a3..8c60a4eb408057f9143f2d5c01c8047f145a5463 100644 (file)
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -221,8 +221,6 @@ void perf_output_end(struct perf_output_handle *handle)
         rcu_read_unlock();
  }
  
-static void rb_irq_work(struct irq_work *work);
-
  static void
  ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
  {
@@ -243,16 +241,6 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
  
         INIT_LIST_HEAD(&rb->event_list);
         spin_lock_init(&rb->event_lock);
-       init_irq_work(&rb->irq_work, rb_irq_work);
-}
-
-static void ring_buffer_put_async(struct ring_buffer *rb)
-{
-       if (!atomic_dec_and_test(&rb->refcount))
-               return;
-
-       rb->rcu_head.next = (void *)rb;
-       irq_work_queue(&rb->irq_work);
  }
  
  /*
@@ -264,6 +252,10 @@ static void ring_buffer_put_async(struct ring_buffer *rb)
   * The ordering is similar to that of perf_output_{begin,end}, with
   * the exception of (B), which should be taken care of by the pmu
   * driver, since ordering rules will differ depending on hardware.
+ *
+ * Call this from pmu::start(); see the comment in perf_aux_output_end()
+ * about its use in pmu callbacks. Both can also be called from the PMI
+ * handler if needed.
   */
  void *perf_aux_output_begin(struct perf_output_handle *handle,
                             struct perf_event *event)
@@ -287,6 +279,13 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
         if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
                 goto err;
  
+       /*
+        * If rb::aux_mmap_count is zero (and rb_has_aux() above went through),
+        * the aux buffer is in perf_mmap_close(), about to get freed.
+        */
+       if (!atomic_read(&rb->aux_mmap_count))
+               goto err_put;
+
         /*
          * Nesting is not supported for AUX area, make sure nested
          * writers are caught early
@@ -328,10 +327,11 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
         return handle->rb->aux_priv;
  
  err_put:
+       /* can't be last */
         rb_free_aux(rb);
  
  err:
-       ring_buffer_put_async(rb);
+       ring_buffer_put(rb);
         handle->event = NULL;
  
         return NULL;
@@ -342,6 +342,10 @@ err:
   * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
   * pmu driver's responsibility to observe ordering rules of the hardware,
   * so that all the data is externally visible before this is called.
+ *
+ * Note: this has to be called from pmu::stop() callback, as the assumption
+ * of the AUX buffer management code is that after pmu::stop(), the AUX
+ * transaction must be stopped and therefore drop the AUX reference count.
   */
  void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
                          bool truncated)
@@ -389,8 +393,9 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
         handle->event = NULL;
  
         local_set(&rb->aux_nest, 0);
+       /* can't be last */
         rb_free_aux(rb);
-       ring_buffer_put_async(rb);
+       ring_buffer_put(rb);
  }
  
  /*
@@ -467,6 +472,33 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx)
         __free_page(page);
  }
  
+static void __rb_free_aux(struct ring_buffer *rb)
+{
+       int pg;
+
+       /*
+        * Should never happen, the last reference should be dropped from
+        * perf_mmap_close() path, which first stops aux transactions (which
+        * in turn are the atomic holders of aux_refcount) and then does the
+        * last rb_free_aux().
+        */
+       WARN_ON_ONCE(in_atomic());
+
+       if (rb->aux_priv) {
+               rb->free_aux(rb->aux_priv);
+               rb->free_aux = NULL;
+               rb->aux_priv = NULL;
+       }
+
+       if (rb->aux_nr_pages) {
+               for (pg = 0; pg < rb->aux_nr_pages; pg++)
+                       rb_free_aux_page(rb, pg);
+
+               kfree(rb->aux_pages);
+               rb->aux_nr_pages = 0;
+       }
+}
+
  int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
                  pgoff_t pgoff, int nr_pages, long watermark, int flags)
  {
@@ -530,7 +562,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
                         goto out;
         }
  
-       rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
+       rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,
                                              overwrite);
         if (!rb->aux_priv)
                 goto out;
@@ -555,45 +587,15 @@ out:
         if (!ret)
                 rb->aux_pgoff = pgoff;
         else
-               rb_free_aux(rb);
+               __rb_free_aux(rb);
  
         return ret;
  }
  
-static void __rb_free_aux(struct ring_buffer *rb)
-{
-       int pg;
-
-       if (rb->aux_priv) {
-               rb->free_aux(rb->aux_priv);
-               rb->free_aux = NULL;
-               rb->aux_priv = NULL;
-       }
-
-       if (rb->aux_nr_pages) {
-               for (pg = 0; pg < rb->aux_nr_pages; pg++)
-                       rb_free_aux_page(rb, pg);
-
-               kfree(rb->aux_pages);
-               rb->aux_nr_pages = 0;
-       }
-}
-
  void rb_free_aux(struct ring_buffer *rb)
  {
         if (atomic_dec_and_test(&rb->aux_refcount))
-               irq_work_queue(&rb->irq_work);
-}
-
-static void rb_irq_work(struct irq_work *work)
-{
-       struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work);
-
-       if (!atomic_read(&rb->aux_refcount))
                 __rb_free_aux(rb);
-
-       if (rb->rcu_head.next == (void *)rb)
-               call_rcu(&rb->rcu_head, rb_free_rcu);
  }
  
  #ifndef CONFIG_PERF_USE_VMALLOC
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c

index da0c09ff6112badb3fa3d4da719f78cee299c1ab..7b1b772ab1ce4f381ce085c82e11005af3f5a2c6 100644 (file)
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1693,8 +1693,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
         int result;
  
         pagefault_disable();
-       result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
-                                                       sizeof(opcode));
+       result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
         pagefault_enable();
  
         if (likely(result == 0))
diff --git a/kernel/futex.c b/kernel/futex.c

index 9d8163afd87ca7605ef85d2ca64d3c4521838fae..e8af73cc51a7cd6bf453aaf03e5ccd5842aacca2 100644 (file)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -681,7 +681,7 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
         int ret;
  
         pagefault_disable();
-       ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
+       ret = __get_user(*dest, from);
         pagefault_enable();
  
         return ret ? -EFAULT : 0;
diff --git a/kernel/power/main.c b/kernel/power/main.c

index b2dd4d999900a26edd9cd26fb952b84b98ee411f..27946975eff004f210d563e73218f1fe6d0b9a9e 100644 (file)
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -280,13 +280,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
         return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
  }
  
-static ssize_t pm_wakeup_irq_store(struct kobject *kobj,
-                                       struct kobj_attribute *attr,
-                                       const char *buf, size_t n)
-{
-       return -EINVAL;
-}
-power_attr(pm_wakeup_irq);
+power_attr_ro(pm_wakeup_irq);
  
  #else /* !CONFIG_PM_SLEEP_DEBUG */
  static inline void pm_print_times_init(void) {}
@@ -564,14 +558,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
         return show_trace_dev_match(buf, PAGE_SIZE);
  }
  
-static ssize_t
-pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
-                        const char *buf, size_t n)
-{
-       return -EINVAL;
-}
-
-power_attr(pm_trace_dev_match);
+power_attr_ro(pm_trace_dev_match);
  
  #endif /* CONFIG_PM_TRACE */
  
diff --git a/kernel/power/power.h b/kernel/power/power.h

index caadb566e82bb51a5348d6ba67a73bda8c99f37d..efe1b3b17c88d0eb793fa84ceec00e5b31604af4 100644 (file)
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -77,6 +77,15 @@ static struct kobj_attribute _name##_attr = {        \
         .store  = _name##_store,                \
  }
  
+#define power_attr_ro(_name) \
+static struct kobj_attribute _name##_attr = {  \
+       .attr   = {                             \
+               .name = __stringify(_name),     \
+               .mode = S_IRUGO,                \
+       },                                      \
+       .show   = _name##_show,                 \
+}
+
  /* Preferred image size in bytes (default 500 MB) */
  extern unsigned long image_size;
  /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c

index 12cd989dadf639c3276ca228fef1431284c862ec..160e1006640d585f417ae37ecab304e407971e67 100644 (file)
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -36,6 +36,14 @@
  
  #define HIBERNATE_SIG  "S1SUSPEND"
  
+/*
+ * When reading an {un,}compressed image, we may restore pages in place,
+ * in which case some architectures need these pages cleaning before they
+ * can be executed. We don't know which pages these may be, so clean the lot.
+ */
+static bool clean_pages_on_read;
+static bool clean_pages_on_decompress;
+
  /*
   *     The swap map is a data structure used for keeping track of each page
   *     written to a swap partition.  It consists of many swap_map_page
@@ -241,6 +249,9 @@ static void hib_end_io(struct bio *bio)
  
         if (bio_data_dir(bio) == WRITE)
                 put_page(page);
+       else if (clean_pages_on_read)
+               flush_icache_range((unsigned long)page_address(page),
+                                  (unsigned long)page_address(page) + PAGE_SIZE);
  
         if (bio->bi_error && !hb->error)
                 hb->error = bio->bi_error;
@@ -1049,6 +1060,7 @@ static int load_image(struct swap_map_handle *handle,
  
         hib_init_batch(&hb);
  
+       clean_pages_on_read = true;
         printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
                 nr_to_read);
         m = nr_to_read / 10;
@@ -1124,6 +1136,10 @@ static int lzo_decompress_threadfn(void *data)
                 d->unc_len = LZO_UNC_SIZE;
                 d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
                                                d->unc, &d->unc_len);
+               if (clean_pages_on_decompress)
+                       flush_icache_range((unsigned long)d->unc,
+                                          (unsigned long)d->unc + d->unc_len);
+
                 atomic_set(&d->stop, 1);
                 wake_up(&d->done);
         }
@@ -1189,6 +1205,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
         }
         memset(crc, 0, offsetof(struct crc_data, go));
  
+       clean_pages_on_decompress = true;
+
         /*
          * Start the decompression threads.
          */
diff --git a/lib/extable.c b/lib/extable.c

index 4cac81ec225e09af4cfd69c36d63a574a8418110..0be02ad561e9e346c01a65434873f9960472047a 100644 (file)
--- a/lib/extable.c
+++ b/lib/extable.c
@@ -14,7 +14,37 @@
  #include <linux/sort.h>
  #include <asm/uaccess.h>
  
+#ifndef ARCH_HAS_RELATIVE_EXTABLE
+#define ex_to_insn(x)  ((x)->insn)
+#else
+static inline unsigned long ex_to_insn(const struct exception_table_entry *x)
+{
+       return (unsigned long)&x->insn + x->insn;
+}
+#endif
+
  #ifndef ARCH_HAS_SORT_EXTABLE
+#ifndef ARCH_HAS_RELATIVE_EXTABLE
+#define swap_ex                NULL
+#else
+static void swap_ex(void *a, void *b, int size)
+{
+       struct exception_table_entry *x = a, *y = b, tmp;
+       int delta = b - a;
+
+       tmp = *x;
+       x->insn = y->insn + delta;
+       y->insn = tmp.insn - delta;
+
+#ifdef swap_ex_entry_fixup
+       swap_ex_entry_fixup(x, y, tmp, delta);
+#else
+       x->fixup = y->fixup + delta;
+       y->fixup = tmp.fixup - delta;
+#endif
+}
+#endif /* ARCH_HAS_RELATIVE_EXTABLE */
+
  /*
   * The exception table needs to be sorted so that the binary
   * search that we use to find entries in it works properly.
@@ -26,9 +56,9 @@ static int cmp_ex(const void *a, const void *b)
         const struct exception_table_entry *x = a, *y = b;
  
         /* avoid overflow */
-       if (x->insn > y->insn)
+       if (ex_to_insn(x) > ex_to_insn(y))
                 return 1;
-       if (x->insn < y->insn)
+       if (ex_to_insn(x) < ex_to_insn(y))
                 return -1;
         return 0;
  }
@@ -37,7 +67,7 @@ void sort_extable(struct exception_table_entry *start,
                   struct exception_table_entry *finish)
  {
         sort(start, finish - start, sizeof(struct exception_table_entry),
-            cmp_ex, NULL);
+            cmp_ex, swap_ex);
  }
  
  #ifdef CONFIG_MODULES
@@ -48,13 +78,15 @@ void sort_extable(struct exception_table_entry *start,
  void trim_init_extable(struct module *m)
  {
         /*trim the beginning*/
-       while (m->num_exentries && within_module_init(m->extable[0].insn, m)) {
+       while (m->num_exentries &&
+              within_module_init(ex_to_insn(&m->extable[0]), m)) {
                 m->extable++;
                 m->num_exentries--;
         }
         /*trim the end*/
         while (m->num_exentries &&
-               within_module_init(m->extable[m->num_exentries-1].insn, m))
+              within_module_init(ex_to_insn(&m->extable[m->num_exentries - 1]),
+                                 m))
                 m->num_exentries--;
  }
  #endif /* CONFIG_MODULES */
@@ -81,13 +113,13 @@ search_extable(const struct exception_table_entry *first,
                  * careful, the distance between value and insn
                  * can be larger than MAX_LONG:
                  */
-               if (mid->insn < value)
+               if (ex_to_insn(mid) < value)
                         first = mid + 1;
-               else if (mid->insn > value)
+               else if (ex_to_insn(mid) > value)
                         last = mid - 1;
                 else
                         return mid;
-        }
-        return NULL;
+       }
+       return NULL;
  }
  #endif
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c

index e0af6ff73d146cfa3356080ebcb1bbea36cd1f96..5a003a2ebd967cfe6a9e202ea9a1ae7f1ddf1aba 100644 (file)
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -39,8 +39,8 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long
                 unsigned long c, data;
  
                 /* Fall back to byte-at-a-time if we get a page fault */
-               if (unlikely(__get_user(c,(unsigned long __user *)(src+res))))
-                       break;
+               unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time);
+
                 *(unsigned long *)(dst+res) = c;
                 if (has_zero(c, &data, &constants)) {
                         data = prep_zero_mask(c, data, &constants);
@@ -55,8 +55,7 @@ byte_at_a_time:
         while (max) {
                 char c;
  
-               if (unlikely(__get_user(c,src+res)))
-                       return -EFAULT;
+               unsafe_get_user(c,src+res, efault);
                 dst[res] = c;
                 if (!c)
                         return res;
@@ -75,6 +74,7 @@ byte_at_a_time:
          * Nope: we hit the address space limit, and we still had more
          * characters the caller would have wanted. That's an EFAULT.
          */
+efault:
         return -EFAULT;
  }
  
@@ -107,7 +107,12 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
         src_addr = (unsigned long)src;
         if (likely(src_addr < max_addr)) {
                 unsigned long max = max_addr - src_addr;
-               return do_strncpy_from_user(dst, src, count, max);
+               long retval;
+
+               user_access_begin();
+               retval = do_strncpy_from_user(dst, src, count, max);
+               user_access_end();
+               return retval;
         }
         return -EFAULT;
  }
diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c

index 3a5f2b366d84ed209a012cf62491ca30f6a8bca8..8e105ed4df12bb6bb0a170afff54d979c15d73c0 100644 (file)
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c
@@ -45,8 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
         src -= align;
         max += align;
  
-       if (unlikely(__get_user(c,(unsigned long __user *)src)))
-               return 0;
+       unsafe_get_user(c, (unsigned long __user *)src, efault);
         c |= aligned_byte_mask(align);
  
         for (;;) {
@@ -61,8 +60,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
                 if (unlikely(max <= sizeof(unsigned long)))
                         break;
                 max -= sizeof(unsigned long);
-               if (unlikely(__get_user(c,(unsigned long __user *)(src+res))))
-                       return 0;
+               unsafe_get_user(c, (unsigned long __user *)(src+res), efault);
         }
         res -= align;
  
@@ -77,6 +75,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
          * Nope: we hit the address space limit, and we still had more
          * characters the caller would have wanted. That's 0.
          */
+efault:
         return 0;
  }
  
@@ -112,7 +111,12 @@ long strnlen_user(const char __user *str, long count)
         src_addr = (unsigned long)str;
         if (likely(src_addr < max_addr)) {
                 unsigned long max = max_addr - src_addr;
-               return do_strnlen_user(str, count, max);
+               long retval;
+
+               user_access_begin();
+               retval = do_strnlen_user(str, count, max);
+               user_access_end();
+               return retval;
         }
         return 0;
  }
@@ -141,7 +145,12 @@ long strlen_user(const char __user *str)
         src_addr = (unsigned long)str;
         if (likely(src_addr < max_addr)) {
                 unsigned long max = max_addr - src_addr;
-               return do_strnlen_user(str, ~0ul, max);
+               long retval;
+
+               user_access_begin();
+               retval = do_strnlen_user(str, ~0ul, max);
+               user_access_end();
+               return retval;
         }
         return 0;
  }
diff --git a/mm/Makefile b/mm/Makefile

index 2ed43191fc3bf78f46f111e88fa9d5a01b8c661a..8b532c94008f2231b2d3d01fc583ff88bb3822ca 100644 (file)
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,6 +5,9 @@
  KASAN_SANITIZE_slab_common.o := n
  KASAN_SANITIZE_slub.o := n
  
+# Since __builtin_frame_address does work as used, disable the warning.
+CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address)
+
  mmu-y                  := nommu.o
  mmu-$(CONFIG_MMU)      := gup.o highmem.o memory.o mincore.o \
                            mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
@@ -81,3 +84,4 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
  obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
  obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
  obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
+obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
diff --git a/mm/maccess.c b/mm/maccess.c

index d159b1c96e484d902f6edb34ebb5a83a3977bd57..78f9274dd49d06f11f87beae9520a0e1267dc3ff 100644 (file)
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -96,8 +96,7 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
         pagefault_disable();
  
         do {
-               ret = __copy_from_user_inatomic(dst++,
-                                               (const void __user __force *)src++, 1);
+               ret = __get_user(*dst++, (const char __user __force *)src++);
         } while (dst[-1] && ret == 0 && src - unsafe_addr < count);
  
         dst[-1] = '\0';
diff --git a/mm/page-writeback.c b/mm/page-writeback.c

index fd51ebfc423fe69abf5ab3ed4e54bf020dc71568..1e6769449ac2ee3f4b49ac31882b1ae71dcd49a4 100644 (file)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -278,7 +278,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone)
         unsigned long nr_pages;
  
         nr_pages = zone_page_state(zone, NR_FREE_PAGES);
-       nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+       /*
+        * Pages reserved for the kernel should not be considered
+        * dirtyable, to prevent a situation where reclaim has to
+        * clean pages in order to balance the zones.
+        */
+       nr_pages -= min(nr_pages, zone->totalreserve_pages);
  
         nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
         nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
@@ -332,7 +337,12 @@ static unsigned long global_dirtyable_memory(void)
         unsigned long x;
  
         x = global_page_state(NR_FREE_PAGES);
-       x -= min(x, dirty_balance_reserve);
+       /*
+        * Pages reserved for the kernel should not be considered
+        * dirtyable, to prevent a situation where reclaim has to
+        * clean pages in order to balance the zones.
+        */
+       x -= min(x, totalreserve_pages);
  
         x += global_page_state(NR_INACTIVE_FILE);
         x += global_page_state(NR_ACTIVE_FILE);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 6a117213feb8727380610b71751b434956a6a9f2..ad092093e101d07a410f1bb9e05b5bb02a493ac9 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -114,13 +114,6 @@ static DEFINE_SPINLOCK(managed_page_count_lock);
  unsigned long totalram_pages __read_mostly;
  unsigned long totalreserve_pages __read_mostly;
  unsigned long totalcma_pages __read_mostly;
-/*
- * When calculating the number of globally allowed dirty pages, there
- * is a certain number of per-zone reserves that should not be
- * considered dirtyable memory.  This is the sum of those reserves
- * over all existing zones that contribute dirtyable memory.
- */
-unsigned long dirty_balance_reserve __read_mostly;
  
  int percpu_pagelist_fraction;
  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
@@ -5981,20 +5974,12 @@ static void calculate_totalreserve_pages(void)
  
                         if (max > zone->managed_pages)
                                 max = zone->managed_pages;
+
+                       zone->totalreserve_pages = max;
+
                         reserve_pages += max;
-                       /*
-                        * Lowmem reserves are not available to
-                        * GFP_HIGHUSER page cache allocations and
-                        * kswapd tries to balance zones to their high
-                        * watermark.  As a result, neither should be
-                        * regarded as dirtyable memory, to prevent a
-                        * situation where reclaim has to clean pages
-                        * in order to balance the zones.
-                        */
-                       zone->dirty_balance_reserve = max;
                 }
         }
-       dirty_balance_reserve = reserve_pages;
         totalreserve_pages = reserve_pages;
  }
  
diff --git a/mm/slab.c b/mm/slab.c

index 4765c97ce6900d98b8ce2968cf9ff62a176f6e42..24a615d42d74f9bacd69e45372b871eeba89dbd1 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4228,6 +4228,36 @@ static int __init slab_proc_init(void)
  module_init(slab_proc_init);
  #endif
  
+#ifdef CONFIG_HARDENED_USERCOPY
+/*
+ * Rejects objects that are incorrectly sized.
+ *
+ * Returns NULL if check passes, otherwise const char * to name of cache
+ * to indicate an error.
+ */
+const char *__check_heap_object(const void *ptr, unsigned long n,
+                               struct page *page)
+{
+       struct kmem_cache *cachep;
+       unsigned int objnr;
+       unsigned long offset;
+
+       /* Find and validate object. */
+       cachep = page->slab_cache;
+       objnr = obj_to_index(cachep, page, (void *)ptr);
+       BUG_ON(objnr >= cachep->num);
+
+       /* Find offset within object. */
+       offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
+
+       /* Allow address range falling entirely within object size. */
+       if (offset <= cachep->object_size && n <= cachep->object_size - offset)
+               return NULL;
+
+       return cachep->name;
+}
+#endif /* CONFIG_HARDENED_USERCOPY */
+
  /**
   * ksize - get the actual amount of memory allocated for a given object
   * @objp: Pointer to the object
diff --git a/mm/slub.c b/mm/slub.c

index 65d5f92d51d27ec1e0993cbe194eaaaae9bcd503..41f7cae64a49bfef91d0f992df18fa4aae0cefb7 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
  #endif
  }
  
+static inline void *fixup_red_left(struct kmem_cache *s, void *p)
+{
+       if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
+               p += s->red_left_pad;
+
+       return p;
+}
+
  static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
  {
  #ifdef CONFIG_SLUB_CPU_PARTIAL
@@ -224,24 +232,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
   *                     Core slab cache functions
   *******************************************************************/
  
-/* Verify that a pointer has an address that is valid within a slab page */
-static inline int check_valid_pointer(struct kmem_cache *s,
-                               struct page *page, const void *object)
-{
-       void *base;
-
-       if (!object)
-               return 1;
-
-       base = page_address(page);
-       if (object < base || object >= base + page->objects * s->size ||
-               (object - base) % s->size) {
-               return 0;
-       }
-
-       return 1;
-}
-
  static inline void *get_freepointer(struct kmem_cache *s, void *object)
  {
         return *(void **)(object + s->offset);
@@ -271,12 +261,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
  
  /* Loop over all objects in a slab */
  #define for_each_object(__p, __s, __addr, __objects) \
-       for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
-                       __p += (__s)->size)
+       for (__p = fixup_red_left(__s, __addr); \
+               __p < (__addr) + (__objects) * (__s)->size; \
+               __p += (__s)->size)
  
  #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
-       for (__p = (__addr), __idx = 1; __idx <= __objects;\
-                       __p += (__s)->size, __idx++)
+       for (__p = fixup_red_left(__s, __addr), __idx = 1; \
+               __idx <= __objects; \
+               __p += (__s)->size, __idx++)
  
  /* Determine object index from a given position */
  static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
@@ -456,6 +448,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
                 set_bit(slab_index(p, s, addr), map);
  }
  
+static inline int size_from_object(struct kmem_cache *s)
+{
+       if (s->flags & SLAB_RED_ZONE)
+               return s->size - s->red_left_pad;
+
+       return s->size;
+}
+
+static inline void *restore_red_left(struct kmem_cache *s, void *p)
+{
+       if (s->flags & SLAB_RED_ZONE)
+               p -= s->red_left_pad;
+
+       return p;
+}
+
  /*
   * Debug settings:
   */
@@ -489,6 +497,26 @@ static inline void metadata_access_disable(void)
  /*
   * Object debugging
   */
+
+/* Verify that a pointer has an address that is valid within a slab page */
+static inline int check_valid_pointer(struct kmem_cache *s,
+                               struct page *page, void *object)
+{
+       void *base;
+
+       if (!object)
+               return 1;
+
+       base = page_address(page);
+       object = restore_red_left(s, object);
+       if (object < base || object >= base + page->objects * s->size ||
+               (object - base) % s->size) {
+               return 0;
+       }
+
+       return 1;
+}
+
  static void print_section(char *text, u8 *addr, unsigned int length)
  {
         metadata_access_enable();
@@ -628,7 +656,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
         pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
                p, p - addr, get_freepointer(s, p));
  
-       if (p > addr + 16)
+       if (s->flags & SLAB_RED_ZONE)
+               print_section("Redzone ", p - s->red_left_pad, s->red_left_pad);
+       else if (p > addr + 16)
                 print_section("Bytes b4 ", p - 16, 16);
  
         print_section("Object ", p, min_t(unsigned long, s->object_size,
@@ -645,9 +675,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
         if (s->flags & SLAB_STORE_USER)
                 off += 2 * sizeof(struct track);
  
-       if (off != s->size)
+       if (off != size_from_object(s))
                 /* Beginning of the filler is the free pointer */
-               print_section("Padding ", p + off, s->size - off);
+               print_section("Padding ", p + off, size_from_object(s) - off);
  
         dump_stack();
  }
@@ -677,6 +707,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
  {
         u8 *p = object;
  
+       if (s->flags & SLAB_RED_ZONE)
+               memset(p - s->red_left_pad, val, s->red_left_pad);
+
         if (s->flags & __OBJECT_POISON) {
                 memset(p, POISON_FREE, s->object_size - 1);
                 p[s->object_size - 1] = POISON_END;
@@ -769,11 +802,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
                 /* We also have user information there */
                 off += 2 * sizeof(struct track);
  
-       if (s->size == off)
+       if (size_from_object(s) == off)
                 return 1;
  
         return check_bytes_and_report(s, page, p, "Object padding",
-                               p + off, POISON_INUSE, s->size - off);
+                       p + off, POISON_INUSE, size_from_object(s) - off);
  }
  
  /* Check the pad bytes at the end of a slab page */
@@ -817,6 +850,10 @@ static int check_object(struct kmem_cache *s, struct page *page,
         u8 *endobject = object + s->object_size;
  
         if (s->flags & SLAB_RED_ZONE) {
+               if (!check_bytes_and_report(s, page, object, "Redzone",
+                       object - s->red_left_pad, val, s->red_left_pad))
+                       return 0;
+
                 if (!check_bytes_and_report(s, page, object, "Redzone",
                         endobject, val, s->inuse - s->object_size))
                         return 0;
@@ -1468,7 +1505,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                         set_freepointer(s, p, NULL);
         }
  
-       page->freelist = start;
+       page->freelist = fixup_red_left(s, start);
         page->inuse = page->objects;
         page->frozen = 1;
  
@@ -3283,7 +3320,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
                  */
                 size += 2 * sizeof(struct track);
  
-       if (flags & SLAB_RED_ZONE)
+       if (flags & SLAB_RED_ZONE) {
                 /*
                  * Add some empty padding so that we can catch
                  * overwrites from earlier objects rather than let
@@ -3292,6 +3329,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
                  * of the object.
                  */
                 size += sizeof(void *);
+
+               s->red_left_pad = sizeof(void *);
+               s->red_left_pad = ALIGN(s->red_left_pad, s->align);
+               size += s->red_left_pad;
+       }
  #endif
  
         /*
@@ -3585,6 +3627,46 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
  EXPORT_SYMBOL(__kmalloc_node);
  #endif
  
+#ifdef CONFIG_HARDENED_USERCOPY
+/*
+ * Rejects objects that are incorrectly sized.
+ *
+ * Returns NULL if check passes, otherwise const char * to name of cache
+ * to indicate an error.
+ */
+const char *__check_heap_object(const void *ptr, unsigned long n,
+                               struct page *page)
+{
+       struct kmem_cache *s;
+       unsigned long offset;
+       size_t object_size;
+
+       /* Find object and usable object size. */
+       s = page->slab_cache;
+       object_size = slab_ksize(s);
+
+       /* Reject impossible pointers. */
+       if (ptr < page_address(page))
+               return s->name;
+
+       /* Find offset within object. */
+       offset = (ptr - page_address(page)) % s->size;
+
+       /* Adjust for redzone and reject if within the redzone. */
+       if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
+               if (offset < s->red_left_pad)
+                       return s->name;
+               offset -= s->red_left_pad;
+       }
+
+       /* Allow address range falling entirely within object size. */
+       if (offset <= object_size && n <= object_size - offset)
+               return NULL;
+
+       return s->name;
+}
+#endif /* CONFIG_HARDENED_USERCOPY */
+
  static size_t __ksize(const void *object)
  {
         struct page *page;
diff --git a/mm/usercopy.c b/mm/usercopy.c

new file mode 100644 (file)

index 0000000..c56b97b
--- /dev/null
+++ b/mm/usercopy.c
@@ -0,0 +1,277 @@
+/*
+ * This implements the various checks for CONFIG_HARDENED_USERCOPY*,
+ * which are designed to protect kernel memory from needless exposure
+ * and overwrite under many unintended conditions. This code is based
+ * on PAX_USERCOPY, which is:
+ *
+ * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source
+ * Security Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <asm/sections.h>
+
+enum {
+       BAD_STACK = -1,
+       NOT_STACK = 0,
+       GOOD_FRAME,
+       GOOD_STACK,
+};
+
+/*
+ * Checks if a given pointer and length is contained by the current
+ * stack frame (if possible).
+ *
+ * Returns:
+ *     NOT_STACK: not at all on the stack
+ *     GOOD_FRAME: fully within a valid stack frame
+ *     GOOD_STACK: fully on the stack (when can't do frame-checking)
+ *     BAD_STACK: error condition (invalid stack position or bad stack frame)
+ */
+static noinline int check_stack_object(const void *obj, unsigned long len)
+{
+       const void * const stack = task_stack_page(current);
+       const void * const stackend = stack + THREAD_SIZE;
+       int ret;
+
+       /* Object is not on the stack at all. */
+       if (obj + len <= stack || stackend <= obj)
+               return NOT_STACK;
+
+       /*
+        * Reject: object partially overlaps the stack (passing the
+        * the check above means at least one end is within the stack,
+        * so if this check fails, the other end is outside the stack).
+        */
+       if (obj < stack || stackend < obj + len)
+               return BAD_STACK;
+
+       /* Check if object is safely within a valid frame. */
+       ret = arch_within_stack_frames(stack, stackend, obj, len);
+       if (ret)
+               return ret;
+
+       return GOOD_STACK;
+}
+
+static void report_usercopy(const void *ptr, unsigned long len,
+                           bool to_user, const char *type)
+{
+       pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n",
+               to_user ? "exposure" : "overwrite",
+               to_user ? "from" : "to", ptr, type ? : "unknown", len);
+       /*
+        * For greater effect, it would be nice to do do_group_exit(),
+        * but BUG() actually hooks all the lock-breaking and per-arch
+        * Oops code, so that is used here instead.
+        */
+       BUG();
+}
+
+/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */
+static bool overlaps(const void *ptr, unsigned long n, unsigned long low,
+                    unsigned long high)
+{
+       unsigned long check_low = (uintptr_t)ptr;
+       unsigned long check_high = check_low + n;
+
+       /* Does not overlap if entirely above or entirely below. */
+       if (check_low >= high || check_high <= low)
+               return false;
+
+       return true;
+}
+
+/* Is this address range in the kernel text area? */
+static inline const char *check_kernel_text_object(const void *ptr,
+                                                  unsigned long n)
+{
+       unsigned long textlow = (unsigned long)_stext;
+       unsigned long texthigh = (unsigned long)_etext;
+       unsigned long textlow_linear, texthigh_linear;
+
+       if (overlaps(ptr, n, textlow, texthigh))
+               return "<kernel text>";
+
+       /*
+        * Some architectures have virtual memory mappings with a secondary
+        * mapping of the kernel text, i.e. there is more than one virtual
+        * kernel address that points to the kernel image. It is usually
+        * when there is a separate linear physical memory mapping, in that
+        * __pa() is not just the reverse of __va(). This can be detected
+        * and checked:
+        */
+       textlow_linear = (unsigned long)__va(__pa(textlow));
+       /* No different mapping: we're done. */
+       if (textlow_linear == textlow)
+               return NULL;
+
+       /* Check the secondary mapping... */
+       texthigh_linear = (unsigned long)__va(__pa(texthigh));
+       if (overlaps(ptr, n, textlow_linear, texthigh_linear))
+               return "<linear kernel text>";
+
+       return NULL;
+}
+
+static inline const char *check_bogus_address(const void *ptr, unsigned long n)
+{
+       /* Reject if object wraps past end of memory. */
+       if (ptr + n < ptr)
+               return "<wrapped address>";
+
+       /* Reject if NULL or ZERO-allocation. */
+       if (ZERO_OR_NULL_PTR(ptr))
+               return "<null>";
+
+       return NULL;
+}
+
+/* Checks for allocs that are marked in some way as spanning multiple pages. */
+static inline const char *check_page_span(const void *ptr, unsigned long n,
+                                         struct page *page, bool to_user)
+{
+#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN
+       const void *end = ptr + n - 1;
+       struct page *endpage;
+       bool is_reserved, is_cma;
+
+       /*
+        * Sometimes the kernel data regions are not marked Reserved (see
+        * check below). And sometimes [_sdata,_edata) does not cover
+        * rodata and/or bss, so check each range explicitly.
+        */
+
+       /* Allow reads of kernel rodata region (if not marked as Reserved). */
+       if (ptr >= (const void *)__start_rodata &&
+           end <= (const void *)__end_rodata) {
+               if (!to_user)
+                       return "<rodata>";
+               return NULL;
+       }
+
+       /* Allow kernel data region (if not marked as Reserved). */
+       if (ptr >= (const void *)_sdata && end <= (const void *)_edata)
+               return NULL;
+
+       /* Allow kernel bss region (if not marked as Reserved). */
+       if (ptr >= (const void *)__bss_start &&
+           end <= (const void *)__bss_stop)
+               return NULL;
+
+       /* Is the object wholly within one base page? */
+       if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) ==
+                  ((unsigned long)end & (unsigned long)PAGE_MASK)))
+               return NULL;
+
+       /* Allow if fully inside the same compound (__GFP_COMP) page. */
+       endpage = virt_to_head_page(end);
+       if (likely(endpage == page))
+               return NULL;
+
+       /*
+        * Reject if range is entirely either Reserved (i.e. special or
+        * device memory), or CMA. Otherwise, reject since the object spans
+        * several independently allocated pages.
+        */
+       is_reserved = PageReserved(page);
+       is_cma = is_migrate_cma_page(page);
+       if (!is_reserved && !is_cma)
+               return "<spans multiple pages>";
+
+       for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) {
+               page = virt_to_head_page(ptr);
+               if (is_reserved && !PageReserved(page))
+                       return "<spans Reserved and non-Reserved pages>";
+               if (is_cma && !is_migrate_cma_page(page))
+                       return "<spans CMA and non-CMA pages>";
+       }
+#endif
+
+       return NULL;
+}
+
+static inline const char *check_heap_object(const void *ptr, unsigned long n,
+                                           bool to_user)
+{
+       struct page *page;
+
+       /*
+        * Some architectures (arm64) return true for virt_addr_valid() on
+        * vmalloced addresses. Work around this by checking for vmalloc
+        * first.
+        */
+       if (is_vmalloc_addr(ptr))
+               return NULL;
+
+       if (!virt_addr_valid(ptr))
+               return NULL;
+
+       page = virt_to_head_page(ptr);
+
+       /* Check slab allocator for flags and size. */
+       if (PageSlab(page))
+               return __check_heap_object(ptr, n, page);
+
+       /* Verify object does not incorrectly span multiple pages. */
+       return check_page_span(ptr, n, page, to_user);
+}
+
+/*
+ * Validates that the given object is:
+ * - not bogus address
+ * - known-safe heap or stack object
+ * - not in kernel text
+ */
+void __check_object_size(const void *ptr, unsigned long n, bool to_user)
+{
+       const char *err;
+
+       /* Skip all tests if size is zero. */
+       if (!n)
+               return;
+
+       /* Check for invalid addresses. */
+       err = check_bogus_address(ptr, n);
+       if (err)
+               goto report;
+
+       /* Check for bad heap object. */
+       err = check_heap_object(ptr, n, to_user);
+       if (err)
+               goto report;
+
+       /* Check for bad stack object. */
+       switch (check_stack_object(ptr, n)) {
+       case NOT_STACK:
+               /* Object is not touching the current process stack. */
+               break;
+       case GOOD_FRAME:
+       case GOOD_STACK:
+               /*
+                * Object is either in the correct frame (when it
+                * is possible to check) or just generally on the
+                * process stack (when frame checking not available).
+                */
+               return;
+       default:
+               err = "<process stack>";
+               goto report;
+       }
+
+       /* Check for object in kernel to avoid text exposure. */
+       err = check_kernel_text_object(ptr, n);
+       if (!err)
+               return;
+
+report:
+       report_usercopy(ptr, n, to_user, err);
+}
+EXPORT_SYMBOL(__check_object_size);
diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c

index 727eb21c9c5624f2998f321d59db1017339c69a3..83795435bd991b117324424d7a1a2cb2985786ad 100644 (file)
--- a/samples/kprobes/kprobe_example.c
+++ b/samples/kprobes/kprobe_example.c
@@ -42,6 +42,11 @@ static int handler_pre(struct kprobe *p, struct pt_regs *regs)
                         " ex1 = 0x%lx\n",
                 p->addr, regs->pc, regs->ex1);
  #endif
+#ifdef CONFIG_ARM64
+       pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx,"
+                       " pstate = 0x%lx\n",
+               p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate);
+#endif
  
         /* A dump_stack() here will give a stack backtrace */
         return 0;
@@ -67,6 +72,10 @@ static void handler_post(struct kprobe *p, struct pt_regs *regs,
         printk(KERN_INFO "post_handler: p->addr = 0x%p, ex1 = 0x%lx\n",
                 p->addr, regs->ex1);
  #endif
+#ifdef CONFIG_ARM64
+       pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n",
+               p->symbol_name, p->addr, (long)regs->pstate);
+#endif
  }
  
  /*
diff --git a/scripts/sortextable.c b/scripts/sortextable.c

index c2423d913b46bd0e659ea4d4c057a3af6119c2d4..a2c0d620ca80fcca79260a1899e8a8d38c354247 100644 (file)
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -266,9 +266,9 @@ do_file(char const *const fname)
                 break;
         }  /* end switch */
         if (memcmp(ELFMAG, ehdr->e_ident, SELFMAG) != 0
-       ||  r2(&ehdr->e_type) != ET_EXEC
+       ||  (r2(&ehdr->e_type) != ET_EXEC && r2(&ehdr->e_type) != ET_DYN)
         ||  ehdr->e_ident[EI_VERSION] != EV_CURRENT) {
-               fprintf(stderr, "unrecognized ET_EXEC file %s\n", fname);
+               fprintf(stderr, "unrecognized ET_EXEC/ET_DYN file %s\n", fname);
                 fail_file();
         }
  
@@ -282,12 +282,13 @@ do_file(char const *const fname)
         case EM_386:
         case EM_X86_64:
         case EM_S390:
+       case EM_AARCH64:
+       case EM_PARISC:
                 custom_sort = sort_relative_table;
                 break;
         case EM_ARCOMPACT:
         case EM_ARCV2:
         case EM_ARM:
-       case EM_AARCH64:
         case EM_MICROBLAZE:
         case EM_MIPS:
         case EM_XTENSA:
@@ -304,7 +305,7 @@ do_file(char const *const fname)
                 if (r2(&ehdr->e_ehsize) != sizeof(Elf32_Ehdr)
                 ||  r2(&ehdr->e_shentsize) != sizeof(Elf32_Shdr)) {
                         fprintf(stderr,
-                               "unrecognized ET_EXEC file: %s\n", fname);
+                               "unrecognized ET_EXEC/ET_DYN file: %s\n", fname);
                         fail_file();
                 }
                 do32(ehdr, fname, custom_sort);
@@ -314,7 +315,7 @@ do_file(char const *const fname)
                 if (r2(&ghdr->e_ehsize) != sizeof(Elf64_Ehdr)
                 ||  r2(&ghdr->e_shentsize) != sizeof(Elf64_Shdr)) {
                         fprintf(stderr,
-                               "unrecognized ET_EXEC file: %s\n", fname);
+                               "unrecognized ET_EXEC/ET_DYN file: %s\n", fname);
                         fail_file();
                 }
                 do64(ghdr, fname, custom_sort);
diff --git a/security/Kconfig b/security/Kconfig

index e45237897b435f8fbf64950f580b9730f79eb1f4..ddb3e8a8d9bd4cf0ae9c9e77f719c7009035ddda 100644 (file)
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -118,6 +118,45 @@ config LSM_MMAP_MIN_ADDR
           this low address space will need the permission specific to the
           systems running LSM.
  
+config HAVE_HARDENED_USERCOPY_ALLOCATOR
+       bool
+       help
+         The heap allocator implements __check_heap_object() for
+         validating memory ranges against heap object sizes in
+         support of CONFIG_HARDENED_USERCOPY.
+
+config HAVE_ARCH_HARDENED_USERCOPY
+       bool
+       help
+         The architecture supports CONFIG_HARDENED_USERCOPY by
+         calling check_object_size() just before performing the
+         userspace copies in the low level implementation of
+         copy_to_user() and copy_from_user().
+
+config HARDENED_USERCOPY
+       bool "Harden memory copies between kernel and userspace"
+       depends on HAVE_ARCH_HARDENED_USERCOPY
+       select BUG
+       help
+         This option checks for obviously wrong memory regions when
+         copying memory to/from the kernel (via copy_to_user() and
+         copy_from_user() functions) by rejecting memory ranges that
+         are larger than the specified heap object, span multiple
+         separately allocates pages, are not on the process stack,
+         or are part of the kernel text. This kills entire classes
+         of heap overflow exploits and similar kernel memory exposures.
+
+config HARDENED_USERCOPY_PAGESPAN
+       bool "Refuse to copy allocations that span multiple pages"
+       depends on HARDENED_USERCOPY
+       depends on !COMPILE_TEST
+       help
+         When a multi-page allocation is done without __GFP_COMP,
+         hardened usercopy will reject attempts to copy it. There are,
+         however, several cases of this in the kernel that have not all
+         been removed. This config is intended to be used only while
+         trying to find such users.
+
  source security/selinux/Kconfig
  source security/smack/Kconfig
  source security/tomoyo/Kconfig
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST

index 39c38cb45b00f8e3e478bfebab6a47a9bf4c3dd7..eeb21eb438989000d0e071ef140880b58b3a4497 100644 (file)
--- a/tools/perf/MANIFEST
+++ b/tools/perf/MANIFEST
@@ -57,6 +57,7 @@ include/asm-generic/bitops/const_hweight.h
  include/asm-generic/bitops/fls64.h
  include/asm-generic/bitops/__fls.h
  include/asm-generic/bitops/fls.h
+include/linux/coresight-pmu.h
  include/linux/perf_event.h
  include/linux/list.h
  include/linux/hash.h
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf

index 929a32ba15f500eb5dd5210fd60c549f00187f1a..74c265e0ffa0f3088a88b07e353254694559c668 100644 (file)
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -77,6 +77,9 @@ include config/utilities.mak
  # Define NO_AUXTRACE if you do not want AUX area tracing support
  #
  # Define NO_LIBBPF if you do not want BPF support
+#
+# Define NO_CSTRACE if you do not want CoreSight trace decoding support
+#
  
  # As per kernel Makefile, avoid funny character set dependencies
  unexport LC_ALL
diff --git a/tools/perf/arch/arm/util/Build b/tools/perf/arch/arm/util/Build

index d22e3d07de3d69b825155218ae0a46b9d257b47e..71de3fc405029b918f8b9131d712f3577c9b857f 100644 (file)
--- a/tools/perf/arch/arm/util/Build
+++ b/tools/perf/arch/arm/util/Build
@@ -2,3 +2,5 @@ libperf-$(CONFIG_DWARF) += dwarf-regs.o
  
  libperf-$(CONFIG_LIBUNWIND)          += unwind-libunwind.o
  libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
+
+libperf-$(CONFIG_AUXTRACE) += pmu.o auxtrace.o cs-etm.o
diff --git a/tools/perf/arch/arm/util/auxtrace.c b/tools/perf/arch/arm/util/auxtrace.c

new file mode 100644 (file)

index 0000000..95c38b6
--- /dev/null
+++ b/tools/perf/arch/arm/util/auxtrace.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright(C) 2015 Linaro Limited. All rights reserved.
+ * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdbool.h>
+#include <linux/coresight-pmu.h>
+
+#include "../../util/auxtrace.h"
+#include "../../util/evlist.h"
+#include "../../util/pmu.h"
+#include "cs-etm.h"
+
+struct auxtrace_record
+*auxtrace_record__init(struct perf_evlist *evlist, int *err)
+{
+       struct perf_pmu *cs_etm_pmu;
+       struct perf_evsel *evsel;
+       bool found_etm = false;
+
+       cs_etm_pmu = perf_pmu__find(CORESIGHT_ETM_PMU_NAME);
+
+       if (evlist) {
+               evlist__for_each(evlist, evsel) {
+                       if (cs_etm_pmu &&
+                           evsel->attr.type == cs_etm_pmu->type)
+                               found_etm = true;
+               }
+       }
+
+       if (found_etm)
+               return cs_etm_record_init(err);
+
+       /*
+        * Clear 'err' even if we haven't found a cs_etm event - that way perf
+        * record can still be used even if tracers aren't present.  The NULL
+        * return value will take care of telling the infrastructure HW tracing
+        * isn't available.
+        */
+       *err = 0;
+       return NULL;
+}
diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c

new file mode 100644 (file)

index 0000000..13a2188
--- /dev/null
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -0,0 +1,563 @@
+/*
+ * Copyright(C) 2015 Linaro Limited. All rights reserved.
+ * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <api/fs/fs.h>
+#include <linux/bitops.h>
+#include <linux/coresight-pmu.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/types.h>
+
+#include "cs-etm.h"
+#include "../../perf.h"
+#include "../../util/auxtrace.h"
+#include "../../util/cpumap.h"
+#include "../../util/evlist.h"
+#include "../../util/pmu.h"
+#include "../../util/thread_map.h"
+#include "../../util/cs-etm.h"
+
+#include <stdlib.h>
+
+struct cs_etm_recording {
+       struct auxtrace_record  itr;
+       struct perf_pmu         *cs_etm_pmu;
+       struct perf_evlist      *evlist;
+       bool                    snapshot_mode;
+       size_t                  snapshot_size;
+};
+
+static bool cs_etm_is_etmv4(struct auxtrace_record *itr, int cpu);
+
+static int cs_etm_parse_snapshot_options(struct auxtrace_record *itr,
+                                        struct record_opts *opts,
+                                        const char *str)
+{
+       struct cs_etm_recording *ptr =
+                               container_of(itr, struct cs_etm_recording, itr);
+       unsigned long long snapshot_size = 0;
+       char *endptr;
+
+       if (str) {
+               snapshot_size = strtoull(str, &endptr, 0);
+               if (*endptr || snapshot_size > SIZE_MAX)
+                       return -1;
+       }
+
+       opts->auxtrace_snapshot_mode = true;
+       opts->auxtrace_snapshot_size = snapshot_size;
+       ptr->snapshot_size = snapshot_size;
+
+       return 0;
+}
+
+static int cs_etm_recording_options(struct auxtrace_record *itr,
+                                   struct perf_evlist *evlist,
+                                   struct record_opts *opts)
+{
+       struct cs_etm_recording *ptr =
+                               container_of(itr, struct cs_etm_recording, itr);
+       struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu;
+       struct perf_evsel *evsel, *cs_etm_evsel = NULL;
+       const struct cpu_map *cpus = evlist->cpus;
+       bool privileged = (geteuid() == 0 || perf_event_paranoid() < 0);
+
+       ptr->evlist = evlist;
+       ptr->snapshot_mode = opts->auxtrace_snapshot_mode;
+
+       evlist__for_each(evlist, evsel) {
+               if (evsel->attr.type == cs_etm_pmu->type) {
+                       if (cs_etm_evsel) {
+                               pr_err("There may be only one %s event\n",
+                                      CORESIGHT_ETM_PMU_NAME);
+                               return -EINVAL;
+                       }
+                       evsel->attr.freq = 0;
+                       evsel->attr.sample_period = 1;
+                       cs_etm_evsel = evsel;
+                       opts->full_auxtrace = true;
+               }
+       }
+
+       /* no need to continue if at least one event of interest was found */
+       if (!cs_etm_evsel)
+               return 0;
+
+       if (opts->use_clockid) {
+               pr_err("Cannot use clockid (-k option) with %s\n",
+                      CORESIGHT_ETM_PMU_NAME);
+               return -EINVAL;
+       }
+
+       /* we are in snapshot mode */
+       if (opts->auxtrace_snapshot_mode) {
+               /*
+                * No size were given to '-S' or '-m,', so go with
+                * the default
+                */
+               if (!opts->auxtrace_snapshot_size &&
+                   !opts->auxtrace_mmap_pages) {
+                       if (privileged) {
+                               opts->auxtrace_mmap_pages = MiB(4) / page_size;
+                       } else {
+                               opts->auxtrace_mmap_pages =
+                                                       KiB(128) / page_size;
+                               if (opts->mmap_pages == UINT_MAX)
+                                       opts->mmap_pages = KiB(256) / page_size;
+                       }
+               } else if (!opts->auxtrace_mmap_pages && !privileged &&
+                                               opts->mmap_pages == UINT_MAX) {
+                       opts->mmap_pages = KiB(256) / page_size;
+               }
+
+               /*
+                * '-m,xyz' was specified but no snapshot size, so make the
+                * snapshot size as big as the auxtrace mmap area.
+                */
+               if (!opts->auxtrace_snapshot_size) {
+                       opts->auxtrace_snapshot_size =
+                               opts->auxtrace_mmap_pages * (size_t)page_size;
+               }
+
+               /*
+                * -Sxyz was specified but no auxtrace mmap area, so make the
+                * auxtrace mmap area big enough to fit the requested snapshot
+                * size.
+                */
+               if (!opts->auxtrace_mmap_pages) {
+                       size_t sz = opts->auxtrace_snapshot_size;
+
+                       sz = round_up(sz, page_size) / page_size;
+                       opts->auxtrace_mmap_pages = roundup_pow_of_two(sz);
+               }
+
+               /* Snapshost size can't be bigger than the auxtrace area */
+               if (opts->auxtrace_snapshot_size >
+                               opts->auxtrace_mmap_pages * (size_t)page_size) {
+                       pr_err("Snapshot size %zu must not be greater than AUX area tracing mmap size %zu\n",
+                              opts->auxtrace_snapshot_size,
+                              opts->auxtrace_mmap_pages * (size_t)page_size);
+                       return -EINVAL;
+               }
+
+               /* Something went wrong somewhere - this shouldn't happen */
+               if (!opts->auxtrace_snapshot_size ||
+                   !opts->auxtrace_mmap_pages) {
+                       pr_err("Failed to calculate default snapshot size and/or AUX area tracing mmap pages\n");
+                       return -EINVAL;
+               }
+       }
+
+       /* We are in full trace mode but '-m,xyz' wasn't specified */
+       if (opts->full_auxtrace && !opts->auxtrace_mmap_pages) {
+               if (privileged) {
+                       opts->auxtrace_mmap_pages = MiB(4) / page_size;
+               } else {
+                       opts->auxtrace_mmap_pages = KiB(128) / page_size;
+                       if (opts->mmap_pages == UINT_MAX)
+                               opts->mmap_pages = KiB(256) / page_size;
+               }
+
+       }
+
+       /* Validate auxtrace_mmap_pages provided by user */
+       if (opts->auxtrace_mmap_pages) {
+               unsigned int max_page = (KiB(128) / page_size);
+               size_t sz = opts->auxtrace_mmap_pages * (size_t)page_size;
+
+               if (!privileged &&
+                   opts->auxtrace_mmap_pages > max_page) {
+                       opts->auxtrace_mmap_pages = max_page;
+                       pr_err("auxtrace too big, truncating to %d\n",
+                              max_page);
+               }
+
+               if (!is_power_of_2(sz)) {
+                       pr_err("Invalid mmap size for %s: must be a power of 2\n",
+                              CORESIGHT_ETM_PMU_NAME);
+                       return -EINVAL;
+               }
+       }
+
+       if (opts->auxtrace_snapshot_mode)
+               pr_debug2("%s snapshot size: %zu\n", CORESIGHT_ETM_PMU_NAME,
+                         opts->auxtrace_snapshot_size);
+
+       if (cs_etm_evsel) {
+               /*
+                * To obtain the auxtrace buffer file descriptor, the auxtrace
+                * event must come first.
+                */
+               perf_evlist__to_front(evlist, cs_etm_evsel);
+               /*
+                * In the case of per-cpu mmaps, we need the CPU on the
+                * AUX event.
+                */
+               if (!cpu_map__empty(cpus))
+                       perf_evsel__set_sample_bit(cs_etm_evsel, CPU);
+       }
+
+       /* Add dummy event to keep tracking */
+       if (opts->full_auxtrace) {
+               struct perf_evsel *tracking_evsel;
+               int err;
+
+               err = parse_events(evlist, "dummy:u", NULL);
+               if (err)
+                       return err;
+
+               tracking_evsel = perf_evlist__last(evlist);
+               perf_evlist__set_tracking_event(evlist, tracking_evsel);
+
+               tracking_evsel->attr.freq = 0;
+               tracking_evsel->attr.sample_period = 1;
+
+               /* In per-cpu case, always need the time of mmap events etc */
+               if (!cpu_map__empty(cpus))
+                       perf_evsel__set_sample_bit(tracking_evsel, TIME);
+       }
+
+       return 0;
+}
+
+static u64 cs_etm_get_config(struct auxtrace_record *itr)
+{
+       u64 config = 0;
+       struct cs_etm_recording *ptr =
+                       container_of(itr, struct cs_etm_recording, itr);
+       struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu;
+       struct perf_evlist *evlist = ptr->evlist;
+       struct perf_evsel *evsel;
+
+       evlist__for_each(evlist, evsel) {
+               if (evsel->attr.type == cs_etm_pmu->type) {
+                       /*
+                        * Variable perf_event_attr::config is assigned to
+                        * ETMv3/PTM.  The bit fields have been made to match
+                        * the ETMv3.5 ETRMCR register specification.  See the
+                        * PMU_FORMAT_ATTR() declarations in
+                        * drivers/hwtracing/coresight/coresight-perf.c for
+                        * details.
+                        */
+                       config = evsel->attr.config;
+                       break;
+               }
+       }
+
+       return config;
+}
+
+static size_t
+cs_etm_info_priv_size(struct auxtrace_record *itr __maybe_unused,
+                     struct perf_evlist *evlist __maybe_unused)
+{
+       int i;
+       int etmv3 = 0, etmv4 = 0;
+       const struct cpu_map *cpus = evlist->cpus;
+
+       /* cpu map is not empty, we have specific CPUs to work with */
+       if (!cpu_map__empty(cpus)) {
+               for (i = 0; i < cpu_map__nr(cpus); i++) {
+                       if (cs_etm_is_etmv4(itr, cpus->map[i]))
+                               etmv4++;
+                       else
+                               etmv3++;
+               }
+       } else {
+               /* get configuration for all CPUs in the system */
+               for (i = 0; i < cpu__max_cpu(); i++) {
+                       if (cs_etm_is_etmv4(itr, i))
+                               etmv4++;
+                       else
+                               etmv3++;
+               }
+       }
+
+       return (CS_ETM_HEADER_SIZE +
+              (etmv4 * CS_ETMV4_PRIV_SIZE) +
+              (etmv3 * CS_ETMV3_PRIV_SIZE));
+}
+
+static const char *metadata_etmv3_ro[CS_ETM_PRIV_MAX] = {
+       [CS_ETM_ETMCCER]        = "mgmt/etmccer",
+       [CS_ETM_ETMIDR]         = "mgmt/etmidr",
+};
+
+static const char *metadata_etmv4_ro[CS_ETMV4_PRIV_MAX] = {
+       [CS_ETMV4_TRCIDR0]              = "trcidr/trcidr0",
+       [CS_ETMV4_TRCIDR1]              = "trcidr/trcidr1",
+       [CS_ETMV4_TRCIDR2]              = "trcidr/trcidr2",
+       [CS_ETMV4_TRCIDR8]              = "trcidr/trcidr8",
+       [CS_ETMV4_TRCAUTHSTATUS]        = "mgmt/trcauthstatus",
+};
+
+static bool cs_etm_is_etmv4(struct auxtrace_record *itr, int cpu)
+{
+       bool ret = false;
+       char path[PATH_MAX];
+       int scan;
+       unsigned int val;
+       struct cs_etm_recording *ptr =
+                       container_of(itr, struct cs_etm_recording, itr);
+       struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu;
+
+       /* Take any of the RO files for ETMv4 and see if it present */
+       snprintf(path, PATH_MAX, "cpu%d/%s",
+                cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0]);
+       scan = perf_pmu__scan_file(cs_etm_pmu, path, "%x", &val);
+
+       /* The file was read successfully, we have a winner */
+       if (scan == 1)
+               ret = true;
+
+       return ret;
+}
+
+static int cs_etm_get_ro(struct perf_pmu *pmu, int cpu, const char *path)
+{
+       char pmu_path[PATH_MAX];
+       int scan;
+       unsigned int val = 0;
+
+       /* Get RO metadata from sysfs */
+       snprintf(pmu_path, PATH_MAX, "cpu%d/%s", cpu, path);
+
+       scan = perf_pmu__scan_file(pmu, pmu_path, "%x", &val);
+       if (scan != 1)
+               pr_err("%s: error reading: %s\n", __func__, pmu_path);
+
+       return val;
+}
+
+static void cs_etm_get_metadata(int cpu, u32 *offset,
+                               struct auxtrace_record *itr,
+                               struct auxtrace_info_event *info)
+{
+       u32 increment;
+       u64 magic;
+       struct cs_etm_recording *ptr =
+                       container_of(itr, struct cs_etm_recording, itr);
+       struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu;
+
+       /* first see what kind of tracer this cpu is affined to */
+       if (cs_etm_is_etmv4(itr, cpu)) {
+               magic = __perf_cs_etmv4_magic;
+               /* Get trace configuration register */
+               info->priv[*offset + CS_ETMV4_TRCCONFIGR] =
+                                               cs_etm_get_config(itr);
+               /* Get traceID from the framework */
+               info->priv[*offset + CS_ETMV4_TRCTRACEIDR] =
+                                               coresight_get_trace_id(cpu);
+               /* Get read-only information from sysFS */
+               info->priv[*offset + CS_ETMV4_TRCIDR0] =
+                       cs_etm_get_ro(cs_etm_pmu, cpu,
+                                     metadata_etmv4_ro[CS_ETMV4_TRCIDR0]);
+               info->priv[*offset + CS_ETMV4_TRCIDR1] =
+                       cs_etm_get_ro(cs_etm_pmu, cpu,
+                                     metadata_etmv4_ro[CS_ETMV4_TRCIDR1]);
+               info->priv[*offset + CS_ETMV4_TRCIDR2] =
+                       cs_etm_get_ro(cs_etm_pmu, cpu,
+                                     metadata_etmv4_ro[CS_ETMV4_TRCIDR2]);
+               info->priv[*offset + CS_ETMV4_TRCIDR8] =
+                       cs_etm_get_ro(cs_etm_pmu, cpu,
+                                     metadata_etmv4_ro[CS_ETMV4_TRCIDR8]);
+               info->priv[*offset + CS_ETMV4_TRCAUTHSTATUS] =
+                       cs_etm_get_ro(cs_etm_pmu, cpu,
+                                     metadata_etmv4_ro
+                                     [CS_ETMV4_TRCAUTHSTATUS]);
+
+               /* How much space was used */
+               increment = CS_ETMV4_PRIV_MAX;
+       } else {
+               magic = __perf_cs_etmv3_magic;
+               /* Get configuration register */
+               info->priv[*offset + CS_ETM_ETMCR] = cs_etm_get_config(itr);
+               /* Get traceID from the framework */
+               info->priv[*offset + CS_ETM_ETMTRACEIDR] =
+                                               coresight_get_trace_id(cpu);
+               /* Get read-only information from sysFS */
+               info->priv[*offset + CS_ETM_ETMCCER] =
+                       cs_etm_get_ro(cs_etm_pmu, cpu,
+                                     metadata_etmv3_ro[CS_ETM_ETMCCER]);
+               info->priv[*offset + CS_ETM_ETMIDR] =
+                       cs_etm_get_ro(cs_etm_pmu, cpu,
+                                     metadata_etmv3_ro[CS_ETM_ETMIDR]);
+
+               /* How much space was used */
+               increment = CS_ETM_PRIV_MAX;
+       }
+
+       /* Build generic header portion */
+       info->priv[*offset + CS_ETM_MAGIC] = magic;
+       info->priv[*offset + CS_ETM_CPU] = cpu;
+       /* Where the next CPU entry should start from */
+       *offset += increment;
+}
+
+static int cs_etm_info_fill(struct auxtrace_record *itr,
+                           struct perf_session *session,
+                           struct auxtrace_info_event *info,
+                           size_t priv_size)
+{
+       int i;
+       u32 offset;
+       u64 nr_cpu, type;
+       const struct cpu_map *cpus = session->evlist->cpus;
+       struct cs_etm_recording *ptr =
+                       container_of(itr, struct cs_etm_recording, itr);
+       struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu;
+
+       if (priv_size != cs_etm_info_priv_size(itr, session->evlist))
+               return -EINVAL;
+
+       if (!session->evlist->nr_mmaps)
+               return -EINVAL;
+
+       /* If the cpu_map is empty all CPUs are involved */
+       nr_cpu = cpu_map__empty(cpus) ? cpu__max_cpu() : cpu_map__nr(cpus);
+       /* Get PMU type as dynamically assigned by the core */
+       type = cs_etm_pmu->type;
+
+       /* First fill out the session header */
+       info->type = PERF_AUXTRACE_CS_ETM;
+       info->priv[CS_HEADER_VERSION_0] = 0;
+       info->priv[CS_PMU_TYPE_CPUS] = type << 32;
+       info->priv[CS_PMU_TYPE_CPUS] |= nr_cpu;
+       info->priv[CS_ETM_SNAPSHOT] = ptr->snapshot_mode;
+
+       offset = CS_ETM_SNAPSHOT + 1;
+
+       /* cpu map is not empty, we have specific CPUs to work with */
+       if (!cpu_map__empty(cpus)) {
+               for (i = 0; i < cpu_map__nr(cpus) && offset < priv_size; i++)
+                       cs_etm_get_metadata(cpus->map[i], &offset, itr, info);
+       } else {
+               /* get configuration for all CPUs in the system */
+               for (i = 0; i < cpu__max_cpu(); i++)
+                       cs_etm_get_metadata(i, &offset, itr, info);
+       }
+
+       return 0;
+}
+
+static int cs_etm_find_snapshot(struct auxtrace_record *itr __maybe_unused,
+                               int idx, struct auxtrace_mmap *mm,
+                               unsigned char *data __maybe_unused,
+                               u64 *head, u64 *old)
+{
+       pr_debug3("%s: mmap index %d old head %zu new head %zu size %zu\n",
+                 __func__, idx, (size_t)*old, (size_t)*head, mm->len);
+
+       *old = *head;
+       *head += mm->len;
+
+       return 0;
+}
+
+static int cs_etm_snapshot_start(struct auxtrace_record *itr)
+{
+       struct cs_etm_recording *ptr =
+                       container_of(itr, struct cs_etm_recording, itr);
+       struct perf_evsel *evsel;
+
+       evlist__for_each(ptr->evlist, evsel) {
+               if (evsel->attr.type == ptr->cs_etm_pmu->type)
+                       return perf_evsel__disable(evsel);
+       }
+       return -EINVAL;
+}
+
+static int cs_etm_snapshot_finish(struct auxtrace_record *itr)
+{
+       struct cs_etm_recording *ptr =
+                       container_of(itr, struct cs_etm_recording, itr);
+       struct perf_evsel *evsel;
+
+       evlist__for_each(ptr->evlist, evsel) {
+               int nthreads = thread_map__nr(evsel->threads);
+               int ncpus = cpu_map__nr(evsel->cpus);
+
+               if (evsel->attr.type == ptr->cs_etm_pmu->type) {
+                       return perf_evsel__enable(evsel, ncpus, nthreads);
+               }
+       }
+       return -EINVAL;
+}
+
+static u64 cs_etm_reference(struct auxtrace_record *itr __maybe_unused)
+{
+       return (((u64) rand() <<  0) & 0x00000000FFFFFFFFull) |
+               (((u64) rand() << 32) & 0xFFFFFFFF00000000ull);
+}
+
+static void cs_etm_recording_free(struct auxtrace_record *itr)
+{
+       struct cs_etm_recording *ptr =
+                       container_of(itr, struct cs_etm_recording, itr);
+       free(ptr);
+}
+
+static int cs_etm_read_finish(struct auxtrace_record *itr, int idx)
+{
+       struct cs_etm_recording *ptr =
+                       container_of(itr, struct cs_etm_recording, itr);
+       struct perf_evsel *evsel;
+
+       evlist__for_each(ptr->evlist, evsel) {
+               if (evsel->attr.type == ptr->cs_etm_pmu->type)
+                       return perf_evlist__enable_event_idx(ptr->evlist,
+                                                            evsel, idx);
+       }
+
+       return -EINVAL;
+}
+
+struct auxtrace_record *cs_etm_record_init(int *err)
+{
+       struct perf_pmu *cs_etm_pmu;
+       struct cs_etm_recording *ptr;
+
+       cs_etm_pmu = perf_pmu__find(CORESIGHT_ETM_PMU_NAME);
+
+       if (!cs_etm_pmu) {
+               *err = -EINVAL;
+               goto out;
+       }
+
+       ptr = zalloc(sizeof(struct cs_etm_recording));
+       if (!ptr) {
+               *err = -ENOMEM;
+               goto out;
+       }
+
+       ptr->cs_etm_pmu                 = cs_etm_pmu;
+       ptr->itr.parse_snapshot_options = cs_etm_parse_snapshot_options;
+       ptr->itr.recording_options      = cs_etm_recording_options;
+       ptr->itr.info_priv_size         = cs_etm_info_priv_size;
+       ptr->itr.info_fill              = cs_etm_info_fill;
+       ptr->itr.find_snapshot          = cs_etm_find_snapshot;
+       ptr->itr.snapshot_start         = cs_etm_snapshot_start;
+       ptr->itr.snapshot_finish        = cs_etm_snapshot_finish;
+       ptr->itr.reference              = cs_etm_reference;
+       ptr->itr.free                   = cs_etm_recording_free;
+       ptr->itr.read_finish            = cs_etm_read_finish;
+
+       *err = 0;
+       return &ptr->itr;
+out:
+       return NULL;
+}
diff --git a/tools/perf/arch/arm/util/cs-etm.h b/tools/perf/arch/arm/util/cs-etm.h

new file mode 100644 (file)

index 0000000..909f486
--- /dev/null
+++ b/tools/perf/arch/arm/util/cs-etm.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright(C) 2015 Linaro Limited. All rights reserved.
+ * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef INCLUDE__PERF_CS_ETM_H__
+#define INCLUDE__PERF_CS_ETM_H__
+
+struct auxtrace_record *cs_etm_record_init(int *err);
+
+#endif
diff --git a/tools/perf/arch/arm/util/pmu.c b/tools/perf/arch/arm/util/pmu.c

new file mode 100644 (file)

index 0000000..af9fb66
--- /dev/null
+++ b/tools/perf/arch/arm/util/pmu.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright(C) 2015 Linaro Limited. All rights reserved.
+ * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <string.h>
+#include <linux/coresight-pmu.h>
+#include <linux/perf_event.h>
+
+#include "../../util/pmu.h"
+
+struct perf_event_attr
+*perf_pmu__get_default_config(struct perf_pmu *pmu __maybe_unused)
+{
+#ifdef HAVE_AUXTRACE_SUPPORT
+       if (!strcmp(pmu->name, CORESIGHT_ETM_PMU_NAME)) {
+               /* add ETM default config here */
+               pmu->selectable = true;
+       }
+#endif
+       return NULL;
+}
diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build

index e58123a8912b8a70eb96432aa4e6cc7444cc8e37..f92918154fec87157c50bf64998c93ea3193b824 100644 (file)
--- a/tools/perf/arch/arm64/util/Build
+++ b/tools/perf/arch/arm64/util/Build
@@ -1,2 +1,6 @@
  libperf-$(CONFIG_DWARF)     += dwarf-regs.o
  libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o
+
+libperf-$(CONFIG_AUXTRACE) += ../../arm/util/pmu.o \
+                             ../../arm/util/auxtrace.o \
+                             ../../arm/util/cs-etm.o
diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c

index 9b94ce5209170fcb6105723aea00bf1214fc4032..4685a40777cc7e2c4806f8af38d57abc077ccbd4 100644 (file)
--- a/tools/perf/arch/x86/util/intel-bts.c
+++ b/tools/perf/arch/x86/util/intel-bts.c
@@ -60,7 +60,9 @@ struct branch {
         u64 misc;
  };
  
-static size_t intel_bts_info_priv_size(struct auxtrace_record *itr __maybe_unused)
+static size_t
+intel_bts_info_priv_size(struct auxtrace_record *itr __maybe_unused,
+                        struct perf_evlist *evlist __maybe_unused)
  {
         return INTEL_BTS_AUXTRACE_PRIV_SIZE;
  }
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c

index c53f787675685a7d52cac525d7c79b854ed31e7e..de3965c4e4aabecb5508999eb5ca99619da8d0f9 100644 (file)
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -273,7 +273,9 @@ intel_pt_pmu_default_config(struct perf_pmu *intel_pt_pmu)
         return attr;
  }
  
-static size_t intel_pt_info_priv_size(struct auxtrace_record *itr __maybe_unused)
+static size_t
+intel_pt_info_priv_size(struct auxtrace_record *itr __maybe_unused,
+                       struct perf_evlist *evlist __maybe_unused)
  {
         return INTEL_PT_AUXTRACE_PRIV_SIZE;
  }
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c

index 99d127fe9c35e500ca74ea871fd4373fcea15d68..ac369c494036e0c7a020679d5d8e3d395aec96db 100644 (file)
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -626,12 +626,16 @@ static int __cmd_inject(struct perf_inject *inject)
         ret = perf_session__process_events(session);
  
         if (!file_out->is_pipe) {
-               if (inject->build_ids) {
+               if (inject->build_ids)
                         perf_header__set_feat(&session->header,
                                               HEADER_BUILD_ID);
-                       if (inject->have_auxtrace)
-                               dsos__hit_all(session);
-               }
+               /*
+                * Keep all buildids when there is unprocessed AUX data because
+                * it is not known which ones the AUX trace hits.
+                */
+               if (perf_header__has_feat(&session->header, HEADER_BUILD_ID) &&
+                   inject->have_auxtrace && !inject->itrace_synth_opts.set)
+                       dsos__hit_all(session);
                 /*
                  * The AUX areas have been removed and replaced with
                  * synthesized hardware events, so clear the feature flag and
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c

index 199fc31e3919c5743ca305f9eef5ef8c8fc86a70..1b9decd5fbf120277bee0b8e85f12b7673205071 100644 (file)
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -276,6 +276,7 @@ static int record__open(struct record *rec)
         struct perf_evlist *evlist = rec->evlist;
         struct perf_session *session = rec->session;
         struct record_opts *opts = &rec->opts;
+       struct perf_evsel_config_term *err_term;
         int rc = 0;
  
         perf_evlist__config(evlist, opts);
@@ -305,6 +306,14 @@ try_again:
                 goto out;
         }
  
+       if (perf_evlist__apply_drv_configs(evlist, &pos, &err_term)) {
+               error("failed to set config \"%s\" on event %s with %d (%s)\n",
+                       err_term->val.drv_cfg, perf_evsel__name(pos), errno,
+                       strerror_r(errno, msg, sizeof(msg)));
+               rc = -1;
+               goto out;
+       }
+
         if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
                                  opts->auxtrace_mmap_pages,
                                  opts->auxtrace_snapshot_mode) < 0) {
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c

index 72b5deb4bd7961bc4fdb86c50689253b23660b0e..368d1e1561f749644105d19dcf2b8d1989860b9c 100644 (file)
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -92,7 +92,8 @@ static struct {
  
                 .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
                               PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
-                             PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
+                             PERF_OUTPUT_EVNAME | PERF_OUTPUT_ADDR |
+                              PERF_OUTPUT_IP |
                               PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
                               PERF_OUTPUT_PERIOD,
  
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile

index de89ec57436171ef74e5aa48135db8c2e9b4c0a0..405c1c1e2975f5eec867542c4bd2e53f5f0fdf1e 100644 (file)
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -433,6 +433,24 @@ endif
  grep-libs  = $(filter -l%,$(1))
  strip-libs = $(filter-out -l%,$(1))
  
+ifdef CSTRACE_PATH
+  ifeq (${IS_64_BIT}, 1)
+    CSTRACE_LNX = linux64
+  else
+    CSTRACE_LNX = linux
+  endif
+  ifeq (${DEBUG}, 1)
+    LIBCSTRACE = -lcstraced_c_api -lcstraced
+    CSTRACE_LIB_PATH = $(CSTRACE_PATH)/lib/$(CSTRACE_LNX)/dbg
+  else
+    LIBCSTRACE = -lcstraced_c_api -lcstraced
+    CSTRACE_LIB_PATH = $(CSTRACE_PATH)/lib/$(CSTRACE_LNX)/rel
+  endif
+  $(call detected,CSTRACE)
+  $(call detected_var,CSTRACE_PATH)
+  EXTLIBS += -L$(CSTRACE_LIB_PATH) $(LIBCSTRACE) -lstdc++
+endif
+
  ifdef NO_LIBPERL
    CFLAGS += -DNO_LIBPERL
  else
@@ -647,9 +665,14 @@ ifdef LIBBABELTRACE
  endif
  
  ifndef NO_AUXTRACE
-  ifeq ($(feature-get_cpuid), 0)
-    msg := $(warning Your gcc lacks the __get_cpuid() builtin, disables support for auxtrace/Intel PT, please install a newer gcc);
-    NO_AUXTRACE := 1
+  ifeq ($(ARCH),x86)
+    ifeq ($(feature-get_cpuid), 0)
+      msg := $(warning Your gcc lacks the __get_cpuid() builtin, disables support for auxtrace/Intel PT, please install a newer gcc);
+      NO_AUXTRACE := 1
+    else
+      $(call detected,CONFIG_AUXTRACE)
+      CFLAGS += -DHAVE_AUXTRACE_SUPPORT
+    endif
    else
      $(call detected,CONFIG_AUXTRACE)
      CFLAGS += -DHAVE_AUXTRACE_SUPPORT
diff --git a/tools/perf/scripts/python/cs-trace-disasm.py b/tools/perf/scripts/python/cs-trace-disasm.py

new file mode 100644 (file)

index 0000000..429d0d2
--- /dev/null
+++ b/tools/perf/scripts/python/cs-trace-disasm.py
@@ -0,0 +1,124 @@
+# perf script event handlers, generated by perf script -g python
+# Licensed under the terms of the GNU GPL License version 2
+
+# The common_* event handler fields are the most useful fields common to
+# all events.  They don't necessarily correspond to the 'common_*' fields
+# in the format files.  Those fields not available as handler params can
+# be retrieved using Python functions of the form common_*(context).
+# See the perf-trace-python Documentation for the list of available functions.
+
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+                '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from subprocess import *
+from Core import *
+import re;
+
+from optparse import OptionParser
+
+#
+# Add options to specify vmlinux file and the objdump executable
+#
+parser = OptionParser()
+parser.add_option("-k", "--vmlinux", dest="vmlinux_name",
+                  help="path to vmlinux file")
+parser.add_option("-d", "--objdump", dest="objdump_name",
+                  help="name of objdump executable (in path)")
+(options, args) = parser.parse_args()
+
+if (options.objdump_name == None):
+        sys.exit("No objdump executable specified - use -d or --objdump option")
+
+# initialize global dicts and regular expression
+
+build_ids = dict();
+mmaps = dict();
+disasm_cache = dict();
+disasm_re = re.compile("^\s*([0-9a-fA-F]+):")
+
+cache_size = 16*1024
+
+def trace_begin():
+        cmd_output = check_output(["perf", "buildid-list"]).split('\n');
+        bid_re = re.compile("([a-fA-f0-9]+)[ \t]([^ \n]+)")
+        for line in cmd_output:
+                m = bid_re.search(line)
+                if (m != None) :
+                        build_ids[m.group(2)] =  \
+                        os.environ['PERF_BUILDID_DIR'] +  \
+                        m.group(2) + "/" + m.group(1);
+
+        if ((options.vmlinux_name != None) and ("[kernel.kallsyms]" in build_ids)):
+                build_ids['[kernel.kallsyms]'] = options.vmlinux_name;
+        else:
+                del build_ids['[kernel.kallsyms]']
+
+        mmap_re = re.compile("PERF_RECORD_MMAP2 -?[0-9]+/[0-9]+: \[(0x[0-9a-fA-F]+).*:\s.*\s(.*.so)")
+        cmd_output= check_output("perf script --show-mmap-events | fgrep PERF_RECORD_MMAP2",shell=True).split('\n')
+        for line in cmd_output:
+                m = mmap_re.search(line)
+                if (m != None) :
+                        mmaps[m.group(2)] = int(m.group(1),0)
+
+
+
+def trace_end():
+        pass
+
+def process_event(t):
+        global cache_size
+        global options 
+
+        sample = t['sample']
+        dso = t['dso']
+
+        # don't let the cache get too big, but don't bother with a fancy replacement policy
+        # just clear it when it hits max size
+
+        if (len(disasm_cache) > cache_size):
+                disasm_cache.clear();
+
+        cpu = format(sample['cpu'], "d");
+        addr_range = format(sample['ip'],"x")  + ":" + format(sample['addr'],"x");
+
+        try:
+                disasm_output = disasm_cache[addr_range];
+        except:
+                try:
+                        fname = build_ids[dso];
+                except KeyError:
+                        if (dso == '[kernel.kallsyms]'):
+                                return;
+                        fname = dso;
+
+                if (dso in mmaps):
+                        offset = mmaps[dso];
+                        disasm = [options.objdump_name,"-d","-z", "--adjust-vma="+format(offset,"#x"),"--start-address="+format(sample['ip'],"#x"),"--stop-address="+format(sample['addr'],"#x"), fname]
+                else:
+                        offset = 0
+                        disasm = [options.objdump_name,"-d","-z", "--start-address="+format(sample['ip'],"#x"),"--stop-address="+format(sample['addr'],"#x"),fname] 
+                disasm_output = check_output(disasm).split('\n')
+                disasm_cache[addr_range] = disasm_output;
+
+        print "FILE: %s\tCPU: %s" % (dso, cpu);
+        for line in disasm_output:
+                m = disasm_re.search(line)
+                if (m != None) :
+                        try:
+                                print "\t",line
+                        except:
+                                exit(1);
+                else:
+                        continue;
+
+def trace_unhandled(event_name, context, event_fields_dict):
+               print ' '.join(['%s=%s'%(k,str(v))for k,v in sorted(event_fields_dict.items())])
+
+def print_header(event_name, cpu, secs, nsecs, pid, comm):
+        print "print_header"
+       print "%-20s %5u %05u.%09u %8u %-20s " % \
+       (event_name, cpu, secs, nsecs, pid, comm),
diff --git a/tools/perf/scripts/python/cs-trace-ranges.py b/tools/perf/scripts/python/cs-trace-ranges.py

new file mode 100644 (file)

index 0000000..c8edacb
--- /dev/null
+++ b/tools/perf/scripts/python/cs-trace-ranges.py
@@ -0,0 +1,44 @@
+#
+# Copyright(C) 2016 Linaro Limited. All rights reserved.
+# Author: Tor Jeremiassen <tor.jeremiassen@linaro.org>
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 2 as published by
+# the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+# more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+                '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+
+def trace_begin():
+        pass;
+
+def trace_end():
+        pass
+
+def process_event(t):
+
+        sample = t['sample']
+
+        print "range:",format(sample['ip'],"x"),"-",format(sample['addr'],"x")
+
+def trace_unhandled(event_name, context, event_fields_dict):
+               print ' '.join(['%s=%s'%(k,str(v))for k,v in sorted(event_fields_dict.items())])
+
+def print_header(event_name, cpu, secs, nsecs, pid, comm):
+        print "print_header"
+       print "%-20s %5u %05u.%09u %8u %-20s " % \
+       (event_name, cpu, secs, nsecs, pid, comm),
diff --git a/tools/perf/util/Build b/tools/perf/util/Build

index 591b3fe3ed49acd8b31f701a78b3302a84786546..a8d806503a45917c603569663596b83e506a031d 100644 (file)
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -84,6 +84,8 @@ libperf-$(CONFIG_AUXTRACE) += auxtrace.o
  libperf-$(CONFIG_AUXTRACE) += intel-pt-decoder/
  libperf-$(CONFIG_AUXTRACE) += intel-pt.o
  libperf-$(CONFIG_AUXTRACE) += intel-bts.o
+libperf-$(CONFIG_AUXTRACE) += cs-etm.o
+libperf-$(CONFIG_AUXTRACE) += cs-etm-decoder/
  libperf-y += parse-branch-options.o
  libperf-y += parse-regs-options.o
  
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c

index 7f10430af39c3ac9e47f4da1aca93e37a8f9cf87..67551225764e7677031865e3bbffc7559808f63c 100644 (file)
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -49,6 +49,7 @@
  
  #include "intel-pt.h"
  #include "intel-bts.h"
+#include "cs-etm.h"
  
  int auxtrace_mmap__mmap(struct auxtrace_mmap *mm,
                         struct auxtrace_mmap_params *mp,
@@ -478,10 +479,11 @@ void auxtrace_heap__pop(struct auxtrace_heap *heap)
                          heap_array[last].ordinal);
  }
  
-size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr)
+size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr,
+                                      struct perf_evlist *evlist)
  {
         if (itr)
-               return itr->info_priv_size(itr);
+               return itr->info_priv_size(itr, evlist);
         return 0;
  }
  
@@ -852,7 +854,7 @@ int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr,
         int err;
  
         pr_debug2("Synthesizing auxtrace information\n");
-       priv_size = auxtrace_record__info_priv_size(itr);
+       priv_size = auxtrace_record__info_priv_size(itr, session->evlist);
         ev = zalloc(sizeof(struct auxtrace_info_event) + priv_size);
         if (!ev)
                 return -ENOMEM;
@@ -891,6 +893,8 @@ int perf_event__process_auxtrace_info(struct perf_tool *tool __maybe_unused,
                 return intel_pt_process_auxtrace_info(event, session);
         case PERF_AUXTRACE_INTEL_BTS:
                 return intel_bts_process_auxtrace_info(event, session);
+       case PERF_AUXTRACE_CS_ETM:
+               return cs_etm__process_auxtrace_info(event, session);
         case PERF_AUXTRACE_UNKNOWN:
         default:
                 return -EINVAL;
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h

index b86f90db1352a6c8635e3ea5d02aa3c21bccc323..adb53e7bcabf624679b06922593dd8bfb5263189 100644 (file)
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -41,6 +41,7 @@ enum auxtrace_type {
         PERF_AUXTRACE_UNKNOWN,
         PERF_AUXTRACE_INTEL_PT,
         PERF_AUXTRACE_INTEL_BTS,
+       PERF_AUXTRACE_CS_ETM,
  };
  
  enum itrace_period_type {
@@ -293,7 +294,8 @@ struct auxtrace_record {
         int (*recording_options)(struct auxtrace_record *itr,
                                  struct perf_evlist *evlist,
                                  struct record_opts *opts);
-       size_t (*info_priv_size)(struct auxtrace_record *itr);
+       size_t (*info_priv_size)(struct auxtrace_record *itr,
+                                struct perf_evlist *evlist);
         int (*info_fill)(struct auxtrace_record *itr,
                          struct perf_session *session,
                          struct auxtrace_info_event *auxtrace_info,
@@ -429,7 +431,8 @@ int auxtrace_parse_snapshot_options(struct auxtrace_record *itr,
  int auxtrace_record__options(struct auxtrace_record *itr,
                              struct perf_evlist *evlist,
                              struct record_opts *opts);
-size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr);
+size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr,
+                                      struct perf_evlist *evlist);
  int auxtrace_record__info_fill(struct auxtrace_record *itr,
                                struct perf_session *session,
                                struct auxtrace_info_event *auxtrace_info,
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c

index 6a7e273a514a642b30a477c3119696dc7fa09975..52d320e922e3535ec5d7b064d1f29e69d5bc41da 100644 (file)
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -145,7 +145,7 @@ static int asnprintf(char **strp, size_t size, const char *fmt, ...)
         return ret;
  }
  
-static char *build_id__filename(const char *sbuild_id, char *bf, size_t size)
+char *build_id__filename(const char *sbuild_id, char *bf, size_t size)
  {
         char *tmp = bf;
         int ret = asnprintf(&bf, size, "%s/.build-id/%.2s/%s", buildid_dir,
diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h

index 27a14a8a945beb8eec9cc389ca7d6545b44a2ac4..eb2c2b6e1dab1f91f831d8e8d1d2b7da2a0851b9 100644 (file)
--- a/tools/perf/util/build-id.h
+++ b/tools/perf/util/build-id.h
@@ -11,6 +11,7 @@
  extern struct perf_tool build_id__mark_dso_hit_ops;
  struct dso;
  
+char *build_id__filename(const char *sbuild_id, char *bf, size_t size);
  int build_id__sprintf(const u8 *build_id, int len, char *bf);
  int sysfs__sprintf_build_id(const char *root_dir, char *sbuild_id);
  int filename__sprintf_build_id(const char *pathname, char *sbuild_id);
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c

index 10af1e7524fbd24de791c38fa23c7d730d54a193..6523e1a8eea5c7dfd952a5140c52501b4f645dd6 100644 (file)
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -7,6 +7,10 @@
  #include <stdlib.h>
  #include "asm/bug.h"
  
+static int max_cpu_num;
+static int max_node_num;
+static int *cpunode_map;
+
  static struct cpu_map *cpu_map__default_new(void)
  {
         struct cpu_map *cpus;
@@ -435,6 +439,32 @@ out:
                 pr_err("Failed to read max nodes, using default of %d\n", max_node_num);
  }
  
+int cpu__max_node(void)
+{
+       if (unlikely(!max_node_num))
+               set_max_node_num();
+
+       return max_node_num;
+}
+
+int cpu__max_cpu(void)
+{
+       if (unlikely(!max_cpu_num))
+               set_max_cpu_num();
+
+       return max_cpu_num;
+}
+
+int cpu__get_node(int cpu)
+{
+       if (unlikely(cpunode_map == NULL)) {
+               pr_debug("cpu_map not initialized\n");
+               return -1;
+       }
+
+       return cpunode_map[cpu];
+}
+
  static int init_cpunode_map(void)
  {
         int i;
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h

index 85f7772457fa091655d62212067f2edddf6e55ae..d6184ba929b65e9426ef095194d69419818fcea5 100644 (file)
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -56,37 +56,11 @@ static inline bool cpu_map__empty(const struct cpu_map *map)
         return map ? map->map[0] == -1 : true;
  }
  
-int max_cpu_num;
-int max_node_num;
-int *cpunode_map;
-
  int cpu__setup_cpunode_map(void);
  
-static inline int cpu__max_node(void)
-{
-       if (unlikely(!max_node_num))
-               pr_debug("cpu_map not initialized\n");
-
-       return max_node_num;
-}
-
-static inline int cpu__max_cpu(void)
-{
-       if (unlikely(!max_cpu_num))
-               pr_debug("cpu_map not initialized\n");
-
-       return max_cpu_num;
-}
-
-static inline int cpu__get_node(int cpu)
-{
-       if (unlikely(cpunode_map == NULL)) {
-               pr_debug("cpu_map not initialized\n");
-               return -1;
-       }
-
-       return cpunode_map[cpu];
-}
+int cpu__max_node(void);
+int cpu__max_cpu(void);
+int cpu__get_node(int cpu);
  
  int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
                        int (*f)(struct cpu_map *map, int cpu, void *data),
diff --git a/tools/perf/util/cs-etm-decoder/Build b/tools/perf/util/cs-etm-decoder/Build

new file mode 100644 (file)

index 0000000..d4896fe
--- /dev/null
+++ b/tools/perf/util/cs-etm-decoder/Build
@@ -0,0 +1,7 @@
+ifeq ($(CSTRACE_PATH),)
+libperf-$(CONFIG_AUXTRACE) += cs-etm-decoder-stub.o
+else
+CFLAGS_cs-etm-decoder.o += -I$(CSTRACE_PATH)/include
+libperf-$(CONFIG_AUXTRACE) += cs-etm-decoder.o
+endif
+
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder-stub.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder-stub.c

new file mode 100644 (file)

index 0000000..38f2b75
--- /dev/null
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder-stub.c
@@ -0,0 +1,91 @@
+/*
+ *
+ * Copyright(C) 2015 Linaro Limited. All rights reserved.
+ * Author: Tor Jeremiassen <tor.jeremiassen@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ * 
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+ * Public License for more details.
+ *
+ * You should have received a copy of the GNU GEneral Public License along
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+
+#include "cs-etm-decoder.h"
+#include "../util.h"
+
+
+struct cs_etm_decoder
+{
+        void *state;
+        int dummy;
+};
+
+int cs_etm_decoder__flush(struct cs_etm_decoder *decoder)
+{
+        (void) decoder;
+        return -1;
+}
+
+int cs_etm_decoder__add_bin_file(struct cs_etm_decoder *decoder, uint64_t offset, uint64_t address, uint64_t len, const char *fname)
+{
+        (void) decoder;
+        (void) offset;
+        (void) address;
+        (void) len;
+        (void) fname;
+        return -1;
+}
+
+const struct cs_etm_state *cs_etm_decoder__process_data_block(struct cs_etm_decoder *decoder,
+                                       uint64_t indx,
+                                       const uint8_t *buf,
+                                       size_t len,
+                                       size_t *consumed)
+{
+        (void) decoder;
+        (void) indx;
+        (void) buf;
+        (void) len;
+        (void) consumed;
+        return NULL;
+}
+
+int cs_etm_decoder__add_mem_access_cb(struct cs_etm_decoder *decoder, uint64_t address, uint64_t len, cs_etm_mem_cb_type cb_func)
+{
+        (void) decoder;
+        (void) address;
+        (void) len;
+        (void) cb_func;
+        return -1;
+}
+
+int cs_etm_decoder__get_packet(struct cs_etm_decoder *decoder, 
+                               struct cs_etm_packet *packet)
+{
+        (void) decoder;
+        (void) packet;
+        return -1;
+}
+
+struct cs_etm_decoder *cs_etm_decoder__new(uint32_t num_cpu, struct cs_etm_decoder_params *d_params, struct cs_etm_trace_params t_params[])
+{
+        (void) num_cpu;
+        (void) d_params;
+        (void) t_params;
+        return NULL;
+}
+
+
+void cs_etm_decoder__free(struct cs_etm_decoder *decoder)
+{
+        (void) decoder;
+        return;
+}
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c

new file mode 100644 (file)

index 0000000..c6f23d6
--- /dev/null
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -0,0 +1,503 @@
+/*
+ *
+ * Copyright(C) 2015 Linaro Limited. All rights reserved.
+ * Author: Tor Jeremiassen <tor.jeremiassen@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ * 
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+ * Public License for more details.
+ *
+ * You should have received a copy of the GNU GEneral Public License along
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/err.h>
+#include <stdlib.h>
+
+#include "../cs-etm.h"
+#include "cs-etm-decoder.h"
+#include "../util.h"
+#include "../util/intlist.h"
+
+#include "c_api/opencsd_c_api.h"
+#include "ocsd_if_types.h"
+#include "etmv4/trc_pkt_types_etmv4.h"
+
+#define MAX_BUFFER 1024 
+
+
+
+struct cs_etm_decoder
+{
+        struct cs_etm_state     state;
+        dcd_tree_handle_t       dcd_tree;
+        void (*packet_printer)(const char *);
+        cs_etm_mem_cb_type      mem_access;
+        ocsd_datapath_resp_t   prev_return;
+        size_t                  prev_processed;
+        bool                    trace_on;
+        bool                    discontinuity;
+        struct cs_etm_packet    packet_buffer[MAX_BUFFER];
+        uint32_t                packet_count;
+        uint32_t                head;
+        uint32_t                tail;
+        uint32_t                end_tail;
+};
+
+static uint32_t cs_etm_decoder__mem_access(const void *context,
+                                           const ocsd_vaddr_t address,
+                                           const ocsd_mem_space_acc_t mem_space,
+                                           const uint32_t req_size,
+                                           uint8_t *buffer)
+{
+        struct cs_etm_decoder *decoder = (struct cs_etm_decoder *) context;
+        (void) mem_space;
+
+        return decoder->mem_access(decoder->state.data,address,req_size,buffer);
+}
+
+static int cs_etm_decoder__gen_etmv4_config(struct cs_etm_trace_params *params,
+                                           ocsd_etmv4_cfg *config)
+{
+        config->reg_configr = params->reg_configr;
+        config->reg_traceidr = params->reg_traceidr;
+        config->reg_idr0 = params->reg_idr0;
+        config->reg_idr1 = params->reg_idr1;
+        config->reg_idr2 = params->reg_idr2;
+        config->reg_idr8 = params->reg_idr8;
+
+        config->reg_idr9 = 0;
+        config->reg_idr10 = 0;
+        config->reg_idr11 = 0;
+        config->reg_idr12 = 0;
+        config->reg_idr13 = 0;
+        config->arch_ver = ARCH_V8;
+        config->core_prof = profile_CortexA;
+
+        return 0;
+}
+
+static int cs_etm_decoder__flush_packet(struct cs_etm_decoder *decoder)
+{
+        int err = 0;
+
+        if (decoder == NULL) return -1;
+
+        if (decoder->packet_count >= 31) return -1;
+
+        if (decoder->tail != decoder->end_tail) {
+                decoder->tail = (decoder->tail + 1) & (MAX_BUFFER - 1);
+                decoder->packet_count++;
+        }
+
+        return err;
+}
+
+int cs_etm_decoder__flush(struct cs_etm_decoder *decoder)
+{
+        return cs_etm_decoder__flush_packet(decoder);
+}
+
+static int cs_etm_decoder__buffer_packet(struct cs_etm_decoder *decoder,
+                                        const ocsd_generic_trace_elem *elem,
+                                        const uint8_t trace_chan_id,
+                                        enum cs_etm_sample_type sample_type)
+{
+        int err = 0;
+        uint32_t et = 0;
+        struct int_node *inode = NULL;
+
+        if (decoder == NULL) return -1;
+
+        if (decoder->packet_count >= 31) return -1;
+
+        err = cs_etm_decoder__flush_packet(decoder);
+
+        if (err) return err;
+
+        et = decoder->end_tail;
+        /* Search the RB tree for the cpu associated with this traceID */
+        inode = intlist__find(traceid_list, trace_chan_id);
+        if (!inode)
+                return PTR_ERR(inode);
+
+        decoder->packet_buffer[et].sample_type = sample_type;
+        decoder->packet_buffer[et].start_addr = elem->st_addr;
+        decoder->packet_buffer[et].end_addr   = elem->en_addr;
+        decoder->packet_buffer[et].exc        = false;
+        decoder->packet_buffer[et].exc_ret    = false;
+        decoder->packet_buffer[et].cpu        = *((int*)inode->priv);
+
+        et = (et + 1) & (MAX_BUFFER - 1);
+
+        decoder->end_tail = et;
+
+        return err;
+}
+
+static int cs_etm_decoder__mark_exception(struct cs_etm_decoder *decoder)
+{
+        int err = 0;
+
+        if (decoder == NULL) return -1;
+  
+        decoder->packet_buffer[decoder->end_tail].exc = true;
+
+        return err;
+}
+
+static int cs_etm_decoder__mark_exception_return(struct cs_etm_decoder *decoder)
+{
+        int err = 0;
+
+        if (decoder == NULL) return -1;
+  
+        decoder->packet_buffer[decoder->end_tail].exc_ret = true;
+        
+        return err;
+}
+
+static ocsd_datapath_resp_t cs_etm_decoder__gen_trace_elem_printer(
+                        const void *context,
+                        const ocsd_trc_index_t indx, 
+                        const uint8_t trace_chan_id,
+                        const ocsd_generic_trace_elem *elem)
+{
+        ocsd_datapath_resp_t resp = OCSD_RESP_CONT;
+        struct cs_etm_decoder *decoder = (struct cs_etm_decoder *) context;
+
+        (void) indx;
+        (void) trace_chan_id;
+
+        switch (elem->elem_type) {
+        case OCSD_GEN_TRC_ELEM_UNKNOWN:
+                break;
+        case OCSD_GEN_TRC_ELEM_NO_SYNC:
+                decoder->trace_on = false;
+                break;
+        case OCSD_GEN_TRC_ELEM_TRACE_ON:
+                decoder->trace_on = true;
+                break;
+        //case OCSD_GEN_TRC_ELEM_TRACE_OVERFLOW:
+                //decoder->trace_on = false;
+                //decoder->discontinuity = true;
+                //break;
+        case OCSD_GEN_TRC_ELEM_INSTR_RANGE:
+                cs_etm_decoder__buffer_packet(decoder,elem,
+                                             trace_chan_id, CS_ETM_RANGE);
+                resp = OCSD_RESP_WAIT;
+                break; 
+        case OCSD_GEN_TRC_ELEM_EXCEPTION:
+                cs_etm_decoder__mark_exception(decoder);
+                break;
+        case OCSD_GEN_TRC_ELEM_EXCEPTION_RET:
+                cs_etm_decoder__mark_exception_return(decoder);
+                break;
+        case OCSD_GEN_TRC_ELEM_PE_CONTEXT:
+        case OCSD_GEN_TRC_ELEM_EO_TRACE:
+        case OCSD_GEN_TRC_ELEM_ADDR_NACC:
+        case OCSD_GEN_TRC_ELEM_TIMESTAMP:
+        case OCSD_GEN_TRC_ELEM_CYCLE_COUNT:
+        //case OCSD_GEN_TRC_ELEM_TS_WITH_CC:
+        case OCSD_GEN_TRC_ELEM_EVENT:
+        default:
+            break;
+        }
+
+        decoder->state.err = 0;
+
+        return resp;
+}
+
+static ocsd_datapath_resp_t cs_etm_decoder__etmv4i_packet_printer(
+        const void *context,
+        const ocsd_datapath_op_t op,
+        const ocsd_trc_index_t indx, 
+        const ocsd_etmv4_i_pkt *pkt)
+{
+        const size_t PACKET_STR_LEN = 1024;
+        ocsd_datapath_resp_t ret = OCSD_RESP_CONT;
+        char packet_str[PACKET_STR_LEN];
+        size_t offset;
+        struct cs_etm_decoder *decoder = (struct cs_etm_decoder *) context;
+
+        sprintf(packet_str,"%ld: ", (long int) indx);
+        offset = strlen(packet_str);
+
+        switch(op) {
+        case OCSD_OP_DATA:
+                if (ocsd_pkt_str(OCSD_PROTOCOL_ETMV4I,
+                                  (void *)pkt,
+                                  packet_str+offset,
+                                  PACKET_STR_LEN-offset) != OCSD_OK)
+                        ret = OCSD_RESP_FATAL_INVALID_PARAM;
+                break;
+        case OCSD_OP_EOT:
+                sprintf(packet_str,"**** END OF TRACE ****\n");
+                break;
+        case OCSD_OP_FLUSH:
+        case OCSD_OP_RESET:
+        default:
+                break;
+        }
+
+        decoder->packet_printer(packet_str);
+
+        return ret;
+}
+                                            
+static int cs_etm_decoder__create_etmv4i_packet_printer(struct cs_etm_decoder_params *d_params, struct cs_etm_trace_params *t_params,
+
+                                                 struct cs_etm_decoder *decoder)
+{
+        ocsd_etmv4_cfg trace_config;
+        int ret = 0;
+
+        if (d_params->packet_printer == NULL) 
+                return -1;
+ 
+        ret = cs_etm_decoder__gen_etmv4_config(t_params,&trace_config);
+
+        if (ret != 0) 
+                return -1;
+
+        decoder->packet_printer = d_params->packet_printer;
+
+        ret = ocsd_dt_create_etmv4i_pkt_proc(decoder->dcd_tree,
+                                              &trace_config,
+                                              cs_etm_decoder__etmv4i_packet_printer,
+                                              decoder);
+
+        return ret;
+}
+
+static int cs_etm_decoder__create_etmv4i_packet_decoder(struct cs_etm_decoder_params *d_params, struct cs_etm_trace_params *t_params, 
+                                                 struct cs_etm_decoder *decoder)
+{
+        ocsd_etmv4_cfg trace_config;
+        int ret = 0;
+        decoder->packet_printer = d_params->packet_printer;
+
+        ret = cs_etm_decoder__gen_etmv4_config(t_params,&trace_config);
+
+        if (ret != 0)
+                return -1;
+
+        ret = ocsd_dt_create_etmv4i_decoder(decoder->dcd_tree,&trace_config);
+
+        if (ret != OCSD_OK) 
+                return -1;
+
+        ret = ocsd_dt_set_gen_elem_outfn(decoder->dcd_tree,
+                                              cs_etm_decoder__gen_trace_elem_printer, decoder);
+        return ret;
+}
+
+int cs_etm_decoder__add_mem_access_cb(struct cs_etm_decoder *decoder, uint64_t address, uint64_t len, cs_etm_mem_cb_type cb_func)
+{
+        int err;
+
+        decoder->mem_access = cb_func;
+        err = ocsd_dt_add_callback_mem_acc(decoder->dcd_tree,
+                                            address,
+                                            address+len-1,
+                                            OCSD_MEM_SPACE_ANY,
+                                            cs_etm_decoder__mem_access,
+                                            decoder);
+        return err;
+}
+
+
+int cs_etm_decoder__add_bin_file(struct cs_etm_decoder *decoder, uint64_t offset, uint64_t address, uint64_t len, const char *fname)
+{
+        int err = 0;
+        file_mem_region_t region;
+
+        (void) len;
+        if (NULL == decoder)
+                return -1;
+
+        if (NULL == decoder->dcd_tree)
+                return -1;
+
+        region.file_offset = offset;
+        region.start_address = address;
+        region.region_size = len;
+        err = ocsd_dt_add_binfile_region_mem_acc(decoder->dcd_tree,
+                                           &region,
+                                           1,
+                                           OCSD_MEM_SPACE_ANY,
+                                           fname);
+
+        return err;
+}
+
+const struct cs_etm_state *cs_etm_decoder__process_data_block(struct cs_etm_decoder *decoder,
+                                       uint64_t indx,
+                                       const uint8_t *buf,
+                                       size_t len,
+                                       size_t *consumed)
+{
+        int ret = 0;
+        ocsd_datapath_resp_t dp_ret = decoder->prev_return;
+        size_t processed = 0;
+
+        if (decoder->packet_count > 0) {
+                decoder->state.err = ret;
+                *consumed = processed;
+                return &(decoder->state);
+        }
+
+        while ((processed < len) && (0 == ret)) {
+                
+                if (OCSD_DATA_RESP_IS_CONT(dp_ret)) {
+                        uint32_t count;
+                        dp_ret = ocsd_dt_process_data(decoder->dcd_tree,
+                                                       OCSD_OP_DATA,
+                                                       indx+processed,
+                                                       len - processed,
+                                                       &buf[processed],
+                                                       &count);
+                        processed += count;
+
+                } else if (OCSD_DATA_RESP_IS_WAIT(dp_ret)) {
+                        dp_ret = ocsd_dt_process_data(decoder->dcd_tree,
+                                                       OCSD_OP_FLUSH,
+                                                       0,
+                                                       0,
+                                                       NULL,
+                                                       NULL);
+                        break;
+                } else {
+                        ret = -1;
+                }
+        }
+        if (OCSD_DATA_RESP_IS_WAIT(dp_ret)) {
+                if (OCSD_DATA_RESP_IS_CONT(decoder->prev_return)) {
+                        decoder->prev_processed = processed;
+                }
+                processed = 0;
+        } else if (OCSD_DATA_RESP_IS_WAIT(decoder->prev_return)) {
+                processed = decoder->prev_processed;
+                decoder->prev_processed = 0;
+        }
+        *consumed = processed;
+        decoder->prev_return = dp_ret;
+        decoder->state.err = ret;
+        return &(decoder->state);
+}
+
+int cs_etm_decoder__get_packet(struct cs_etm_decoder *decoder, 
+                               struct cs_etm_packet *packet)
+{
+        if (decoder->packet_count == 0) return -1;
+
+        if (packet == NULL) return -1;
+
+        *packet = decoder->packet_buffer[decoder->head];
+
+        decoder->head = (decoder->head + 1) & (MAX_BUFFER - 1);
+
+        decoder->packet_count--;
+
+        return 0;
+}
+
+static void cs_etm_decoder__clear_buffer(struct cs_etm_decoder *decoder)
+{
+        unsigned i;
+
+        decoder->head = 0;
+        decoder->tail = 0;
+        decoder->end_tail = 0;
+        decoder->packet_count = 0;
+        for (i = 0; i < MAX_BUFFER; i++) {
+                decoder->packet_buffer[i].start_addr = 0xdeadbeefdeadbeefUL;
+                decoder->packet_buffer[i].end_addr   = 0xdeadbeefdeadbeefUL;
+                decoder->packet_buffer[i].exc        = false;
+                decoder->packet_buffer[i].exc_ret    = false;
+                decoder->packet_buffer[i].cpu        = INT_MIN;
+        }
+}
+
+struct cs_etm_decoder *cs_etm_decoder__new(uint32_t num_cpu, struct cs_etm_decoder_params *d_params, struct cs_etm_trace_params t_params[])
+{
+        struct cs_etm_decoder *decoder;
+        ocsd_dcd_tree_src_t format;
+        uint32_t flags;
+        int ret;
+        size_t i;
+
+        if ((t_params == NULL) || (d_params == 0)) {
+                return NULL;
+        }
+
+        decoder = zalloc(sizeof(struct cs_etm_decoder));
+
+        if (decoder == NULL) {
+                return NULL;
+        }
+
+        decoder->state.data = d_params->data;
+        decoder->prev_return = OCSD_RESP_CONT;
+        cs_etm_decoder__clear_buffer(decoder);
+        format = (d_params->formatted ? OCSD_TRC_SRC_FRAME_FORMATTED :
+                                         OCSD_TRC_SRC_SINGLE);
+        flags = 0;
+        flags |= (d_params->fsyncs ? OCSD_DFRMTR_HAS_FSYNCS : 0);
+        flags |= (d_params->hsyncs ? OCSD_DFRMTR_HAS_HSYNCS : 0);
+        flags |= (d_params->frame_aligned ? OCSD_DFRMTR_FRAME_MEM_ALIGN : 0);
+
+        /* Create decode tree for the data source */
+        decoder->dcd_tree = ocsd_create_dcd_tree(format,flags);
+
+        if (decoder->dcd_tree == 0) {
+                goto err_free_decoder;
+        }
+
+        for (i = 0; i < num_cpu; ++i) {
+                switch (t_params[i].protocol)
+                {
+                        case CS_ETM_PROTO_ETMV4i: 
+                                if (d_params->operation == CS_ETM_OPERATION_PRINT) {
+                                        ret = cs_etm_decoder__create_etmv4i_packet_printer(d_params,&t_params[i],decoder);
+                                } else if (d_params->operation == CS_ETM_OPERATION_DECODE) {
+                                        ret = cs_etm_decoder__create_etmv4i_packet_decoder(d_params,&t_params[i],decoder); 
+                                } else {
+                                        ret = -CS_ETM_ERR_PARAM;
+                                }
+                                if (ret != 0) {
+                                        goto err_free_decoder_tree;
+                                }
+                                break;
+                        default:
+                                goto err_free_decoder_tree;
+                                break;
+                }
+        }
+
+
+        return decoder;
+
+err_free_decoder_tree:
+        ocsd_destroy_dcd_tree(decoder->dcd_tree);
+err_free_decoder:
+        free(decoder);
+        return NULL;
+}
+
+
+void cs_etm_decoder__free(struct cs_etm_decoder *decoder)
+{
+        if (decoder == NULL) return;
+
+        ocsd_destroy_dcd_tree(decoder->dcd_tree);
+        decoder->dcd_tree = NULL;
+
+        free(decoder);
+}
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h

new file mode 100644 (file)

index 0000000..38c5ae8
--- /dev/null
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright(C) 2015 Linaro Limited. All rights reserved.
+ * Author: Tor Jeremiassen <tor.jeremiassen@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ * 
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+ * Public License for more details.
+ *
+ * You should have received a copy of the GNU GEneral Public License along
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef INCLUDE__CS_ETM_DECODER_H__
+#define INCLUDE__CS_ETM_DECODER_H__
+
+#include <linux/types.h>
+#include <stdio.h>
+
+struct cs_etm_decoder;
+
+struct cs_etm_buffer {
+        const unsigned char *buf;
+        size_t  len;
+        uint64_t offset;
+        //bool    consecutive;
+        uint64_t        ref_timestamp;
+        //uint64_t        trace_nr;
+};
+
+enum cs_etm_sample_type {
+        CS_ETM_RANGE      = 1 << 0,
+};
+
+struct cs_etm_state {
+        int err;
+        void *data;
+        unsigned isa;
+        uint64_t start;
+        uint64_t end;
+        uint64_t timestamp;
+};
+
+struct cs_etm_packet {
+        enum cs_etm_sample_type sample_type;
+        uint64_t start_addr;
+        uint64_t end_addr;
+        bool     exc;
+        bool     exc_ret;
+        int cpu;
+};
+
+
+struct cs_etm_queue;
+typedef uint32_t (*cs_etm_mem_cb_type)(struct cs_etm_queue *, uint64_t, size_t, uint8_t *);
+
+struct cs_etm_trace_params {
+        void *etmv4i_packet_handler;
+        uint32_t reg_idr0;
+        uint32_t reg_idr1;
+        uint32_t reg_idr2;
+        uint32_t reg_idr8;
+        uint32_t reg_configr;
+        uint32_t reg_traceidr;
+        int  protocol;
+};
+
+struct cs_etm_decoder_params {
+        int  operation;
+        void (*packet_printer)(const char *);
+        cs_etm_mem_cb_type  mem_acc_cb;
+        bool formatted;
+        bool fsyncs;
+        bool hsyncs;
+        bool frame_aligned;
+        void *data;
+};
+
+enum {
+        CS_ETM_PROTO_ETMV3 = 1,
+        CS_ETM_PROTO_ETMV4i,
+        CS_ETM_PROTO_ETMV4d,
+};
+
+enum {
+        CS_ETM_OPERATION_PRINT = 1,
+        CS_ETM_OPERATION_DECODE,
+};
+
+enum {
+        CS_ETM_ERR_NOMEM = 1,
+        CS_ETM_ERR_NODATA,
+        CS_ETM_ERR_PARAM,
+};
+
+
+struct cs_etm_decoder *cs_etm_decoder__new(uint32_t num_cpu, struct cs_etm_decoder_params *,struct cs_etm_trace_params []);
+
+int cs_etm_decoder__add_mem_access_cb(struct cs_etm_decoder *, uint64_t, uint64_t, cs_etm_mem_cb_type);
+
+int cs_etm_decoder__flush(struct cs_etm_decoder *);
+void cs_etm_decoder__free(struct cs_etm_decoder *);
+int cs_etm_decoder__get_packet(struct cs_etm_decoder *, struct cs_etm_packet *);
+
+int cs_etm_decoder__add_bin_file(struct cs_etm_decoder *, uint64_t, uint64_t, uint64_t, const char *);
+
+const struct cs_etm_state *cs_etm_decoder__process_data_block(struct cs_etm_decoder *,
+                                       uint64_t,
+                                       const uint8_t *,
+                                       size_t,
+                                       size_t *);
+
+#endif /* INCLUDE__CS_ETM_DECODER_H__ */
+
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c

new file mode 100644 (file)

index 0000000..ca93257
--- /dev/null
+++ b/tools/perf/util/cs-etm.c
@@ -0,0 +1,1533 @@
+/*
+ * Copyright(C) 2016 Linaro Limited. All rights reserved.
+ * Author: Tor Jeremiassen <tor.jeremiassen@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/log2.h>
+
+#include "perf.h"
+#include "thread_map.h"
+#include "thread.h"
+#include "thread-stack.h"
+#include "callchain.h"
+#include "auxtrace.h"
+#include "evlist.h"
+#include "machine.h"
+#include "util.h"
+#include "util/intlist.h"
+#include "color.h"
+#include "cs-etm.h"
+#include "cs-etm-decoder/cs-etm-decoder.h"
+#include "debug.h"
+
+#include <stdlib.h>
+
+#define KiB(x) ((x) * 1024)
+#define MiB(x) ((x) * 1024 * 1024)
+#define MAX_TIMESTAMP (~0ULL)
+
+struct cs_etm_auxtrace {
+        struct auxtrace         auxtrace;
+        struct auxtrace_queues  queues;
+        struct auxtrace_heap    heap;
+        u64                    **metadata;
+        u32                     auxtrace_type;
+        struct perf_session    *session;
+        struct machine         *machine;
+        struct perf_evsel      *switch_evsel;
+        struct thread          *unknown_thread;
+        uint32_t                num_cpu;
+        bool                    timeless_decoding;
+        bool                    sampling_mode;
+        bool                    snapshot_mode;
+        bool                    data_queued;
+        bool                    sync_switch;
+        bool                    synth_needs_swap;
+        int                     have_sched_switch;
+
+        bool                    sample_instructions;
+        u64                     instructions_sample_type;
+        u64                     instructions_sample_period;
+        u64                     instructions_id;
+        struct itrace_synth_opts synth_opts;
+        unsigned                pmu_type;
+};
+
+struct cs_etm_queue {
+        struct cs_etm_auxtrace *etm;
+        unsigned                queue_nr;
+        struct auxtrace_buffer *buffer;
+        const struct           cs_etm_state *state;
+        struct ip_callchain    *chain;
+        union perf_event       *event_buf;
+        bool                    on_heap;
+        bool                    step_through_buffers;
+        bool                    use_buffer_pid_tid;
+        pid_t                   pid, tid;
+        int                     cpu;
+        struct thread          *thread;
+        u64                     time;
+        u64                     timestamp;
+        bool                    stop;
+        struct cs_etm_decoder  *decoder;
+        u64                     offset;
+        bool                    eot;
+        bool                    kernel_mapped;
+};
+
+static int cs_etm__get_trace(struct cs_etm_buffer *buff, struct cs_etm_queue *etmq);
+static int cs_etm__update_queues(struct cs_etm_auxtrace *);
+static int cs_etm__process_queues(struct cs_etm_auxtrace *, u64);
+static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *, pid_t, u64);
+static uint32_t cs_etm__mem_access(struct cs_etm_queue *, uint64_t , size_t , uint8_t *);
+
+static void cs_etm__packet_dump(const char *pkt_string)
+{
+        const char *color = PERF_COLOR_BLUE;
+
+        color_fprintf(stdout,color, "  %s\n", pkt_string);
+        fflush(stdout);
+}
+
+static void cs_etm__dump_event(struct cs_etm_auxtrace *etm,
+                              struct auxtrace_buffer *buffer)
+{
+        const char *color = PERF_COLOR_BLUE;
+        struct cs_etm_decoder_params d_params;
+        struct cs_etm_trace_params *t_params;
+        struct cs_etm_decoder *decoder;
+        size_t buffer_used = 0;
+        size_t i;
+
+        fprintf(stdout,"\n");
+        color_fprintf(stdout, color,
+                     ". ... CoreSight ETM Trace data: size %zu bytes\n",
+                     buffer->size);
+
+        t_params = zalloc(sizeof(struct cs_etm_trace_params) * etm->num_cpu);
+        for (i = 0; i < etm->num_cpu; ++i) {
+                t_params[i].protocol = CS_ETM_PROTO_ETMV4i;
+                t_params[i].reg_idr0 = etm->metadata[i][CS_ETMV4_TRCIDR0];
+                t_params[i].reg_idr1 = etm->metadata[i][CS_ETMV4_TRCIDR1];
+                t_params[i].reg_idr2 = etm->metadata[i][CS_ETMV4_TRCIDR2];
+                t_params[i].reg_idr8 = etm->metadata[i][CS_ETMV4_TRCIDR8];
+                t_params[i].reg_configr = etm->metadata[i][CS_ETMV4_TRCCONFIGR];
+                t_params[i].reg_traceidr = etm->metadata[i][CS_ETMV4_TRCTRACEIDR];
+  //[CS_ETMV4_TRCAUTHSTATUS] = "   TRCAUTHSTATUS                  %"PRIx64"\n",
+        }
+        d_params.packet_printer = cs_etm__packet_dump;
+        d_params.operation = CS_ETM_OPERATION_PRINT;
+        d_params.formatted = true;
+        d_params.fsyncs = false;
+        d_params.hsyncs = false;
+        d_params.frame_aligned = true;
+
+        decoder = cs_etm_decoder__new(etm->num_cpu,&d_params, t_params);
+
+        zfree(&t_params);
+
+        if (decoder == NULL) {
+                return; 
+        }
+        do {
+            size_t consumed;
+            cs_etm_decoder__process_data_block(decoder,buffer->offset,&(((uint8_t *)buffer->data)[buffer_used]),buffer->size - buffer_used, &consumed);
+            buffer_used += consumed;
+        } while(buffer_used < buffer->size);
+        cs_etm_decoder__free(decoder);
+}
+                              
+static int cs_etm__flush_events(struct perf_session *session, struct perf_tool *tool){
+        struct cs_etm_auxtrace *etm = container_of(session->auxtrace,
+                                                   struct cs_etm_auxtrace,
+                                                   auxtrace);
+
+        int ret;
+
+        if (dump_trace)
+                return 0;
+
+        if (!tool->ordered_events)
+                return -EINVAL;
+
+        ret = cs_etm__update_queues(etm);
+
+        if (ret < 0) 
+                return ret;
+
+        if (etm->timeless_decoding)
+                return cs_etm__process_timeless_queues(etm,-1,MAX_TIMESTAMP - 1);
+
+        return cs_etm__process_queues(etm, MAX_TIMESTAMP);
+}
+
+static void  cs_etm__set_pid_tid_cpu(struct cs_etm_auxtrace *etm,
+                                    struct auxtrace_queue *queue)
+{
+        struct cs_etm_queue *etmq = queue->priv;
+
+        if ((queue->tid == -1) || (etm->have_sched_switch)) {
+                etmq->tid = machine__get_current_tid(etm->machine, etmq->cpu);
+                thread__zput(etmq->thread);
+        }
+
+        if ((!etmq->thread) && (etmq->tid != -1)) {
+                etmq->thread = machine__find_thread(etm->machine,-1,etmq->tid);
+        }
+
+        if (etmq->thread) {
+                etmq->pid = etmq->thread->pid_;
+                if (queue->cpu == -1) {
+                        etmq->cpu = etmq->thread->cpu;
+                }
+        }
+}
+
+static void cs_etm__free_queue(void *priv)
+{
+        struct cs_etm_queue *etmq = priv;
+
+        if (!etmq)
+                return;
+
+        thread__zput(etmq->thread);
+        cs_etm_decoder__free(etmq->decoder);
+        zfree(&etmq->event_buf);
+        zfree(&etmq->chain);
+        free(etmq);
+}
+
+static void cs_etm__free_events(struct perf_session *session)
+{
+        struct cs_etm_auxtrace *aux = container_of(session->auxtrace,
+                                                   struct cs_etm_auxtrace,
+                                                   auxtrace);
+
+        struct auxtrace_queues *queues = &(aux->queues);
+
+        unsigned i;
+
+        for (i = 0; i < queues->nr_queues; ++i) {
+                cs_etm__free_queue(queues->queue_array[i].priv);
+                queues->queue_array[i].priv = 0;
+        }
+
+        auxtrace_queues__free(queues);
+
+}
+
+static void cs_etm__free(struct perf_session *session)
+{
+
+        size_t i;
+        struct int_node *inode, *tmp;
+        struct cs_etm_auxtrace *aux = container_of(session->auxtrace,
+                                                   struct cs_etm_auxtrace,
+                                                   auxtrace);
+        auxtrace_heap__free(&aux->heap);
+        cs_etm__free_events(session);
+        session->auxtrace = NULL;
+
+        /* First remove all traceID/CPU# nodes from the RB tree */
+        intlist__for_each_safe(inode, tmp, traceid_list)
+                intlist__remove(traceid_list, inode);
+        /* Then the RB tree itself */
+        intlist__delete(traceid_list);
+
+        //thread__delete(aux->unknown_thread);
+        for (i = 0; i < aux->num_cpu; ++i) {
+                zfree(&aux->metadata[i]);
+        }
+        zfree(&aux->metadata);
+        free(aux);
+}
+
+static void cs_etm__use_buffer_pid_tid(struct cs_etm_queue *etmq,
+                                      struct auxtrace_queue *queue,
+                                      struct auxtrace_buffer *buffer)
+{
+        if ((queue->cpu == -1) && (buffer->cpu != -1)) {
+                etmq->cpu = buffer->cpu;
+        }
+
+        etmq->pid = buffer->pid;
+        etmq->tid = buffer->tid;
+
+        thread__zput(etmq->thread);
+
+        if (etmq->tid != -1) {
+                if (etmq->pid != -1) {
+                        etmq->thread = machine__findnew_thread(etmq->etm->machine,
+                                                               etmq->pid,
+                                                               etmq->tid);
+                } else {
+                        etmq->thread = machine__findnew_thread(etmq->etm->machine,
+                                                               -1,
+                                                               etmq->tid);
+                }
+        }
+}
+
+
+static int cs_etm__get_trace(struct cs_etm_buffer *buff, struct cs_etm_queue *etmq)
+{
+        struct auxtrace_buffer *aux_buffer = etmq->buffer;
+        struct auxtrace_buffer *old_buffer = aux_buffer;
+        struct auxtrace_queue *queue;
+
+        if (etmq->stop) {
+                buff->len = 0;
+                return 0;
+        }
+
+        queue = &etmq->etm->queues.queue_array[etmq->queue_nr];
+
+        aux_buffer = auxtrace_buffer__next(queue,aux_buffer);
+
+        if (!aux_buffer) {
+                if (old_buffer) {
+                        auxtrace_buffer__drop_data(old_buffer);
+                }
+                buff->len = 0;
+                return 0;
+        }
+
+        etmq->buffer = aux_buffer;
+
+        if (!aux_buffer->data) {
+                int fd = perf_data_file__fd(etmq->etm->session->file);
+
+                aux_buffer->data = auxtrace_buffer__get_data(aux_buffer, fd);
+                if (!aux_buffer->data)
+                        return -ENOMEM;
+        }
+
+        if (old_buffer)
+                auxtrace_buffer__drop_data(old_buffer);
+
+        if (aux_buffer->use_data) {
+                buff->offset = aux_buffer->offset;
+                buff->len = aux_buffer->use_size;
+                buff->buf = aux_buffer->use_data;
+        } else {
+                buff->offset = aux_buffer->offset;
+                buff->len = aux_buffer->size;
+                buff->buf = aux_buffer->data;
+        }
+        /*
+        buff->offset = 0;
+        buff->len = sizeof(cstrace);
+        buff->buf = cstrace;
+        */
+
+        buff->ref_timestamp = aux_buffer->reference;
+
+        if (etmq->use_buffer_pid_tid && 
+            ((etmq->pid != aux_buffer->pid) || 
+             (etmq->tid != aux_buffer->tid))) {
+                cs_etm__use_buffer_pid_tid(etmq,queue,aux_buffer);
+        }
+
+        if (etmq->step_through_buffers)
+                etmq->stop = true;
+
+        return buff->len;
+}
+
+static struct cs_etm_queue *cs_etm__alloc_queue(struct cs_etm_auxtrace *etm,
+                                               unsigned int queue_nr)
+{
+        struct cs_etm_decoder_params d_params;
+        struct cs_etm_trace_params   *t_params;
+        struct cs_etm_queue *etmq;
+        size_t i;
+
+        etmq = zalloc(sizeof(struct cs_etm_queue));
+        if (!etmq)
+                return NULL;
+
+        if (etm->synth_opts.callchain) {
+                size_t sz = sizeof(struct ip_callchain);
+
+                sz += etm->synth_opts.callchain_sz * sizeof(u64);
+                etmq->chain = zalloc(sz);
+                if (!etmq->chain)
+                        goto out_free;
+        } else {
+                etmq->chain = NULL;
+        }
+
+        etmq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE);
+        if (!etmq->event_buf)
+                goto out_free;
+
+        etmq->etm = etm;
+        etmq->queue_nr = queue_nr;
+        etmq->pid = -1;
+        etmq->tid = -1;
+        etmq->cpu = -1;
+        etmq->stop = false;
+        etmq->kernel_mapped = false;
+
+        t_params = zalloc(sizeof(struct cs_etm_trace_params)*etm->num_cpu);
+
+        for (i = 0; i < etm->num_cpu; ++i) {
+                t_params[i].reg_idr0 = etm->metadata[i][CS_ETMV4_TRCIDR0];
+                t_params[i].reg_idr1 = etm->metadata[i][CS_ETMV4_TRCIDR1];
+                t_params[i].reg_idr2 = etm->metadata[i][CS_ETMV4_TRCIDR2];
+                t_params[i].reg_idr8 = etm->metadata[i][CS_ETMV4_TRCIDR8];
+                t_params[i].reg_configr = etm->metadata[i][CS_ETMV4_TRCCONFIGR];
+                t_params[i].reg_traceidr = etm->metadata[i][CS_ETMV4_TRCTRACEIDR];
+                t_params[i].protocol = CS_ETM_PROTO_ETMV4i;
+        }
+        d_params.packet_printer = cs_etm__packet_dump;
+        d_params.operation = CS_ETM_OPERATION_DECODE;    
+        d_params.formatted = true;
+        d_params.fsyncs = false;
+        d_params.hsyncs = false;
+        d_params.frame_aligned = true;
+        d_params.data = etmq;
+
+        etmq->decoder = cs_etm_decoder__new(etm->num_cpu,&d_params,t_params);
+
+
+        zfree(&t_params);
+
+        if (!etmq->decoder)
+                goto out_free;
+
+        etmq->offset = 0;
+        etmq->eot = false;
+
+        return etmq;
+
+out_free:
+        zfree(&etmq->event_buf);
+        zfree(&etmq->chain);
+        free(etmq);
+        return NULL;
+}
+
+static int cs_etm__setup_queue(struct cs_etm_auxtrace *etm, 
+                              struct auxtrace_queue *queue,
+                              unsigned int queue_nr)
+{
+        struct cs_etm_queue *etmq = queue->priv;
+
+        if (list_empty(&(queue->head))) 
+                return 0;
+
+        if (etmq == NULL) {
+                etmq = cs_etm__alloc_queue(etm,queue_nr);
+
+                if (etmq == NULL) {
+                        return -ENOMEM;
+                }
+
+                queue->priv = etmq;
+
+                if (queue->cpu != -1) {
+                        etmq->cpu = queue->cpu;
+                }
+
+                etmq->tid = queue->tid;
+
+                if (etm->sampling_mode) {
+                        if (etm->timeless_decoding)
+                                etmq->step_through_buffers = true;
+                        if (etm->timeless_decoding || !etm->have_sched_switch)
+                                etmq->use_buffer_pid_tid = true;
+                }
+        }
+        
+        if (!etmq->on_heap && 
+            (!etm->sync_switch)) {
+                const struct cs_etm_state *state;
+                int ret = 0;
+
+                if (etm->timeless_decoding)
+                        return ret;
+
+                //cs_etm__log("queue %u getting timestamp\n",queue_nr);
+                //cs_etm__log("queue %u decoding cpu %d pid %d tid %d\n",
+                           //queue_nr, etmq->cpu, etmq->pid, etmq->tid);
+                (void) state;
+                return ret;
+                /*
+                while (1) {
+                        state = cs_etm_decoder__decode(etmq->decoder);
+                        if (state->err) {
+                                if (state->err == CS_ETM_ERR_NODATA) {
+                                        //cs_etm__log("queue %u has no timestamp\n",
+                                                   //queue_nr);
+                                        return 0;
+                                }
+                                continue;
+                        }
+                        if (state->timestamp)
+                                break;
+                }
+
+                etmq->timestamp = state->timestamp;
+                //cs_etm__log("queue %u timestamp 0x%"PRIx64 "\n",
+                           //queue_nr, etmq->timestamp);
+                etmq->state = state;
+                etmq->have_sample = true;
+                //cs_etm__sample_flags(etmq);
+                ret = auxtrace_heap__add(&etm->heap, queue_nr, etmq->timestamp);
+                if (ret)
+                        return ret;
+                etmq->on_heap = true;
+                */
+        }
+        
+        return 0;
+}
+
+
+static int cs_etm__setup_queues(struct cs_etm_auxtrace *etm)
+{
+        unsigned int i;
+        int ret;
+
+        for (i = 0; i < etm->queues.nr_queues; i++) {
+                ret = cs_etm__setup_queue(etm, &(etm->queues.queue_array[i]),i);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
+
+#if 0
+struct cs_etm_cache_entry {
+        struct auxtrace_cache_entry     entry;
+        uint64_t                        icount;
+        uint64_t                        bcount;
+};
+
+static size_t cs_etm__cache_divisor(void)
+{
+        static size_t d = 64;
+
+        return d;
+}
+
+static size_t cs_etm__cache_size(struct dso *dso,
+                                struct machine *machine)
+{
+        off_t size;
+
+        size = dso__data_size(dso,machine);
+        size /= cs_etm__cache_divisor();
+
+        if (size < 1000) 
+                return 10;
+
+        if (size > (1 << 21)) 
+                return 21;
+
+        return 32 - __builtin_clz(size);
+}
+
+static struct auxtrace_cache *cs_etm__cache(struct dso *dso,
+                                           struct machine *machine)
+{
+        struct auxtrace_cache *c;
+        size_t bits;
+
+        if (dso->auxtrace_cache)
+                return dso->auxtrace_cache;
+
+        bits = cs_etm__cache_size(dso,machine);
+
+        c = auxtrace_cache__new(bits, sizeof(struct cs_etm_cache_entry), 200);
+
+        dso->auxtrace_cache = c;
+
+        return c;
+}
+
+static int cs_etm__cache_add(struct dso *dso, struct machine *machine,
+                            uint64_t offset, uint64_t icount, uint64_t bcount)
+{
+        struct auxtrace_cache *c = cs_etm__cache(dso, machine);
+        struct cs_etm_cache_entry *e;
+        int err;
+
+        if (!c)
+                return -ENOMEM;
+
+        e = auxtrace_cache__alloc_entry(c);
+        if (!e)
+                return -ENOMEM;
+
+        e->icount = icount;
+        e->bcount = bcount;
+
+        err = auxtrace_cache__add(c, offset, &e->entry);
+
+        if (err)
+                auxtrace_cache__free_entry(c, e);
+
+        return err;
+}
+
+static struct cs_etm_cache_entry *cs_etm__cache_lookup(struct dso *dso,
+                                                      struct machine *machine,
+                                                      uint64_t offset)
+{
+        struct auxtrace_cache *c = cs_etm__cache(dso, machine);
+
+        if (!c)
+                return NULL;
+
+        return auxtrace_cache__lookup(dso->auxtrace_cache, offset);
+}
+#endif
+
+static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
+                                           struct cs_etm_packet *packet)
+{
+        int ret = 0;
+        struct cs_etm_auxtrace *etm = etmq->etm;
+        union perf_event *event = etmq->event_buf;
+        struct perf_sample sample = {.ip = 0,};
+        uint64_t start_addr = packet->start_addr;
+        uint64_t end_addr = packet->end_addr;
+
+        event->sample.header.type = PERF_RECORD_SAMPLE;
+        event->sample.header.misc = PERF_RECORD_MISC_USER;
+        event->sample.header.size = sizeof(struct perf_event_header);
+
+
+        sample.ip = start_addr;
+        sample.pid = etmq->pid;
+        sample.tid = etmq->tid;
+        sample.addr = end_addr;
+        sample.id = etmq->etm->instructions_id;
+        sample.stream_id = etmq->etm->instructions_id;
+        sample.period = (end_addr - start_addr) >> 2; 
+        sample.cpu = packet->cpu;
+        sample.flags = 0; // etmq->flags;
+        sample.insn_len = 1; // etmq->insn_len;
+
+        //etmq->last_insn_cnt = etmq->state->tot_insn_cnt;
+
+#if 0
+        {
+                struct   addr_location al;
+                uint64_t offset;
+                struct   thread *thread;
+                struct   machine *machine = etmq->etm->machine;
+                uint8_t  cpumode;
+                struct   cs_etm_cache_entry *e;
+                uint8_t  buf[256];
+                size_t   bufsz;
+
+                thread = etmq->thread;
+
+                if (!thread) {
+                        thread = etmq->etm->unknown_thread;
+                }
+
+                if (start_addr > 0xffffffc000000000UL) {
+                        cpumode = PERF_RECORD_MISC_KERNEL;
+                } else {
+                        cpumode = PERF_RECORD_MISC_USER;
+                }
+
+                thread__find_addr_map(thread, cpumode, MAP__FUNCTION, start_addr,&al);
+                if (!al.map || !al.map->dso) {
+                        goto endTest;
+                }
+                if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR &&
+                    dso__data_status_seen(al.map->dso,DSO_DATA_STATUS_SEEN_ITRACE)) {
+                        goto endTest;
+                }
+
+                offset = al.map->map_ip(al.map,start_addr);
+
+
+                e = cs_etm__cache_lookup(al.map->dso, machine, offset);
+
+                if (e) {
+                  (void) e;
+                } else {
+                        int len;
+                        map__load(al.map, machine->symbol_filter);
+
+                        bufsz = sizeof(buf);
+                        len = dso__data_read_offset(al.map->dso, machine,
+                                                    offset, buf, bufsz);
+
+                        if (len <= 0) {
+                                goto endTest;
+                        }
+
+                        cs_etm__cache_add(al.map->dso, machine, offset, (end_addr - start_addr) >> 2, end_addr - start_addr);
+
+                }
+endTest:
+                (void) offset;
+        }
+#endif
+
+        ret = perf_session__deliver_synth_event(etm->session,event, &sample);
+
+        if (ret) {
+                pr_err("CS ETM Trace: failed to deliver instruction event, error %d\n", ret);
+
+        }
+        return ret;
+}
+
+struct cs_etm_synth {
+        struct perf_tool dummy_tool;
+        struct perf_session *session;
+};
+
+
+static int cs_etm__event_synth(struct perf_tool *tool,
+                              union perf_event *event,
+                              struct perf_sample *sample,
+                              struct machine *machine)
+{
+        struct cs_etm_synth *cs_etm_synth =
+                      container_of(tool, struct cs_etm_synth, dummy_tool);
+
+        (void) sample;
+        (void) machine;
+
+        return perf_session__deliver_synth_event(cs_etm_synth->session, event, NULL);
+
+}
+
+
+static int cs_etm__synth_event(struct perf_session *session,
+                              struct perf_event_attr *attr, u64 id)
+{
+        struct cs_etm_synth cs_etm_synth;
+
+        memset(&cs_etm_synth, 0, sizeof(struct cs_etm_synth));
+        cs_etm_synth.session = session;
+
+        return perf_event__synthesize_attr(&cs_etm_synth.dummy_tool, attr, 1,
+                                           &id, cs_etm__event_synth);
+}
+
+static int cs_etm__synth_events(struct cs_etm_auxtrace *etm, 
+                               struct perf_session *session)
+{
+        struct perf_evlist *evlist = session->evlist;
+        struct perf_evsel *evsel;
+        struct perf_event_attr attr;
+        bool found = false;
+        u64 id;
+        int err;
+
+        evlist__for_each(evlist, evsel) {
+
+                if (evsel->attr.type == etm->pmu_type) {
+                        found = true;
+                        break;
+                }
+        }
+
+        if (!found) {
+                pr_debug("There are no selected events with Core Sight Trace data\n");
+                return 0;
+        }
+
+        memset(&attr, 0, sizeof(struct perf_event_attr));
+        attr.size = sizeof(struct perf_event_attr);
+        attr.type = PERF_TYPE_HARDWARE;
+        attr.sample_type = evsel->attr.sample_type & PERF_SAMPLE_MASK;
+        attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID |
+                            PERF_SAMPLE_PERIOD;
+        if (etm->timeless_decoding) 
+                attr.sample_type &= ~(u64)PERF_SAMPLE_TIME;
+        else
+                attr.sample_type |= PERF_SAMPLE_TIME;
+
+        attr.exclude_user = evsel->attr.exclude_user;
+        attr.exclude_kernel = evsel->attr.exclude_kernel;
+        attr.exclude_hv = evsel->attr.exclude_hv;
+        attr.exclude_host = evsel->attr.exclude_host;
+        attr.exclude_guest = evsel->attr.exclude_guest;
+        attr.sample_id_all = evsel->attr.sample_id_all;
+        attr.read_format = evsel->attr.read_format;
+
+        id = evsel->id[0] + 1000000000;
+
+        if (!id)
+                id = 1;
+
+        if (etm->synth_opts.instructions) {
+                attr.config = PERF_COUNT_HW_INSTRUCTIONS;
+                attr.sample_period = etm->synth_opts.period;
+                etm->instructions_sample_period = attr.sample_period;
+                err = cs_etm__synth_event(session, &attr, id);
+
+                if (err) {
+                        pr_err("%s: failed to synthesize 'instructions' event type\n",
+                               __func__);
+                        return err;
+                }
+                etm->sample_instructions = true;
+                etm->instructions_sample_type = attr.sample_type;
+                etm->instructions_id = id;
+                id += 1;
+        }
+
+        etm->synth_needs_swap = evsel->needs_swap;
+        return 0;
+}
+
+static int cs_etm__sample(struct cs_etm_queue *etmq)
+{
+        //const struct cs_etm_state *state = etmq->state;
+        struct cs_etm_packet packet;
+        //struct cs_etm_auxtrace *etm = etmq->etm;
+        int err;
+
+        err = cs_etm_decoder__get_packet(etmq->decoder,&packet);
+        // if there is no sample, it returns err = -1, no real error
+
+        if (!err && packet.sample_type & CS_ETM_RANGE) {
+                err = cs_etm__synth_instruction_sample(etmq,&packet);
+                if (err)
+                        return err;
+        }
+        return 0;
+}
+
+static int cs_etm__run_decoder(struct cs_etm_queue *etmq, u64 *timestamp)
+{
+        struct cs_etm_buffer buffer;
+        size_t buffer_used;
+        int err = 0;
+
+       /* Go through each buffer in the queue and decode them one by one */
+more:
+        buffer_used = 0;
+        memset(&buffer, 0, sizeof(buffer));
+        err = cs_etm__get_trace(&buffer,etmq);
+        if (err <= 0)
+                return err;
+
+        do {
+            size_t processed = 0;
+            etmq->state = cs_etm_decoder__process_data_block(etmq->decoder,
+                                               etmq->offset,
+                                               &buffer.buf[buffer_used],
+                                               buffer.len-buffer_used,
+                                               &processed);
+            err = etmq->state->err;
+            etmq->offset += processed;
+            buffer_used += processed;
+            if (!err)
+                cs_etm__sample(etmq);
+        } while (!etmq->eot && (buffer.len > buffer_used));
+goto more;
+
+        (void) timestamp;
+
+        return err;
+}
+
+static int cs_etm__update_queues(struct cs_etm_auxtrace *etm)
+{
+  if (etm->queues.new_data) {
+        etm->queues.new_data = false;
+        return cs_etm__setup_queues(etm);
+  }
+  return 0;
+}
+
+static int cs_etm__process_queues(struct cs_etm_auxtrace *etm, u64 timestamp)
+{
+        unsigned int queue_nr;
+        u64 ts;
+        int ret;
+
+        while (1) {
+                struct auxtrace_queue *queue;
+                struct cs_etm_queue *etmq;
+        
+                if (!etm->heap.heap_cnt)
+                        return 0;
+        
+                if (etm->heap.heap_array[0].ordinal >= timestamp)
+                        return 0;
+        
+                queue_nr = etm->heap.heap_array[0].queue_nr;
+                queue = &etm->queues.queue_array[queue_nr];
+                etmq = queue->priv;
+        
+                //cs_etm__log("queue %u processing 0x%" PRIx64 " to 0x%" PRIx64 "\n",
+                           //queue_nr, etm->heap.heap_array[0].ordinal,
+                           //timestamp);
+
+                auxtrace_heap__pop(&etm->heap);
+
+                if (etm->heap.heap_cnt) {
+                        ts = etm->heap.heap_array[0].ordinal + 1;
+                        if (ts > timestamp)
+                                ts = timestamp;
+                } else {
+                        ts = timestamp;
+                }
+
+                cs_etm__set_pid_tid_cpu(etm, queue);
+
+                ret = cs_etm__run_decoder(etmq, &ts);
+
+                if (ret < 0) {
+                        auxtrace_heap__add(&etm->heap, queue_nr, ts);
+                        return ret;
+                }
+
+                if (!ret) {
+                        ret = auxtrace_heap__add(&etm->heap, queue_nr, ts);
+                        if (ret < 0)
+                                return ret;
+                } else {
+                        etmq->on_heap = false;
+                }
+        }
+        return 0;
+}
+
+static int cs_etm__process_timeless_queues(struct cs_etm_auxtrace *etm,
+                                          pid_t tid,
+                                          u64 time_)
+{
+        struct auxtrace_queues *queues = &etm->queues;
+        unsigned int i;
+        u64 ts = 0;
+        
+        for (i = 0; i < queues->nr_queues; ++i) {
+                struct auxtrace_queue *queue = &(etm->queues.queue_array[i]);
+                struct cs_etm_queue *etmq = queue->priv;
+
+                if (etmq && ((tid == -1) || (etmq->tid == tid))) {
+                        etmq->time = time_;
+                        cs_etm__set_pid_tid_cpu(etm, queue);
+                        cs_etm__run_decoder(etmq,&ts);
+
+                }
+        }
+        return 0;
+}
+
+static struct cs_etm_queue *cs_etm__cpu_to_etmq(struct cs_etm_auxtrace *etm, 
+                                               int cpu)
+{
+        unsigned q,j;
+
+        if (etm->queues.nr_queues == 0)
+                return NULL;
+
+       if (cpu < 0)
+               q = 0;
+        else if ((unsigned) cpu >= etm->queues.nr_queues)
+                q = etm->queues.nr_queues - 1;
+        else 
+                q = cpu;
+
+        if (etm->queues.queue_array[q].cpu == cpu)
+                return etm->queues.queue_array[q].priv;
+
+        for (j = 0; q > 0; j++) {
+                if (etm->queues.queue_array[--q].cpu == cpu)
+                        return etm->queues.queue_array[q].priv;
+        }
+
+        for (; j < etm->queues.nr_queues; j++) {
+                if (etm->queues.queue_array[j].cpu == cpu)
+                        return etm->queues.queue_array[j].priv;
+
+        }
+
+        return NULL;
+}
+
+static uint32_t cs_etm__mem_access(struct cs_etm_queue *etmq, uint64_t address, size_t size, uint8_t *buffer)
+{
+        struct   addr_location al;
+        uint64_t offset;
+        struct   thread *thread;
+        struct   machine *machine;
+        uint8_t  cpumode;
+        int len;
+
+        if (etmq == NULL)
+                return -1;
+
+        machine = etmq->etm->machine;
+        thread = etmq->thread;
+        if (address > 0xffffffc000000000UL) {
+                cpumode = PERF_RECORD_MISC_KERNEL;
+        } else {
+                cpumode = PERF_RECORD_MISC_USER;
+        }
+
+        thread__find_addr_map(thread, cpumode, MAP__FUNCTION, address,&al);
+
+        if (!al.map || !al.map->dso) {
+                return 0;
+        }
+
+        if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR &&
+            dso__data_status_seen(al.map->dso,DSO_DATA_STATUS_SEEN_ITRACE)) {
+                return 0;
+        }
+
+        offset = al.map->map_ip(al.map,address);
+
+        map__load(al.map, machine->symbol_filter);
+
+        len = dso__data_read_offset(al.map->dso, machine,
+                                    offset, buffer, size);
+
+        if (len <= 0) {
+                return 0;
+        }
+
+        return len;
+}
+
+static bool check_need_swap(int file_endian)
+{
+       const int data = 1;
+       u8 *check = (u8 *)&data;
+       int host_endian;
+
+       if (check[0] == 1)
+               host_endian = ELFDATA2LSB;
+       else
+               host_endian = ELFDATA2MSB;
+
+       return host_endian != file_endian;
+}
+
+static int cs_etm__read_elf_info(const char *fname, uint64_t *foffset, uint64_t *fstart, uint64_t *fsize)
+{
+       FILE *fp;
+        u8 e_ident[EI_NIDENT];
+       int ret = -1;
+       bool need_swap = false;
+       size_t buf_size;
+       void *buf;
+       int i;
+
+       fp = fopen(fname, "r");
+       if (fp == NULL)
+               return -1;
+
+       if (fread(e_ident, sizeof(e_ident), 1, fp) != 1)
+               goto out;
+
+       if (memcmp(e_ident, ELFMAG, SELFMAG) ||
+           e_ident[EI_VERSION] != EV_CURRENT)
+               goto out;
+
+       need_swap = check_need_swap(e_ident[EI_DATA]);
+
+       /* for simplicity */
+       fseek(fp, 0, SEEK_SET);
+
+       if (e_ident[EI_CLASS] == ELFCLASS32) {
+               Elf32_Ehdr ehdr;
+               Elf32_Phdr *phdr;
+
+               if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1)
+                       goto out;
+
+               if (need_swap) {
+                       ehdr.e_phoff = bswap_32(ehdr.e_phoff);
+                       ehdr.e_phentsize = bswap_16(ehdr.e_phentsize);
+                       ehdr.e_phnum = bswap_16(ehdr.e_phnum);
+               }
+
+               buf_size = ehdr.e_phentsize * ehdr.e_phnum;
+               buf = malloc(buf_size);
+               if (buf == NULL)
+                       goto out;
+
+               fseek(fp, ehdr.e_phoff, SEEK_SET);
+               if (fread(buf, buf_size, 1, fp) != 1)
+                       goto out_free;
+
+               for (i = 0, phdr = buf; i < ehdr.e_phnum; i++, phdr++) {
+
+                       if (need_swap) {
+                               phdr->p_type = bswap_32(phdr->p_type);
+                               phdr->p_offset = bswap_32(phdr->p_offset);
+                               phdr->p_filesz = bswap_32(phdr->p_filesz);
+                       }
+
+                       if (phdr->p_type != PT_LOAD)
+                               continue;
+
+                        *foffset = phdr->p_offset;
+                        *fstart = phdr->p_vaddr;
+                        *fsize = phdr->p_filesz;
+                        ret = 0;
+                        break;
+               }
+       } else {
+               Elf64_Ehdr ehdr;
+               Elf64_Phdr *phdr;
+
+               if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1)
+                       goto out;
+
+               if (need_swap) {
+                       ehdr.e_phoff = bswap_64(ehdr.e_phoff);
+                       ehdr.e_phentsize = bswap_16(ehdr.e_phentsize);
+                       ehdr.e_phnum = bswap_16(ehdr.e_phnum);
+               }
+
+               buf_size = ehdr.e_phentsize * ehdr.e_phnum;
+               buf = malloc(buf_size);
+               if (buf == NULL)
+                       goto out;
+
+               fseek(fp, ehdr.e_phoff, SEEK_SET);
+               if (fread(buf, buf_size, 1, fp) != 1)
+                       goto out_free;
+
+               for (i = 0, phdr = buf; i < ehdr.e_phnum; i++, phdr++) {
+
+                       if (need_swap) {
+                               phdr->p_type = bswap_32(phdr->p_type);
+                               phdr->p_offset = bswap_64(phdr->p_offset);
+                               phdr->p_filesz = bswap_64(phdr->p_filesz);
+                       }
+
+                       if (phdr->p_type != PT_LOAD)
+                               continue;
+
+                        *foffset = phdr->p_offset;
+                        *fstart = phdr->p_vaddr;
+                        *fsize = phdr->p_filesz;
+                        ret = 0;
+                        break;
+               }
+       }
+out_free:
+       free(buf);
+out:
+       fclose(fp);
+       return ret;
+}
+
+static int cs_etm__process_event(struct perf_session *session,
+                                union perf_event *event,
+                                struct perf_sample *sample,
+                                struct perf_tool *tool)
+{
+        struct cs_etm_auxtrace *etm = container_of(session->auxtrace,
+                                                   struct cs_etm_auxtrace,
+                                                   auxtrace);
+
+        u64 timestamp;
+        int err = 0;
+
+        if (dump_trace) 
+                return 0;
+
+        if (!tool->ordered_events) {
+                pr_err("CoreSight ETM Trace requires ordered events\n");
+                return -EINVAL;
+        }
+
+        if (sample->time && (sample->time != (u64)-1))
+                timestamp = sample->time;
+        else
+                timestamp = 0;
+
+        if (timestamp || etm->timeless_decoding) {
+                err = cs_etm__update_queues(etm);
+                if (err)
+                        return err;
+
+        }
+
+        if (event->header.type == PERF_RECORD_MMAP2) {
+                struct dso *dso;
+                int cpu;
+                struct cs_etm_queue *etmq;
+
+                cpu = sample->cpu;
+
+                etmq = cs_etm__cpu_to_etmq(etm,cpu);
+
+                if (!etmq) {
+                        return -1;
+                }
+
+                dso = dsos__find(&(etm->machine->dsos),event->mmap2.filename,false);
+                if (NULL != dso) {
+                        err = cs_etm_decoder__add_mem_access_cb(
+                            etmq->decoder,
+                            event->mmap2.start, 
+                            event->mmap2.len, 
+                            cs_etm__mem_access);
+                }
+
+                if ((symbol_conf.vmlinux_name != NULL) && (!etmq->kernel_mapped)) {
+                        uint64_t foffset;
+                        uint64_t fstart;
+                        uint64_t fsize;
+
+                        err = cs_etm__read_elf_info(symbol_conf.vmlinux_name,
+                                                      &foffset,&fstart,&fsize);
+
+                        if (!err) {
+                                cs_etm_decoder__add_bin_file(
+                                        etmq->decoder,
+                                        foffset,
+                                        fstart,
+                                        fsize & ~0x1ULL,
+                                        symbol_conf.vmlinux_name);
+
+                                etmq->kernel_mapped = true;
+                        }
+                }
+
+        }
+
+        if (etm->timeless_decoding) {
+                if (event->header.type == PERF_RECORD_EXIT) {
+                        err = cs_etm__process_timeless_queues(etm,
+                                                             event->fork.tid,
+                                                             sample->time);
+                }
+        } else if (timestamp) {
+                err = cs_etm__process_queues(etm, timestamp);
+        }
+
+        //cs_etm__log("event %s (%u): cpu %d time%"PRIu64" tsc %#"PRIx64"\n",
+                   //perf_event__name(event->header.type), event->header.type,
+                   //sample->cpu, sample->time, timestamp);
+        return err;
+}
+
+static int cs_etm__process_auxtrace_event(struct perf_session *session,
+                                  union perf_event *event,
+                                  struct perf_tool *tool)
+{
+        struct cs_etm_auxtrace *etm = container_of(session->auxtrace,
+                                                   struct cs_etm_auxtrace,
+                                                   auxtrace);
+
+        (void) tool;
+
+        if (!etm->data_queued) {
+                struct auxtrace_buffer *buffer;
+                off_t  data_offset;
+                int fd = perf_data_file__fd(session->file);
+                bool is_pipe = perf_data_file__is_pipe(session->file);
+                int err;
+
+                if (is_pipe) {
+                        data_offset = 0;
+                } else {
+                        data_offset = lseek(fd, 0, SEEK_CUR);
+                        if (data_offset == -1) {
+                                return -errno;
+                        }
+                }
+
+                err = auxtrace_queues__add_event(&etm->queues,
+                                                 session,
+                                                 event,
+                                                 data_offset,
+                                                 &buffer);
+                if (err)
+                        return err;
+
+                if (dump_trace)
+                {
+                        if (auxtrace_buffer__get_data(buffer,fd)) {
+                                cs_etm__dump_event(etm,buffer);
+                                auxtrace_buffer__put_data(buffer);
+                        }
+                }
+        } 
+
+        return 0;
+
+}
+
+static const char * const cs_etm_global_header_fmts[] = {
+  [CS_HEADER_VERSION_0]    = "   Header version                 %"PRIx64"\n",
+  [CS_PMU_TYPE_CPUS]       = "   PMU type/num cpus              %"PRIx64"\n",
+  [CS_ETM_SNAPSHOT]        = "   Snapshot                       %"PRIx64"\n",
+};
+
+static const char * const cs_etm_priv_fmts[] = {
+  [CS_ETM_MAGIC]           = "   Magic number                   %"PRIx64"\n",
+  [CS_ETM_CPU]             = "   CPU                            %"PRIx64"\n",
+  [CS_ETM_ETMCR]           = "   ETMCR                          %"PRIx64"\n",
+  [CS_ETM_ETMTRACEIDR]     = "   ETMTRACEIDR                    %"PRIx64"\n",
+  [CS_ETM_ETMCCER]         = "   ETMCCER                        %"PRIx64"\n",
+  [CS_ETM_ETMIDR]          = "   ETMIDR                         %"PRIx64"\n",
+};
+
+static const char * const cs_etmv4_priv_fmts[] = {
+  [CS_ETM_MAGIC]           = "   Magic number                   %"PRIx64"\n",
+  [CS_ETM_CPU]             = "   CPU                            %"PRIx64"\n",
+  [CS_ETMV4_TRCCONFIGR]    = "   TRCCONFIGR                     %"PRIx64"\n",
+  [CS_ETMV4_TRCTRACEIDR]   = "   TRCTRACEIDR                    %"PRIx64"\n",
+  [CS_ETMV4_TRCIDR0]       = "   TRCIDR0                        %"PRIx64"\n",
+  [CS_ETMV4_TRCIDR1]       = "   TRCIDR1                        %"PRIx64"\n",
+  [CS_ETMV4_TRCIDR2]       = "   TRCIDR2                        %"PRIx64"\n",
+  [CS_ETMV4_TRCIDR8]       = "   TRCIDR8                        %"PRIx64"\n",
+  [CS_ETMV4_TRCAUTHSTATUS] = "   TRCAUTHSTATUS                  %"PRIx64"\n",
+};
+
+static void cs_etm__print_auxtrace_info(u64 *val, size_t num)
+{
+        unsigned i,j,cpu;
+
+        for (i = 0, cpu = 0; cpu < num; ++cpu) {
+
+                if (val[i] == __perf_cs_etmv3_magic) {
+                        for (j = 0; j < CS_ETM_PRIV_MAX; ++j, ++i) {
+                                fprintf(stdout,cs_etm_priv_fmts[j],val[i]);
+                        }
+                } else if (val[i] == __perf_cs_etmv4_magic) {
+                        for (j = 0; j < CS_ETMV4_PRIV_MAX; ++j, ++i) {
+                                fprintf(stdout,cs_etmv4_priv_fmts[j],val[i]);
+                        }
+                } else {
+                        // failure.. return
+                        return;
+                }
+        }
+}
+
+int cs_etm__process_auxtrace_info(union perf_event *event,
+                                 struct perf_session *session)
+{
+        struct auxtrace_info_event *auxtrace_info = &(event->auxtrace_info);
+        size_t event_header_size = sizeof(struct perf_event_header);
+        size_t info_header_size = 8;
+        size_t total_size = auxtrace_info->header.size;
+        size_t priv_size = 0;
+        size_t num_cpu;
+        struct cs_etm_auxtrace *etm = 0;
+        int err = 0, idx = -1;
+        u64 *ptr;
+        u64 *hdr = NULL;
+        u64 **metadata = NULL;
+        size_t i,j,k;
+        unsigned pmu_type;
+        struct int_node *inode;
+
+        /*
+         * sizeof(auxtrace_info_event::type) +
+         * sizeof(auxtrace_info_event::reserved) == 8
+         */
+        info_header_size = 8;
+
+        if (total_size < (event_header_size + info_header_size))
+                return -EINVAL;
+
+        priv_size = total_size - event_header_size - info_header_size;
+
+        // First the global part
+
+        ptr = (u64 *) auxtrace_info->priv;
+        if (ptr[0] == 0) {
+                hdr = zalloc(sizeof(u64 *) * CS_HEADER_VERSION_0_MAX);
+                if (hdr == NULL) {
+                        return -EINVAL;
+                }
+                for (i = 0; i < CS_HEADER_VERSION_0_MAX; ++i) {
+                        hdr[i] = ptr[i];
+                }
+                num_cpu = hdr[CS_PMU_TYPE_CPUS] & 0xffffffff;
+                pmu_type = (unsigned) ((hdr[CS_PMU_TYPE_CPUS] >> 32) & 0xffffffff);
+        } else {
+                return -EINVAL;
+        }
+
+        /*
+         * Create an RB tree for traceID-CPU# tuple.  Since the conversion has
+         * to be made for each packet that gets decoded optimizing access in
+         * anything other than a sequential array is worth doing.
+         */
+        traceid_list = intlist__new(NULL);
+        if (!traceid_list)
+                return -ENOMEM;
+
+        metadata = zalloc(sizeof(u64 *) * num_cpu);
+        if (!metadata) {
+               err = -ENOMEM;
+                goto err_free_traceid_list;
+        }
+
+        if (metadata == NULL) {
+                return -EINVAL;
+        }
+
+        for (j = 0; j < num_cpu; ++j) {
+                if (ptr[i] == __perf_cs_etmv3_magic) {
+                        metadata[j] = zalloc(sizeof(u64)*CS_ETM_PRIV_MAX);
+                        if (metadata == NULL)
+                                return -EINVAL;
+                        for (k = 0; k < CS_ETM_PRIV_MAX; k++) {
+                                metadata[j][k] = ptr[i+k];
+                        }
+
+                       /* The traceID is our handle */
+                       idx = metadata[j][CS_ETM_ETMIDR];
+                        i += CS_ETM_PRIV_MAX;
+                } else if (ptr[i] == __perf_cs_etmv4_magic) {
+                        metadata[j] = zalloc(sizeof(u64)*CS_ETMV4_PRIV_MAX);
+                        if (metadata == NULL)
+                                return -EINVAL;
+                        for (k = 0; k < CS_ETMV4_PRIV_MAX; k++) {
+                                metadata[j][k] = ptr[i+k];
+                        }
+
+                       /* The traceID is our handle */
+                       idx = metadata[j][CS_ETMV4_TRCTRACEIDR];
+                        i += CS_ETMV4_PRIV_MAX;
+                }
+
+               /* Get an RB node for this CPU */
+               inode = intlist__findnew(traceid_list, idx);
+
+               /* Something went wrong, no need to continue */
+               if (!inode) {
+                       err = PTR_ERR(inode);
+                       goto err_free_metadata;
+               }
+
+               /*
+                * The node for that CPU should not have been taken already.
+                * Backout if that's the case.
+                */
+               if (inode->priv) {
+                       err = -EINVAL;
+                       goto err_free_metadata;
+               }
+
+               /* All good, associate the traceID with the CPU# */
+               inode->priv = &metadata[j][CS_ETM_CPU];
+
+        }
+
+        if (i*8 != priv_size)
+                return -EINVAL;
+
+        if (dump_trace)
+                cs_etm__print_auxtrace_info(auxtrace_info->priv,num_cpu);
+
+        etm = zalloc(sizeof(struct cs_etm_auxtrace));
+
+        etm->num_cpu = num_cpu;
+        etm->pmu_type = pmu_type;
+        etm->snapshot_mode = (hdr[CS_ETM_SNAPSHOT] != 0);
+
+        if (!etm)
+                return -ENOMEM;
+
+
+        err = auxtrace_queues__init(&etm->queues);
+        if (err)
+                goto err_free;
+
+        etm->unknown_thread = thread__new(999999999,999999999);
+        if (etm->unknown_thread == NULL) {
+                err = -ENOMEM;
+                goto err_free_queues;
+        }
+        err = thread__set_comm(etm->unknown_thread, "unknown", 0);
+        if (err) {
+                goto err_delete_thread;
+        }
+
+        if (thread__init_map_groups(etm->unknown_thread,
+                                    etm->machine)) {
+                err = -ENOMEM;
+                goto err_delete_thread;
+        }
+
+        etm->timeless_decoding = true;
+        etm->sampling_mode = false;
+        etm->metadata = metadata;
+        etm->session = session;
+        etm->machine = &session->machines.host;
+        etm->auxtrace_type = auxtrace_info->type;
+
+        etm->auxtrace.process_event = cs_etm__process_event;
+        etm->auxtrace.process_auxtrace_event = cs_etm__process_auxtrace_event;
+        etm->auxtrace.flush_events = cs_etm__flush_events;
+        etm->auxtrace.free_events  = cs_etm__free_events;
+        etm->auxtrace.free         = cs_etm__free;
+        session->auxtrace = &(etm->auxtrace);
+
+        if (dump_trace)
+                return 0;
+
+        if (session->itrace_synth_opts && session->itrace_synth_opts->set) {
+                etm->synth_opts = *session->itrace_synth_opts;
+        } else {
+                itrace_synth_opts__set_default(&etm->synth_opts);
+        }
+        etm->synth_opts.branches = false;
+        etm->synth_opts.callchain = false;
+        etm->synth_opts.calls = false;
+        etm->synth_opts.returns = false;
+
+        err = cs_etm__synth_events(etm, session);
+        if (err)
+                goto err_delete_thread;
+
+        err = auxtrace_queues__process_index(&etm->queues, session);
+        if (err)
+                goto err_delete_thread;
+
+        etm->data_queued = etm->queues.populated;
+
+        return 0;
+
+err_delete_thread:
+        thread__delete(etm->unknown_thread);
+err_free_queues:
+        auxtrace_queues__free(&etm->queues);
+        session->auxtrace = NULL;
+err_free:
+        free(etm);
+err_free_metadata:
+       /* No need to check @metadata[j], free(NULL) is supported */
+       for (j = 0; j < num_cpu; ++j)
+               free(metadata[j]);
+       free(metadata);
+err_free_traceid_list:
+       intlist__delete(traceid_list);
+
+        return err;
+}
diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h

new file mode 100644 (file)

index 0000000..ec6ff78
--- /dev/null
+++ b/tools/perf/util/cs-etm.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright(C) 2015 Linaro Limited. All rights reserved.
+ * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef INCLUDE__UTIL_PERF_CS_ETM_H__
+#define INCLUDE__UTIL_PERF_CS_ETM_H__
+
+#include "util/event.h"
+#include "util/intlist.h"
+#include "util/session.h"
+
+/* Versionning header in case things need tro change in the future.  That way
+ * decoding of old snapshot is still possible.
+ */
+enum {
+       /* Starting with 0x0 */
+       CS_HEADER_VERSION_0,
+       /* PMU->type (32 bit), total # of CPUs (32 bit) */
+       CS_PMU_TYPE_CPUS,
+       CS_ETM_SNAPSHOT,
+       CS_HEADER_VERSION_0_MAX,
+};
+
+/* Beginning of header common to both ETMv3 and V4 */
+enum {
+       CS_ETM_MAGIC,
+       CS_ETM_CPU,
+};
+
+/* ETMv3/PTM metadata */
+enum {
+       /* Dynamic, configurable parameters */
+       CS_ETM_ETMCR = CS_ETM_CPU + 1,
+       CS_ETM_ETMTRACEIDR,
+       /* RO, taken from sysFS */
+       CS_ETM_ETMCCER,
+       CS_ETM_ETMIDR,
+       CS_ETM_PRIV_MAX,
+};
+
+/* ETMv4 metadata */
+enum {
+       /* Dynamic, configurable parameters */
+       CS_ETMV4_TRCCONFIGR = CS_ETM_CPU + 1,
+       CS_ETMV4_TRCTRACEIDR,
+       /* RO, taken from sysFS */
+       CS_ETMV4_TRCIDR0,
+       CS_ETMV4_TRCIDR1,
+       CS_ETMV4_TRCIDR2,
+       CS_ETMV4_TRCIDR8,
+       CS_ETMV4_TRCAUTHSTATUS,
+       CS_ETMV4_PRIV_MAX,
+};
+
+/* RB tree for quick conversion between traceID and CPUs */
+struct intlist *traceid_list;
+
+#define KiB(x) ((x) * 1024)
+#define MiB(x) ((x) * 1024 * 1024)
+
+#define CS_ETM_HEADER_SIZE (CS_HEADER_VERSION_0_MAX * sizeof(u64))
+
+static const u64 __perf_cs_etmv3_magic   = 0x3030303030303030ULL;
+static const u64 __perf_cs_etmv4_magic   = 0x4040404040404040ULL;
+#define CS_ETMV3_PRIV_SIZE (CS_ETM_PRIV_MAX * sizeof(u64))
+#define CS_ETMV4_PRIV_SIZE (CS_ETMV4_PRIV_MAX * sizeof(u64))
+
+int cs_etm__process_auxtrace_info(union perf_event *event,
+                                  struct perf_session *session);
+
+#endif
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c

index b4b96120fc3b4336749e97a7ede90725c1c4340e..b856cf0393ea96d652eb73b11c5e14af911d6f61 100644 (file)
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1247,6 +1247,30 @@ int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **e
         return err;
  }
  
+int perf_evlist__apply_drv_configs(struct perf_evlist *evlist,
+                                  struct perf_evsel **err_evsel,
+                                  struct perf_evsel_config_term **err_term)
+{
+       struct perf_evsel *evsel;
+       int err = 0;
+       const int ncpus = cpu_map__nr(evlist->cpus),
+                 nthreads = thread_map__nr(evlist->threads);
+
+       evlist__for_each(evlist, evsel) {
+               if (list_empty(&evsel->drv_config_terms))
+                       continue;
+
+               err = perf_evsel__apply_drv_configs(evsel, ncpus,
+                                                   nthreads, err_term);
+               if (err) {
+                       *err_evsel = evsel;
+                       break;
+               }
+       }
+
+       return err;
+}
+
  int perf_evlist__set_filter(struct perf_evlist *evlist, const char *filter)
  {
         struct perf_evsel *evsel;
@@ -1486,7 +1510,7 @@ int perf_evlist__open(struct perf_evlist *evlist)
         perf_evlist__update_id_pos(evlist);
  
         evlist__for_each(evlist, evsel) {
-               err = perf_evsel__open(evsel, evlist->cpus, evlist->threads);
+               err = perf_evsel__open(evsel, evsel->cpus, evsel->threads);
                 if (err < 0)
                         goto out_err;
         }
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h

index a459fe71b452e0b721d798cc73cc6287ae87dc1c..ae5c1eb1d08caa369a78217e27f491749a3ec891 100644 (file)
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -163,6 +163,9 @@ void perf_evlist__set_maps(struct perf_evlist *evlist, struct cpu_map *cpus,
                            struct thread_map *threads);
  int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target);
  int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **err_evsel);
+int perf_evlist__apply_drv_configs(struct perf_evlist *evlist,
+                                  struct perf_evsel **err_evsel,
+                                  struct perf_evsel_config_term **term);
  
  void __perf_evlist__set_leader(struct list_head *list);
  void perf_evlist__set_leader(struct perf_evlist *evlist);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c

index 397fb4ed3c97b6deffeffd8f69bbceb886ac58ea..39a8bd842d0df73b216c89e5b15d10e9be667252 100644 (file)
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -211,6 +211,7 @@ void perf_evsel__init(struct perf_evsel *evsel,
         evsel->bpf_fd      = -1;
         INIT_LIST_HEAD(&evsel->node);
         INIT_LIST_HEAD(&evsel->config_terms);
+       INIT_LIST_HEAD(&evsel->drv_config_terms);
         perf_evsel__object.init(evsel);
         evsel->sample_size = __perf_evsel__sample_size(attr->sample_type);
         perf_evsel__calc_id_pos(evsel);
@@ -981,6 +982,27 @@ int perf_evsel__append_filter(struct perf_evsel *evsel,
         return -1;
  }
  
+int perf_evsel__apply_drv_configs(struct perf_evsel *evsel,
+                                 int ncpus, int nthreads,
+                                 struct perf_evsel_config_term **err_term)
+{
+       int err = 0;
+       struct perf_evsel_config_term *term;
+
+       list_for_each_entry(term, &evsel->drv_config_terms, list) {
+               err = perf_evsel__run_ioctl(evsel, ncpus, nthreads,
+                                           PERF_EVENT_IOC_SET_DRV_CONFIGS,
+                                           (void *)term->val.drv_cfg);
+
+               if (err) {
+                       *err_term = term;
+                       break;
+               }
+       }
+
+       return err;
+}
+
  int perf_evsel__enable(struct perf_evsel *evsel, int ncpus, int nthreads)
  {
         return perf_evsel__run_ioctl(evsel, ncpus, nthreads,
@@ -988,6 +1010,16 @@ int perf_evsel__enable(struct perf_evsel *evsel, int ncpus, int nthreads)
                                      0);
  }
  
+int perf_evsel__disable(struct perf_evsel *evsel)
+{
+       int nthreads = thread_map__nr(evsel->threads);
+       int ncpus = cpu_map__nr(evsel->cpus);
+
+       return perf_evsel__run_ioctl(evsel, ncpus, nthreads,
+                                    PERF_EVENT_IOC_DISABLE,
+                                    0);
+}
+
  int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads)
  {
         if (ncpus == 0 || nthreads == 0)
@@ -1033,6 +1065,16 @@ static void perf_evsel__free_config_terms(struct perf_evsel *evsel)
         }
  }
  
+static void perf_evsel__free_drv_config_terms(struct perf_evsel *evsel)
+{
+       struct perf_evsel_config_term *term, *h;
+
+       list_for_each_entry_safe(term, h, &evsel->drv_config_terms, list) {
+               list_del(&term->list);
+               free(term);
+       }
+}
+
  void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
  {
         int cpu, thread;
@@ -1054,6 +1096,7 @@ void perf_evsel__exit(struct perf_evsel *evsel)
         perf_evsel__free_fd(evsel);
         perf_evsel__free_id(evsel);
         perf_evsel__free_config_terms(evsel);
+       perf_evsel__free_drv_config_terms(evsel);
         close_cgroup(evsel->cgrp);
         cpu_map__put(evsel->cpus);
         cpu_map__put(evsel->own_cpus);
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h

index 0e49bd742c639c02d1aef18f421c0204faa95823..b649143ac16b0fc9e1b0753267cf6dbbbf4ca308 100644 (file)
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -44,6 +44,7 @@ enum {
         PERF_EVSEL__CONFIG_TERM_CALLGRAPH,
         PERF_EVSEL__CONFIG_TERM_STACK_USER,
         PERF_EVSEL__CONFIG_TERM_INHERIT,
+       PERF_EVSEL__CONFIG_TERM_DRV_CFG,
         PERF_EVSEL__CONFIG_TERM_MAX,
  };
  
@@ -55,6 +56,7 @@ struct perf_evsel_config_term {
                 u64     freq;
                 bool    time;
                 char    *callgraph;
+               char    *drv_cfg;
                 u64     stack_user;
                 bool    inherit;
         } val;
@@ -75,6 +77,7 @@ struct perf_evsel_config_term {
   *          PERF_SAMPLE_IDENTIFIER) in a non-sample event i.e. if sample_id_all
   *          is used there is an id sample appended to non-sample events
   * @priv:   And what is in its containing unnamed union are tool specific
+ * @drv_config_terms: List of configurables sent directly to the PMU driver
   */
  struct perf_evsel {
         struct list_head        node;
@@ -123,6 +126,7 @@ struct perf_evsel {
         char                    *group_name;
         bool                    cmdline_group_boundary;
         struct list_head        config_terms;
+       struct list_head        drv_config_terms;
         int                     bpf_fd;
  };
  
@@ -227,7 +231,11 @@ int perf_evsel__append_filter(struct perf_evsel *evsel,
                               const char *op, const char *filter);
  int perf_evsel__apply_filter(struct perf_evsel *evsel, int ncpus, int nthreads,
                              const char *filter);
+int perf_evsel__apply_drv_configs(struct perf_evsel *evsel,
+                                 int ncpus, int nthreads,
+                                 struct perf_evsel_config_term **err_term);
  int perf_evsel__enable(struct perf_evsel *evsel, int ncpus, int nthreads);
+int perf_evsel__disable(struct perf_evsel *evsel);
  
  int perf_evsel__open_per_cpu(struct perf_evsel *evsel,
                              struct cpu_map *cpus);
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c

index 8b303ff20289a8ffb4baff9e29292dd47e9068eb..888640ffada5b19340a3670283d8f3732d85bc52 100644 (file)
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1,3 +1,4 @@
+#include "build-id.h"
  #include "callchain.h"
  #include "debug.h"
  #include "event.h"
@@ -685,8 +686,16 @@ static struct dso *machine__get_kernel(struct machine *machine)
                                                  DSO_TYPE_GUEST_KERNEL);
         }
  
-       if (kernel != NULL && (!kernel->has_build_id))
-               dso__read_running_kernel_build_id(kernel, machine);
+       if (kernel != NULL && (!kernel->has_build_id)) {
+                if (symbol_conf.vmlinux_name != NULL) {
+                        filename__read_build_id(symbol_conf.vmlinux_name,
+                                                kernel->build_id,
+                                                sizeof(kernel->build_id));
+                        kernel->has_build_id = 1;
+                } else {
+                       dso__read_running_kernel_build_id(kernel, machine);
+                }
+        }
  
         return kernel;
  }
@@ -700,8 +709,19 @@ static void machine__get_kallsyms_filename(struct machine *machine, char *buf,
  {
         if (machine__is_default_guest(machine))
                 scnprintf(buf, bufsz, "%s", symbol_conf.default_guest_kallsyms);
-       else
-               scnprintf(buf, bufsz, "%s/proc/kallsyms", machine->root_dir);
+       else {
+                if (symbol_conf.vmlinux_name != 0) {
+                        unsigned char build_id[BUILD_ID_SIZE];
+                        char build_id_hex[SBUILD_ID_SIZE];
+                        filename__read_build_id(symbol_conf.vmlinux_name,
+                                                build_id,
+                                                sizeof(build_id));
+                        build_id__sprintf(build_id,sizeof(build_id), build_id_hex);
+                        build_id__filename((char *)build_id_hex,buf,bufsz);
+                } else {
+                       scnprintf(buf, bufsz, "%s/proc/kallsyms", machine->root_dir);
+                }
+        }
  }
  
  const char *ref_reloc_sym_names[] = {"_text", "_stext", NULL};
@@ -710,7 +730,7 @@ const char *ref_reloc_sym_names[] = {"_text", "_stext", NULL};
   * Returns the name of the start symbol in *symbol_name. Pass in NULL as
   * symbol_name if it's not that important.
   */
-static u64 machine__get_running_kernel_start(struct machine *machine,
+static u64 machine__get_kallsyms_kernel_start(struct machine *machine,
                                              const char **symbol_name)
  {
         char filename[PATH_MAX];
@@ -738,7 +758,7 @@ static u64 machine__get_running_kernel_start(struct machine *machine,
  int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel)
  {
         enum map_type type;
-       u64 start = machine__get_running_kernel_start(machine, NULL);
+       u64 start = machine__get_kallsyms_kernel_start(machine, NULL);
  
         for (type = 0; type < MAP__NR_TYPES; ++type) {
                 struct kmap *kmap;
@@ -1083,7 +1103,8 @@ int machine__create_kernel_maps(struct machine *machine)
  {
         struct dso *kernel = machine__get_kernel(machine);
         const char *name;
-       u64 addr = machine__get_running_kernel_start(machine, &name);
+       u64 addr = machine__get_kallsyms_kernel_start(machine, &name);
+
         if (!addr)
                 return -1;
  
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c

index a35db828bd0d32d09068fed2e1c451e28db90dbf..854dd2105bd584786caec47344acf0636b0d990c 100644 (file)
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -285,7 +285,8 @@ static struct perf_evsel *
  __add_event(struct list_head *list, int *idx,
             struct perf_event_attr *attr,
             char *name, struct cpu_map *cpus,
-           struct list_head *config_terms)
+           struct list_head *config_terms,
+           struct list_head *drv_config_terms)
  {
         struct perf_evsel *evsel;
  
@@ -304,6 +305,9 @@ __add_event(struct list_head *list, int *idx,
         if (config_terms)
                 list_splice(config_terms, &evsel->config_terms);
  
+       if (drv_config_terms)
+               list_splice(drv_config_terms, &evsel->drv_config_terms);
+
         list_add_tail(&evsel->node, list);
         return evsel;
  }
@@ -312,7 +316,8 @@ static int add_event(struct list_head *list, int *idx,
                      struct perf_event_attr *attr, char *name,
                      struct list_head *config_terms)
  {
-       return __add_event(list, idx, attr, name, NULL, config_terms) ? 0 : -ENOMEM;
+       return __add_event(list, idx, attr, name,
+                          NULL, config_terms, NULL) ? 0 : -ENOMEM;
  }
  
  static int parse_aliases(char *str, const char *names[][PERF_EVSEL__MAX_ALIASES], int size)
@@ -823,7 +828,8 @@ static int config_term_pmu(struct perf_event_attr *attr,
                            struct parse_events_term *term,
                            struct parse_events_error *err)
  {
-       if (term->type_term == PARSE_EVENTS__TERM_TYPE_USER)
+       if (term->type_term == PARSE_EVENTS__TERM_TYPE_USER ||
+           term->type_term == PARSE_EVENTS__TERM_TYPE_DRV_CFG)
                 /*
                  * Always succeed for sysfs terms, as we dont know
                  * at this point what type they need to have.
@@ -869,10 +875,7 @@ static int config_attr(struct perf_event_attr *attr,
         return 0;
  }
  
-static int get_config_terms(struct list_head *head_config,
-                           struct list_head *head_terms __maybe_unused)
-{
-#define ADD_CONFIG_TERM(__type, __name, __val)                 \
+#define ADD_CONFIG_TERM(__type, __name, __val, __head_terms)   \
  do {                                                           \
         struct perf_evsel_config_term *__t;                     \
                                                                 \
@@ -883,33 +886,43 @@ do {                                                              \
         INIT_LIST_HEAD(&__t->list);                             \
         __t->type       = PERF_EVSEL__CONFIG_TERM_ ## __type;   \
         __t->val.__name = __val;                                \
-       list_add_tail(&__t->list, head_terms);                  \
+       list_add_tail(&__t->list, __head_terms);                \
  } while (0)
  
+static int get_config_terms(struct list_head *head_config,
+                           struct list_head *head_terms __maybe_unused)
+{
         struct parse_events_term *term;
  
         list_for_each_entry(term, head_config, list) {
                 switch (term->type_term) {
                 case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD:
-                       ADD_CONFIG_TERM(PERIOD, period, term->val.num);
+                       ADD_CONFIG_TERM(PERIOD, period,
+                                       term->val.num, head_terms);
                         break;
                 case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ:
-                       ADD_CONFIG_TERM(FREQ, freq, term->val.num);
+                       ADD_CONFIG_TERM(FREQ, freq,
+                                       term->val.num, head_terms);
                         break;
                 case PARSE_EVENTS__TERM_TYPE_TIME:
-                       ADD_CONFIG_TERM(TIME, time, term->val.num);
+                       ADD_CONFIG_TERM(TIME, time,
+                                       term->val.num, head_terms);
                         break;
                 case PARSE_EVENTS__TERM_TYPE_CALLGRAPH:
-                       ADD_CONFIG_TERM(CALLGRAPH, callgraph, term->val.str);
+                       ADD_CONFIG_TERM(CALLGRAPH, callgraph,
+                                       term->val.str, head_terms);
                         break;
                 case PARSE_EVENTS__TERM_TYPE_STACKSIZE:
-                       ADD_CONFIG_TERM(STACK_USER, stack_user, term->val.num);
+                       ADD_CONFIG_TERM(STACK_USER, stack_user,
+                                       term->val.num, head_terms);
                         break;
                 case PARSE_EVENTS__TERM_TYPE_INHERIT:
-                       ADD_CONFIG_TERM(INHERIT, inherit, term->val.num ? 1 : 0);
+                       ADD_CONFIG_TERM(INHERIT, inherit,
+                                       term->val.num ? 1 : 0, head_terms);
                         break;
                 case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
-                       ADD_CONFIG_TERM(INHERIT, inherit, term->val.num ? 0 : 1);
+                       ADD_CONFIG_TERM(INHERIT, inherit,
+                                       term->val.num ? 0 : 1, head_terms);
                         break;
                 default:
                         break;
@@ -919,6 +932,21 @@ do {                                                               \
         return 0;
  }
  
+static int get_drv_config_terms(struct list_head *head_config,
+                               struct list_head *head_terms)
+{
+       struct parse_events_term *term;
+
+       list_for_each_entry(term, head_config, list) {
+               if (term->type_term != PARSE_EVENTS__TERM_TYPE_DRV_CFG)
+                       continue;
+
+               ADD_CONFIG_TERM(DRV_CFG, drv_cfg, term->val.str, head_terms);
+       }
+
+       return 0;
+}
+
  int parse_events_add_tracepoint(struct list_head *list, int *idx,
                                 char *sys, char *event,
                                 struct parse_events_error *err,
@@ -989,6 +1017,7 @@ int parse_events_add_pmu(struct parse_events_evlist *data,
         struct perf_pmu *pmu;
         struct perf_evsel *evsel;
         LIST_HEAD(config_terms);
+       LIST_HEAD(drv_config_terms);
  
         pmu = perf_pmu__find(name);
         if (!pmu)
@@ -1003,7 +1032,8 @@ int parse_events_add_pmu(struct parse_events_evlist *data,
  
         if (!head_config) {
                 attr.type = pmu->type;
-               evsel = __add_event(list, &data->idx, &attr, NULL, pmu->cpus, NULL);
+               evsel = __add_event(list, &data->idx, &attr,
+                                   NULL, pmu->cpus, NULL, NULL);
                 return evsel ? 0 : -ENOMEM;
         }
  
@@ -1020,12 +1050,15 @@ int parse_events_add_pmu(struct parse_events_evlist *data,
         if (get_config_terms(head_config, &config_terms))
                 return -ENOMEM;
  
+       if (get_drv_config_terms(head_config, &drv_config_terms))
+               return -ENOMEM;
+
         if (perf_pmu__config(pmu, &attr, head_config, data->error))
                 return -EINVAL;
  
         evsel = __add_event(list, &data->idx, &attr,
                             pmu_event_name(head_config), pmu->cpus,
-                           &config_terms);
+                           &config_terms, &drv_config_terms);
         if (evsel) {
                 evsel->unit = info.unit;
                 evsel->scale = info.scale;
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h

index f1a6db107241b1c8ffaf03a3514ba1549df6fd1b..09c3ee2df45c71940e076444edefeaefe0cf2775 100644 (file)
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -68,7 +68,8 @@ enum {
         PARSE_EVENTS__TERM_TYPE_CALLGRAPH,
         PARSE_EVENTS__TERM_TYPE_STACKSIZE,
         PARSE_EVENTS__TERM_TYPE_NOINHERIT,
-       PARSE_EVENTS__TERM_TYPE_INHERIT
+       PARSE_EVENTS__TERM_TYPE_INHERIT,
+       PARSE_EVENTS__TERM_TYPE_DRV_CFG,
  };
  
  struct parse_events_term {
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l

index 58c5831ffd5c22133f48a4c1a3a07721c71362fa..de260ed0dd542910850e7af4778ac808e46a42ff 100644 (file)
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -53,6 +53,16 @@ static int str(yyscan_t scanner, int token)
         return token;
  }
  
+static int drv_str(yyscan_t scanner, int token)
+{
+       YYSTYPE *yylval = parse_events_get_lval(scanner);
+       char *text = parse_events_get_text(scanner);
+
+       /* Strip off the '@' */
+       yylval->str = strdup(text + 1);
+       return token;
+}
+
  #define REWIND(__alloc)                                \
  do {                                                           \
         YYSTYPE *__yylval = parse_events_get_lval(yyscanner);   \
@@ -123,6 +133,7 @@ num_hex             0x[a-fA-F0-9]+
  num_raw_hex    [a-fA-F0-9]+
  name           [a-zA-Z_*?][a-zA-Z0-9_*?.]*
  name_minus     [a-zA-Z_*?][a-zA-Z0-9\-_*?.]*
+drv_cfg_term   [a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\.:]+)?
  /* If you add a modifier you need to update check_modifier() */
  modifier_event [ukhpPGHSDI]+
  modifier_bp    [rwx]{1,3}
@@ -196,6 +207,7 @@ no-inherit          { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOINHERIT); }
  ,                      { return ','; }
  "/"                    { BEGIN(INITIAL); return '/'; }
  {name_minus}           { return str(yyscanner, PE_NAME); }
+@{drv_cfg_term}                { return drv_str(yyscanner, PE_DRV_CFG_TERM); }
  }
  
  <mem>{
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y

index ad379968d4c10c0fb7bb2ddc3bacce3dc1166f43..d35c10275ba4e4722a021083885909bd635b9e6e 100644 (file)
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -48,6 +48,7 @@ static inc_group_count(struct list_head *list,
  %token PE_PREFIX_MEM PE_PREFIX_RAW PE_PREFIX_GROUP
  %token PE_ERROR
  %token PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT
+%token PE_DRV_CFG_TERM
  %type <num> PE_VALUE
  %type <num> PE_VALUE_SYM_HW
  %type <num> PE_VALUE_SYM_SW
@@ -62,6 +63,7 @@ static inc_group_count(struct list_head *list,
  %type <str> PE_MODIFIER_BP
  %type <str> PE_EVENT_NAME
  %type <str> PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT
+%type <str> PE_DRV_CFG_TERM
  %type <num> value_sym
  %type <head> event_config
  %type <term> event_term
@@ -573,6 +575,15 @@ PE_TERM
         ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1, &@1, NULL));
         $$ = term;
  }
+|
+PE_DRV_CFG_TERM
+{
+       struct parse_events_term *term;
+
+       ABORT_ON(parse_events_term__str(&term, PARSE_EVENTS__TERM_TYPE_DRV_CFG,
+                                       $1, $1, &@1, NULL));
+       $$ = term;
+}
  
  sep_dc: ':' |
  
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c

index a8e825fca42af9aeab46de300855e5e22ea8ffc0..df49c0035170cc423f091e89bb34c8d2196886f9 100644 (file)
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -806,6 +806,8 @@ static void python_process_general_event(struct perf_sample *sample,
                         PyInt_FromLong(sample->cpu));
         pydict_set_item_string_decref(dict_sample, "ip",
                         PyLong_FromUnsignedLongLong(sample->ip));
+        pydict_set_item_string_decref(dict_sample, "addr",
+                       PyLong_FromUnsignedLongLong(sample->addr));
         pydict_set_item_string_decref(dict_sample, "time",
                         PyLong_FromUnsignedLongLong(sample->time));
         pydict_set_item_string_decref(dict_sample, "period",
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c

index 468de95bc8bb1a73a666fc6eb92275d0ec1cd4f9..010ff659b82fa667c70fffa85fd8e654c7d078d4 100644 (file)
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -224,14 +224,6 @@ static int process_event_stub(struct perf_tool *tool __maybe_unused,
         return 0;
  }
  
-static int process_build_id_stub(struct perf_tool *tool __maybe_unused,
-                                union perf_event *event __maybe_unused,
-                                struct perf_session *session __maybe_unused)
-{
-       dump_printf(": unhandled!\n");
-       return 0;
-}
-
  static int process_finished_round_stub(struct perf_tool *tool __maybe_unused,
                                        union perf_event *event __maybe_unused,
                                        struct ordered_events *oe __maybe_unused)
@@ -244,23 +236,6 @@ static int process_finished_round(struct perf_tool *tool,
                                   union perf_event *event,
                                   struct ordered_events *oe);
  
-static int process_id_index_stub(struct perf_tool *tool __maybe_unused,
-                                union perf_event *event __maybe_unused,
-                                struct perf_session *perf_session
-                                __maybe_unused)
-{
-       dump_printf(": unhandled!\n");
-       return 0;
-}
-
-static int process_event_auxtrace_info_stub(struct perf_tool *tool __maybe_unused,
-                               union perf_event *event __maybe_unused,
-                               struct perf_session *session __maybe_unused)
-{
-       dump_printf(": unhandled!\n");
-       return 0;
-}
-
  static int skipn(int fd, off_t n)
  {
         char buf[4096];
@@ -287,10 +262,9 @@ static s64 process_event_auxtrace_stub(struct perf_tool *tool __maybe_unused,
         return event->auxtrace.size;
  }
  
-static
-int process_event_auxtrace_error_stub(struct perf_tool *tool __maybe_unused,
-                                     union perf_event *event __maybe_unused,
-                                     struct perf_session *session __maybe_unused)
+static int process_event_op2_stub(struct perf_tool *tool __maybe_unused,
+                                 union perf_event *event __maybe_unused,
+                                 struct perf_session *session __maybe_unused)
  {
         dump_printf(": unhandled!\n");
         return 0;
@@ -331,7 +305,7 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
         if (tool->tracing_data == NULL)
                 tool->tracing_data = process_event_synth_tracing_data_stub;
         if (tool->build_id == NULL)
-               tool->build_id = process_build_id_stub;
+               tool->build_id = process_event_op2_stub;
         if (tool->finished_round == NULL) {
                 if (tool->ordered_events)
                         tool->finished_round = process_finished_round;
@@ -339,13 +313,13 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
                         tool->finished_round = process_finished_round_stub;
         }
         if (tool->id_index == NULL)
-               tool->id_index = process_id_index_stub;
+               tool->id_index = process_event_op2_stub;
         if (tool->auxtrace_info == NULL)
-               tool->auxtrace_info = process_event_auxtrace_info_stub;
+               tool->auxtrace_info = process_event_op2_stub;
         if (tool->auxtrace == NULL)
                 tool->auxtrace = process_event_auxtrace_stub;
         if (tool->auxtrace_error == NULL)
-               tool->auxtrace_error = process_event_auxtrace_error_stub;
+               tool->auxtrace_error = process_event_op2_stub;
  }
  
  static void swap_sample_id_all(union perf_event *event, void *data)
diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c

index 48906333a858c06b41991f33cb0f5f2cdd4f68d9..9be16712ce745bfefff9659a454266c16d281df1 100644 (file)
--- a/tools/perf/util/symbol-minimal.c
+++ b/tools/perf/util/symbol-minimal.c
@@ -344,7 +344,7 @@ int dso__load_sym(struct dso *dso, struct map *map __maybe_unused,
         if (ret >= 0)
                 dso->is_64_bit = ret;
  
-       if (filename__read_build_id(ss->name, build_id, BUILD_ID_SIZE) > 0) {
+       if ((!dso->has_build_id) && (filename__read_build_id(ss->name, build_id, BUILD_ID_SIZE) > 0)) {
                 dso__set_build_id(dso, build_id);
         }
         return 0;
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c

index 520a32a12f8aa4ceede1909b202c61b78dfb1fc4..754711be8b251b2c0e33d1a57053710b20dc8edf 100644 (file)
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1468,7 +1468,8 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
          * Read the build id if possible. This is required for
          * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work
          */
-       if (filename__read_build_id(dso->name, build_id, BUILD_ID_SIZE) > 0)
+       if ((!dso->has_build_id) &&
+           (filename__read_build_id(dso->name, build_id, BUILD_ID_SIZE) > 0))
                 dso__set_build_id(dso, build_id);
  
         /*
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c

index 487d6357b7e75039ff914077eb11d890f938e51d..453eafd4dd6e5fa3f48b458e2fd62df0792c3554 100644 (file)
--- a/virt/kvm/arm/vgic-v3.c
+++ b/virt/kvm/arm/vgic-v3.c
@@ -28,6 +28,7 @@
  
  #include <asm/kvm_emulate.h>
  #include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
  #include <asm/kvm_mmu.h>
  
  /* These are for GICv2 emulation only */
@@ -36,18 +37,12 @@
  #define GICH_LR_PHYSID_CPUID           (7UL << GICH_LR_PHYSID_CPUID_SHIFT)
  #define ICH_LR_VIRTUALID_MASK          (BIT_ULL(32) - 1)
  
-/*
- * LRs are stored in reverse order in memory. make sure we index them
- * correctly.
- */
-#define LR_INDEX(lr)                   (VGIC_V3_MAX_LRS - 1 - lr)
-
  static u32 ich_vtr_el2;
  
  static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
  {
         struct vgic_lr lr_desc;
-       u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)];
+       u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[VGIC_V3_LR_INDEX(lr)];
  
         if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
                 lr_desc.irq = val & ICH_LR_VIRTUALID_MASK;
@@ -111,7 +106,7 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
                 lr_val |= ((u64)lr_desc.hwirq) << ICH_LR_PHYS_ID_SHIFT;
         }
  
-       vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val;
+       vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[VGIC_V3_LR_INDEX(lr)] = lr_val;
  
         if (!(lr_desc.state & LR_STATE_MASK))
                 vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr);
author	Alex Shi <alex.shi@linaro.org>
	Sat, 4 Feb 2017 04:11:15 +0000 (12:11 +0800)
committer	Alex Shi <alex.shi@linaro.org>
	Sat, 4 Feb 2017 04:11:15 +0000 (12:11 +0800)